Raw File
/*============================================================================
epub2txt v2
epub2txt.c
Copyright (c)2020-2024 Kevin Boone, GPL v3.0
============================================================================*/
#define _GNU_SOURCE 1
#define _POSIX_C_SOURCE 200809L // For strdup, realpath, mkdtemp, and other POSIX functions
#include <stdio.h>
#include <stdlib.h> // For mkdtemp, realpath, malloc, free, getenv
#include <string.h> // For strdup, strcmp, strerror, strrchr, strndup
#include <unistd.h> // For access, getpid
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#include <stdarg.h> // Required for asprintf prototype and its usage
#include <limits.h> // For PATH_MAX
#ifndef __APPLE__
#include <malloc.h>
#endif
// Explicitly declare asprintf if the system headers aren't providing it
#if defined(__GNUC__) && !defined(asprintf)
extern int asprintf(char **strp, const char *fmt, ...)
__attribute__((__format__(__printf__, 2, 3)));
#elif !defined(asprintf)
extern int asprintf(char **strp, const char *fmt, ...);
#endif
// Explicitly declare mkdtemp if not found via stdlib.h with feature test macros
// This is unusual on macOS but will satisfy the "undeclared function" error.
#ifndef mkdtemp // Check if it's a macro first (unlikely for mkdtemp)
extern char *mkdtemp(char *template);
#endif
#include "epub2txt.h"
#include "log.h"
#include "list.h"
#include "custom_string.h"
#include "sxmlc.h"
#include "xhtml.h"
#include "util.h"
// APPNAME is defined by the Makefile compiler arguments, e.g., -DAPPNAME=\"epub2txt\"
static char *tempdir = NULL;
/*============================================================================
epub2txt_unescape_html
============================================================================*/
static char *epub2txt_unescape_html (const char *s)
{
typedef enum {MODE_ANY=0, MODE_AMP=1} Mode;
Mode mode = MODE_ANY;
String *out = string_create_empty();
WString *in = wstring_create_from_utf8(s);
WString *ent = wstring_create_empty();
int i, l = wstring_length (in);
for (i = 0; i < l; i++)
{
const uint32_t *ws = wstring_wstr (in);
uint32_t c = ws[i]; // Changed from int to uint32_t
if (mode == MODE_AMP)
{
if (c == ';')
{
WString *trans = xhtml_translate_entity (ent);
char *trans_utf8 = wstring_to_utf8 (trans);
string_append (out, trans_utf8);
free (trans_utf8);
wstring_destroy (trans);
wstring_clear (ent);
mode = MODE_ANY;
}
else
{
wstring_append_c (ent, c);
}
}
else
{
if (c == '&')
mode = MODE_AMP;
else
// Assuming string_append_c in custom_string.h correctly handles uint32_t for UTF-8
string_append_c (out, c);
}
}
wstring_destroy (ent);
wstring_destroy (in);
char *ret = strdup (string_cstr (out));
string_destroy (out);
return ret;
}
/*============================================================================
epub2txt_format_meta
============================================================================*/
static void epub2txt_format_meta (const Epub2TxtOptions *options,
const char *key, const char *text)
{
if (text)
{
char *ss = epub2txt_unescape_html (text);
char *s = NULL; // Initialize to NULL
asprintf (&s, "%s: %s", key, ss);
char *error = NULL;
xhtml_utf8_to_stdout (s, options, &error);
if (error) free (error);
if (s) free (s); // Check if s was allocated
free (ss);
}
}
/*============================================================================
epub2txt_dump_metadata
============================================================================*/
static List *epub2txt_dump_metadata (const char *opf_canonical_path,
const Epub2TxtOptions *options, char **error)
{
IN
List *ret = NULL;
String *buff = NULL;
if (string_create_from_utf8_file (opf_canonical_path, &buff, error))
{
const char *buff_cstr = string_cstr (buff);
log_debug ("Read OPF, size %d from %s", string_length (buff), opf_canonical_path);
XMLNode *metadata_node = NULL; // Renamed variable
XMLDoc doc;
XMLDoc_init (&doc);
if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
{
XMLNode *root = XMLDoc_root (&doc);
if (root && root->children)
{
int i, l = root->n_children;
for (i = 0; i < l; i++)
{
XMLNode *r1 = root->children[i];
if (strcmp (r1->tag, "metadata") == 0 || strstr (r1->tag, ":metadata"))
{
metadata_node = r1;
if (metadata_node && metadata_node->children)
{
int j, l2 = metadata_node->n_children;
for (j = 0; j < l2; j++)
{
XMLNode *r2 = metadata_node->children[j];
const char *mdtag = r2->tag;
const char *mdtext = r2->text;
if (!mdtext) continue;
if (strstr (mdtag, "creator"))
epub2txt_format_meta (options, "Creator", mdtext);
else if (strstr (mdtag, "publisher"))
epub2txt_format_meta (options, "Publisher", mdtext);
else if (strstr (mdtag, "contributor"))
epub2txt_format_meta (options, "Contributor", mdtext);
else if (strstr (mdtag, "identifier"))
epub2txt_format_meta (options, "Identifier", mdtext);
else if (strstr (mdtag, "date"))
{
char *mdate = strdup (mdtext);
char *p = strchr (mdate, '-');
if (p) *p = 0;
epub2txt_format_meta (options, "Date", mdate);
free (mdate);
}
else if (strstr (mdtag, "description"))
epub2txt_format_meta (options, "Description", mdtext);
else if (strstr (mdtag, "subject"))
epub2txt_format_meta (options, "Subject", mdtext);
else if (strstr (mdtag, "language"))
epub2txt_format_meta (options, "Language", mdtext);
else if (strstr (mdtag, "title"))
epub2txt_format_meta (options, "Title", mdtext);
else if (strstr (mdtag, "meta") && options->calibre)
{
// More robust Calibre metadata parsing
char *meta_name_attr = NULL;
char *meta_content_attr = NULL;
int k, nattrs = r2->n_attributes;
for (k = 0; k < nattrs; k++) {
if (strcmp(r2->attributes[k].name, "name") == 0 || strcmp(r2->attributes[k].name, "property") == 0) {
meta_name_attr = r2->attributes[k].value;
} else if (strcmp(r2->attributes[k].name, "content") == 0) {
meta_content_attr = r2->attributes[k].value;
}
}
if (meta_name_attr && meta_content_attr) {
if (strcmp(meta_name_attr, "calibre:series") == 0) {
epub2txt_format_meta(options, "Calibre series", meta_content_attr);
} else if (strcmp(meta_name_attr, "calibre:series_index") == 0) {
char *s = strdup(meta_content_attr);
char *p = strchr(s, '.');
if (p) *p = 0;
epub2txt_format_meta(options, "Calibre series index", s);
free(s);
} else if (strcmp(meta_name_attr, "calibre:title_sort") == 0) {
epub2txt_format_meta(options, "Calibre title sort", meta_content_attr);
}
}
}
}
}
break; // Found metadata node
}
}
} else {
log_warning("Root element or its children are NULL in OPF: %s", opf_canonical_path);
}
XMLDoc_free (&doc);
}
else
{
// Error already contains "Can't parse OPF XML" from XMLDoc_parse_buffer_DOM
// or asprintf (error, "Can't parse OPF XML from %s", opf_canonical_path);
}
string_destroy (buff);
}
OUT
return ret;
}
/*============================================================================
epub2txt_get_items
============================================================================*/
List *epub2txt_get_items (const char *opf_canonical_path, char **error)
{
IN
List *ret = NULL;
String *buff = NULL;
if (string_create_from_utf8_file (opf_canonical_path, &buff, error))
{
const char *buff_cstr = string_cstr (buff);
log_debug ("Read OPF for spine items, size %d from %s", string_length (buff), opf_canonical_path);
BOOL got_manifest = FALSE;
XMLNode *manifest_node = NULL; // Renamed
XMLDoc doc;
XMLDoc_init (&doc);
if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
{
XMLNode *root = XMLDoc_root (&doc);
int l_root_children = 0;
if (root && root->children)
{
l_root_children = root->n_children;
for (int i = 0; i < l_root_children; i++)
{
XMLNode *r1 = root->children[i];
if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
{
manifest_node = r1;
got_manifest = TRUE;
break;
}
}
}
else
{
log_warning ("'%s' has no root element or children -- corrupt EPUB?", opf_canonical_path);
}
if (!got_manifest || !manifest_node || !manifest_node->children)
{
asprintf (error, "File %s has no valid manifest or manifest children", opf_canonical_path);
string_destroy(buff);
XMLDoc_free(&doc);
OUT
return NULL;
}
ret = list_create_strings();
if (root && root->children)
{
for (int i = 0; i < l_root_children; i++)
{
XMLNode *r1 = root->children[i];
if (strcmp (r1->tag, "spine") == 0 || strstr (r1->tag, ":spine"))
{
if (r1->children)
{
int j, l_spine_children = r1->n_children;
for (j = 0; j < l_spine_children; j++)
{
XMLNode *itemref_node = r1->children[j]; // itemref
if (itemref_node->attributes)
{
int k, nattrs_itemref = itemref_node->n_attributes;
for (k = 0; k < nattrs_itemref; k++)
{
char *attr_name_itemref = itemref_node->attributes[k].name;
if (strcmp (attr_name_itemref, "idref") == 0)
{
char *idref_value = itemref_node->attributes[k].value;
int m, l_manifest_children = manifest_node->n_children;
for (m = 0; m < l_manifest_children; m++)
{
XMLNode *manifest_item_node = manifest_node->children[m];
if (manifest_item_node->attributes)
{
int n, nattrs_manifest_item = manifest_item_node->n_attributes;
for (n = 0; n < nattrs_manifest_item; n++)
{
char *attr_name_manifest = manifest_item_node->attributes[n].name;
char *attr_val_manifest = manifest_item_node->attributes[n].value;
if (strcmp (attr_name_manifest, "id") == 0 &&
idref_value != NULL && attr_val_manifest != NULL && // Add NULL checks
strcmp (attr_val_manifest, idref_value) == 0)
{
for (int p = 0; p < nattrs_manifest_item; p++)
{
if (strcmp (manifest_item_node->attributes[p].name, "href") == 0)
{
char *decoded_href = decode_url (manifest_item_node->attributes[p].value);
list_append (ret, decoded_href);
break;
}
}
break;
}
}
}
}
break;
}
}
}
}
}
break;
}
}
}
XMLDoc_free (&doc);
}
else
{
// Error from XMLDoc_parse_buffer_DOM
}
string_destroy (buff);
}
OUT
return ret;
}
/*============================================================================
epub2txt_get_root_file
============================================================================*/
String *epub2txt_get_root_file (const char *container_xml_path, char **error)
{
IN
String *ret = NULL;
String *buff = NULL;
if (string_create_from_utf8_file (container_xml_path, &buff, error))
{
const char *buff_cstr = string_cstr (buff);
log_debug ("Read container.xml, size %d from %s", string_length (buff), container_xml_path);
XMLDoc doc;
XMLDoc_init (&doc);
if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
{
XMLNode *root = XMLDoc_root (&doc);
if (root && root->children)
{
int i, l = root->n_children;
for (i = 0; i < l; i++)
{
XMLNode *r1 = root->children[i];
if (strcmp (r1->tag, "rootfiles") == 0)
{
if (r1->children)
{
XMLNode *rootfiles_node = r1;
int j, l2 = rootfiles_node->n_children;
for (j = 0; j < l2; j++)
{
XMLNode *rootfile_node = rootfiles_node->children[j]; // Renamed
if (strcmp (rootfile_node->tag, "rootfile") == 0)
{
if (rootfile_node->attributes)
{
int k, nattrs = rootfile_node->n_attributes;
for (k = 0; k < nattrs; k++)
{
char *attr_name = rootfile_node->attributes[k].name;
char *attr_value = rootfile_node->attributes[k].value;
if (strcmp (attr_name, "full-path") == 0)
{
ret = string_create (attr_value);
break;
}
}
}
if (ret) break;
}
}
}
if (ret) break;
}
}
} else {
log_warning("Root element or its children are NULL in %s", container_xml_path);
}
if (ret == NULL) { // If still NULL after checking all children
// Avoid overwriting previous error from string_create_from_utf8_file or XMLDoc_parse_buffer_DOM
if (*error == NULL) {
asprintf (error, "%s does not specify a root file via full-path attribute", container_xml_path);
}
}
XMLDoc_free (&doc);
}
else
{
// Error from XMLDoc_parse_buffer_DOM, *error should be set
}
string_destroy (buff);
}
OUT
return ret;
}
/*============================================================================
epub2txt_cleanup
============================================================================*/
void epub2txt_cleanup (void)
{
if (tempdir)
{
log_debug ("Deleting temporary directory: %s", tempdir);
run_command ((const char *[]){"rm", "-rf", tempdir, NULL}, FALSE);
free (tempdir); // tempdir was allocated by strdup from template or asprintf
tempdir = NULL;
}
}
/*============================================================================
epub2txt_do_file
============================================================================*/
void epub2txt_do_file (const char *file, const Epub2TxtOptions *options,
char **error)
{
IN
*error = NULL;
log_debug ("epub2txt_do_file: %s", file);
if (access (file, R_OK) == 0)
{
log_debug ("File access OK");
char *tempbase;
if (!(tempbase = getenv("TMPDIR")) && !(tempbase = getenv("TMP")))
tempbase = "/tmp";
log_debug ("tempbase is: %s", tempbase);
if (tempdir != NULL) { // Should be cleaned up by atexit or previous call's end
log_warning("tempdir was not NULL (%s), implies prior cleanup issue or re-entry.", tempdir);
// Forcing cleanup here might be too aggressive if an atexit handler is also registered
// free(tempdir); tempdir = NULL; // Or call epub2txt_cleanup carefully.
}
char temp_dir_template[PATH_MAX]; // PATH_MAX from <limits.h>
// snprintf is safer than sprintf
snprintf(temp_dir_template, PATH_MAX, "%s/epub2txt.%d.XXXXXX", tempbase, getpid());
temp_dir_template[PATH_MAX - 1] = '\0'; // Ensure null termination if PATH_MAX is hit
if (mkdtemp(temp_dir_template) == NULL) { // mkdtemp is from <stdlib.h>
asprintf(error, "Can't create temporary directory using template %s: %s", temp_dir_template, strerror(errno));
return; // tempdir (global) is still NULL
}
tempdir = strdup(temp_dir_template); // Assign to global tempdir
if (tempdir == NULL) {
asprintf(error, "Failed to strdup temporary directory path: %s", strerror(errno));
// Attempt to remove the created directory if we can't store its path
rmdir(temp_dir_template); // Best effort, might fail if not empty
return;
}
log_debug ("tempdir created: %s", tempdir);
log_debug ("Running unzip command");
int unzip_status = run_command ((const char *[]){"unzip", "-o", "-qq", file, "-d", tempdir, NULL}, TRUE);
if (unzip_status != 0) {
asprintf(error, "Unzip command failed for %s with status %d", file, unzip_status);
epub2txt_cleanup(); // Clean up the created tempdir
return;
}
log_debug ("Unzip finished");
log_debug ("Fix permissions: %s", tempdir);
run_command((const char *[]){"chmod", "-R", "u+rwX,go+rX,go-w", tempdir, NULL}, FALSE);
log_debug ("Permissions fixed");
char *container_xml_path_str;
asprintf (&container_xml_path_str, "%s/META-INF/container.xml", tempdir);
if (!container_xml_path_str) { /* Malloc error */ *error = strdup("asprintf failed for container_xml_path"); epub2txt_cleanup(); return; }
log_debug ("Container.xml path is: %s", container_xml_path_str);
String *rootfile_relative_path = epub2txt_get_root_file (container_xml_path_str, error);
free(container_xml_path_str);
if (*error == NULL && rootfile_relative_path != NULL)
{
log_debug ("OPF rootfile relative path from container.xml: %s", string_cstr(rootfile_relative_path));
char *opf_constructed_path;
asprintf (&opf_constructed_path, "%s/%s", tempdir, string_cstr(rootfile_relative_path));
if (!opf_constructed_path) { /* Malloc error */ /* ... cleanup ... */ string_destroy(rootfile_relative_path); epub2txt_cleanup(); return; }
char *opf_canonical = realpath (opf_constructed_path, NULL);
free (opf_constructed_path);
char *tempdir_canonical_for_check = realpath(tempdir, NULL);
if (tempdir_canonical_for_check == NULL) {
asprintf(error, "Failed to resolve temporary directory path '%s': %s", tempdir, strerror(errno));
string_destroy(rootfile_relative_path);
if (opf_canonical) free(opf_canonical);
epub2txt_cleanup();
return;
}
if (opf_canonical == NULL || !is_subpath(tempdir_canonical_for_check, opf_canonical))
{
if (opf_canonical == NULL)
asprintf (error, "Bad OPF rootfile (relative: %s): realpath failed: %s", string_cstr(rootfile_relative_path), strerror (errno));
else
asprintf (error, "Bad OPF rootfile path \"%s\": outside EPUB container (resolved temp dir: %s)", opf_canonical, tempdir_canonical_for_check);
free(tempdir_canonical_for_check);
string_destroy(rootfile_relative_path);
if (opf_canonical) free(opf_canonical);
epub2txt_cleanup();
return;
}
free(tempdir_canonical_for_check);
log_debug("Canonical OPF path: %s", opf_canonical);
char *content_dir = strdup (opf_canonical);
if (!content_dir) { /* Malloc error */
asprintf(error, "strdup failed for content_dir");
string_destroy(rootfile_relative_path);
free(opf_canonical);
epub2txt_cleanup();
return;
}
char *last_slash = strrchr (content_dir, '/');
if (last_slash) {
*last_slash = '\0';
} else {
// This case means opf_canonical has no '/', which is unlikely for an absolute path
// unless it's in the root of the filesystem. Default to "." or a copy of tempdir.
free(content_dir);
content_dir = strdup(tempdir); // Content is in the root of the temp extraction
if (!content_dir) { /* Malloc error */ /* ... cleanup ...*/ string_destroy(rootfile_relative_path); free(opf_canonical); epub2txt_cleanup(); return; }
}
log_debug ("Content directory is: %s", content_dir);
if (options->meta)
{
epub2txt_dump_metadata (opf_canonical, options, error);
if (*error)
{
log_warning ("Error during metadata dump: %s (continuing with text)", *error);
free (*error);
*error = NULL;
}
}
if (!options->notext && *error == NULL)
{
List *spine_items = epub2txt_get_items (opf_canonical, error);
if (*error == NULL && spine_items != NULL)
{
log_debug ("EPUB spine has %d items", list_length (spine_items));
int i, l = list_length (spine_items);
for (i = 0; i < l; i++)
{
const char *item_rel_path = (const char *)list_get (spine_items, i);
char *item_constr_path;
asprintf (&item_constr_path, "%s/%s", content_dir, item_rel_path);
if (!item_constr_path) { /* Malloc error */ continue; } // Skip item
char *item_canon_path = realpath (item_constr_path, NULL);
free (item_constr_path);
if (item_canon_path == NULL || !is_subpath (content_dir, item_canon_path))
{
if (item_canon_path == NULL)
log_warning ("Skipping EPUB spine item \"%s\": invalid path (realpath: %s)",
item_rel_path, strerror(errno));
else
log_warning ("Skipping EPUB spine item \"%s\" (%s): outside content directory (%s)",
item_rel_path, item_canon_path, content_dir);
if(item_canon_path) free(item_canon_path);
continue;
}
if (options->section_separator)
printf ("%s\n", options->section_separator);
xhtml_file_to_stdout (item_canon_path, options, error);
free(item_canon_path);
if (*error) {
log_warning("Error processing spine item %s: %s (continuing)", item_rel_path, *error);
free(*error);
*error = NULL;
}
}
list_destroy (spine_items);
}
else if (*error) {
log_warning("Could not get spine items: %s", *error);
// *error is kept for main to report
} else { // spine_items is NULL but no error
log_warning("Spine items list is NULL but no specific error reported by epub2txt_get_items.");
}
}
free (content_dir);
free (opf_canonical);
}
else if (*error) {
// Error from epub2txt_get_root_file or rootfile_relative_path is NULL
// *error is already set
} else { // rootfile_relative_path is NULL, but no *error set by epub2txt_get_root_file
asprintf(error, "Failed to get OPF root file path from container.xml (it was NULL).");
}
if (rootfile_relative_path) string_destroy (rootfile_relative_path);
epub2txt_cleanup();
}
else
{
asprintf (error, "File not found or not readable: %s", file);
}
OUT
}
Generated by GNU Enscript 1.6.6, and GophHub 1.3.