Raw File
1 /*============================================================================
2 epub2txt v2
3 epub2txt.c
4 Copyright (c)2020-2024 Kevin Boone, GPL v3.0
5 ============================================================================*/
6
7 #define _GNU_SOURCE 1
8 #define _POSIX_C_SOURCE 200809L // For strdup, realpath, mkdtemp, and other POSIX functions
9
10 #include <stdio.h>
11 #include <stdlib.h> // For mkdtemp, realpath, malloc, free, getenv
12 #include <string.h> // For strdup, strcmp, strerror, strrchr, strndup
13 #include <unistd.h> // For access, getpid
14 #include <sys/types.h>
15 #include <sys/stat.h>
16 #include <errno.h>
17 #include <stdarg.h> // Required for asprintf prototype and its usage
18 #include <limits.h> // For PATH_MAX
19
20 #ifndef __APPLE__
21 #include <malloc.h>
22 #endif
23
24 // Explicitly declare asprintf if the system headers aren't providing it
25 #if defined(__GNUC__) && !defined(asprintf)
26 extern int asprintf(char **strp, const char *fmt, ...)
27 __attribute__((__format__(__printf__, 2, 3)));
28 #elif !defined(asprintf)
29 extern int asprintf(char **strp, const char *fmt, ...);
30 #endif
31
32 // Explicitly declare mkdtemp if not found via stdlib.h with feature test macros
33 // This is unusual on macOS but will satisfy the "undeclared function" error.
34 #ifndef mkdtemp // Check if it's a macro first (unlikely for mkdtemp)
35 extern char *mkdtemp(char *template);
36 #endif
37
38 #include "epub2txt.h"
39 #include "log.h"
40 #include "list.h"
41
42 #include "custom_string.h"
43 #include "sxmlc.h"
44 #include "xhtml.h"
45 #include "util.h"
46
47 // APPNAME is defined by the Makefile compiler arguments, e.g., -DAPPNAME=\"epub2txt\"
48
49 static char *tempdir = NULL;
50
51 /*============================================================================
52 epub2txt_unescape_html
53 ============================================================================*/
54 static char *epub2txt_unescape_html (const char *s)
55 {
56 typedef enum {MODE_ANY=0, MODE_AMP=1} Mode;
57 Mode mode = MODE_ANY;
58 String *out = string_create_empty();
59 WString *in = wstring_create_from_utf8(s);
60 WString *ent = wstring_create_empty();
61 int i, l = wstring_length (in);
62 for (i = 0; i < l; i++)
63 {
64 const uint32_t *ws = wstring_wstr (in);
65 uint32_t c = ws[i]; // Changed from int to uint32_t
66 if (mode == MODE_AMP)
67 {
68 if (c == ';')
69 {
70 WString *trans = xhtml_translate_entity (ent);
71 char *trans_utf8 = wstring_to_utf8 (trans);
72 string_append (out, trans_utf8);
73 free (trans_utf8);
74 wstring_destroy (trans);
75 wstring_clear (ent);
76 mode = MODE_ANY;
77 }
78 else
79 {
80 wstring_append_c (ent, c);
81 }
82 }
83 else
84 {
85 if (c == '&')
86 mode = MODE_AMP;
87 else
88 // Assuming string_append_c in custom_string.h correctly handles uint32_t for UTF-8
89 string_append_c (out, c);
90 }
91 }
92 wstring_destroy (ent);
93 wstring_destroy (in);
94 char *ret = strdup (string_cstr (out));
95 string_destroy (out);
96 return ret;
97 }
98
99 /*============================================================================
100 epub2txt_format_meta
101 ============================================================================*/
102 static void epub2txt_format_meta (const Epub2TxtOptions *options,
103 const char *key, const char *text)
104 {
105 if (text)
106 {
107 char *ss = epub2txt_unescape_html (text);
108 char *s = NULL; // Initialize to NULL
109 asprintf (&s, "%s: %s", key, ss);
110 char *error = NULL;
111 xhtml_utf8_to_stdout (s, options, &error);
112 if (error) free (error);
113 if (s) free (s); // Check if s was allocated
114 free (ss);
115 }
116 }
117
118 /*============================================================================
119 epub2txt_dump_metadata
120 ============================================================================*/
121 static List *epub2txt_dump_metadata (const char *opf_canonical_path,
122 const Epub2TxtOptions *options, char **error)
123 {
124 IN
125 List *ret = NULL;
126 String *buff = NULL;
127 if (string_create_from_utf8_file (opf_canonical_path, &buff, error))
128 {
129 const char *buff_cstr = string_cstr (buff);
130 log_debug ("Read OPF, size %d from %s", string_length (buff), opf_canonical_path);
131 XMLNode *metadata_node = NULL; // Renamed variable
132 XMLDoc doc;
133 XMLDoc_init (&doc);
134 if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
135 {
136 XMLNode *root = XMLDoc_root (&doc);
137 if (root && root->children)
138 {
139 int i, l = root->n_children;
140 for (i = 0; i < l; i++)
141 {
142 XMLNode *r1 = root->children[i];
143 if (strcmp (r1->tag, "metadata") == 0 || strstr (r1->tag, ":metadata"))
144 {
145 metadata_node = r1;
146 if (metadata_node && metadata_node->children)
147 {
148 int j, l2 = metadata_node->n_children;
149 for (j = 0; j < l2; j++)
150 {
151 XMLNode *r2 = metadata_node->children[j];
152 const char *mdtag = r2->tag;
153 const char *mdtext = r2->text;
154 if (!mdtext) continue;
155 if (strstr (mdtag, "creator"))
156 epub2txt_format_meta (options, "Creator", mdtext);
157 else if (strstr (mdtag, "publisher"))
158 epub2txt_format_meta (options, "Publisher", mdtext);
159 else if (strstr (mdtag, "contributor"))
160 epub2txt_format_meta (options, "Contributor", mdtext);
161 else if (strstr (mdtag, "identifier"))
162 epub2txt_format_meta (options, "Identifier", mdtext);
163 else if (strstr (mdtag, "date"))
164 {
165 char *mdate = strdup (mdtext);
166 char *p = strchr (mdate, '-');
167 if (p) *p = 0;
168 epub2txt_format_meta (options, "Date", mdate);
169 free (mdate);
170 }
171 else if (strstr (mdtag, "description"))
172 epub2txt_format_meta (options, "Description", mdtext);
173 else if (strstr (mdtag, "subject"))
174 epub2txt_format_meta (options, "Subject", mdtext);
175 else if (strstr (mdtag, "language"))
176 epub2txt_format_meta (options, "Language", mdtext);
177 else if (strstr (mdtag, "title"))
178 epub2txt_format_meta (options, "Title", mdtext);
179 else if (strstr (mdtag, "meta") && options->calibre)
180 {
181 // More robust Calibre metadata parsing
182 char *meta_name_attr = NULL;
183 char *meta_content_attr = NULL;
184 int k, nattrs = r2->n_attributes;
185
186 for (k = 0; k < nattrs; k++) {
187 if (strcmp(r2->attributes[k].name, "name") == 0 || strcmp(r2->attributes[k].name, "property") == 0) {
188 meta_name_attr = r2->attributes[k].value;
189 } else if (strcmp(r2->attributes[k].name, "content") == 0) {
190 meta_content_attr = r2->attributes[k].value;
191 }
192 }
193
194 if (meta_name_attr && meta_content_attr) {
195 if (strcmp(meta_name_attr, "calibre:series") == 0) {
196 epub2txt_format_meta(options, "Calibre series", meta_content_attr);
197 } else if (strcmp(meta_name_attr, "calibre:series_index") == 0) {
198 char *s = strdup(meta_content_attr);
199 char *p = strchr(s, '.');
200 if (p) *p = 0;
201 epub2txt_format_meta(options, "Calibre series index", s);
202 free(s);
203 } else if (strcmp(meta_name_attr, "calibre:title_sort") == 0) {
204 epub2txt_format_meta(options, "Calibre title sort", meta_content_attr);
205 }
206 }
207 }
208 }
209 }
210 break; // Found metadata node
211 }
212 }
213 } else {
214 log_warning("Root element or its children are NULL in OPF: %s", opf_canonical_path);
215 }
216 XMLDoc_free (&doc);
217 }
218 else
219 {
220 // Error already contains "Can't parse OPF XML" from XMLDoc_parse_buffer_DOM
221 // or asprintf (error, "Can't parse OPF XML from %s", opf_canonical_path);
222 }
223 string_destroy (buff);
224 }
225 OUT
226 return ret;
227 }
228
229 /*============================================================================
230 epub2txt_get_items
231 ============================================================================*/
232 List *epub2txt_get_items (const char *opf_canonical_path, char **error)
233 {
234 IN
235 List *ret = NULL;
236 String *buff = NULL;
237 if (string_create_from_utf8_file (opf_canonical_path, &buff, error))
238 {
239 const char *buff_cstr = string_cstr (buff);
240 log_debug ("Read OPF for spine items, size %d from %s", string_length (buff), opf_canonical_path);
241 BOOL got_manifest = FALSE;
242 XMLNode *manifest_node = NULL; // Renamed
243 XMLDoc doc;
244 XMLDoc_init (&doc);
245 if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
246 {
247 XMLNode *root = XMLDoc_root (&doc);
248 int l_root_children = 0;
249
250 if (root && root->children)
251 {
252 l_root_children = root->n_children;
253 for (int i = 0; i < l_root_children; i++)
254 {
255 XMLNode *r1 = root->children[i];
256 if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
257 {
258 manifest_node = r1;
259 got_manifest = TRUE;
260 break;
261 }
262 }
263 }
264 else
265 {
266 log_warning ("'%s' has no root element or children -- corrupt EPUB?", opf_canonical_path);
267 }
268
269 if (!got_manifest || !manifest_node || !manifest_node->children)
270 {
271 asprintf (error, "File %s has no valid manifest or manifest children", opf_canonical_path);
272 string_destroy(buff);
273 XMLDoc_free(&doc);
274 OUT
275 return NULL;
276 }
277
278 ret = list_create_strings();
279
280 if (root && root->children)
281 {
282 for (int i = 0; i < l_root_children; i++)
283 {
284 XMLNode *r1 = root->children[i];
285 if (strcmp (r1->tag, "spine") == 0 || strstr (r1->tag, ":spine"))
286 {
287 if (r1->children)
288 {
289 int j, l_spine_children = r1->n_children;
290 for (j = 0; j < l_spine_children; j++)
291 {
292 XMLNode *itemref_node = r1->children[j]; // itemref
293 if (itemref_node->attributes)
294 {
295 int k, nattrs_itemref = itemref_node->n_attributes;
296 for (k = 0; k < nattrs_itemref; k++)
297 {
298 char *attr_name_itemref = itemref_node->attributes[k].name;
299 if (strcmp (attr_name_itemref, "idref") == 0)
300 {
301 char *idref_value = itemref_node->attributes[k].value;
302 int m, l_manifest_children = manifest_node->n_children;
303 for (m = 0; m < l_manifest_children; m++)
304 {
305 XMLNode *manifest_item_node = manifest_node->children[m];
306 if (manifest_item_node->attributes)
307 {
308 int n, nattrs_manifest_item = manifest_item_node->n_attributes;
309 for (n = 0; n < nattrs_manifest_item; n++)
310 {
311 char *attr_name_manifest = manifest_item_node->attributes[n].name;
312 char *attr_val_manifest = manifest_item_node->attributes[n].value;
313 if (strcmp (attr_name_manifest, "id") == 0 &&
314 idref_value != NULL && attr_val_manifest != NULL && // Add NULL checks
315 strcmp (attr_val_manifest, idref_value) == 0)
316 {
317 for (int p = 0; p < nattrs_manifest_item; p++)
318 {
319 if (strcmp (manifest_item_node->attributes[p].name, "href") == 0)
320 {
321 char *decoded_href = decode_url (manifest_item_node->attributes[p].value);
322 list_append (ret, decoded_href);
323 break;
324 }
325 }
326 break;
327 }
328 }
329 }
330 }
331 break;
332 }
333 }
334 }
335 }
336 }
337 break;
338 }
339 }
340 }
341 XMLDoc_free (&doc);
342 }
343 else
344 {
345 // Error from XMLDoc_parse_buffer_DOM
346 }
347 string_destroy (buff);
348 }
349 OUT
350 return ret;
351 }
352
353 /*============================================================================
354 epub2txt_get_root_file
355 ============================================================================*/
356 String *epub2txt_get_root_file (const char *container_xml_path, char **error)
357 {
358 IN
359 String *ret = NULL;
360 String *buff = NULL;
361 if (string_create_from_utf8_file (container_xml_path, &buff, error))
362 {
363 const char *buff_cstr = string_cstr (buff);
364 log_debug ("Read container.xml, size %d from %s", string_length (buff), container_xml_path);
365 XMLDoc doc;
366 XMLDoc_init (&doc);
367 if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
368 {
369 XMLNode *root = XMLDoc_root (&doc);
370 if (root && root->children)
371 {
372 int i, l = root->n_children;
373 for (i = 0; i < l; i++)
374 {
375 XMLNode *r1 = root->children[i];
376 if (strcmp (r1->tag, "rootfiles") == 0)
377 {
378 if (r1->children)
379 {
380 XMLNode *rootfiles_node = r1;
381 int j, l2 = rootfiles_node->n_children;
382 for (j = 0; j < l2; j++)
383 {
384 XMLNode *rootfile_node = rootfiles_node->children[j]; // Renamed
385 if (strcmp (rootfile_node->tag, "rootfile") == 0)
386 {
387 if (rootfile_node->attributes)
388 {
389 int k, nattrs = rootfile_node->n_attributes;
390 for (k = 0; k < nattrs; k++)
391 {
392 char *attr_name = rootfile_node->attributes[k].name;
393 char *attr_value = rootfile_node->attributes[k].value;
394 if (strcmp (attr_name, "full-path") == 0)
395 {
396 ret = string_create (attr_value);
397 break;
398 }
399 }
400 }
401 if (ret) break;
402 }
403 }
404 }
405 if (ret) break;
406 }
407 }
408 } else {
409 log_warning("Root element or its children are NULL in %s", container_xml_path);
410 }
411 if (ret == NULL) { // If still NULL after checking all children
412 // Avoid overwriting previous error from string_create_from_utf8_file or XMLDoc_parse_buffer_DOM
413 if (*error == NULL) {
414 asprintf (error, "%s does not specify a root file via full-path attribute", container_xml_path);
415 }
416 }
417 XMLDoc_free (&doc);
418 }
419 else
420 {
421 // Error from XMLDoc_parse_buffer_DOM, *error should be set
422 }
423 string_destroy (buff);
424 }
425 OUT
426 return ret;
427 }
428
429 /*============================================================================
430 epub2txt_cleanup
431 ============================================================================*/
432 void epub2txt_cleanup (void)
433 {
434 if (tempdir)
435 {
436 log_debug ("Deleting temporary directory: %s", tempdir);
437 run_command ((const char *[]){"rm", "-rf", tempdir, NULL}, FALSE);
438 free (tempdir); // tempdir was allocated by strdup from template or asprintf
439 tempdir = NULL;
440 }
441 }
442
443 /*============================================================================
444 epub2txt_do_file
445 ============================================================================*/
446 void epub2txt_do_file (const char *file, const Epub2TxtOptions *options,
447 char **error)
448 {
449 IN
450 *error = NULL;
451
452 log_debug ("epub2txt_do_file: %s", file);
453 if (access (file, R_OK) == 0)
454 {
455 log_debug ("File access OK");
456
457 char *tempbase;
458 if (!(tempbase = getenv("TMPDIR")) && !(tempbase = getenv("TMP")))
459 tempbase = "/tmp";
460 log_debug ("tempbase is: %s", tempbase);
461
462 if (tempdir != NULL) { // Should be cleaned up by atexit or previous call's end
463 log_warning("tempdir was not NULL (%s), implies prior cleanup issue or re-entry.", tempdir);
464 // Forcing cleanup here might be too aggressive if an atexit handler is also registered
465 // free(tempdir); tempdir = NULL; // Or call epub2txt_cleanup carefully.
466 }
467
468 char temp_dir_template[PATH_MAX]; // PATH_MAX from <limits.h>
469 // snprintf is safer than sprintf
470 snprintf(temp_dir_template, PATH_MAX, "%s/epub2txt.%d.XXXXXX", tempbase, getpid());
471 temp_dir_template[PATH_MAX - 1] = '\0'; // Ensure null termination if PATH_MAX is hit
472
473 if (mkdtemp(temp_dir_template) == NULL) { // mkdtemp is from <stdlib.h>
474 asprintf(error, "Can't create temporary directory using template %s: %s", temp_dir_template, strerror(errno));
475 return; // tempdir (global) is still NULL
476 }
477 tempdir = strdup(temp_dir_template); // Assign to global tempdir
478 if (tempdir == NULL) {
479 asprintf(error, "Failed to strdup temporary directory path: %s", strerror(errno));
480 // Attempt to remove the created directory if we can't store its path
481 rmdir(temp_dir_template); // Best effort, might fail if not empty
482 return;
483 }
484 log_debug ("tempdir created: %s", tempdir);
485
486 log_debug ("Running unzip command");
487 int unzip_status = run_command ((const char *[]){"unzip", "-o", "-qq", file, "-d", tempdir, NULL}, TRUE);
488 if (unzip_status != 0) {
489 asprintf(error, "Unzip command failed for %s with status %d", file, unzip_status);
490 epub2txt_cleanup(); // Clean up the created tempdir
491 return;
492 }
493
494 log_debug ("Unzip finished");
495 log_debug ("Fix permissions: %s", tempdir);
496 run_command((const char *[]){"chmod", "-R", "u+rwX,go+rX,go-w", tempdir, NULL}, FALSE);
497 log_debug ("Permissions fixed");
498
499 char *container_xml_path_str;
500 asprintf (&container_xml_path_str, "%s/META-INF/container.xml", tempdir);
501 if (!container_xml_path_str) { /* Malloc error */ *error = strdup("asprintf failed for container_xml_path"); epub2txt_cleanup(); return; }
502 log_debug ("Container.xml path is: %s", container_xml_path_str);
503
504 String *rootfile_relative_path = epub2txt_get_root_file (container_xml_path_str, error);
505 free(container_xml_path_str);
506
507 if (*error == NULL && rootfile_relative_path != NULL)
508 {
509 log_debug ("OPF rootfile relative path from container.xml: %s", string_cstr(rootfile_relative_path));
510
511 char *opf_constructed_path;
512 asprintf (&opf_constructed_path, "%s/%s", tempdir, string_cstr(rootfile_relative_path));
513 if (!opf_constructed_path) { /* Malloc error */ /* ... cleanup ... */ string_destroy(rootfile_relative_path); epub2txt_cleanup(); return; }
514
515 char *opf_canonical = realpath (opf_constructed_path, NULL);
516 free (opf_constructed_path);
517
518 char *tempdir_canonical_for_check = realpath(tempdir, NULL);
519 if (tempdir_canonical_for_check == NULL) {
520 asprintf(error, "Failed to resolve temporary directory path '%s': %s", tempdir, strerror(errno));
521 string_destroy(rootfile_relative_path);
522 if (opf_canonical) free(opf_canonical);
523 epub2txt_cleanup();
524 return;
525 }
526
527 if (opf_canonical == NULL || !is_subpath(tempdir_canonical_for_check, opf_canonical))
528 {
529 if (opf_canonical == NULL)
530 asprintf (error, "Bad OPF rootfile (relative: %s): realpath failed: %s", string_cstr(rootfile_relative_path), strerror (errno));
531 else
532 asprintf (error, "Bad OPF rootfile path \"%s\": outside EPUB container (resolved temp dir: %s)", opf_canonical, tempdir_canonical_for_check);
533
534 free(tempdir_canonical_for_check);
535 string_destroy(rootfile_relative_path);
536 if (opf_canonical) free(opf_canonical);
537 epub2txt_cleanup();
538 return;
539 }
540 free(tempdir_canonical_for_check);
541
542 log_debug("Canonical OPF path: %s", opf_canonical);
543
544 char *content_dir = strdup (opf_canonical);
545 if (!content_dir) { /* Malloc error */
546 asprintf(error, "strdup failed for content_dir");
547 string_destroy(rootfile_relative_path);
548 free(opf_canonical);
549 epub2txt_cleanup();
550 return;
551 }
552 char *last_slash = strrchr (content_dir, '/');
553 if (last_slash) {
554 *last_slash = '\0';
555 } else {
556 // This case means opf_canonical has no '/', which is unlikely for an absolute path
557 // unless it's in the root of the filesystem. Default to "." or a copy of tempdir.
558 free(content_dir);
559 content_dir = strdup(tempdir); // Content is in the root of the temp extraction
560 if (!content_dir) { /* Malloc error */ /* ... cleanup ...*/ string_destroy(rootfile_relative_path); free(opf_canonical); epub2txt_cleanup(); return; }
561 }
562 log_debug ("Content directory is: %s", content_dir);
563
564 if (options->meta)
565 {
566 epub2txt_dump_metadata (opf_canonical, options, error);
567 if (*error)
568 {
569 log_warning ("Error during metadata dump: %s (continuing with text)", *error);
570 free (*error);
571 *error = NULL;
572 }
573 }
574
575 if (!options->notext && *error == NULL)
576 {
577 List *spine_items = epub2txt_get_items (opf_canonical, error);
578 if (*error == NULL && spine_items != NULL)
579 {
580 log_debug ("EPUB spine has %d items", list_length (spine_items));
581 int i, l = list_length (spine_items);
582 for (i = 0; i < l; i++)
583 {
584 const char *item_rel_path = (const char *)list_get (spine_items, i);
585 char *item_constr_path;
586 asprintf (&item_constr_path, "%s/%s", content_dir, item_rel_path);
587 if (!item_constr_path) { /* Malloc error */ continue; } // Skip item
588
589 char *item_canon_path = realpath (item_constr_path, NULL);
590 free (item_constr_path);
591
592 if (item_canon_path == NULL || !is_subpath (content_dir, item_canon_path))
593 {
594 if (item_canon_path == NULL)
595 log_warning ("Skipping EPUB spine item \"%s\": invalid path (realpath: %s)",
596 item_rel_path, strerror(errno));
597 else
598 log_warning ("Skipping EPUB spine item \"%s\" (%s): outside content directory (%s)",
599 item_rel_path, item_canon_path, content_dir);
600 if(item_canon_path) free(item_canon_path);
601 continue;
602 }
603
604 if (options->section_separator)
605 printf ("%s\n", options->section_separator);
606
607 xhtml_file_to_stdout (item_canon_path, options, error);
608 free(item_canon_path);
609 if (*error) {
610 log_warning("Error processing spine item %s: %s (continuing)", item_rel_path, *error);
611 free(*error);
612 *error = NULL;
613 }
614 }
615 list_destroy (spine_items);
616 }
617 else if (*error) {
618 log_warning("Could not get spine items: %s", *error);
619 // *error is kept for main to report
620 } else { // spine_items is NULL but no error
621 log_warning("Spine items list is NULL but no specific error reported by epub2txt_get_items.");
622 }
623 }
624 free (content_dir);
625 free (opf_canonical);
626 }
627 else if (*error) {
628 // Error from epub2txt_get_root_file or rootfile_relative_path is NULL
629 // *error is already set
630 } else { // rootfile_relative_path is NULL, but no *error set by epub2txt_get_root_file
631 asprintf(error, "Failed to get OPF root file path from container.xml (it was NULL).");
632 }
633
634 if (rootfile_relative_path) string_destroy (rootfile_relative_path);
635 epub2txt_cleanup();
636 }
637 else
638 {
639 asprintf (error, "File not found or not readable: %s", file);
640 }
641
642 OUT
643 }
Generated by GNU Enscript 1.6.6, and GophHub 1.3.