1 /*============================================================================ 2 epub2txt v2 3 epub2txt.c 4 Copyright (c)2020-2024 Kevin Boone, GPL v3.0 5 ============================================================================*/ 6 7 #define _GNU_SOURCE 1 8 #define _POSIX_C_SOURCE 200809L // For strdup, realpath, mkdtemp, and other POSIX functions 9 10 #include 11 #include // For mkdtemp, realpath, malloc, free, getenv 12 #include // For strdup, strcmp, strerror, strrchr, strndup 13 #include // For access, getpid 14 #include 15 #include 16 #include 17 #include // Required for asprintf prototype and its usage 18 #include // For PATH_MAX 19 20 #ifndef __APPLE__ 21 #include 22 #endif 23 24 // Explicitly declare asprintf if the system headers aren't providing it 25 #if defined(__GNUC__) && !defined(asprintf) 26 extern int asprintf(char **strp, const char *fmt, ...) 27 __attribute__((__format__(__printf__, 2, 3))); 28 #elif !defined(asprintf) 29 extern int asprintf(char **strp, const char *fmt, ...); 30 #endif 31 32 // Explicitly declare mkdtemp if not found via stdlib.h with feature test macros 33 // This is unusual on macOS but will satisfy the "undeclared function" error. 34 #ifndef mkdtemp // Check if it's a macro first (unlikely for mkdtemp) 35 extern char *mkdtemp(char *template); 36 #endif 37 38 #include "epub2txt.h" 39 #include "log.h" 40 #include "list.h" 41 42 #include "custom_string.h" 43 #include "sxmlc.h" 44 #include "xhtml.h" 45 #include "util.h" 46 47 // APPNAME is defined by the Makefile compiler arguments, e.g., -DAPPNAME=\"epub2txt\" 48 49 static char *tempdir = NULL; 50 51 /*============================================================================ 52 epub2txt_unescape_html 53 ============================================================================*/ 54 static char *epub2txt_unescape_html (const char *s) 55 { 56 typedef enum {MODE_ANY=0, MODE_AMP=1} Mode; 57 Mode mode = MODE_ANY; 58 String *out = string_create_empty(); 59 WString *in = wstring_create_from_utf8(s); 60 WString *ent = wstring_create_empty(); 61 int i, l = wstring_length (in); 62 for (i = 0; i < l; i++) 63 { 64 const uint32_t *ws = wstring_wstr (in); 65 uint32_t c = ws[i]; // Changed from int to uint32_t 66 if (mode == MODE_AMP) 67 { 68 if (c == ';') 69 { 70 WString *trans = xhtml_translate_entity (ent); 71 char *trans_utf8 = wstring_to_utf8 (trans); 72 string_append (out, trans_utf8); 73 free (trans_utf8); 74 wstring_destroy (trans); 75 wstring_clear (ent); 76 mode = MODE_ANY; 77 } 78 else 79 { 80 wstring_append_c (ent, c); 81 } 82 } 83 else 84 { 85 if (c == '&') 86 mode = MODE_AMP; 87 else 88 // Assuming string_append_c in custom_string.h correctly handles uint32_t for UTF-8 89 string_append_c (out, c); 90 } 91 } 92 wstring_destroy (ent); 93 wstring_destroy (in); 94 char *ret = strdup (string_cstr (out)); 95 string_destroy (out); 96 return ret; 97 } 98 99 /*============================================================================ 100 epub2txt_format_meta 101 ============================================================================*/ 102 static void epub2txt_format_meta (const Epub2TxtOptions *options, 103 const char *key, const char *text) 104 { 105 if (text) 106 { 107 char *ss = epub2txt_unescape_html (text); 108 char *s = NULL; // Initialize to NULL 109 asprintf (&s, "%s: %s", key, ss); 110 char *error = NULL; 111 xhtml_utf8_to_stdout (s, options, &error); 112 if (error) free (error); 113 if (s) free (s); // Check if s was allocated 114 free (ss); 115 } 116 } 117 118 /*============================================================================ 119 epub2txt_dump_metadata 120 ============================================================================*/ 121 static List *epub2txt_dump_metadata (const char *opf_canonical_path, 122 const Epub2TxtOptions *options, char **error) 123 { 124 IN 125 List *ret = NULL; 126 String *buff = NULL; 127 if (string_create_from_utf8_file (opf_canonical_path, &buff, error)) 128 { 129 const char *buff_cstr = string_cstr (buff); 130 log_debug ("Read OPF, size %d from %s", string_length (buff), opf_canonical_path); 131 XMLNode *metadata_node = NULL; // Renamed variable 132 XMLDoc doc; 133 XMLDoc_init (&doc); 134 if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc)) 135 { 136 XMLNode *root = XMLDoc_root (&doc); 137 if (root && root->children) 138 { 139 int i, l = root->n_children; 140 for (i = 0; i < l; i++) 141 { 142 XMLNode *r1 = root->children[i]; 143 if (strcmp (r1->tag, "metadata") == 0 || strstr (r1->tag, ":metadata")) 144 { 145 metadata_node = r1; 146 if (metadata_node && metadata_node->children) 147 { 148 int j, l2 = metadata_node->n_children; 149 for (j = 0; j < l2; j++) 150 { 151 XMLNode *r2 = metadata_node->children[j]; 152 const char *mdtag = r2->tag; 153 const char *mdtext = r2->text; 154 if (!mdtext) continue; 155 if (strstr (mdtag, "creator")) 156 epub2txt_format_meta (options, "Creator", mdtext); 157 else if (strstr (mdtag, "publisher")) 158 epub2txt_format_meta (options, "Publisher", mdtext); 159 else if (strstr (mdtag, "contributor")) 160 epub2txt_format_meta (options, "Contributor", mdtext); 161 else if (strstr (mdtag, "identifier")) 162 epub2txt_format_meta (options, "Identifier", mdtext); 163 else if (strstr (mdtag, "date")) 164 { 165 char *mdate = strdup (mdtext); 166 char *p = strchr (mdate, '-'); 167 if (p) *p = 0; 168 epub2txt_format_meta (options, "Date", mdate); 169 free (mdate); 170 } 171 else if (strstr (mdtag, "description")) 172 epub2txt_format_meta (options, "Description", mdtext); 173 else if (strstr (mdtag, "subject")) 174 epub2txt_format_meta (options, "Subject", mdtext); 175 else if (strstr (mdtag, "language")) 176 epub2txt_format_meta (options, "Language", mdtext); 177 else if (strstr (mdtag, "title")) 178 epub2txt_format_meta (options, "Title", mdtext); 179 else if (strstr (mdtag, "meta") && options->calibre) 180 { 181 // More robust Calibre metadata parsing 182 char *meta_name_attr = NULL; 183 char *meta_content_attr = NULL; 184 int k, nattrs = r2->n_attributes; 185 186 for (k = 0; k < nattrs; k++) { 187 if (strcmp(r2->attributes[k].name, "name") == 0 || strcmp(r2->attributes[k].name, "property") == 0) { 188 meta_name_attr = r2->attributes[k].value; 189 } else if (strcmp(r2->attributes[k].name, "content") == 0) { 190 meta_content_attr = r2->attributes[k].value; 191 } 192 } 193 194 if (meta_name_attr && meta_content_attr) { 195 if (strcmp(meta_name_attr, "calibre:series") == 0) { 196 epub2txt_format_meta(options, "Calibre series", meta_content_attr); 197 } else if (strcmp(meta_name_attr, "calibre:series_index") == 0) { 198 char *s = strdup(meta_content_attr); 199 char *p = strchr(s, '.'); 200 if (p) *p = 0; 201 epub2txt_format_meta(options, "Calibre series index", s); 202 free(s); 203 } else if (strcmp(meta_name_attr, "calibre:title_sort") == 0) { 204 epub2txt_format_meta(options, "Calibre title sort", meta_content_attr); 205 } 206 } 207 } 208 } 209 } 210 break; // Found metadata node 211 } 212 } 213 } else { 214 log_warning("Root element or its children are NULL in OPF: %s", opf_canonical_path); 215 } 216 XMLDoc_free (&doc); 217 } 218 else 219 { 220 // Error already contains "Can't parse OPF XML" from XMLDoc_parse_buffer_DOM 221 // or asprintf (error, "Can't parse OPF XML from %s", opf_canonical_path); 222 } 223 string_destroy (buff); 224 } 225 OUT 226 return ret; 227 } 228 229 /*============================================================================ 230 epub2txt_get_items 231 ============================================================================*/ 232 List *epub2txt_get_items (const char *opf_canonical_path, char **error) 233 { 234 IN 235 List *ret = NULL; 236 String *buff = NULL; 237 if (string_create_from_utf8_file (opf_canonical_path, &buff, error)) 238 { 239 const char *buff_cstr = string_cstr (buff); 240 log_debug ("Read OPF for spine items, size %d from %s", string_length (buff), opf_canonical_path); 241 BOOL got_manifest = FALSE; 242 XMLNode *manifest_node = NULL; // Renamed 243 XMLDoc doc; 244 XMLDoc_init (&doc); 245 if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc)) 246 { 247 XMLNode *root = XMLDoc_root (&doc); 248 int l_root_children = 0; 249 250 if (root && root->children) 251 { 252 l_root_children = root->n_children; 253 for (int i = 0; i < l_root_children; i++) 254 { 255 XMLNode *r1 = root->children[i]; 256 if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest")) 257 { 258 manifest_node = r1; 259 got_manifest = TRUE; 260 break; 261 } 262 } 263 } 264 else 265 { 266 log_warning ("'%s' has no root element or children -- corrupt EPUB?", opf_canonical_path); 267 } 268 269 if (!got_manifest || !manifest_node || !manifest_node->children) 270 { 271 asprintf (error, "File %s has no valid manifest or manifest children", opf_canonical_path); 272 string_destroy(buff); 273 XMLDoc_free(&doc); 274 OUT 275 return NULL; 276 } 277 278 ret = list_create_strings(); 279 280 if (root && root->children) 281 { 282 for (int i = 0; i < l_root_children; i++) 283 { 284 XMLNode *r1 = root->children[i]; 285 if (strcmp (r1->tag, "spine") == 0 || strstr (r1->tag, ":spine")) 286 { 287 if (r1->children) 288 { 289 int j, l_spine_children = r1->n_children; 290 for (j = 0; j < l_spine_children; j++) 291 { 292 XMLNode *itemref_node = r1->children[j]; // itemref 293 if (itemref_node->attributes) 294 { 295 int k, nattrs_itemref = itemref_node->n_attributes; 296 for (k = 0; k < nattrs_itemref; k++) 297 { 298 char *attr_name_itemref = itemref_node->attributes[k].name; 299 if (strcmp (attr_name_itemref, "idref") == 0) 300 { 301 char *idref_value = itemref_node->attributes[k].value; 302 int m, l_manifest_children = manifest_node->n_children; 303 for (m = 0; m < l_manifest_children; m++) 304 { 305 XMLNode *manifest_item_node = manifest_node->children[m]; 306 if (manifest_item_node->attributes) 307 { 308 int n, nattrs_manifest_item = manifest_item_node->n_attributes; 309 for (n = 0; n < nattrs_manifest_item; n++) 310 { 311 char *attr_name_manifest = manifest_item_node->attributes[n].name; 312 char *attr_val_manifest = manifest_item_node->attributes[n].value; 313 if (strcmp (attr_name_manifest, "id") == 0 && 314 idref_value != NULL && attr_val_manifest != NULL && // Add NULL checks 315 strcmp (attr_val_manifest, idref_value) == 0) 316 { 317 for (int p = 0; p < nattrs_manifest_item; p++) 318 { 319 if (strcmp (manifest_item_node->attributes[p].name, "href") == 0) 320 { 321 char *decoded_href = decode_url (manifest_item_node->attributes[p].value); 322 list_append (ret, decoded_href); 323 break; 324 } 325 } 326 break; 327 } 328 } 329 } 330 } 331 break; 332 } 333 } 334 } 335 } 336 } 337 break; 338 } 339 } 340 } 341 XMLDoc_free (&doc); 342 } 343 else 344 { 345 // Error from XMLDoc_parse_buffer_DOM 346 } 347 string_destroy (buff); 348 } 349 OUT 350 return ret; 351 } 352 353 /*============================================================================ 354 epub2txt_get_root_file 355 ============================================================================*/ 356 String *epub2txt_get_root_file (const char *container_xml_path, char **error) 357 { 358 IN 359 String *ret = NULL; 360 String *buff = NULL; 361 if (string_create_from_utf8_file (container_xml_path, &buff, error)) 362 { 363 const char *buff_cstr = string_cstr (buff); 364 log_debug ("Read container.xml, size %d from %s", string_length (buff), container_xml_path); 365 XMLDoc doc; 366 XMLDoc_init (&doc); 367 if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc)) 368 { 369 XMLNode *root = XMLDoc_root (&doc); 370 if (root && root->children) 371 { 372 int i, l = root->n_children; 373 for (i = 0; i < l; i++) 374 { 375 XMLNode *r1 = root->children[i]; 376 if (strcmp (r1->tag, "rootfiles") == 0) 377 { 378 if (r1->children) 379 { 380 XMLNode *rootfiles_node = r1; 381 int j, l2 = rootfiles_node->n_children; 382 for (j = 0; j < l2; j++) 383 { 384 XMLNode *rootfile_node = rootfiles_node->children[j]; // Renamed 385 if (strcmp (rootfile_node->tag, "rootfile") == 0) 386 { 387 if (rootfile_node->attributes) 388 { 389 int k, nattrs = rootfile_node->n_attributes; 390 for (k = 0; k < nattrs; k++) 391 { 392 char *attr_name = rootfile_node->attributes[k].name; 393 char *attr_value = rootfile_node->attributes[k].value; 394 if (strcmp (attr_name, "full-path") == 0) 395 { 396 ret = string_create (attr_value); 397 break; 398 } 399 } 400 } 401 if (ret) break; 402 } 403 } 404 } 405 if (ret) break; 406 } 407 } 408 } else { 409 log_warning("Root element or its children are NULL in %s", container_xml_path); 410 } 411 if (ret == NULL) { // If still NULL after checking all children 412 // Avoid overwriting previous error from string_create_from_utf8_file or XMLDoc_parse_buffer_DOM 413 if (*error == NULL) { 414 asprintf (error, "%s does not specify a root file via full-path attribute", container_xml_path); 415 } 416 } 417 XMLDoc_free (&doc); 418 } 419 else 420 { 421 // Error from XMLDoc_parse_buffer_DOM, *error should be set 422 } 423 string_destroy (buff); 424 } 425 OUT 426 return ret; 427 } 428 429 /*============================================================================ 430 epub2txt_cleanup 431 ============================================================================*/ 432 void epub2txt_cleanup (void) 433 { 434 if (tempdir) 435 { 436 log_debug ("Deleting temporary directory: %s", tempdir); 437 run_command ((const char *[]){"rm", "-rf", tempdir, NULL}, FALSE); 438 free (tempdir); // tempdir was allocated by strdup from template or asprintf 439 tempdir = NULL; 440 } 441 } 442 443 /*============================================================================ 444 epub2txt_do_file 445 ============================================================================*/ 446 void epub2txt_do_file (const char *file, const Epub2TxtOptions *options, 447 char **error) 448 { 449 IN 450 *error = NULL; 451 452 log_debug ("epub2txt_do_file: %s", file); 453 if (access (file, R_OK) == 0) 454 { 455 log_debug ("File access OK"); 456 457 char *tempbase; 458 if (!(tempbase = getenv("TMPDIR")) && !(tempbase = getenv("TMP"))) 459 tempbase = "/tmp"; 460 log_debug ("tempbase is: %s", tempbase); 461 462 if (tempdir != NULL) { // Should be cleaned up by atexit or previous call's end 463 log_warning("tempdir was not NULL (%s), implies prior cleanup issue or re-entry.", tempdir); 464 // Forcing cleanup here might be too aggressive if an atexit handler is also registered 465 // free(tempdir); tempdir = NULL; // Or call epub2txt_cleanup carefully. 466 } 467 468 char temp_dir_template[PATH_MAX]; // PATH_MAX from 469 // snprintf is safer than sprintf 470 snprintf(temp_dir_template, PATH_MAX, "%s/epub2txt.%d.XXXXXX", tempbase, getpid()); 471 temp_dir_template[PATH_MAX - 1] = '\0'; // Ensure null termination if PATH_MAX is hit 472 473 if (mkdtemp(temp_dir_template) == NULL) { // mkdtemp is from 474 asprintf(error, "Can't create temporary directory using template %s: %s", temp_dir_template, strerror(errno)); 475 return; // tempdir (global) is still NULL 476 } 477 tempdir = strdup(temp_dir_template); // Assign to global tempdir 478 if (tempdir == NULL) { 479 asprintf(error, "Failed to strdup temporary directory path: %s", strerror(errno)); 480 // Attempt to remove the created directory if we can't store its path 481 rmdir(temp_dir_template); // Best effort, might fail if not empty 482 return; 483 } 484 log_debug ("tempdir created: %s", tempdir); 485 486 log_debug ("Running unzip command"); 487 int unzip_status = run_command ((const char *[]){"unzip", "-o", "-qq", file, "-d", tempdir, NULL}, TRUE); 488 if (unzip_status != 0) { 489 asprintf(error, "Unzip command failed for %s with status %d", file, unzip_status); 490 epub2txt_cleanup(); // Clean up the created tempdir 491 return; 492 } 493 494 log_debug ("Unzip finished"); 495 log_debug ("Fix permissions: %s", tempdir); 496 run_command((const char *[]){"chmod", "-R", "u+rwX,go+rX,go-w", tempdir, NULL}, FALSE); 497 log_debug ("Permissions fixed"); 498 499 char *container_xml_path_str; 500 asprintf (&container_xml_path_str, "%s/META-INF/container.xml", tempdir); 501 if (!container_xml_path_str) { /* Malloc error */ *error = strdup("asprintf failed for container_xml_path"); epub2txt_cleanup(); return; } 502 log_debug ("Container.xml path is: %s", container_xml_path_str); 503 504 String *rootfile_relative_path = epub2txt_get_root_file (container_xml_path_str, error); 505 free(container_xml_path_str); 506 507 if (*error == NULL && rootfile_relative_path != NULL) 508 { 509 log_debug ("OPF rootfile relative path from container.xml: %s", string_cstr(rootfile_relative_path)); 510 511 char *opf_constructed_path; 512 asprintf (&opf_constructed_path, "%s/%s", tempdir, string_cstr(rootfile_relative_path)); 513 if (!opf_constructed_path) { /* Malloc error */ /* ... cleanup ... */ string_destroy(rootfile_relative_path); epub2txt_cleanup(); return; } 514 515 char *opf_canonical = realpath (opf_constructed_path, NULL); 516 free (opf_constructed_path); 517 518 char *tempdir_canonical_for_check = realpath(tempdir, NULL); 519 if (tempdir_canonical_for_check == NULL) { 520 asprintf(error, "Failed to resolve temporary directory path '%s': %s", tempdir, strerror(errno)); 521 string_destroy(rootfile_relative_path); 522 if (opf_canonical) free(opf_canonical); 523 epub2txt_cleanup(); 524 return; 525 } 526 527 if (opf_canonical == NULL || !is_subpath(tempdir_canonical_for_check, opf_canonical)) 528 { 529 if (opf_canonical == NULL) 530 asprintf (error, "Bad OPF rootfile (relative: %s): realpath failed: %s", string_cstr(rootfile_relative_path), strerror (errno)); 531 else 532 asprintf (error, "Bad OPF rootfile path \"%s\": outside EPUB container (resolved temp dir: %s)", opf_canonical, tempdir_canonical_for_check); 533 534 free(tempdir_canonical_for_check); 535 string_destroy(rootfile_relative_path); 536 if (opf_canonical) free(opf_canonical); 537 epub2txt_cleanup(); 538 return; 539 } 540 free(tempdir_canonical_for_check); 541 542 log_debug("Canonical OPF path: %s", opf_canonical); 543 544 char *content_dir = strdup (opf_canonical); 545 if (!content_dir) { /* Malloc error */ 546 asprintf(error, "strdup failed for content_dir"); 547 string_destroy(rootfile_relative_path); 548 free(opf_canonical); 549 epub2txt_cleanup(); 550 return; 551 } 552 char *last_slash = strrchr (content_dir, '/'); 553 if (last_slash) { 554 *last_slash = '\0'; 555 } else { 556 // This case means opf_canonical has no '/', which is unlikely for an absolute path 557 // unless it's in the root of the filesystem. Default to "." or a copy of tempdir. 558 free(content_dir); 559 content_dir = strdup(tempdir); // Content is in the root of the temp extraction 560 if (!content_dir) { /* Malloc error */ /* ... cleanup ...*/ string_destroy(rootfile_relative_path); free(opf_canonical); epub2txt_cleanup(); return; } 561 } 562 log_debug ("Content directory is: %s", content_dir); 563 564 if (options->meta) 565 { 566 epub2txt_dump_metadata (opf_canonical, options, error); 567 if (*error) 568 { 569 log_warning ("Error during metadata dump: %s (continuing with text)", *error); 570 free (*error); 571 *error = NULL; 572 } 573 } 574 575 if (!options->notext && *error == NULL) 576 { 577 List *spine_items = epub2txt_get_items (opf_canonical, error); 578 if (*error == NULL && spine_items != NULL) 579 { 580 log_debug ("EPUB spine has %d items", list_length (spine_items)); 581 int i, l = list_length (spine_items); 582 for (i = 0; i < l; i++) 583 { 584 const char *item_rel_path = (const char *)list_get (spine_items, i); 585 char *item_constr_path; 586 asprintf (&item_constr_path, "%s/%s", content_dir, item_rel_path); 587 if (!item_constr_path) { /* Malloc error */ continue; } // Skip item 588 589 char *item_canon_path = realpath (item_constr_path, NULL); 590 free (item_constr_path); 591 592 if (item_canon_path == NULL || !is_subpath (content_dir, item_canon_path)) 593 { 594 if (item_canon_path == NULL) 595 log_warning ("Skipping EPUB spine item \"%s\": invalid path (realpath: %s)", 596 item_rel_path, strerror(errno)); 597 else 598 log_warning ("Skipping EPUB spine item \"%s\" (%s): outside content directory (%s)", 599 item_rel_path, item_canon_path, content_dir); 600 if(item_canon_path) free(item_canon_path); 601 continue; 602 } 603 604 if (options->section_separator) 605 printf ("%s\n", options->section_separator); 606 607 xhtml_file_to_stdout (item_canon_path, options, error); 608 free(item_canon_path); 609 if (*error) { 610 log_warning("Error processing spine item %s: %s (continuing)", item_rel_path, *error); 611 free(*error); 612 *error = NULL; 613 } 614 } 615 list_destroy (spine_items); 616 } 617 else if (*error) { 618 log_warning("Could not get spine items: %s", *error); 619 // *error is kept for main to report 620 } else { // spine_items is NULL but no error 621 log_warning("Spine items list is NULL but no specific error reported by epub2txt_get_items."); 622 } 623 } 624 free (content_dir); 625 free (opf_canonical); 626 } 627 else if (*error) { 628 // Error from epub2txt_get_root_file or rootfile_relative_path is NULL 629 // *error is already set 630 } else { // rootfile_relative_path is NULL, but no *error set by epub2txt_get_root_file 631 asprintf(error, "Failed to get OPF root file path from container.xml (it was NULL)."); 632 } 633 634 if (rootfile_relative_path) string_destroy (rootfile_relative_path); 635 epub2txt_cleanup(); 636 } 637 else 638 { 639 asprintf (error, "File not found or not readable: %s", file); 640 } 641 642 OUT 643 }