import re import subprocess from pathlib import Path from datetime import datetime, timedelta import shutil import zipfile import sys, os def create_zip_from_file(input_file, output_zip): """ Create a zip file containing a single file. """ try: with zipfile.ZipFile(output_zip, "w", compression=zipfile.ZIP_DEFLATED) as zipf: arcname = Path(input_file).name # Just "GUTINDEX.ALL.new" inside the archive zipf.write(input_file, arcname=arcname) print(f"Created zip archive: {output_zip}") except Exception as e: print(f"Failed to create zip file {output_zip}: {e}") def snapshot_gutindex(source_path, snapshot_dir="snapshots"): """ Make a timestamped snapshot copy of the source file to ./snapshots/. The snapshot filename includes the base name and a datestamp. """ try: Path(snapshot_dir).mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") base_name = Path(source_path).name snapshot_path = Path(snapshot_dir) / f"{base_name}-{timestamp}" shutil.copy2(source_path, snapshot_path) print(f"Snapshot created: {snapshot_path}") except Exception as e: print(f"Failed to create snapshot of {source_path}: {e}") def find_highest_gutenberg_id(path, threshold=72574): """ this looks for the highest 5 digit number that ends a line and is above the threshold in the GUTINDEX.2025 file """ pattern = re.compile(r"\s+(\d{5})\s*$") max_id = None try: with open(path, "r", encoding="utf-8") as infile: for line in infile: match = pattern.search(line) if match: number = int(match.group(1)) if number >= threshold: if max_id is None or number > max_id: max_id = number except FileNotFoundError: print(f"Error: {path} not found.") except Exception as e: print(f"An error occurred while processing {path}: {e}") return max_id def find_zips_to_process(startat='70000'): try: all_files = list(Path("/home/DONE").iterdir()) now = datetime.now() cutoff = now - timedelta(hours=12) # we have all the files in DONE in a big list. filtered = [] for f in all_files: if f.is_file(): # split into name and extension name, ext = f.name.rsplit(".", 1) if "." in f.name else (f.name, "") if ext != "zip": # only process zip files continue if re.match(r"^\d{5}\.zip$", f.name) is None: continue if int(name) < int(startat): # must be above the starting number continue # when was it modified mtime = datetime.fromtimestamp(f.stat().st_mtime) # skip it if its too recent (allowance for catalog team) if mtime > cutoff: # if modification time is too new, skip it # print(f"skipping too new: {name}") continue # if here, we have a candidate zip file # add to list of files we will process filtered.append(f.name) # we have zip file names in 'filtered' now sort them zip_files_sorted = sorted(filtered, key=lambda x: int(re.match(r"(\d+)", x).group(1))) # if we have some, find the last one (at end) if zip_files_sorted: return zip_files_sorted else: print("No matching zip files found in filtered set.") return None except Exception as e: print(f"Error scanning /home/DONE: {e}") def adjust_posting_date_line(current_time): adjusted = current_time - timedelta(hours=36) end_day = adjusted.day end_month = adjusted.strftime("%b") end_year = adjusted.year start_day = 1 start_month = end_month start_year = end_year return ( f"~ ~ ~ ~ Posting Dates for the below eBooks: " f"{start_day} {start_month} {start_year} to {end_day} {end_month} {end_year} ~ ~ ~ ~" ) def adjust_updated_to_line(line, current_time): adjusted = current_time - timedelta(hours=36) new_date = adjusted.strftime("Updated to %B %-d, %Y") # e.g., "Updated to July 23, 2025" return new_date def remove_consecutive_blank_lines(lines): result = [] prev_blank = False for line in lines: is_blank = line.strip() == "" if not (is_blank and prev_blank): # Skip if both current and previous are blank result.append(line) prev_blank = is_blank return result def insert_results_and_update_metadata( gutindex_path, results_path, start_id, output_path ): pattern_id = re.compile(rf"\s+{start_id}\s*$") pattern_header = re.compile(r"^~ ~ ~ ~ Posting Dates for the below eBooks:.*~ ~ ~ ~$") pattern_updated = re.compile(r"^Updated to .*") try: with open(gutindex_path, "r", encoding="utf-8") as infile: lines = infile.readlines() with open(results_path, "r", encoding="utf-8") as results_file: results_lines = results_file.readlines() output_lines = [] inserted = False header_updated = False updated_line_modified = False now = datetime.now() for line in lines: original = line.rstrip("\r\n") if not updated_line_modified and pattern_updated.match(original): new_line = adjust_updated_to_line(original, now) output_lines.append(new_line + "\r\n") updated_line_modified = True continue if not header_updated and pattern_header.match(original): new_line = adjust_posting_date_line(now) output_lines.append(new_line + "\r\n") header_updated = True continue if not inserted and pattern_id.search(original): for rline in results_lines: output_lines.append(rline.rstrip("\r\n") + "\r\n") inserted = True output_lines.append(original + "\r\n") with open(output_path, "w", encoding="utf-8", newline="") as outfile: filtered_lines = remove_consecutive_blank_lines(output_lines) outfile.writelines(filtered_lines) if inserted: print(f"Inserted results above line ending in {start_id}.") else: print(f"Warning: No line ending in {start_id} found. Results not inserted.") if header_updated: print("Updated posting date header.") else: print("Warning: Posting date header not found.") if updated_line_modified: print("Updated 'Updated to ...' line.") else: print("Warning: 'Updated to' line not found.") except Exception as e: print(f"Error modifying GUTINDEX.2025: {e}") def main(): input_2025 = "/home/hhelpers/GUTINDEX.2025" input_all = "/home/hhelpers/GUTINDEX.ALL" output_2025 = "GUTINDEX.2025.new" output_all = "GUTINDEX.ALL.new" # Make snapshots of both files snapshot_gutindex(input_2025) snapshot_gutindex(input_all) # Determine the range to query last_we_have = find_highest_gutenberg_id(input_2025) if last_we_have is None: print("No valid starting Gutenberg ID found.") return start = last_we_have + 1 # start just above the last one we have # start is the lowest ebook number that is a candidate to be added to GUTINDEX* # get zips to process zips_to_process = find_zips_to_process(startat=str(start)) if not zips_to_process: print("No new zips to process") return # delete the file 'results.txt' if it exists if os.path.exists("results.txt"): os.remove("results.txt") print(zips_to_process) ## for zip in reversed(zips_to_process): print(f"processing: {zip}") ebook = int(zip.rsplit(".", 1)[0]) # get the data for this ebook try: result = subprocess.run( ["python3", "query_one_book.py", str(ebook)], check=True, capture_output=True, text=True ) with open("results.txt", "a", encoding="utf-8") as outfile: outfile.write(result.stdout) outfile.write("\n") except subprocess.CalledProcessError as e: print("Error while running query_one_book.py:") print(e.stderr) print("Exiting...") sys.exit(1) # Apply all edits to both GUTINDEX.2025 and GUTINDEX.ALL # put the working files into the current working directory (not into ~hhelpers) insert_results_and_update_metadata(input_2025, "results.txt", last_we_have, output_2025) insert_results_and_update_metadata(input_all, "results.txt", last_we_have, output_all) # Create zip file from the new GUTINDEX.ALL output_zip = "GUTINDEX.zip.new" create_zip_from_file(output_all, output_zip) if __name__ == "__main__": main()