import re
import subprocess
from pathlib import Path
from datetime import datetime, timedelta
import shutil
import zipfile
import sys, os

def create_zip_from_file(input_file, output_zip):
    """
    Create a zip file containing a single file.
    """
    try:
        with zipfile.ZipFile(output_zip, "w", compression=zipfile.ZIP_DEFLATED) as zipf:
            arcname = Path(input_file).name  # Just "GUTINDEX.ALL.new" inside the archive
            zipf.write(input_file, arcname=arcname)
        print(f"Created zip archive: {output_zip}")
    except Exception as e:
        print(f"Failed to create zip file {output_zip}: {e}")

def snapshot_gutindex(source_path, snapshot_dir="snapshots"):
    """
    Make a timestamped snapshot copy of the source file to ./snapshots/.
    The snapshot filename includes the base name and a datestamp.
    """
    try:
        Path(snapshot_dir).mkdir(parents=True, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
        base_name = Path(source_path).name
        snapshot_path = Path(snapshot_dir) / f"{base_name}-{timestamp}"
        shutil.copy2(source_path, snapshot_path)
        print(f"Snapshot created: {snapshot_path}")
    except Exception as e:
        print(f"Failed to create snapshot of {source_path}: {e}")


def find_highest_gutenberg_id(path, threshold=72574):
    """
    this looks for the highest 5 digit number that ends a line and
    is above the threshold in the GUTINDEX.2025 file
    """
    pattern = re.compile(r"\s+(\d{5})\s*$")
    max_id = None

    try:
        with open(path, "r", encoding="utf-8") as infile:
            for line in infile:
                match = pattern.search(line)
                if match:
                    number = int(match.group(1))
                    if number >= threshold:
                        if max_id is None or number > max_id:
                            max_id = number
    except FileNotFoundError:
        print(f"Error: {path} not found.")
    except Exception as e:
        print(f"An error occurred while processing {path}: {e}")

    return max_id

def find_zips_to_process(startat='70000'):
    try:
        all_files = list(Path("/home/DONE").iterdir())

        now = datetime.now()
        cutoff = now - timedelta(hours=12)

        # we have all the files in DONE in a big list.
        filtered = []
        for f in all_files:
            if f.is_file():
                # split into name and extension
                name, ext = f.name.rsplit(".", 1) if "." in f.name else (f.name, "")
                if ext != "zip":  # only process zip files
                    continue
                if re.match(r"^\d{5}\.zip$", f.name) is None:
                    continue
                if int(name) < int(startat):  # must be above the starting number
                    continue
                # when was it modified
                mtime = datetime.fromtimestamp(f.stat().st_mtime)
                # skip it if its too recent (allowance for catalog team)
                if mtime > cutoff:
                    # if modification time is too new, skip it
                    # print(f"skipping too new: {name}")
                    continue
                # if here, we have a candidate zip file
                # add to list of files we will process
                filtered.append(f.name)

        # we have zip file names in 'filtered' now sort them
        zip_files_sorted = sorted(filtered, key=lambda x: int(re.match(r"(\d+)", x).group(1)))

        # if we have some, find the last one (at end)
        if zip_files_sorted:
            return zip_files_sorted
        else:
            print("No matching zip files found in filtered set.")
            return None
    except Exception as e:
        print(f"Error scanning /home/DONE: {e}") 

def adjust_posting_date_line(current_time):
    adjusted = current_time - timedelta(hours=36)
    end_day = adjusted.day
    end_month = adjusted.strftime("%b")
    end_year = adjusted.year

    start_day = 1
    start_month = end_month
    start_year = end_year

    return (
        f"~ ~ ~ ~ Posting Dates for the below eBooks:  "
        f"{start_day} {start_month} {start_year} to {end_day} {end_month} {end_year} ~ ~ ~ ~"
    )

def adjust_updated_to_line(line, current_time):
    adjusted = current_time - timedelta(hours=36)
    new_date = adjusted.strftime("Updated to %B %-d, %Y")  # e.g., "Updated to July 23, 2025"
    return new_date

def remove_consecutive_blank_lines(lines):
    result = []
    prev_blank = False
    
    for line in lines:
        is_blank = line.strip() == ""
        if not (is_blank and prev_blank):  # Skip if both current and previous are blank
            result.append(line)
        prev_blank = is_blank
    
    return result
            
def insert_results_and_update_metadata(
    gutindex_path, results_path, start_id, output_path
):
    pattern_id = re.compile(rf"\s+{start_id}\s*$")

    pattern_header = re.compile(r"^~ ~ ~ ~ Posting Dates for the below eBooks:.*~ ~ ~ ~$")
    pattern_updated = re.compile(r"^Updated to .*")

    try:
        with open(gutindex_path, "r", encoding="utf-8") as infile:
            lines = infile.readlines()

        with open(results_path, "r", encoding="utf-8") as results_file:
            results_lines = results_file.readlines()

        output_lines = []
        inserted = False
        header_updated = False
        updated_line_modified = False
        now = datetime.now()

        for line in lines:
            original = line.rstrip("\r\n")

            if not updated_line_modified and pattern_updated.match(original):
                new_line = adjust_updated_to_line(original, now)
                output_lines.append(new_line + "\r\n")
                updated_line_modified = True
                continue

            if not header_updated and pattern_header.match(original):
                new_line = adjust_posting_date_line(now)
                output_lines.append(new_line + "\r\n")
                header_updated = True
                continue

            if not inserted and pattern_id.search(original):
                for rline in results_lines:
                    output_lines.append(rline.rstrip("\r\n") + "\r\n")
                inserted = True

            output_lines.append(original + "\r\n")

        with open(output_path, "w", encoding="utf-8", newline="") as outfile:
            filtered_lines = remove_consecutive_blank_lines(output_lines)
            outfile.writelines(filtered_lines)

        if inserted:
            print(f"Inserted results above line ending in {start_id}.")
        else:
            print(f"Warning: No line ending in {start_id} found. Results not inserted.")

        if header_updated:
            print("Updated posting date header.")
        else:
            print("Warning: Posting date header not found.")

        if updated_line_modified:
            print("Updated 'Updated to ...' line.")
        else:
            print("Warning: 'Updated to' line not found.")

    except Exception as e:
        print(f"Error modifying GUTINDEX.2025: {e}")

def main():
    input_2025 = "/home/hhelpers/GUTINDEX.2025"
    input_all = "/home/hhelpers/GUTINDEX.ALL"
    output_2025 = "GUTINDEX.2025.new"
    output_all = "GUTINDEX.ALL.new"

    # Make snapshots of both files
    snapshot_gutindex(input_2025)
    snapshot_gutindex(input_all)

    # Determine the range to query
    last_we_have = find_highest_gutenberg_id(input_2025)
    if last_we_have is None:
        print("No valid starting Gutenberg ID found.")
        return

    start = last_we_have + 1  # start just above the last one we have
    # start is the lowest ebook number that is a candidate to be added to GUTINDEX*

    # get zips to process
    zips_to_process = find_zips_to_process(startat=str(start))
    if not zips_to_process:
        print("No new zips to process")
        return

    # delete the file 'results.txt' if it exists
    if os.path.exists("results.txt"):
        os.remove("results.txt")

    print(zips_to_process)  ##

    for zip in reversed(zips_to_process):
        print(f"processing: {zip}")
        ebook = int(zip.rsplit(".", 1)[0])
        # get the data for this ebook
        try:
            result = subprocess.run(
                ["python3", "query_one_book.py", str(ebook)],
                check=True,
                capture_output=True,
                text=True
            )
            with open("results.txt", "a", encoding="utf-8") as outfile:
                outfile.write(result.stdout)
                outfile.write("\n")
        except subprocess.CalledProcessError as e:
            print("Error while running query_one_book.py:")
            print(e.stderr)
            print("Exiting...")
            sys.exit(1)

    # Apply all edits to both GUTINDEX.2025 and GUTINDEX.ALL
    # put the working files into the current working directory (not into ~hhelpers)
    insert_results_and_update_metadata(input_2025, "results.txt", last_we_have, output_2025)
    insert_results_and_update_metadata(input_all, "results.txt", last_we_have, output_all)

    # Create zip file from the new GUTINDEX.ALL
    output_zip = "GUTINDEX.zip.new"
    create_zip_from_file(output_all, output_zip)

if __name__ == "__main__":
    main()