#!/usr/bin/awk -f
# epub2txt.awk by Ben Collver <bencollver@tilde.pink>

function basename(str) {
    retval = str
    gsub(/\.[^.][^.]*$/, "", retval)
    return retval
}

function copy(src, dst,    cmd) {
    print "Copy " src " to " dst " ..."
    cmd = sprintf("%s %s %s", cmd_copy, src, dst)
    system(cmd)
    return
}

function create_spine(    href, id, k, label, link, out) {
    out = "spine.html"
    print "<!DOCTYPE html>" >out
    print "<html>" >>out
    print "<head>" >>out
    print "<meta http-equiv=\"Content-Type\" " \
        "content=\"text/html; charset=UTF-8\">" >>out
    print "<title>Spine contents for " title "</title>" >>out
    print "<style type=\"text/css\">" >>out
    print ".content ul {padding:0px}" >>out
    print ".content li {" >>out
    print "border-top:1px solid black;" >>out
    print "padding:0px 0px 15px 0px;" >>out
    print "}" >>out
    print ".content li a {text-decoration:none}" >>out
    print_header_css(out)
    print "</style>" >>out
    print "</head>" >>out
    print "<body>" >>out
    print "<div class=\"header\">" >>out
    print_header_spine(out)
    print "</div>" >>out
    print "<div class=\"clear\"></div>" >>out
    print "<div class=\"content\">" >>out
    print "<h1>" title " by " author "</h1>" >>out
    print "<ul>" >>out
    close(out)
    for (k = 1; k <= spine_count; k++) {
        id = spine[k]
        dit()
        href = get_item_href_by_id(id)
        if (length(href) == 0) {
            print "WARN: Unable to find item " id " in manifest."
            print "Omitting from index."
            continue
        }
        link = dirjoin(opf_dir, href)
        if (has_ncx) {
            label = get_item_label_by_href(href, 0)
            if (length(label) == 0) {
                # if no label, try fuzzy match
                label = get_item_label_by_href(href, length(href))
            }
        } else {
            label = ""
        }
        if (length(label) == 0) {
            # if no label, try scraping from content
            # DOS: Make sure all files are closed before this point
            label = get_item_label_scrape(link)
        }
        if (length(label) == 0) {
            label = id
        }
        label = normalize(label)
        print "<li>" >>out
        print "<a href=\"" link "\">" label "</a>" >>out
        print "</li>" >>out
        close(out)
    }
    print "</ul>" >>out
    print "</div>" >>out
    print "</body>" >>out
    print "</html>" >>out
    close(out)
    copy(out, html_out)
    return
}

function create_text(     cmd, count, cp437_out, fn_txt, href, i, id,
     k, link, out, tok)
{
    cp437_out = "index.437"

    system(cmd_md " " txt_dir)

    del(cp437_out)
    del(dos_out)
    del(utf8_out)

    # copy images to short file names
    for (img_num = 1; img_num <= img_count; img_num++) {
        href = img_hrefs[img_num]
        link = dirjoin(opf_dir, href)
        if (!file_exists(link)) {
            link = "OEBPS/" href
            if (!file_exists(link)) {
                link = "OPS/" href
            }
        }
        base = basename(href)
        ext = substr(href, length(base) + 2)
        if (ext == "jpeg") {
            short_ext = "jpg"
        } else {
            short_ext = substr(ext, 1, 3)
        }
        short = txt_dir "/" img_num "." short_ext
        copy(link, short)
    }

    # convert content items to text
    for (k = 1; k <= spine_count; k++) {
        id = spine[k]
        dit()
        href = get_item_href_by_id(id)
        if (length(href) == 0) {
            print "WARN: Unable to find item " id " in manifest."
            print "Omitting from " utf8_out "."
            continue
        }
        link = dirjoin(opf_dir, href)
        fn_txt = basename(link) ".txt"

        close(utf8_out)

        # DOS: Make sure all files are closed before this point
        cmd = sprintf("%s -I <%s >%s", cmd_webdump, link, fn_txt)
        system(cmd)
        while ((getline <fn_txt) > 0) {
            $0 = trimright($0)
            count = split($0, parts, /\]/)
            if (count == 0) {
                print "" >>utf8_out
                continue
            }

            last_tok = parts[count]
            last_len = length(last_tok)

            for (i = 1; i < count; i++) {
                tok = parts[i]
                if (match(tok, /\[img:/)) {
                    print_txt_img(utf8_out, tok, RSTART, RLENGTH)
                } else if (match(tok, /\[/)) {
                    print_txt_ref(utf8_out, tok, RSTART, RLENGTH)
                } else {
                    printf "%s", tok >>utf8_out
                }
                # skip space if not needed after penultimate token
                if (i + 1 < count || last_len > 0) {
                    printf " " >>utf8_out
                }
            }

            # the last token should never contain "["
            # if it does, then the line has mis-matched square brackets
            # for this reason, process the last token AFTER the loop

            print last_tok >>utf8_out
        }
        close(fn_txt)
    }
    close(utf8_out)

    system(cmd_utf8tocp " 437 " utf8_out " >" cp437_out)
    # transliterate certain Unicode characters to ASCII
    while ((getline <cp437_out) > 0) {
        gsub(/\\u00A9/, "(C)")
        gsub(/\\u00AE/, "(R)")
        gsub(/\\u00BE/, "3/4")
        gsub(/\\u00C1/, "A")
        gsub(/\\u00C8/, "E")
        gsub(/\\u00CA/, "E")
        gsub(/\\u00D7/, "x")
        gsub(/\\u00D8/, "*0*")
        gsub(/\\u00F8/, "o")
        gsub(/\\u0107/, "c")
        gsub(/\\u0161/, "s")
        gsub(/\\u0219/, "s")
        gsub(/\\u02DC/, "~")
        gsub(/\\u0396/, "Z")
        gsub(/\\u039A/, "K")
        gsub(/\\u03B8/, "\xE9")
        gsub(/\\u03C7/, "X")
        gsub(/\\u2011/, "-")
        gsub(/\\u2013/, "-")
        gsub(/\\u2014/, "--")
        gsub(/\\u2018/, "`")
        gsub(/\\u2019/, "'")
        gsub(/\\u201[CD]/, "\"")
        gsub(/\\u2022/, "\xFA")
        gsub(/\\u2026/, "...")
        gsub(/\\u202F/, " ")
        gsub(/\\u2032/, "'")
        gsub(/\\u2033/, "\"")
        gsub(/\\u2122/, "(TM)")
        gsub(/\\u21D0/, "<=")
        gsub(/\\u21D2/, "=>")
        gsub(/\\u2190/, "<-")
        gsub(/\\u2191/, "^")
        gsub(/\\u2192/, "->")
        gsub(/\\u2212/, "-")
        gsub(/\\u2227/, "/\\")
        gsub(/\\u2228/, "\\/")
        gsub(/\\u222B/, "\xF4")
        gsub(/\\u2260/, "!=")
        gsub(/\\u226B/, ">>")
        gsub(/\\u22EF/, "...")
        gsub(/\\u27F5/, "<-")
        gsub(/\\u27F6/, "--")
        print_wrap(dos_out, $0, 70)
    }
    close(dos_out)
    close(cp437_out)
    return
}

function create_toc(     href, i, label, link, out) {
    if (!has_ncx) {
        # EPUB3 has a TOC built-in to the content
        # https://www.w3.org/TR/epub-33/#sec-nav-toc
        return
    }
    out = "toc.html"
    print "<!DOCTYPE html>" >out
    print "<html>" >>out
    print "<head>" >>out
    print "<meta http-equiv=\"Content-Type\" " \
        "content=\"text/html; charset=UTF-8\">" >>out
    print "<title>Table of contents for " title "</title>" >>out
    print "<style type=\"text/css\">" >>out
    print ".content ul {padding:0px}" >>out
    print ".content li {" >>out
    print "border-top:1px solid black;" >>out
    print "padding:0px 0px 15px 0px;" >>out
    print "}" >>out
    print ".content li a {text-decoration:none}" >>out
    print_header_css(out)
    print "</style>" >>out
    print "</head>" >>out
    print "<body>" >>out
    print "<div class=\"header\">" >>out
    print_header_toc(out)
    print "</div>" >>out
    print "<div class=\"clear\"></div>" >>out
    print "<div class=\"content\">" >>out
    print "<h1>" title " by " author "</h1>" >>out
    print "<ul>" >>out
    for (i = 1; i <= tocs; i++) {
        href = toc[i]
        label = normalize(label_by_href[href])
        link = dirjoin(opf_dir, href)
        print "<li>" >>out
        print "<a href=\"" link "\">" label "</a>" >>out
        print "</li>" >>out
    }
    print "</ul>" >>out
    print "</div>" >>out
    print "</body>" >>out
    print "</html>" >>out
    close(out)
    return
}

function del(name,     cmd) {
    print "Deleting " name " ..."
    cmd = sprintf("%s %s", cmd_del, dospath(name))
    system(cmd)
    return
}

function detect_dos(     k) {
    retval = 0
    for (k in ENVIRON) {
        if (k == "COMSPEC") {
            retval = 1
            break
        }
    }
    return retval
}

function dirjoin(dir, name) {
    if (dir == ".") {
        retval = name
    } else {
        retval = dir "/" name
    }
    return retval
}

function dirname(str) {
    retval = str
    gsub(/[\/\\][^\/\\]*$/, "", retval)
    if (retval == str) {
        retval = "."
    }
    return retval
}

function dit() {
    printf "."
    fflush("/dev/stdout")
    return
}

function dospath(str) {
    if (is_dos) {
        gsub(/\//, "\\", str)
    }
    return str
}

function file_exists(name) {
    retval = 0
    if ((getline < name) > 0) {
        retval = 1
    }
    close(name)
    return retval
}

# find_break()
#
# CP437 data gums up gawk regex in UTF-8 locales.
# Find the last space before the end of the string.
# Use that as a position to break a string into
# hard wrapped lines.

function find_break(str,    c, i, len) {
    retval = 0
    len = length(str)
    for (i = len; i > 0; i--) {
        c = substr(str, i, 1)
        if (c == " ") {
             retval = i
             break
        }
    }
    return retval
}

function get_html_file_by_href(href,     fn, i, off) {
    retval = ""
    off = length(href) - 1
    for (i = 1; i <= htmlfiles; i++) {
        fn = htmlfile[i]
        if (substr(fn, length(fn) - off) == href) {
            retval = fn
            break
        }
    }
    return retval
}

function get_img_num_by_href(href,    img_href, img_num, off) {
    retval = ""
    off = length(href) - 1
    for (img_num = 1; img_num <= img_count; img_num++) {
        img_href = img_hrefs[img_num]
        if (img_href == href) {
            retval = img_num
            break
        } else if (substr(img_href, length(img_href) - off) == href) {
            retval = img_num
            break
        }
    }
    return retval
}

function get_item_href_by_id(id,    k) {
    retval = ""
    for (k in item_href_by_id) {
        if (k == id) {
            retval = item_href_by_id[k]
            break
        }
    }
    return retval
}

function get_item_id_by_href(href,     k, off) {
    retval = ""
    off = length(href) - 1
    for (k in item_id_by_href) {
        if (k == href) {
            retval = item_id_by_href[k]
            break
        } else if (substr(k, length(k) - off) == href) {
            retval = item_id_by_href[k]
            break
        }
    }
    return retval
}

function get_item_label_by_href(href, len,    i, k, str) {
    retval = ""

    for (i = 1; i <= tocs; i++) {
        k = toc[i]
        if (len > 0) {
            str = substr(k, 1, len)
        } else {
            str = k
        }
        if (str == href) {
            retval = label_by_href[k]
            break
        }
    }
    return retval
}

function get_item_label_scrape(file,    cmd) {
    retval = ""

    # DOS: Make sure all files are closed before this point
    cmd = sprintf("%s \"%s\" >itemlbl.xml", cmd_xmlrem, file)
    system(cmd)
    cmd = sprintf("%s itemlbl.xml >itemlbl.tsv", cmd_xml2tsv)
    system(cmd)

    FS = "\t"
    while ((getline <"itemlbl.tsv") > 0) {
        if ($1 ~ /\/h[1-4]\/.*text\(\)/) {
            retval = $2
            break
        }
    }
    close("itemlbl.tsv")
    return retval
}

function normalize(str) {
    retval = str
    gsub(/\\n/, "", retval)
    gsub(/\\r/, "", retval)
    gsub(/\\t/, "", retval)
    gsub(/^  */, "", retval)
    gsub(/ * $/, "", retval)
    return retval
}

function parse_epub(     cover_id, man_href, man_id, man_type, \
    meta_content, meta_name, nav_id, nav_lbl, nav_src)
{
    cover_id = ""
    author = ""
    img_count = 0
    man_href = ""
    man_id = ""
    man_type = ""
    meta_content = ""
    meta_name = ""
    spine_count = 0
    title = ""
    FS = "\t"
    while ((getline <"opf.tsv") > 0) {
        if ($1 ~ /metadata\/dc:creator\/text\(\)$/) {
            author = normalize($2)
        } else if ($1 ~ /metadata\/dc:title\/text\(\)$/) {
            title = normalize($2)
        } else if ($1 ~ /metadata\/meta\[@name]$/) {
            meta_name = $2
        } else if ($1 ~ /metadata\/meta\[@content]$/) {
            meta_content = $2
        } else if ($1 ~ /metadata\/meta$/) {
            if (length(meta_name) > 0 && length(meta_content) > 0) {
                cover_id = meta_content
            }
            meta_name = ""
            meta_content = ""
        } else if ($1 ~ /manifest\/item\[@href]$/) {
            man_href = $2
        } else if ($1 ~ /manifest\/item\[@id]$/) {
            man_id = $2
        } else if ($1 ~ /manifest\/item\[@media-type]$/) {
            man_type = $2
        } else if ($1 ~ /manifest\/item$/) {
            item_id_by_href[man_href] = man_id
            item_type_by_id[man_id] = man_type
            item_href_by_id[man_id] = man_href
            if (man_type ~ /^image\//) {
                img_count++
                img_hrefs[img_count] = man_href
            }
            man_href = ""
            man_id = ""
            man_type = ""
        } else if ($1 ~ /spine\/itemref\[@idref\]$/) {
            spine_count++
            spine[spine_count] = $2
        }
    }
    close("opf.tsv")

    if (!has_ncx) {
        return
    }

    nav_id = ""
    nav_lbl = ""
    nav_src = ""
    while ((getline <"ncx.tsv") > 0) {
        if ($1 ~ /ncx\/docAuthor\/text\/text\(\)/) {
            if (length(author) == 0) {
                author = $2
            }
        } else if ($1 ~ /navPoint\[@id]$/) {
            nav_id = $2
        } else if ($1 ~ /navPoint\/navLabel\/text\/text\(\)$/) {
            nav_lbl = $2
        } else if ($1 ~ /navPoint\/content\[@src]$/) {
            nav_src = $2
        } else if ($1 ~ /navPoint$/) {
            if (length(nav_id) > 0 &&
                length(nav_lbl) > 0 &&
                length(nav_src) > 0)
            {
                tocs++
                toc[tocs] = nav_src
                label_by_href[nav_src] = nav_lbl
            }
            nav_id = ""
            nav_lbl = ""
            nav_src = ""
        }
    }
    close("ncx.tsv")
    return
}

function print_doseol(out, str) {
    if (is_dos) {
        printf "%s\n", str >>out
    } else {
        printf "%s\r\n", str >>out
    }
    return
}

function print_header_css(out) {
    print ".clear {clear:both}" >>out
    print ".header ul {" >>out
    print "list-style:none;" >>out
    print "margin-left:0px;" >>out
    print "padding-left:0px;" >>out
    print "text-indent:0px;" >>out
    print "}" >>out
    print ".header li {" >>out
    print "border:1px solid black;" >>out
    print "float:left;" >>out
    print "width:200px;" >>out
    print "}" >>out
    print ".header li.selected {" >>out
    print "font-weight:bold;" >>out
    print "}" >>out
    return
}

function print_header_spine(out) {
    if (has_ncx) {
        print "<ul>" >>out
        print "<li class=\"selected\">Spine contents</li>" >>out
        print "<li><a href=\"toc.html\">Table of contents</a></li>" >>out
        print "</ul>" >>out
    }
    return
}

function print_header_toc(out) {
    print "<ul>" >>out
    print "<li><a href=\"spine.html\">Spine contents</a></li>" >>out
    print "<li class=\"selected\">Table of contents</li>" >>out
    print "</ul>" >>out
    return
}

# function print_txt_img()
#
# IN:  webdump:  label[img: url]
# OUT: markdown: [label](url)

function print_txt_img(out, buf, pos, len,     base, before, ext, img, \
    img_num, link, short, short_base, short_ext, str)
{
    before = substr(buf, 1, pos-1)
    img = substr(buf, pos+6)

    # for remote images, refer to original image URL

    if (img ~ /:/) {
        printf "![%s](%s)", before, img >>out
        return
    }

    # for local images, refer to short 8.3 file names in plaintext output

    while (img ~ /^\.\.\//) {
        gsub(/^\.\.\//, "", img)
    }

    img_num = get_img_num_by_href(img)

    base = basename(img)
    ext = substr(img, length(base) + 2)
    short_base = substr(base, length(base) - 8, 8)
    if (ext == "jpeg") {
        short_ext = "jpg"
    } else {
        short_ext = substr(ext, 1, 3)
    }
    short = img_num "." short_ext

    # There should always be an img_num, but just in case...
    if (img_num == 0) {
        print "WARN: Unable to find image in manifest: " link

        short = short_base "." short_ext
        link = dirjoin(opf_dir, img)
        if (!file_exists(link)) {
            link = "OEBPS/" img
            if (!file_exists(link)) {
                 link = "OPS/" img
            }
        }

        copy(link, txt_dir "/" short)
    }

    printf "![%s](%s)", before, short >>out
    return
}

# function print_txt_ref()
#
# Tests whether the content in square brackets is an epub reference.
# If so, remove it.
#
# IN:  text1 [link to .html/.xhtml file within EPUB content]
# OUT: text1
#
# IN:  text1 [text2]
# OUT: text1 [text2]

function print_txt_ref(out, buf, pos, len,    before, closeb, href, \
    item_id, is_reference, openb, refmaybe, str)
{
    before = substr(buf, 1, pos-1)
    refmaybe = substr(buf, pos+1)
    href = refmaybe
    while (href ~ /^\.\.\//) {
        gsub(/^\.\.\//, "", href)
    }
    gsub(/#.*/, "", href)
    item_id = get_item_id_by_href(href)
    if (length(item_id) == 0) {
        item_id = get_html_file_by_href(href)
    }
    is_reference = length(item_id)

    if (is_reference) {
        printf "%s", before >>out
    } else {
        if (before ~ / $/) {
            openb = ""
        } else {
            openb = " "
        }
        printf "%s%s[%s]", before, openb, refmaybe >>out
    }
    return
}

function print_wrap(out, str, wraplen,     after, before, buf, chunk, pos) {
    buf = str
    while (length(buf) > wraplen) {
        chunk = substr(buf, 1, wraplen)
        pos = find_break(chunk)
        if (pos == 0) {
            break
        } else {
            before = substr(buf, 1, pos-1)
            after = substr(buf, pos+1)
            print_doseol(out, before)
            buf = after
        }
    }
    print_doseol(out, buf)
    return
}

function process_html(file,     bak) {
    bak = basename(file) ".bak"
    copy(file, bak)
    truncate(file)
    while ((getline <bak) > 0) {
        gsub(/text\/x-oeb1-css/, "text/css")
        print >>file
    }
    close(bak)
    close(file)
    return
}

function trim(str) {
    retval = str
    gsub(/^  */, "", retval)
    gsub(/ * $/, "", retval)
    return retval
}

function trimright(str) {
    retval = str
    gsub(/ * $/, "", retval)
    return retval
}

function truncate(file) {
    printf "" >file
    return
}

function usage() {
    print "Usage: epub2txt.awk book.epub"
    print ""
    print "Converts book.epub into plain text."
    print ""
    print "This script extracts to the current directory."
    print "Run the script from the desired directoy."
    exit(0)
}

function main(     cmd, i) {
    html_out = "index.html"

    # Use plaintxt directory to avoid file name collisions.
    # Formerly, i copied the images to 8.3 names under "images" or "img"
    # but both of those are commonly used subdirectories in EPUB files.

    txt_dir = "plaintxt"
    utf8_out = txt_dir "/index.txt"
    dos_out = txt_dir "/index.dos"

    is_dos = detect_dos()

    if (is_dos) {
        cmd_copy = "gnucp.exe -f"
        cmd_del = "redir.exe -e NUL -o NUL del"
        cmd_find = "gnufind.exe"
        cmd_md = "md"
        cmd_unzip = "unzip.exe"
        cmd_utf8tocp = "utf8tocp.com"
        cmd_webdump = "webdump.exe"
        # Note, close all open files before running xmlrem.bat and
        # xml2tsv.bat.  Otherwise, if this script is run with
        # GAWK.EXE it will result in FAT corruption and trash the
        # users disk.
        cmd_xmlrem = "xmlrem.bat"
        cmd_xml2tsv = "xml2tsv.bat"
    } else {
        cmd_copy = "cp -f"
        cmd_del = "rm -f"
        cmd_find = "find"
        cmd_md = "mkdir -p"
        cmd_unzip = "unzip"
        cmd_utf8tocp = "utf8tocp"
        cmd_webdump = "webdump"
        cmd_xmlrem = "xmlrem.awk"
        cmd_xml2tsv = "xml2tsv.awk"
    }

    if (ARGC != 2) {
        usage()
    }

    base = basename(ARGV[1])

    epub = ARGV[1]
    cmd = sprintf("%s -o \"%s\"", cmd_unzip, epub)
    system(cmd)

    has_ncx = 0
    ncx_xml = ""
    opf_xml = ""
    htmlfiles = 0
    del("files.txt")

    print "Finding files in EPUB ..."
    system(cmd_find " . -type f >files.txt")

    print "Filtering file list ..."
    while ((getline <"files.txt") > 0) {
        # remove leading ./
        gsub(/^\.\//, "")

        if (/\/\./) {
            # skip hidden files
            continue
        }

        lfile = tolower($0)
        if (lfile ~ /meta-inf/) {
            # skip files under meta-inf/ directory
            continue
        }

        if (lfile ~ /\.opf$/ && length(opf_xml) == 0) {
            opf_xml = $0
        } else if (lfile ~ /\.ncx$/ && length(ncx_xml) == 0) {
            ncx_xml = $0
            has_ncx = 1
        } else if (lfile ~ /\.x?html?$/ || lfile ~ /\.xml$/) {
            if (dirname(lfile) != ".") {
                htmlfiles++
                htmlfile[htmlfiles] = $0
            }
        }
    }
    close("files.txt")

    if (has_ncx && length(ncx_xml) == 0) {
        print "Error: Couldn't find .ncx file"
        exit(1)
    }
    if (length(opf_xml) == 0) {
        print "Error: Couldn't find .opf file"
        exit(1)
    }

    if (has_ncx) {
        # DOS: Make sure all files are closed before this point
        cmd = sprintf("%s \"%s\" >ncx.rem", cmd_xmlrem, ncx_xml)
        system(cmd)
        cmd = sprintf("%s ncx.rem >ncx.tsv", cmd_xml2tsv)
        system(cmd)
    }

    # DOS: Make sure all files are closed before this point
    opf_dir = dirname(opf_xml)
    cmd = sprintf("%s \"%s\" >opf.rem", cmd_xmlrem, opf_xml)
    system(cmd)
    cmd = sprintf("%s opf.rem >opf.tsv", cmd_xml2tsv)
    system(cmd)
    
    print "Fixing CSS in HTML files ..."
    for (i = 1; i <= htmlfiles; i++) {
        process_html(htmlfile[i])
    }

    print "Parsing EPUB ..."
    tocs = 0
    parse_epub()

    print "Creating HTML table of contents ..."
    create_spine()
    create_toc()
    print ""

    print "Converting to plain text ... "
    create_text()
    print ""

    printf "All done.  See %s, %s, or %s\n", html_out, utf8_out, dos_out
    exit(0)
}

BEGIN {
    main()
}
