Check-in by ben on 2024-08-05 23:35:09 Change print_html() to convert archive.org to pharos links in the webdump References section. INSERTED DELETED 67 20 src/web.awk 67 20 TOTAL over 1 changed file Index: src/web.awk ================================================================== --- src/web.awk +++ src/web.awk @@ -107,52 +107,99 @@ } if (type == TYPE_TEXT) { print $0 } } else { - if (match($0, /^ [0-9]+\. /)) { - prefix = substr($0, 0, RLENGTH) - link = substr($0, RLENGTH+1) - if (link !~ /^[a-z]+:/) { - # convert relative link to full URL - relative = link - if (relative ~ /^\/\//) { - link = proto relative - } else if (relative ~ /^\//) { - link = root relative - } else { - link = base "/" relative - } - } - print prefix link - } else { - print $0 - } + print_ref_full($0, base, proto, root) } } close(cmd) unlink(curlcfg) return } -function print_html(html, cmd, work) { +function print_html(html, cmd, marker, work) { work = gettemp() gsub(/\\n/, "
", html) print html >work close(work) cmd = sprintf("%s -a -n 3 <%s | %s -ilr -w 60", cmd_strings, work, \ cmd_webdump) + marker = 999999 while ((cmd | getline) > 0) { gsub(/\t/, " ") - print + if (NR < marker) { + if ($0 ~ /^References$/) { + marker = NR + } + print $0 + } else { + print_ref_pharos($0) + } } close(cmd) unlink(work) return } + +# Print the webdump references section, converting relative URLs +# to full URLs + +function print_ref_full(str, base, proto, root, link, prefix, relative) { + if (match(str, /^ [0-9]+\. /)) { + prefix = substr(str, 0, RLENGTH) + link = substr(str, RLENGTH+1) + # convert relative links to full URLs + if (link !~ /^[a-z]+:/) { + # convert relative link to full URL + relative = link + if (relative ~ /^\/\//) { + link = proto relative + } else if (relative ~ /^\//) { + link = root relative + } else { + link = base "/" relative + } + } + print prefix link + } else { + print str + } + return +} + + +# Print the webdump references section, translating archive.org URLs to +# pharos URLs + +function print_ref_pharos(str, id, label, link, prefix, relative, token) { + if (match(str, /^ [0-9]+\. /)) { + prefix = substr(str, 0, RLENGTH) + link = substr(str, RLENGTH+1) + + id = "" + if (match(link, /https?:\/\/(www\.)?archive\.org\/details\//)) { + token = substr(link, RSTART+RLENGTH) + id = substr(token, 1, length(token) - 7) + if (match(id, /[?\/ ]/)) { + id = substr(id, 1, RSTART-1) + } + } + if (length(id) > 0) { + label = prefix id + printf "[1|%s|%s/details/%s|%s|%s]\n", label, cgipath, + id, server, port + } else { + print str + } + } else { + print str + } + return +} function web_init() { TYPE_HEADERS = 2 TYPE_LINKS = 1 TYPE_RAW = 9