function dump(search, type, base, cmd, curlcfg, is_html, is_image, label, limit, link, marker, out, parts, proto, relative, root, sel, url) { out = "" url = search gsub(/%3F/, "?", url) if (url !~ /^(http|https):\/\/[[:alnum:].-]+(:[0-9]+)*(\/[[:alnum:].,?@~=%%:\/+&_() -]*)*$/) { info(out, sprintf("Error: Unacceptable URL \"%s\"", url)) return } if (type == TYPE_HEADERS || type == TYPE_RAW) { limit = max_bin_size } else { limit = max_txt_size } # Use temporary file for curl configuration. # This keeps user input separate from shell execution. curlcfg = gettemp() printf "--connect-timeout 10\n" > curlcfg if (limit > 0) { printf "--max-filesize %dM\n", limit >> curlcfg } printf "--max-redirs 0\n" >> curlcfg printf "--proto =http,https\n" >> curlcfg printf "--show-error\n" >> curlcfg printf "--silent\n" >> curlcfg printf "--url %s\n", uri_encode(url) >> curlcfg printf "--user-agent %s\n", agent >> curlcfg if (type == TYPE_HEADERS) { printf "--output /dev/null\n" >> curlcfg printf "--dump-header -\n" >> curlcfg } close(curlcfg) if (type == TYPE_HEADERS || type == TYPE_RAW) { cmd = sprintf("%s -K %s 2>&1", cmd_curl, curlcfg) system(cmd) unlink(curlcfg) return } # Use strings command to guard webdump from binary input. # Use "strings -a" to avoid security pitfalls. cmd = sprintf("%s -K %s 2>&1 | %s -a -n 3 | %s -ilr -w 60", cmd_curl, curlcfg, cmd_strings, cmd_webdump) # Parse base out of original URL. # Use this to convert relative links to full URLs. # webdump has the -b option for this. # Do it manually instead to avoid passing user input through the shell. split(url, parts, "?") base = parts[1] if (match(base, /^(http|https):\/\/[[:alnum:].-]+(:[0-9]+)*/)) { root = substr(base, 0, RLENGTH) } else { root = "" } if (match(base, /^(http|https):/)) { proto = substr(base, 0, RLENGTH) } else { proto = "" } sub(/\/$/, "", base) # marker determines where the bottom references section begins # line numbers smaller than the marker are content # line numbers larger than the marker are referenced links marker = 999999 if (type == TYPE_LINKS) { sel = cgipath "/raw?" search is_html = detect_html(url) is_image = detect_image(url) item(out, "9", "Binary download", sel, server, port) if (is_image) { item(out, "I", "Image view", sel, server, port) } if (is_html) { label = "Source" } else { label = "Text view" } item(out, "0", label, sel, server, port) if (is_html) { label = "HTML view" } else { label = "Strings" } sel = cgipath "/text?" search item(out, "0", label, sel, server, port) sel = cgipath "/debug?" search item(out, "0", "Headers", sel, server, port) info(out, "") } while ((cmd | getline) > 0) { if (NR < marker) { if ($0 ~ /^References$/) { marker = NR } if (type == TYPE_TEXT) { print } } else { print_ref_full($0, base, proto, root) } } close(cmd) unlink(curlcfg) return } function print_html(out, html, cmd, marker, out, work) { out = "" work = gettemp() gsub(/\\n/, "
", html) print html >work close(work) cmd = sprintf("%s -a -n 3 <%s | %s -ilr -w 60", cmd_strings, work, cmd_webdump) marker = 999999 while ((cmd | getline) > 0) { gsub(/\t/, " ") gsub(/\\t/, " ") if (NR < marker) { if ($0 ~ /^References$/) { marker = NR } info(out, $0) } else { print_ref_pharos(out, $0) } } close(cmd) unlink(work) return } # Print the webdump references section, converting relative URLs # to full URLs function print_ref_full(str, base, proto, root, link, out, prefix, relative) { out = "" if (match(str, /^ [0-9]+\. /)) { prefix = substr(str, 0, RLENGTH) link = substr(str, RLENGTH + 1) # convert relative links to full URLs if (link !~ /^[a-z]+:/) { # convert relative link to full URL relative = link if (relative ~ /^\/\//) { link = proto relative } else if (relative ~ /^\//) { link = root relative } else { link = base "/" relative } } info(out, prefix link) } else { info(out, str) } return } # Print the webdump references section, translating archive.org URLs to # pharos URLs function print_ref_pharos(out, str, id, label, link, prefix, relative, token) { if (match(str, /^ [0-9]+\. /)) { prefix = substr(str, 0, RLENGTH) link = substr(str, RLENGTH + 1) id = "" if (match(link, /https?:\/\/(www\.)?archive\.org\/details\//)) { token = substr(link, RSTART+RLENGTH) id = substr(token, 1, length(token) - 7) if (match(id, /[?\/ ]/)) { id = substr(id, 1, RSTART - 1) } } if (length(id) > 0) { label = prefix id item(out, "1", label, cgipath "/details?" id, server, port) } else { info(out, str) } } else { info(out, str) } return } function web_init() { TYPE_HEADERS = 2 TYPE_LINKS = 1 TYPE_RAW = 9 TYPE_TEXT = 0 }