#!/usr/bin/awk -f # xml2tsv.awk version 8 by Ben Collver # reads XML from stdin and writes TSV to stdout function append(str) { if (length(str) > 0) { if (length(text) == 0) { text = str } else { text = text eolstr str } } return } function hex_decode_print(str, d) { while (length(str) > 0) { d = substr(str, 1, 2) str = substr(str, 3) if (d == "09") { printf "%s", tabstr } else if (d == "0A") { printf "%s", eolstr } else { printf "%c", hex_decode_ord[d] } } return } function hex_init( i, d) { for (i = 0; i <= 255; i++) { d = sprintf("%02X", i) hex_decode_ord[d] = i } return } function normalize(str) { retval = str gsub(/^[\n\r ][\n\r ]*/, "", retval) gsub(/[\n\r ]*[\n\r ]$/, "", retval) gsub(/\r\n/, eolstr, retval) gsub(/\n/, eolstr, retval) gsub(/\t/, tabstr, retval) return retval } function trimleft(str) { retval = str gsub(/^[\n ][\n ]*/, "", retval) return retval } BEGIN { eolstr = "\\n" tabstr = "\\t" is_hex = 0 path = "" text = "" RS = ">" hex_init() } /^<\?xml/ { # ignore xml header next } /" exit(1) } str = normalize(tokens[1]) append(str) name = substr(tokens[2], 2) if (name == "awk:cdata") { if (length(text) > 0) { printf "%s/text()\t", path hex_decode_print(text) print "" } is_hex = 0 } else { namelen = length(name) pathlen = length(path) expected = substr(path, 1 + pathlen - namelen, namelen) if (name != expected) { printf "Error: Expected \"%s\" closing tag, got \"%s\"\n", expected, name exit(1) } if (length(text) > 0) { printf "%s/text()\t%s\n", path, text } path = substr(path, 1, pathlen - namelen - 1) } printf "%s\t\n", closed_path text = "" next } /" exit(1) } str = normalize(tokens[1]) append(str) str = tokens[2] if (str ~ /^!\[CDATA\[/) { str = normalize(substr(str, 9, length(str) - 10)) append(str) next } else if (str ~ /^!DOCTYPE /) { # ignore DOCTYPE next } if (match(str, /[\n ]*\/$/)) { # discard slash from self-closing tag str = substr(str, 1, length(str) - RLENGTH) is_open = 0 } if (match(str, /^awk:cdata type="awk:hexBinary"/)) { is_hex = 1 next } match(str, /^[^\n ][^\n ]*/) name = substr(str, 1, RLENGTH) str = substr(str, RLENGTH+1) oldpath = path path = path "/" name while (length(str) > 0) { str = trimleft(str) if (match(str, /^[^=]*="[^"]*"/)) { pair = substr(str, 1, RLENGTH) str = trimleft(substr(str, RLENGTH+1)) count = split(pair, tokens, /=/) attr = tokens[1] value = substr(tokens[2], 2, length(tokens[2]) - 2) } else if (match(str, /^[^=]*=[^\n ]*/)) { pair = substr(str, 1, RLENGTH) str = trimleft(substr(str, RLENGTH+1)) count = split(pair, tokens, /=/) attr = tokens[1] value = tokens[2] } else { printf "Apparently malformed attribute: \"%s\"\n", str exit(0) } printf "%s[@%s]\t%s\n", path, attr, value } if (!is_open) { printf "%s\t\n", path path = oldpath } next } { str = normalize($0) append(str) }