#!/usr/bin/mawk -f # mwimport.awk: convert xml mediawiki dump to sql # pv import.xml | ./mwimport.awk >import.sql # based on mwimport perl script by Robert Bihlmeyer (2007) # ulcer/sdf.org, 2018, GPL >= 2 # WARNING: run this script with awk, which returns "length" in bytes and not chars # e.g. "mawk -f ./mwimport.awk" of "gawk -b -f ./mwimport.awk" # tag per line is expected. schema referenced for development is mw1.27 # all referenced tags are required, missing closing tag will result in data loss function tag() { if (match($0, "^ *<[^/]*/>")) return "" sub("^ *<[^>]*>", ""); sub("]*> *$", "") return $0 } # len is global, s is returned function textify(s) { gsub(""", "\"", s); gsub("&", "\\&", s) gsub("<", "<", s); gsub(">", ">", s) len = length(s) gsub("'", "''", s) return "'" s "'" } function flush() { print "BEGIN;" print "REPLACE INTO text(old_id,old_text,old_flags) VALUES " dump_text ";" if (dump_page) print "REPLACE INTO page(page_id,page_namespace,page_title," \ "page_restrictions,page_is_redirect,page_is_new,page_random," \ "page_touched,page_latest,page_len) VALUES " dump_page ";" print "REPLACE INTO revision(rev_id,rev_page,rev_text_id,rev_comment," \ "rev_user,rev_user_text,rev_timestamp,rev_minor_edit,rev_deleted," \ "rev_len,rev_parent_id) VALUES " dump_rev ";" print "COMMIT;" dump_text = dump_page = dump_dev = "" } function line() { if (!getline) exit } function expect(s) { while ($0 !~ s) line() } function try(s) { if ($0 ~ s) return 1 return 0 } function revision() { if (!try("")) return 0 rev_contributor_username = rev_contributor_id = 0 expect(""); rev_id = tag() expect(""); rev_timestamp = tag() expect("") line() if (try("")) { rev_contributor_username = tag() expect(""); rev_contributor_id = tag() } else if (try("")) rev_contributor_username = tag() expect("") line() if (try("")) rev_minor = tag() expect("") sub(" *", "") rev_comment = "" while (!match($0, "")) { line() rev_comment = rev_comment $0 "\n" } sub(" *$", "") rev_comment = rev_comment $0 expect("") sub("^ *]*>", "") rev_text = "" while (!match($0, "")) { line() rev_text = rev_text $0 "\n" } sub(" *$", "") rev_text = rev_text $0 # process revision data rev_text = textify(rev_text) rev_len = len rev_comment = textify(rev_comment) rev_contributor_username = textify(rev_contributor_username) gsub("[-:TZ]", "", rev_timestamp) dump_text = dump_text ? dump_text ",\n" : "" dump_text = dump_text "(" rev_id "," rev_text ",'utf-8')" rev_minor = rev_minor ? 1 : 0 dump_rev = dump_rev ? dump_rev ",\n" : "" dump_rev = dump_rev "(" rev_id "," page_id "," rev_id "," rev_comment \ "," rev_contributor_id "," rev_contributor_username "," rev_timestamp \ "," rev_minor ",0," rev_len "," page_latest ")" page_latest = rev_id page_latest_len = rev_len latest_start = substr(rev_text, 0, 60) } BEGIN { while ($0 !~ /<\/siteinfo>/) { line() if ($0 ~ //) { key = $0; sub(".*key=\"", "", key); sub("\".*", "", key) if ($0 ~ "/>$") value = "" else value = $0; sub("[^>]*>", "", value); sub("<.*", "", value) ns[key] = value } } } { expect("") pages++ page_latest = 0; page_restrictions = "" expect(""); page_title = tag() expect("<id>"); page_id = tag() line() if (try("<restrictions>")) { page_restrictions = tag() line() } while (revision()) {} expect("</page>") page_ns = 0 for (i in ns) { if (substr(page_title, 1, length(ns[i]) + 1 ) == ns[i] ":" ) { page_ns = i page_title = substr(page_title, length(ns[i]) + 2) break } } page_title = textify(page_title) page_restrictions = textify(page_restrictions) page_redirect = 0 if (match(latest_start, "^'#(REDIRECT|redirect) ")) page_redirect = 1 dump_page = dump_page ? dump_page ",\n" : "" dump_page = dump_page "(" page_id "," page_ns "," page_title "," \ page_restrictions "," page_redirect ",0,random(),date()+0," \ page_latest "," page_latest_len ")" if (pages > 100) { pages = 0; flush() } } END { if (pages) flush() }