#!/usr/bin/mawk -f # mwimport.awk: convert xml mediawiki dump to sql # pv import.xml | ./mwimport.awk >import.sql # based on mwimport perl script by Robert Bihlmeyer (2007) # ulcer/sdf.org, 2018, GPL >= 2 # WARNING: run this script with awk, which returns "length" in bytes and not chars # e.g. "mawk -f ./mwimport.awk" of "gawk -b -f ./mwimport.awk" # tag per line is expected. schema referenced for development is mw1.27 # all referenced tags are required, missing closing tag will result in data loss function tag() { if (match($0, "^ *<[^/]*/>")) return "" sub("^ *<[^>]*>", ""); sub("]*> *$", "") return $0 } # len is global, s is returned function textify(s) { gsub(""", "\"", s); gsub("&", "\\&", s) gsub("<", "<", s); gsub(">", ">", s) len = length(s) gsub("'", "''", s) return "'" s "'" } function flush() { # print "BEGIN;" # print "REPLACE INTO text(old_id,old_text,old_flags) VALUES " dump_text ";" # if (dump_page) print "REPLACE INTO page(page_id,page_namespace,page_title," \ # "page_restrictions,page_is_redirect,page_is_new,page_random," \ # "page_touched,page_latest,page_len) VALUES " dump_page ";" # print "REPLACE INTO revision(rev_id,rev_page,rev_text_id,rev_comment," \ # "rev_user,rev_user_text,rev_timestamp,rev_minor_edit,rev_deleted," \ # "rev_len,rev_parent_id) VALUES " dump_rev ";" # print "COMMIT;" dump_text = dump_page = dump_dev = "" } BEGIN { header = 1 } # multiline values between tags comment { if (match($0, "")) { sub(" *$", "") comment = 0 } rev_comment = rev_comment "\n" $0 next } text { if (match($0, "")) { sub(" *$", "") text = 0 } rev_text = rev_text "\n" $0 next } # mediawiki: header, namespace, etc. header && /$") value = "" else value = $0; sub("[^>]*>", "", value); sub("<.*", "", value) ns[key] = value next } header && /<\/siteinfo>/ { header = 0; next } # page !page && // { page = 1; page_latest = 0; page_restrictions = ""; next } page && // { page_title = tag(); next } page && /<id>/ { page_id = tag(); next } page && /<restrictions>/ { page_restrictions = tag(); next } page && /<redirect>/ { page_redirect = tag(); next } page && /<redirect *title/ { sub(".*title=\"", ""); sub("\".*", "") page_redirect = $0 next } # each page contains revision page && /<revision>/ { revision = 1; page = 0 rev_contributor_username = rev_contributor_id = 0; next } revision && /<id>/ { rev_id = tag(); next } revision && /<timestamp>/ { rev_timestamp = tag(); next } revision && /<minor>/ { rev_minor = tag(); next } # each revision contains contributor revision && /<contributor>/ { contributor = 1; revision = 0; next } contributor && /<username>/ { rev_contributor_username = tag(); next } contributor && /<ip>/ { rev_contributor_username = tag(); next } contributor && /<id>/ { rev_contributor_id = tag(); next } contributor && /<\/contributor>/ { contributor = 0; revision = 1; next } # each revision contains comment revision && /<comment>/ { comment = 1 sub(" *<comment>", "") if (match($0, "</comment>")) { sub("</comment> *$", "") comment = 0 } rev_comment = $0 next } # each revision contains text revision && /<text xml:space="preserve">/ { text = 1 sub("^ *<text [^>]*>", "") if (match($0, "</text>")) { sub("</text> *$", "") text = 0 } rev_text = $0 next } revision && /<\/revision>/ { revision = 0; page = 1 rev_text = textify(rev_text) rev_len = len rev_comment = textify(rev_comment) rev_contributor_username = textify(rev_contributor_username) gsub("[-:TZ]", "", rev_timestamp) dump_text = dump_text ? dump_text ",\n" : "" dump_text = dump_text "(" rev_id "," rev_text ",'utf-8')" rev_minor = rev_minor ? 1 : 0 dump_rev = dump_rev ? dump_rev ",\n" : "" dump_rev = dump_rev "(" rev_id "," page_id "," rev_id "," rev_comment \ "," rev_contributor_id "," rev_contributor_username "," rev_timestamp \ "," rev_minor ",0," rev_len "," page_latest ")" page_latest = rev_id page_latest_len = rev_len latest_start = substr(rev_text, 0, 60) next } page && /<\/page>/ { pages++ page = 0 page_ns = 0 for (i in ns) { if (substr(page_title, 1, length(ns[i]) + 1 ) == ns[i] ":" ) { page_ns = i page_title = substr(page_title, length(ns[i]) + 2) break } } page_title = textify(page_title) page_restrictions = textify(page_restrictions) page_redirect = 0 if (match(latest_start, "^'#(REDIRECT|redirect) ")) page_redirect = 1 dump_page = dump_page ? dump_page ",\n" : "" dump_page = dump_page "(" page_id "," page_ns "," page_title "," \ page_restrictions "," page_redirect ",0,random(),date()+0," \ page_latest "," page_latest_len ")" next } pages > 100 { pages = 0; flush() } END { if (pages) flush() }