#!/usr/bin/mawk -f
# mwimport.awk: convert xml mediawiki dump to sql
# pv import.xml | ./mwimport.awk >import.sql
# based on mwimport perl script by Robert Bihlmeyer (2007)
# ulcer/sdf.org, 2018, GPL >= 2
# WARNING: run this script with awk, which returns "length" in bytes and not chars
# e.g. "mawk -f ./mwimport.awk" of "gawk -b -f ./mwimport.awk"
# tag per line is expected. schema referenced for development is mw1.27
# all referenced tags are required, missing closing tag will result in data loss
function tag() {
if (match($0, "^ *<[^/]*/>")) return ""
sub("^ *<[^>]*>", ""); sub("[^>]*> *$", "")
return $0
}
# len is global, s is returned
function textify(s) {
gsub(""", "\"", s); gsub("&", "\\&", s)
gsub("<", "<", s); gsub(">", ">", s)
len = length(s)
gsub("'", "''", s)
return "'" s "'"
}
function flush() {
# print "BEGIN;"
# print "REPLACE INTO text(old_id,old_text,old_flags) VALUES " dump_text ";"
# if (dump_page) print "REPLACE INTO page(page_id,page_namespace,page_title," \
# "page_restrictions,page_is_redirect,page_is_new,page_random," \
# "page_touched,page_latest,page_len) VALUES " dump_page ";"
# print "REPLACE INTO revision(rev_id,rev_page,rev_text_id,rev_comment," \
# "rev_user,rev_user_text,rev_timestamp,rev_minor_edit,rev_deleted," \
# "rev_len,rev_parent_id) VALUES " dump_rev ";"
# print "COMMIT;"
dump_text = dump_page = dump_dev = ""
}
BEGIN { header = 1 }
# multiline values between tags
comment {
if (match($0, "")) {
sub(" *$", "")
comment = 0
}
rev_comment = rev_comment "\n" $0
next
}
text {
if (match($0, "")) {
sub(" *$", "")
text = 0
}
rev_text = rev_text "\n" $0
next
}
# mediawiki: header, namespace, etc.
header && /$") value = ""
else value = $0; sub("[^>]*>", "", value); sub("<.*", "", value)
ns[key] = value
next
}
header && /<\/siteinfo>/ { header = 0; next }
# page
!page && // { page = 1; page_latest = 0; page_restrictions = ""; next }
page && // { page_title = tag(); next }
page && // { page_id = tag(); next }
page && // { page_restrictions = tag(); next }
page && // { page_redirect = tag(); next }
page && // { revision = 1; page = 0
rev_contributor_username = rev_contributor_id = 0; next }
revision && // { rev_id = tag(); next }
revision && // { rev_timestamp = tag(); next }
revision && // { rev_minor = tag(); next }
# each revision contains contributor
revision && // { contributor = 1; revision = 0; next }
contributor && // { rev_contributor_username = tag(); next }
contributor && // { rev_contributor_username = tag(); next }
contributor && // { rev_contributor_id = tag(); next }
contributor && /<\/contributor>/ { contributor = 0; revision = 1; next }
# each revision contains comment
revision && // { comment = 1
sub(" *", "")
if (match($0, "")) {
sub(" *$", "")
comment = 0
}
rev_comment = $0
next
}
# each revision contains text
revision && // { text = 1
sub("^ *]*>", "")
if (match($0, "")) {
sub(" *$", "")
text = 0
}
rev_text = $0
next
}
revision && /<\/revision>/ { revision = 0; page = 1
rev_text = textify(rev_text)
rev_len = len
rev_comment = textify(rev_comment)
rev_contributor_username = textify(rev_contributor_username)
gsub("[-:TZ]", "", rev_timestamp)
dump_text = dump_text ? dump_text ",\n" : ""
dump_text = dump_text "(" rev_id "," rev_text ",'utf-8')"
rev_minor = rev_minor ? 1 : 0
dump_rev = dump_rev ? dump_rev ",\n" : ""
dump_rev = dump_rev "(" rev_id "," page_id "," rev_id "," rev_comment \
"," rev_contributor_id "," rev_contributor_username "," rev_timestamp \
"," rev_minor ",0," rev_len "," page_latest ")"
page_latest = rev_id
page_latest_len = rev_len
latest_start = substr(rev_text, 0, 60)
next
}
page && /<\/page>/ { pages++
page = 0
page_ns = 0
for (i in ns) {
if (substr(page_title, 1, length(ns[i]) + 1 ) == ns[i] ":" ) {
page_ns = i
page_title = substr(page_title, length(ns[i]) + 2)
break
}
}
page_title = textify(page_title)
page_restrictions = textify(page_restrictions)
page_redirect = 0
if (match(latest_start, "^'#(REDIRECT|redirect) ")) page_redirect = 1
dump_page = dump_page ? dump_page ",\n" : ""
dump_page = dump_page "(" page_id "," page_ns "," page_title "," \
page_restrictions "," page_redirect ",0,random(),date()+0," \
page_latest "," page_latest_len ")"
next
}
pages > 100 { pages = 0; flush() }
END { if (pages) flush() }