#!/bin/sh # # Convert a feed sfeed to mbox and keep a cache of feeds seen # feed=$1 if [ -z "${feed}" ]; then echo "usage: sfeed_mbox_cache file" exit 1 fi feed_cache=$(dirname "${feed}")/.$(basename "${feed}").cache touch "${feed_cache}" awk -v FEED="${feed}" -v FEED_CACHE="${feed_cache}" ' BEGIN { FS = "\t" mtime = strftime("%a %b %d %H:%M:%S %Y") } function print_content(c, c_type) { gsub("\\\\n", "\n", c) gsub("\\\\t", " ", c) gsub("\\\\\\\\", "\\", c) if (c_type == "html") { cmd = "webdump -lrw 72" c = "" c "" } else { cmd = "cat" } # Gracefully handle possible From in content cmd = cmd " | sed \"s/^From/>From/\"" print c | cmd close(cmd) } FILENAME == FEED_CACHE { seen[$0]++ } FILENAME == FEED { timestamp = $1 title = $2 link = $3 content = $4 content_type = $5 id = $6 author = $7 feed = FEED sub(/.*\//, "", feed) # Use current timestamp if not present if (!timestamp) { timestamp = systime() } # Workaround for feeds that have no id if (!id) { id = "empty" } # Workaround for feeds that have empty link fields if (!link) { link = id } if (!author) { author = feed } # Skip already seen feeds if (seen[link]) { next } print "From MAILER-DAEMON " mtime print "From: \"" author "\" <" feed "@sfeed.invalid>" print "Date: " strftime("%a, %d %b %Y %H:%M +0000 (UTC)", timestamp) print "Subject: " title print "X-RSS-Feed: " feed print "X-RSS-Id: " id print "X-RSS-URL: " link print "X-RSS-Content-Type: " content_type print "Content-Type: text/plain; charset=\"utf-8\"" print "Content-Transfer-Encoding: binary" print "" print_content(content, content_type) print "" print "" print link >> FEED_CACHE seen[link]++ } ' "${feed_cache}" "${feed}"