sfeed_update - sfeed - RSS and Atom parser
 (HTM) git clone git://git.codemadness.org/sfeed
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       sfeed_update (6730B)
       ---
            1 #!/bin/sh
            2 # update feeds, merge with old feeds.
            3 # NOTE: assumes "sfeed_*" executables are in $PATH.
            4 
            5 # defaults
            6 sfeedpath="$HOME/.sfeed/feeds"
            7 
            8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
            9 # feeds are finished at a time.
           10 maxjobs=16
           11 
           12 # load config (evaluate shellscript).
           13 # loadconfig(configfile)
           14 loadconfig() {
           15         # allow to specify config via argv[1].
           16         if [ "$1" != "" ]; then
           17                 # get absolute path of config file required for including.
           18                 config="$1"
           19                 configpath=$(readlink -f "${config}" 2>/dev/null)
           20         else
           21                 # default config location.
           22                 config="$HOME/.sfeed/sfeedrc"
           23                 configpath="${config}"
           24         fi
           25 
           26         # config is loaded here to be able to override $sfeedpath or functions.
           27         if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then
           28                 . "${configpath}"
           29         else
           30                 printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
           31                 echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2
           32                 die
           33         fi
           34 }
           35 
           36 # log(name, s)
           37 log() {
           38         printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
           39 }
           40 
           41 # log_error(name, s)
           42 log_error() {
           43         printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
           44         # set error exit status indicator for parallel jobs.
           45         rm -f "${sfeedtmpdir}/ok"
           46 }
           47 
           48 # fetch a feed via HTTP/HTTPS etc.
           49 # fetch(name, url, feedfile)
           50 fetch() {
           51         # fail on redirects, hide User-Agent, timeout is 15 seconds.
           52         curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
           53                 "$2" 2>/dev/null
           54 }
           55 
           56 # convert encoding from one encoding to another.
           57 # convertencoding(name, from, to)
           58 convertencoding() {
           59         if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
           60                 iconv -cs -f "$2" -t "$3" 2> /dev/null
           61         else
           62                 # else no convert, just output.
           63                 cat
           64         fi
           65 }
           66 
           67 # parse and convert input, by default XML to the sfeed(5) TSV format.
           68 # parse(name, feedurl, basesiteurl)
           69 parse() {
           70         sfeed "$3"
           71 }
           72 
           73 # filter fields.
           74 # filter(name, url)
           75 filter() {
           76         cat
           77 }
           78 
           79 # merge raw files: unique sort by id, title, link.
           80 # merge(name, oldfile, newfile)
           81 merge() {
           82         sort -t '        ' -u -k6,6 -k2,2 -k3,3 "$3" "$2" 2>/dev/null
           83 }
           84 
           85 # order by timestamp (descending).
           86 # order(name, url)
           87 order() {
           88         sort -t '        ' -k1rn,1 2>/dev/null
           89 }
           90 
           91 # internal handler to fetch and process a feed.
           92 # _feed(name, feedurl, [basesiteurl], [encoding])
           93 _feed() {
           94         name="$1"
           95         feedurl="$2"
           96         basesiteurl="$3"
           97         encoding="$4"
           98 
           99         filename="$(printf '%s' "${name}" | tr '/' '_')"
          100         sfeedfile="${sfeedpath}/${filename}"
          101         tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
          102 
          103         # if file does not exist yet create it.
          104         [ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
          105 
          106         if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
          107                 log_error "${name}" "FAIL (FETCH)"
          108                 return 1
          109         fi
          110 
          111         # try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
          112         [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
          113 
          114         if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
          115                 log_error "${name}" "FAIL (ENCODING)"
          116                 return 1
          117         fi
          118         rm -f "${tmpfeedfile}.fetch"
          119 
          120         # if baseurl is empty then use feedurl.
          121         if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
          122                 log_error "${name}" "FAIL (PARSE)"
          123                 return 1
          124         fi
          125         rm -f "${tmpfeedfile}.utf8"
          126 
          127         if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
          128                 log_error "${name}" "FAIL (FILTER)"
          129                 return 1
          130         fi
          131         rm -f "${tmpfeedfile}.tsv"
          132 
          133         # new feed data is empty: no need for below stages.
          134         if [ ! -s "${tmpfeedfile}.filter" ]; then
          135                 log "${name}" "OK"
          136                 return 0
          137         fi
          138 
          139         if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
          140                 log_error "${name}" "FAIL (MERGE)"
          141                 return 1
          142         fi
          143         rm -f "${tmpfeedfile}.filter"
          144 
          145         if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
          146                 log_error "${name}" "FAIL (ORDER)"
          147                 return 1
          148         fi
          149         rm -f "${tmpfeedfile}.merge"
          150 
          151         # copy
          152         if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
          153                 log_error "${name}" "FAIL (COPY)"
          154                 return 1
          155         fi
          156         rm -f "${tmpfeedfile}.order"
          157 
          158         # OK
          159         log "${name}" "OK"
          160         return 0
          161 }
          162 
          163 # fetch and process a feed in parallel.
          164 # feed(name, feedurl, [basesiteurl], [encoding])
          165 feed() {
          166         # Output job parameters for xargs.
          167         # Specify fields as a single parameter separated by a NUL byte.
          168         # The parameter is split into fields later by the child process, this
          169         # allows using xargs with empty fields across many implementations.
          170         printf '%s\037%s\037%s\037%s\037%s\037%s\0' \
          171                 "${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4"
          172 }
          173 
          174 # cleanup()
          175 cleanup() {
          176         # remove temporary directory with feed files.
          177         rm -rf "${sfeedtmpdir}"
          178 }
          179 
          180 # die(statuscode)
          181 die() {
          182         statuscode="${1:-1}" # default: exit 1
          183         # cleanup temporary files etc.
          184         cleanup
          185         exit "${statuscode}"
          186 }
          187 
          188 # sighandler(signo)
          189 sighandler() {
          190         signo="$1"
          191         # ignore TERM signal for myself.
          192         trap -- "" TERM
          193         # kill all running children >:D
          194         kill -TERM -$$
          195 }
          196 
          197 # feeds()
          198 feeds() {
          199         printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
          200         echo "See sfeedrc.example for an example." >&2
          201         die
          202 }
          203 
          204 # runfeeds()
          205 runfeeds() {
          206         # print feeds for parallel processing with xargs.
          207         feeds > "${sfeedtmpdir}/jobs" || die
          208         SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \
          209                 "$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs"
          210 }
          211 
          212 # main(args...)
          213 main() {
          214         # signal number received for parent.
          215         signo=0
          216         # SIGINT: signal to interrupt parent.
          217         trap -- "sighandler 2" "INT"
          218         # SIGTERM: signal to terminate parent.
          219         trap -- "sighandler 15" "TERM"
          220         # load config file.
          221         loadconfig "$1"
          222         # fetch feeds and store in temporary directory.
          223         sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die
          224         mkdir -p "${sfeedtmpdir}/feeds"
          225         touch "${sfeedtmpdir}/ok" || die
          226         # make sure path exists.
          227         mkdir -p "${sfeedpath}"
          228         # run and process the feeds.
          229         runfeeds
          230         statuscode=$?
          231 
          232         # check error exit status indicator for parallel jobs.
          233         [ -f "${sfeedtmpdir}/ok" ] || statuscode=1
          234         # on signal SIGINT and SIGTERM exit with signal number + 128.
          235         [ ${signo} -ne 0 ] && die $((signo+128))
          236         die ${statuscode}
          237 }
          238 
          239 # process a single feed.
          240 # parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding
          241 if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then
          242         [ "$1" = "" ] && exit 0 # must have an argument set
          243         # IFS is "\037"
          244         printf '%s\n' "$1" | \
          245         while IFS="" read -r _config _tmpdir _name _feedurl _basesiteurl _encoding; do
          246                 loadconfig "${_config}"
          247                 sfeedtmpdir="${_tmpdir}"
          248                 _feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_encoding}"
          249                 exit "$?"
          250         done
          251         exit 0
          252 fi
          253 
          254 # ...else parent mode:
          255 argv0="$0" # store $0, in the zsh shell $0 is the name of the function.
          256 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"