sfeed_update - sfeed - RSS and Atom parser
(HTM) git clone git://git.codemadness.org/sfeed
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
sfeed_update (6730B)
---
1 #!/bin/sh
2 # update feeds, merge with old feeds.
3 # NOTE: assumes "sfeed_*" executables are in $PATH.
4
5 # defaults
6 sfeedpath="$HOME/.sfeed/feeds"
7
8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
9 # feeds are finished at a time.
10 maxjobs=16
11
12 # load config (evaluate shellscript).
13 # loadconfig(configfile)
14 loadconfig() {
15 # allow to specify config via argv[1].
16 if [ "$1" != "" ]; then
17 # get absolute path of config file required for including.
18 config="$1"
19 configpath=$(readlink -f "${config}" 2>/dev/null)
20 else
21 # default config location.
22 config="$HOME/.sfeed/sfeedrc"
23 configpath="${config}"
24 fi
25
26 # config is loaded here to be able to override $sfeedpath or functions.
27 if [ -r "${configpath}" ] && [ -f "${configpath}" ]; then
28 . "${configpath}"
29 else
30 printf "Configuration file \"%s\" cannot be read.\n" "${config}" >&2
31 echo "See the sfeedrc.example file or the sfeedrc(5) man page for an example." >&2
32 die
33 fi
34 }
35
36 # log(name, s)
37 log() {
38 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2"
39 }
40
41 # log_error(name, s)
42 log_error() {
43 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
44 # set error exit status indicator for parallel jobs.
45 rm -f "${sfeedtmpdir}/ok"
46 }
47
48 # fetch a feed via HTTP/HTTPS etc.
49 # fetch(name, url, feedfile)
50 fetch() {
51 # fail on redirects, hide User-Agent, timeout is 15 seconds.
52 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
53 "$2" 2>/dev/null
54 }
55
56 # convert encoding from one encoding to another.
57 # convertencoding(name, from, to)
58 convertencoding() {
59 if [ "$2" != "" ] && [ "$3" != "" ] && [ "$2" != "$3" ]; then
60 iconv -cs -f "$2" -t "$3" 2> /dev/null
61 else
62 # else no convert, just output.
63 cat
64 fi
65 }
66
67 # parse and convert input, by default XML to the sfeed(5) TSV format.
68 # parse(name, feedurl, basesiteurl)
69 parse() {
70 sfeed "$3"
71 }
72
73 # filter fields.
74 # filter(name, url)
75 filter() {
76 cat
77 }
78
79 # merge raw files: unique sort by id, title, link.
80 # merge(name, oldfile, newfile)
81 merge() {
82 sort -t ' ' -u -k6,6 -k2,2 -k3,3 "$3" "$2" 2>/dev/null
83 }
84
85 # order by timestamp (descending).
86 # order(name, url)
87 order() {
88 sort -t ' ' -k1rn,1 2>/dev/null
89 }
90
91 # internal handler to fetch and process a feed.
92 # _feed(name, feedurl, [basesiteurl], [encoding])
93 _feed() {
94 name="$1"
95 feedurl="$2"
96 basesiteurl="$3"
97 encoding="$4"
98
99 filename="$(printf '%s' "${name}" | tr '/' '_')"
100 sfeedfile="${sfeedpath}/${filename}"
101 tmpfeedfile="${sfeedtmpdir}/feeds/${filename}"
102
103 # if file does not exist yet create it.
104 [ -e "${sfeedfile}" ] || touch "${sfeedfile}" 2>/dev/null
105
106 if ! fetch "${name}" "${feedurl}" "${sfeedfile}" > "${tmpfeedfile}.fetch"; then
107 log_error "${name}" "FAIL (FETCH)"
108 return 1
109 fi
110
111 # try to detect encoding (if not specified). if detecting the encoding fails assume utf-8.
112 [ "${encoding}" = "" ] && encoding=$(sfeed_xmlenc < "${tmpfeedfile}.fetch")
113
114 if ! convertencoding "${name}" "${encoding}" "utf-8" < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.utf8"; then
115 log_error "${name}" "FAIL (ENCODING)"
116 return 1
117 fi
118 rm -f "${tmpfeedfile}.fetch"
119
120 # if baseurl is empty then use feedurl.
121 if ! parse "${name}" "${feedurl}" "${basesiteurl:-${feedurl}}" < "${tmpfeedfile}.utf8" > "${tmpfeedfile}.tsv"; then
122 log_error "${name}" "FAIL (PARSE)"
123 return 1
124 fi
125 rm -f "${tmpfeedfile}.utf8"
126
127 if ! filter "${name}" "${feedurl}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
128 log_error "${name}" "FAIL (FILTER)"
129 return 1
130 fi
131 rm -f "${tmpfeedfile}.tsv"
132
133 # new feed data is empty: no need for below stages.
134 if [ ! -s "${tmpfeedfile}.filter" ]; then
135 log "${name}" "OK"
136 return 0
137 fi
138
139 if ! merge "${name}" "${sfeedfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
140 log_error "${name}" "FAIL (MERGE)"
141 return 1
142 fi
143 rm -f "${tmpfeedfile}.filter"
144
145 if ! order "${name}" "${feedurl}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
146 log_error "${name}" "FAIL (ORDER)"
147 return 1
148 fi
149 rm -f "${tmpfeedfile}.merge"
150
151 # copy
152 if ! cp "${tmpfeedfile}.order" "${sfeedfile}"; then
153 log_error "${name}" "FAIL (COPY)"
154 return 1
155 fi
156 rm -f "${tmpfeedfile}.order"
157
158 # OK
159 log "${name}" "OK"
160 return 0
161 }
162
163 # fetch and process a feed in parallel.
164 # feed(name, feedurl, [basesiteurl], [encoding])
165 feed() {
166 # Output job parameters for xargs.
167 # Specify fields as a single parameter separated by a NUL byte.
168 # The parameter is split into fields later by the child process, this
169 # allows using xargs with empty fields across many implementations.
170 printf '%s\037%s\037%s\037%s\037%s\037%s\0' \
171 "${config}" "${sfeedtmpdir}" "$1" "$2" "$3" "$4"
172 }
173
174 # cleanup()
175 cleanup() {
176 # remove temporary directory with feed files.
177 rm -rf "${sfeedtmpdir}"
178 }
179
180 # die(statuscode)
181 die() {
182 statuscode="${1:-1}" # default: exit 1
183 # cleanup temporary files etc.
184 cleanup
185 exit "${statuscode}"
186 }
187
188 # sighandler(signo)
189 sighandler() {
190 signo="$1"
191 # ignore TERM signal for myself.
192 trap -- "" TERM
193 # kill all running children >:D
194 kill -TERM -$$
195 }
196
197 # feeds()
198 feeds() {
199 printf "Configuration file \"%s\" is invalid or does not contain a \"feeds\" function.\n" "${config}" >&2
200 echo "See sfeedrc.example for an example." >&2
201 die
202 }
203
204 # runfeeds()
205 runfeeds() {
206 # print feeds for parallel processing with xargs.
207 feeds > "${sfeedtmpdir}/jobs" || die
208 SFEED_UPDATE_CHILD="1" xargs -x -0 -P "${maxjobs}" -n 1 \
209 "$(readlink -f "${argv0}")" < "${sfeedtmpdir}/jobs"
210 }
211
212 # main(args...)
213 main() {
214 # signal number received for parent.
215 signo=0
216 # SIGINT: signal to interrupt parent.
217 trap -- "sighandler 2" "INT"
218 # SIGTERM: signal to terminate parent.
219 trap -- "sighandler 15" "TERM"
220 # load config file.
221 loadconfig "$1"
222 # fetch feeds and store in temporary directory.
223 sfeedtmpdir="$(mktemp -d "${TMPDIR:-/tmp}/sfeed_XXXXXX")" || die
224 mkdir -p "${sfeedtmpdir}/feeds"
225 touch "${sfeedtmpdir}/ok" || die
226 # make sure path exists.
227 mkdir -p "${sfeedpath}"
228 # run and process the feeds.
229 runfeeds
230 statuscode=$?
231
232 # check error exit status indicator for parallel jobs.
233 [ -f "${sfeedtmpdir}/ok" ] || statuscode=1
234 # on signal SIGINT and SIGTERM exit with signal number + 128.
235 [ ${signo} -ne 0 ] && die $((signo+128))
236 die ${statuscode}
237 }
238
239 # process a single feed.
240 # parameters are: config, tmpdir, name, feedurl, basesiteurl, encoding
241 if [ "${SFEED_UPDATE_CHILD}" = "1" ]; then
242 [ "$1" = "" ] && exit 0 # must have an argument set
243 # IFS is "\037"
244 printf '%s\n' "$1" | \
245 while IFS="" read -r _config _tmpdir _name _feedurl _basesiteurl _encoding; do
246 loadconfig "${_config}"
247 sfeedtmpdir="${_tmpdir}"
248 _feed "${_name}" "${_feedurl}" "${_basesiteurl}" "${_encoding}"
249 exit "$?"
250 done
251 exit 0
252 fi
253
254 # ...else parent mode:
255 argv0="$0" # store $0, in the zsh shell $0 is the name of the function.
256 [ "${SFEED_UPDATE_INCLUDE}" = "1" ] || main "$@"