tscrape_update: Sync with sfeed_update - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit e9035ce73e795646108130d6f0e3e4f3be30e46a
(DIR) parent 8ff19bed65e82fddf4c01543eaa536863b378fc2
(HTM) Author: Leonardo Taccari <iamleot@gmail.com>
Date: Mon, 17 Dec 2018 18:16:08 +0100
tscrape_update: Sync with sfeed_update
- Handle signals consistently in different shells
- Improve SIGINT handling
- Add a variable for max amount of feeds to update concurrently
- Add filter(), order() support per feed
- Don't always exit 1, exit 130 on SIGINT, exit 0 otherwise
- Fail on feed HTTP redirect
--http1.0 curl option was not removed (it is not present in sfeed_update)
to avoid HTTP/2.
Diffstat:
M tscrape_update | 93 +++++++++++++++++++++----------
1 file changed, 65 insertions(+), 28 deletions(-)
---
(DIR) diff --git a/tscrape_update b/tscrape_update
@@ -5,11 +5,15 @@
# defaults
tscrapepath="$HOME/.tscrape/feeds"
+# used for processing feeds concurrently: wait until ${maxjobs} amount of
+# feeds are finished at a time.
+maxjobs=8
+
# load config (evaluate shellscript).
# loadconfig(configfile)
loadconfig() {
# allow to specify config via argv[1].
- if [ ! x"$1" = x"" ]; then
+ if [ "$1" != "" ]; then
# get absolute path of config file.
config=$(readlink -f "$1")
else
@@ -17,8 +21,7 @@ loadconfig() {
config="$HOME/.tscrape/tscraperc"
fi
- # load config: config is loaded here to be able to override $tscrapepath
- # or functions.
+ # config is loaded here to be able to override $tscrapepath or functions.
if [ -r "${config}" ]; then
. "${config}"
else
@@ -28,46 +31,69 @@ loadconfig() {
fi
}
-# merge raw files.
-# merge(oldfile, newfile)
+# merge raw files: unique sort by id, retweetid.
+# merge(name, oldfile, newfile)
merge() {
- # unique sort by id, retweetid.
- # order by timestamp (desc).
- (sort -t ' ' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null) |
+ sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
+}
+
+# filter fields.
+# filter(name)
+filter() {
+ cat
+}
+
+# order by timestamp (descending).
+# order(name)
+order() {
sort -t ' ' -k1rn,1
}
# fetch a feed via HTTP/HTTPS etc.
-# fetchfeed(url, name, feedfile)
+# fetchfeed(name, url, feedfile)
fetchfeed() {
- if curl --http1.0 -H 'User-Agent:' -f -s -S --max-time 15 -z "$3" "$1"; then
- printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
+ if curl --http1.0 -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
+ -z "$3" "$2" 2>/dev/null; then
+ printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
else
- printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
+ printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
fi
}
# fetch and parse feed.
# feed(name, feedurl)
feed() {
+ # wait until ${maxjobs} are finished: throughput using this logic is
+ # non-optimal, but it is simple and portable.
+ [ ${signo} -ne 0 ] && return
+ [ $((curjobs % maxjobs)) -eq 0 ] && wait
+ [ ${signo} -ne 0 ] && return
+ curjobs=$((curjobs + 1))
+
(name="$1"
- tmpfeedfile="${tscrapetmpdir}/${name}"
+ filename="$(printf '%s' "$1" | tr '/' '_')"
+ feedurl="$2"
+ tmpfeedfile="${tscrapetmpdir}/${filename}"
tmpencfile=""
- tscrapefile="${tscrapepath}/$1"
+ tscrapefile="${tscrapepath}/${filename}"
- fetchfeed "$2" "$1" "${tscrapefile}" | tscrape > "${tmpfeedfile}"
+ fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \
+ tscrape | filter "${name}" > "${tmpfeedfile}"
# get new data and merge with old.
- tscrapefilenew="${tscrapepath}/${name}.new"
+ tscrapefilenew="${tscrapepath}/${filename}.new"
# new feed data is non-empty.
if [ -s "${tmpfeedfile}" ]; then
# if file exists, merge
if [ -e "${tscrapefile}" ]; then
- merge "${tscrapefile}" "${tmpfeedfile}" > "${tscrapefilenew}"
+ merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \
+ order "${name}" > "${tscrapefilenew}"
+
# overwrite old file with updated file
mv "${tscrapefilenew}" "${tscrapefile}"
else
- merge "/dev/null" "${tmpfeedfile}" > "${tscrapefile}"
+ merge "${name}" "/dev/null" "${tmpfeedfile}" | \
+ order "${name}" > "${tscrapefile}"
fi
fi) &
}
@@ -81,28 +107,39 @@ cleanup() {
rm -rf "${tscrapetmpdir}"
}
+sighandler() {
+ signo="$1"
+ # ignore TERM signal for myself.
+ trap -- "" TERM
+ # kill all running childs >:D
+ kill -TERM -$$
+}
+
feeds() {
echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
echo "See tscraperc.example for an example." >&2
}
+# job counter.
+curjobs=0
+# signal number received for parent.
+signo=0
+# SIGINT: signal to interrupt parent.
+trap -- "sighandler 2" "INT"
+# SIGTERM: signal to terminate parent.
+trap -- "sighandler 15" "TERM"
# load config file.
loadconfig "$1"
-# fetch feeds and store in temporary file.
+# fetch feeds and store in temporary directory.
tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
-# kill whole current process group on ^C.
-isrunning="1"
-# SIGTERM: signal to terminate parent.
-trap -- "terminated" "15"
-# SIGINT: kill all running childs >:D
-trap -- "kill -TERM -$$" "2"
# make sure path exists.
mkdir -p "${tscrapepath}"
# fetch feeds specified in config file.
feeds
# wait till all feeds are fetched (concurrently).
-wait
+[ ${signo} -eq 0 ] && wait
# cleanup temporary files etc.
cleanup
-# if terminated.
-[ "${isrunning}" = "0" ] && exit 1
+# on signal SIGINT and SIGTERM exit with signal number + 128.
+[ ${signo} -ne 0 ] && exit $((signo+128))
+exit 0