Add a patch to sync tscrape_update with sfeed_update - pkgsrc-localpatches - leot's pkgsrc LOCALPATCHES
(HTM) hg clone https://bitbucket.org/iamleot/pkgsrc-localpatches
(DIR) Log
(DIR) Files
(DIR) Refs
---
(DIR) changeset 07743a30c060a43e3fa9a3959ed163df5d2f8237
(DIR) parent 024501ed3cc79721a8f4d48050a5f3a8f56443ef
(HTM) Author: Leonardo Taccari <iamleot@gmail.com>
Date: Sat, 15 Dec 2018 23:34:47
Add a patch to sync tscrape_update with sfeed_update
Diffstat:
wip/tscrape-git/patch-tscrape_update.patch | 172 +++++++++++++++++++++++++++++
1 files changed, 172 insertions(+), 0 deletions(-)
---
diff -r 024501ed3cc7 -r 07743a30c060 wip/tscrape-git/patch-tscrape_update.patch
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/wip/tscrape-git/patch-tscrape_update.patch Sat Dec 15 23:34:47 2018 +0100
@@ -0,0 +1,172 @@
+tscrape_update: Sync with sfeed_update
+
+- Handle signals consistently in different shells
+- Improve SIGINT handling
+- Add a variable for max amount of feeds to update concurrently
+- Add filter(), order() support per feed
+- Don't always exit 1, exit 130 on SIGINT, exit 0 otherwise
+- Fail on feed HTTP redirect
+
+--- tscrape_update.orig
++++ tscrape_update
+@@ -5,11 +5,15 @@
+ # defaults
+ tscrapepath="$HOME/.tscrape/feeds"
+
++# used for processing feeds concurrently: wait until ${maxjobs} amount of
++# feeds are finished at a time.
++maxjobs=8
++
+ # load config (evaluate shellscript).
+ # loadconfig(configfile)
+ loadconfig() {
+ # allow to specify config via argv[1].
+- if [ ! x"$1" = x"" ]; then
++ if [ "$1" != "" ]; then
+ # get absolute path of config file.
+ config=$(readlink -f "$1")
+ else
+@@ -17,8 +21,7 @@ loadconfig() {
+ config="$HOME/.tscrape/tscraperc"
+ fi
+
+- # load config: config is loaded here to be able to override $tscrapepath
+- # or functions.
++ # config is loaded here to be able to override $tscrapepath or functions.
+ if [ -r "${config}" ]; then
+ . "${config}"
+ else
+@@ -28,46 +31,69 @@ loadconfig() {
+ fi
+ }
+
+-# merge raw files.
++# merge raw files: unique sort by id, retweetid.
+ # merge(oldfile, newfile)
+ merge() {
+- # unique sort by id, retweetid.
+- # order by timestamp (desc).
+- (sort -t ' ' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null) |
++ sort -t ' ' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null
++}
++
++# filter fields.
++# filter(name)
++filter() {
++ cat
++}
++
++# order by timestamp (descending).
++# order(name)
++order() {
+ sort -t ' ' -k1rn,1
+ }
+
+ # fetch a feed via HTTP/HTTPS etc.
+-# fetchfeed(url, name, feedfile)
++# fetchfeed(name, url, feedfile)
+ fetchfeed() {
+- if curl --http1.0 -H 'User-Agent:' -f -s -S --max-time 15 -z "$3" "$1"; then
+- printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
++ if curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
++ -z "$3" "$2" 2>/dev/null; then
++ printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
+ else
+- printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
++ printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
+ fi
+ }
+
+ # fetch and parse feed.
+ # feed(name, feedurl)
+ feed() {
++ # wait until ${maxjobs} are finished: throughput using this logic is
++ # non-optimal, but it is simple and portable.
++ [ ${signo} -ne 0 ] && return
++ [ $((curjobs % maxjobs)) -eq 0 ] && wait
++ [ ${signo} -ne 0 ] && return
++ curjobs=$((curjobs + 1))
++
+ (name="$1"
+- tmpfeedfile="${tscrapetmpdir}/${name}"
++ filename="$(printf '%s' "$1" | tr '/' '_')"
++ feedurl="$2"
++ tmpfeedfile="${tscrapetmpdir}/${filename}"
+ tmpencfile=""
+- tscrapefile="${tscrapepath}/$1"
++ tscrapefile="${tscrapepath}/${filename}"
+
+- fetchfeed "$2" "$1" "${tscrapefile}" | tscrape > "${tmpfeedfile}"
++ fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \
++ tscrape | filter "${name}" > "${tmpfeedfile}"
+
+ # get new data and merge with old.
+- tscrapefilenew="${tscrapepath}/${name}.new"
++ tscrapefilenew="${tscrapepath}/${filename}.new"
+ # new feed data is non-empty.
+ if [ -s "${tmpfeedfile}" ]; then
+ # if file exists, merge
+ if [ -e "${tscrapefile}" ]; then
+- merge "${tscrapefile}" "${tmpfeedfile}" > "${tscrapefilenew}"
++ merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \
++ order "${name}" > "${tscrapefilenew}"
++
+ # overwrite old file with updated file
+ mv "${tscrapefilenew}" "${tscrapefile}"
+ else
+- merge "/dev/null" "${tmpfeedfile}" > "${tscrapefile}"
++ merge "${name}" "/dev/null" "${tmpfeedfile}" | \
++ order "${name}" > "${tscrapefile}"
+ fi
+ fi) &
+ }
+@@ -81,28 +107,39 @@ cleanup() {
+ rm -rf "${tscrapetmpdir}"
+ }
+
++sighandler() {
++ signo="$1"
++ # ignore TERM signal for myself.
++ trap -- "" TERM
++ # kill all running childs >:D
++ kill -TERM -$$
++}
++
+ feeds() {
+ echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
+ echo "See tscraperc.example for an example." >&2
+ }
+
++# job counter.
++curjobs=0
++# signal number received for parent.
++signo=0
++# SIGINT: signal to interrupt parent.
++trap -- "sighandler 2" "INT"
++# SIGTERM: signal to terminate parent.
++trap -- "sighandler 15" "TERM"
+ # load config file.
+ loadconfig "$1"
+-# fetch feeds and store in temporary file.
++# fetch feeds and store in temporary directory.
+ tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
+-# kill whole current process group on ^C.
+-isrunning="1"
+-# SIGTERM: signal to terminate parent.
+-trap -- "terminated" "15"
+-# SIGINT: kill all running childs >:D
+-trap -- "kill -TERM -$$" "2"
+ # make sure path exists.
+ mkdir -p "${tscrapepath}"
+ # fetch feeds specified in config file.
+ feeds
+ # wait till all feeds are fetched (concurrently).
+-wait
++[ ${signo} -eq 0 ] && wait
+ # cleanup temporary files etc.
+ cleanup
+-# if terminated.
+-[ "${isrunning}" = "0" ] && exit 1
++# on signal SIGINT and SIGTERM exit with signal number + 128.
++[ ${signo} -ne 0 ] && exit $((signo+128))
++exit 0