codemadness.org

       tscrape_update: Sync with sfeed_update - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit e9035ce73e795646108130d6f0e3e4f3be30e46a
 (DIR) parent 8ff19bed65e82fddf4c01543eaa536863b378fc2
 (HTM) Author: Leonardo Taccari <iamleot@gmail.com>
       Date:   Mon, 17 Dec 2018 18:16:08 +0100
       
       tscrape_update: Sync with sfeed_update
       
       - Handle signals consistently in different shells
       - Improve SIGINT handling
       - Add a variable for max amount of feeds to update concurrently
       - Add filter(), order() support per feed
       - Don't always exit 1, exit 130 on SIGINT, exit 0 otherwise
       - Fail on feed HTTP redirect
       
       --http1.0 curl option was not removed (it is not present in sfeed_update)
       to avoid HTTP/2.
       
       Diffstat:
         M tscrape_update                      |      93 +++++++++++++++++++++----------
       
       1 file changed, 65 insertions(+), 28 deletions(-)
       ---
 (DIR) diff --git a/tscrape_update b/tscrape_update
       @@ -5,11 +5,15 @@
        # defaults
        tscrapepath="$HOME/.tscrape/feeds"
        
       +# used for processing feeds concurrently: wait until ${maxjobs} amount of
       +# feeds are finished at a time.
       +maxjobs=8
       +
        # load config (evaluate shellscript).
        # loadconfig(configfile)
        loadconfig() {
                # allow to specify config via argv[1].
       -        if [ ! x"$1" = x"" ]; then
       +        if [ "$1" != "" ]; then
                        # get absolute path of config file.
                        config=$(readlink -f "$1")
                else
       @@ -17,8 +21,7 @@ loadconfig() {
                        config="$HOME/.tscrape/tscraperc"
                fi
        
       -        # load config: config is loaded here to be able to override $tscrapepath
       -        # or functions.
       +        # config is loaded here to be able to override $tscrapepath or functions.
                if [ -r "${config}" ]; then
                        . "${config}"
                else
       @@ -28,46 +31,69 @@ loadconfig() {
                fi
        }
        
       -# merge raw files.
       -# merge(oldfile, newfile)
       +# merge raw files: unique sort by id, retweetid.
       +# merge(name, oldfile, newfile)
        merge() {
       -        # unique sort by id, retweetid.
       -        # order by timestamp (desc).
       -        (sort -t '        ' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null) |
       +        sort -t '        ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
       +}
       +
       +# filter fields.
       +# filter(name)
       +filter() {
       +        cat
       +}
       +
       +# order by timestamp (descending).
       +# order(name)
       +order() {
                sort -t '        ' -k1rn,1
        }
        
        # fetch a feed via HTTP/HTTPS etc.
       -# fetchfeed(url, name, feedfile)
       +# fetchfeed(name, url, feedfile)
        fetchfeed() {
       -        if curl --http1.0 -H 'User-Agent:' -f -s -S --max-time 15 -z "$3" "$1"; then
       -                printf "  OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
       +        if curl --http1.0 -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
       +                -z "$3" "$2" 2>/dev/null; then
       +                printf "  OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
                else
       -                printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
       +                printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
                fi
        }
        
        # fetch and parse feed.
        # feed(name, feedurl)
        feed() {
       +        # wait until ${maxjobs} are finished: throughput using this logic is
       +        # non-optimal, but it is simple and portable.
       +        [ ${signo} -ne 0 ] && return
       +        [ $((curjobs % maxjobs)) -eq 0 ] && wait
       +        [ ${signo} -ne 0 ] && return
       +        curjobs=$((curjobs + 1))
       +
                (name="$1"
       -        tmpfeedfile="${tscrapetmpdir}/${name}"
       +        filename="$(printf '%s' "$1" | tr '/' '_')"
       +        feedurl="$2"
       +        tmpfeedfile="${tscrapetmpdir}/${filename}"
                tmpencfile=""
       -        tscrapefile="${tscrapepath}/$1"
       +        tscrapefile="${tscrapepath}/${filename}"
        
       -        fetchfeed "$2" "$1" "${tscrapefile}" | tscrape > "${tmpfeedfile}"
       +        fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \
       +                tscrape | filter "${name}" > "${tmpfeedfile}"
        
                # get new data and merge with old.
       -        tscrapefilenew="${tscrapepath}/${name}.new"
       +        tscrapefilenew="${tscrapepath}/${filename}.new"
                # new feed data is non-empty.
                if [ -s "${tmpfeedfile}" ]; then
                        # if file exists, merge
                        if [ -e "${tscrapefile}" ]; then
       -                        merge "${tscrapefile}" "${tmpfeedfile}" > "${tscrapefilenew}"
       +                        merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \
       +                                order "${name}" > "${tscrapefilenew}"
       +
                                # overwrite old file with updated file
                                mv "${tscrapefilenew}" "${tscrapefile}"
                        else
       -                        merge "/dev/null" "${tmpfeedfile}" > "${tscrapefile}"
       +                        merge "${name}" "/dev/null" "${tmpfeedfile}" | \
       +                                order "${name}" > "${tscrapefile}"
                        fi
                fi) &
        }
       @@ -81,28 +107,39 @@ cleanup() {
                rm -rf "${tscrapetmpdir}"
        }
        
       +sighandler() {
       +        signo="$1"
       +        # ignore TERM signal for myself.
       +        trap -- "" TERM
       +        # kill all running childs >:D
       +        kill -TERM -$$
       +}
       +
        feeds() {
                echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
                echo "See tscraperc.example for an example." >&2
        }
        
       +# job counter.
       +curjobs=0
       +# signal number received for parent.
       +signo=0
       +# SIGINT: signal to interrupt parent.
       +trap -- "sighandler 2" "INT"
       +# SIGTERM: signal to terminate parent.
       +trap -- "sighandler 15" "TERM"
        # load config file.
        loadconfig "$1"
       -# fetch feeds and store in temporary file.
       +# fetch feeds and store in temporary directory.
        tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
       -# kill whole current process group on ^C.
       -isrunning="1"
       -# SIGTERM: signal to terminate parent.
       -trap -- "terminated" "15"
       -# SIGINT: kill all running childs >:D
       -trap -- "kill -TERM -$$" "2"
        # make sure path exists.
        mkdir -p "${tscrapepath}"
        # fetch feeds specified in config file.
        feeds
        # wait till all feeds are fetched (concurrently).
       -wait
       +[ ${signo} -eq 0 ] && wait
        # cleanup temporary files etc.
        cleanup
       -# if terminated.
       -[ "${isrunning}" = "0" ] && exit 1
       +# on signal SIGINT and SIGTERM exit with signal number + 128.
       +[ ${signo} -ne 0 ] && exit $((signo+128))
       +exit 0