tscrape_update improvements - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit db47c97bea3370886d011a2c950ead2551cf3fbc
 (DIR) parent 5e6e62cf3522747a7c4573736d774503ff139a12
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun, 12 May 2019 19:20:49 +0200
       
       tscrape_update improvements
       
       - Better checking and verbose logging (on failure) of each stage:
         fetchfeed, filter, merge, order, convertencoding. This makes sure on out-of-memory,
         disk-space or other resource limits the output is not corrupted.
         - This also has the added advantage it runs less processes (piped) at the same
           time.
         - Clear previous unneeded file to preserve space in /tmp
           (/tmp is often mounted as mfs/tmpfs).
       - Rename fetchfeed to fetch.
       - Add logging function (able to override), use more logical logging format (pun
         intended).
       - Code-style: order overridable functions in execution order.
       
       Diffstat:
         M tscrape_update                      |     104 ++++++++++++++++++++-----------
       
       1 file changed, 69 insertions(+), 35 deletions(-)
       ---
 (DIR) diff --git a/tscrape_update b/tscrape_update
       @@ -31,10 +31,17 @@ loadconfig() {
                fi
        }
        
       -# merge raw files: unique sort by id, retweetid.
       -# merge(name, oldfile, newfile)
       -merge() {
       -        sort -t '        ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
       +# log(name,s)
       +log() {
       +        printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
       +}
       +
       +# fetch a feed via HTTP/HTTPS etc.
       +# fetch(name, url, feedfile)
       +fetch() {
       +        # fail on redirects, hide User-Agent, timeout is 15 seconds.
       +        curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
       +                "$2" 2>/dev/null
        }
        
        # filter fields.
       @@ -49,15 +56,10 @@ order() {
                sort -t '        ' -k1rn,1
        }
        
       -# fetch a feed via HTTP/HTTPS etc.
       -# fetchfeed(name, url, feedfile)
       -fetchfeed() {
       -        if curl --http1.0 -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
       -                "$2" 2>/dev/null; then
       -                printf "  OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
       -        else
       -                printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
       -        fi
       +# merge raw files: unique sort by id, retweetid.
       +# merge(name, oldfile, newfile)
       +merge() {
       +        sort -t '        ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
        }
        
        # fetch and parse feed.
       @@ -73,33 +75,65 @@ feed() {
                (name="$1"
                filename="$(printf '%s' "$1" | tr '/' '_')"
                feedurl="$2"
       -        tmpfeedfile="${tscrapetmpdir}/${filename}"
       -        tmpencfile=""
       +
                tscrapefile="${tscrapepath}/${filename}"
       +        tmpfeedfile="${tscrapetmpdir}/${filename}"
       +
       +        if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedfile}.fetch"; then
       +                log "${name}" "FAIL (FETCH)"
       +                return
       +        fi
       +
       +        if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; then
       +                log "${name}" "FAIL (CONVERT)"
       +                return
       +        fi
       +        rm -f "${tmpfeedfile}.fetch"
       +
       +        if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
       +                log "${name}" "FAIL (FILTER)"
       +                return
       +        fi
       +        rm -f "${tmpfeedfile}.tsv"
       +
       +        # new feed data is empty: no need for below stages.
       +        if [ ! -s "${tmpfeedfile}.filter" ]; then
       +                log "${name}" "OK"
       +                return
       +        fi
       +
       +        # if file does not exist yet "merge" with /dev/null.
       +        if [ -e "${tscrapefile}" ]; then
       +                oldfile="${tscrapefile}"
       +        else
       +                oldfile="/dev/null"
       +        fi
       +
       +        if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
       +                log "${name}" "FAIL (MERGE)"
       +                return
       +        fi
       +        rm -f "${tmpfeedfile}.filter"
       +
       +        if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
       +                log "${name}" "FAIL (ORDER)"
       +                return
       +        fi
       +        rm -f "${tmpfeedfile}.merge"
       +
       +        # atomic move.
       +        if ! mv "${tmpfeedfile}.order" "${tscrapefile}"; then
       +                log "${name}" "FAIL (MOVE)"
       +                return
       +        fi
        
       -        fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \
       -                tscrape | filter "${name}" > "${tmpfeedfile}"
       -
       -        # get new data and merge with old.
       -        tscrapefilenew="${tscrapepath}/${filename}.new"
       -        # new feed data is non-empty.
       -        if [ -s "${tmpfeedfile}" ]; then
       -                # if file exists, merge
       -                if [ -e "${tscrapefile}" ]; then
       -                        merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \
       -                                order "${name}" > "${tscrapefilenew}"
       -
       -                        # overwrite old file with updated file
       -                        mv "${tscrapefilenew}" "${tscrapefile}"
       -                else
       -                        merge "${name}" "/dev/null" "${tmpfeedfile}" | \
       -                                order "${name}" > "${tscrapefile}"
       -                fi
       -        fi) &
       +        # OK
       +        log "${name}" "OK"
       +        ) &
        }
        
        cleanup() {
       -        # remove temporary files
       +        # remove temporary directory with files.
                rm -rf "${tscrapetmpdir}"
        }