tscrape_update improvements - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit db47c97bea3370886d011a2c950ead2551cf3fbc
(DIR) parent 5e6e62cf3522747a7c4573736d774503ff139a12
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sun, 12 May 2019 19:20:49 +0200
tscrape_update improvements
- Better checking and verbose logging (on failure) of each stage:
fetchfeed, filter, merge, order, convertencoding. This makes sure on out-of-memory,
disk-space or other resource limits the output is not corrupted.
- This also has the added advantage it runs less processes (piped) at the same
time.
- Clear previous unneeded file to preserve space in /tmp
(/tmp is often mounted as mfs/tmpfs).
- Rename fetchfeed to fetch.
- Add logging function (able to override), use more logical logging format (pun
intended).
- Code-style: order overridable functions in execution order.
Diffstat:
M tscrape_update | 104 ++++++++++++++++++++-----------
1 file changed, 69 insertions(+), 35 deletions(-)
---
(DIR) diff --git a/tscrape_update b/tscrape_update
@@ -31,10 +31,17 @@ loadconfig() {
fi
}
-# merge raw files: unique sort by id, retweetid.
-# merge(name, oldfile, newfile)
-merge() {
- sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
+# log(name,s)
+log() {
+ printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
+}
+
+# fetch a feed via HTTP/HTTPS etc.
+# fetch(name, url, feedfile)
+fetch() {
+ # fail on redirects, hide User-Agent, timeout is 15 seconds.
+ curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
+ "$2" 2>/dev/null
}
# filter fields.
@@ -49,15 +56,10 @@ order() {
sort -t ' ' -k1rn,1
}
-# fetch a feed via HTTP/HTTPS etc.
-# fetchfeed(name, url, feedfile)
-fetchfeed() {
- if curl --http1.0 -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
- "$2" 2>/dev/null; then
- printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
- else
- printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$1" >&2
- fi
+# merge raw files: unique sort by id, retweetid.
+# merge(name, oldfile, newfile)
+merge() {
+ sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
}
# fetch and parse feed.
@@ -73,33 +75,65 @@ feed() {
(name="$1"
filename="$(printf '%s' "$1" | tr '/' '_')"
feedurl="$2"
- tmpfeedfile="${tscrapetmpdir}/${filename}"
- tmpencfile=""
+
tscrapefile="${tscrapepath}/${filename}"
+ tmpfeedfile="${tscrapetmpdir}/${filename}"
+
+ if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedfile}.fetch"; then
+ log "${name}" "FAIL (FETCH)"
+ return
+ fi
+
+ if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; then
+ log "${name}" "FAIL (CONVERT)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.fetch"
+
+ if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
+ log "${name}" "FAIL (FILTER)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.tsv"
+
+ # new feed data is empty: no need for below stages.
+ if [ ! -s "${tmpfeedfile}.filter" ]; then
+ log "${name}" "OK"
+ return
+ fi
+
+ # if file does not exist yet "merge" with /dev/null.
+ if [ -e "${tscrapefile}" ]; then
+ oldfile="${tscrapefile}"
+ else
+ oldfile="/dev/null"
+ fi
+
+ if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
+ log "${name}" "FAIL (MERGE)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.filter"
+
+ if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
+ log "${name}" "FAIL (ORDER)"
+ return
+ fi
+ rm -f "${tmpfeedfile}.merge"
+
+ # atomic move.
+ if ! mv "${tmpfeedfile}.order" "${tscrapefile}"; then
+ log "${name}" "FAIL (MOVE)"
+ return
+ fi
- fetchfeed "${name}" "${feedurl}" "${tscrapefile}" | \
- tscrape | filter "${name}" > "${tmpfeedfile}"
-
- # get new data and merge with old.
- tscrapefilenew="${tscrapepath}/${filename}.new"
- # new feed data is non-empty.
- if [ -s "${tmpfeedfile}" ]; then
- # if file exists, merge
- if [ -e "${tscrapefile}" ]; then
- merge "${name}" "${tscrapefile}" "${tmpfeedfile}" | \
- order "${name}" > "${tscrapefilenew}"
-
- # overwrite old file with updated file
- mv "${tscrapefilenew}" "${tscrapefile}"
- else
- merge "${name}" "/dev/null" "${tmpfeedfile}" | \
- order "${name}" > "${tscrapefile}"
- fi
- fi) &
+ # OK
+ log "${name}" "OK"
+ ) &
}
cleanup() {
- # remove temporary files
+ # remove temporary directory with files.
rm -rf "${tscrapetmpdir}"
}