tscrape_update - tscrape - twitter scraper (not working anymore)
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
tscrape_update (4960B)
---
1 #!/bin/sh
2 # update feeds, merge with old feeds.
3 # NOTE: assumes "tscrape_*" executables are in $PATH.
4
5 # defaults
6 tscrapepath="$HOME/.tscrape/feeds"
7
8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
9 # feeds are finished at a time.
10 maxjobs=8
11
12 # Twitter authentication bearer (seems to be static).
13 bearer="AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
14
15 # guest token.
16 token=""
17
18 # load config (evaluate shellscript).
19 # loadconfig(configfile)
20 loadconfig() {
21 # allow to specify config via argv[1].
22 if [ "$1" != "" ]; then
23 # get absolute path of config file.
24 config=$(readlink -f "$1")
25 else
26 # default config location.
27 config="$HOME/.tscrape/tscraperc"
28 fi
29
30 # config is loaded here to be able to override $tscrapepath or functions.
31 if [ -r "${config}" ]; then
32 . "${config}"
33 else
34 echo "Configuration file \"${config}\" does not exist or is not readable." >&2
35 echo "See tscraperc.example for an example." >&2
36 exit 1
37 fi
38 }
39
40 # log(name, s)
41 log() {
42 printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
43 }
44
45 # acquire guest token.
46 # guesttoken()
47 guesttoken() {
48 # fail on redirects, hide User-Agent, timeout is 15 seconds.
49 curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
50 -H "Authorization: Bearer ${bearer}" \
51 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null | \
52 sed -nE 's@.*\{"guest_token":"([^"]*)"\}.*@\1@p'
53 }
54
55 # fetch a feed via HTTP/HTTPS etc.
56 # fetch(name, twittername, feedfile)
57 fetch() {
58 url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=$2&tweet_mode=extended&count=100&include_rts=1"
59
60 # fail on redirects, hide User-Agent, timeout is 15 seconds.
61 curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
62 -H "Authorization: Bearer ${bearer}" \
63 -H "x-guest-token: $token" \
64 "${url}" 2>/dev/null
65 }
66
67 # filter fields.
68 # filter(name)
69 filter() {
70 cat
71 }
72
73 # merge raw files: unique sort by id, retweetid.
74 # merge(name, oldfile, newfile)
75 merge() {
76 sort -t ' ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
77 }
78
79 # order by timestamp (descending).
80 # order(name)
81 order() {
82 sort -t ' ' -k1rn,1
83 }
84
85 # fetch and parse feed.
86 # feed(name, feedurl)
87 feed() {
88 # wait until ${maxjobs} are finished: will stall the queue if an item
89 # is slow, but it is portable.
90 [ ${signo} -ne 0 ] && return
91 [ $((curjobs % maxjobs)) -eq 0 ] && wait
92 [ ${signo} -ne 0 ] && return
93 curjobs=$((curjobs + 1))
94
95 (name="$1"
96 filename="$(printf '%s' "$1" | tr '/' '_')"
97 feedurl="$2"
98
99 tscrapefile="${tscrapepath}/${filename}"
100 tmpfeedfile="${tscrapetmpdir}/${filename}"
101
102 if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedfile}.fetch"; then
103 log "${name}" "FAIL (FETCH)"
104 return
105 fi
106
107 if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; then
108 log "${name}" "FAIL (CONVERT)"
109 return
110 fi
111 rm -f "${tmpfeedfile}.fetch"
112
113 if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
114 log "${name}" "FAIL (FILTER)"
115 return
116 fi
117 rm -f "${tmpfeedfile}.tsv"
118
119 # new feed data is empty: no need for below stages.
120 if [ ! -s "${tmpfeedfile}.filter" ]; then
121 log "${name}" "OK"
122 return
123 fi
124
125 # if file does not exist yet "merge" with /dev/null.
126 if [ -e "${tscrapefile}" ]; then
127 oldfile="${tscrapefile}"
128 else
129 oldfile="/dev/null"
130 fi
131
132 if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
133 log "${name}" "FAIL (MERGE)"
134 return
135 fi
136 rm -f "${tmpfeedfile}.filter"
137
138 if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
139 log "${name}" "FAIL (ORDER)"
140 return
141 fi
142 rm -f "${tmpfeedfile}.merge"
143
144 # copy
145 if ! cp "${tmpfeedfile}.order" "${tscrapefile}"; then
146 log "${name}" "FAIL (COPY)"
147 return
148 fi
149 rm -f "${tmpfeedfile}.order"
150
151 # OK
152 log "${name}" "OK"
153 ) &
154 }
155
156 cleanup() {
157 # remove temporary directory with files.
158 rm -rf "${tscrapetmpdir}"
159 }
160
161 sighandler() {
162 signo="$1"
163 # ignore TERM signal for myself.
164 trap -- "" TERM
165 # kill all running childs >:D
166 kill -TERM -$$
167 }
168
169 feeds() {
170 echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
171 echo "See tscraperc.example for an example." >&2
172 }
173
174 # get quest token.
175 token=$(guesttoken)
176 if [ -z "${token}" ]; then
177 echo "Failed to acquire guest token" >&2
178 exit 1
179 fi
180
181 # job counter.
182 curjobs=0
183 # signal number received for parent.
184 signo=0
185 # SIGINT: signal to interrupt parent.
186 trap -- "sighandler 2" "INT"
187 # SIGTERM: signal to terminate parent.
188 trap -- "sighandler 15" "TERM"
189 # load config file.
190 loadconfig "$1"
191 # fetch feeds and store in temporary directory.
192 tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
193 # make sure path exists.
194 mkdir -p "${tscrapepath}"
195 # fetch feeds specified in config file.
196 feeds
197 # wait till all feeds are fetched (concurrently).
198 [ ${signo} -eq 0 ] && wait
199 # cleanup temporary files etc.
200 cleanup
201 # on signal SIGINT and SIGTERM exit with signal number + 128.
202 [ ${signo} -ne 0 ] && exit $((signo+128))
203 exit 0