codemadness.org

       tscrape_update - tscrape - twitter scraper (not working anymore)
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       tscrape_update (4960B)
       ---
            1 #!/bin/sh
            2 # update feeds, merge with old feeds.
            3 # NOTE: assumes "tscrape_*" executables are in $PATH.
            4 
            5 # defaults
            6 tscrapepath="$HOME/.tscrape/feeds"
            7 
            8 # used for processing feeds concurrently: wait until ${maxjobs} amount of
            9 # feeds are finished at a time.
           10 maxjobs=8
           11 
           12 # Twitter authentication bearer (seems to be static).
           13 bearer="AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw"
           14 
           15 # guest token.
           16 token=""
           17 
           18 # load config (evaluate shellscript).
           19 # loadconfig(configfile)
           20 loadconfig() {
           21         # allow to specify config via argv[1].
           22         if [ "$1" != "" ]; then
           23                 # get absolute path of config file.
           24                 config=$(readlink -f "$1")
           25         else
           26                 # default config location.
           27                 config="$HOME/.tscrape/tscraperc"
           28         fi
           29 
           30         # config is loaded here to be able to override $tscrapepath or functions.
           31         if [ -r "${config}" ]; then
           32                 . "${config}"
           33         else
           34                 echo "Configuration file \"${config}\" does not exist or is not readable." >&2
           35                 echo "See tscraperc.example for an example." >&2
           36                 exit 1
           37         fi
           38 }
           39 
           40 # log(name, s)
           41 log() {
           42         printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
           43 }
           44 
           45 # acquire guest token.
           46 # guesttoken()
           47 guesttoken() {
           48         # fail on redirects, hide User-Agent, timeout is 15 seconds.
           49         curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
           50                 -H "Authorization: Bearer ${bearer}" \
           51                 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null | \
           52                 sed -nE 's@.*\{"guest_token":"([^"]*)"\}.*@\1@p'
           53 }
           54 
           55 # fetch a feed via HTTP/HTTPS etc.
           56 # fetch(name, twittername, feedfile)
           57 fetch() {
           58         url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=$2&tweet_mode=extended&count=100&include_rts=1"
           59 
           60         # fail on redirects, hide User-Agent, timeout is 15 seconds.
           61         curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
           62                 -H "Authorization: Bearer ${bearer}" \
           63                 -H "x-guest-token: $token" \
           64                 "${url}" 2>/dev/null
           65 }
           66 
           67 # filter fields.
           68 # filter(name)
           69 filter() {
           70         cat
           71 }
           72 
           73 # merge raw files: unique sort by id, retweetid.
           74 # merge(name, oldfile, newfile)
           75 merge() {
           76         sort -t '        ' -u -k5,5 -k8,8 "$2" "$3" 2>/dev/null
           77 }
           78 
           79 # order by timestamp (descending).
           80 # order(name)
           81 order() {
           82         sort -t '        ' -k1rn,1
           83 }
           84 
           85 # fetch and parse feed.
           86 # feed(name, feedurl)
           87 feed() {
           88         # wait until ${maxjobs} are finished: will stall the queue if an item
           89         # is slow, but it is portable.
           90         [ ${signo} -ne 0 ] && return
           91         [ $((curjobs % maxjobs)) -eq 0 ] && wait
           92         [ ${signo} -ne 0 ] && return
           93         curjobs=$((curjobs + 1))
           94 
           95         (name="$1"
           96         filename="$(printf '%s' "$1" | tr '/' '_')"
           97         feedurl="$2"
           98 
           99         tscrapefile="${tscrapepath}/${filename}"
          100         tmpfeedfile="${tscrapetmpdir}/${filename}"
          101 
          102         if ! fetch "${name}" "${feedurl}" "${tscrapefile}" > "${tmpfeedfile}.fetch"; then
          103                 log "${name}" "FAIL (FETCH)"
          104                 return
          105         fi
          106 
          107         if ! tscrape < "${tmpfeedfile}.fetch" > "${tmpfeedfile}.tsv"; then
          108                 log "${name}" "FAIL (CONVERT)"
          109                 return
          110         fi
          111         rm -f "${tmpfeedfile}.fetch"
          112 
          113         if ! filter "${name}" < "${tmpfeedfile}.tsv" > "${tmpfeedfile}.filter"; then
          114                 log "${name}" "FAIL (FILTER)"
          115                 return
          116         fi
          117         rm -f "${tmpfeedfile}.tsv"
          118 
          119         # new feed data is empty: no need for below stages.
          120         if [ ! -s "${tmpfeedfile}.filter" ]; then
          121                 log "${name}" "OK"
          122                 return
          123         fi
          124 
          125         # if file does not exist yet "merge" with /dev/null.
          126         if [ -e "${tscrapefile}" ]; then
          127                 oldfile="${tscrapefile}"
          128         else
          129                 oldfile="/dev/null"
          130         fi
          131 
          132         if ! merge "${name}" "${oldfile}" "${tmpfeedfile}.filter" > "${tmpfeedfile}.merge"; then
          133                 log "${name}" "FAIL (MERGE)"
          134                 return
          135         fi
          136         rm -f "${tmpfeedfile}.filter"
          137 
          138         if ! order "${name}" < "${tmpfeedfile}.merge" > "${tmpfeedfile}.order"; then
          139                 log "${name}" "FAIL (ORDER)"
          140                 return
          141         fi
          142         rm -f "${tmpfeedfile}.merge"
          143 
          144         # copy
          145         if ! cp "${tmpfeedfile}.order" "${tscrapefile}"; then
          146                 log "${name}" "FAIL (COPY)"
          147                 return
          148         fi
          149         rm -f "${tmpfeedfile}.order"
          150 
          151         # OK
          152         log "${name}" "OK"
          153         ) &
          154 }
          155 
          156 cleanup() {
          157         # remove temporary directory with files.
          158         rm -rf "${tscrapetmpdir}"
          159 }
          160 
          161 sighandler() {
          162         signo="$1"
          163         # ignore TERM signal for myself.
          164         trap -- "" TERM
          165         # kill all running childs >:D
          166         kill -TERM -$$
          167 }
          168 
          169 feeds() {
          170         echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
          171         echo "See tscraperc.example for an example." >&2
          172 }
          173 
          174 # get quest token.
          175 token=$(guesttoken)
          176 if [ -z "${token}" ]; then
          177         echo "Failed to acquire guest token" >&2
          178         exit 1
          179 fi
          180 
          181 # job counter.
          182 curjobs=0
          183 # signal number received for parent.
          184 signo=0
          185 # SIGINT: signal to interrupt parent.
          186 trap -- "sighandler 2" "INT"
          187 # SIGTERM: signal to terminate parent.
          188 trap -- "sighandler 15" "TERM"
          189 # load config file.
          190 loadconfig "$1"
          191 # fetch feeds and store in temporary directory.
          192 tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
          193 # make sure path exists.
          194 mkdir -p "${tscrapepath}"
          195 # fetch feeds specified in config file.
          196 feeds
          197 # wait till all feeds are fetched (concurrently).
          198 [ ${signo} -eq 0 ] && wait
          199 # cleanup temporary files etc.
          200 cleanup
          201 # on signal SIGINT and SIGTERM exit with signal number + 128.
          202 [ ${signo} -ne 0 ] && exit $((signo+128))
          203 exit 0