#!/bin/sh # # Generate tscrape(1) output based on JSON user timeline returned by # twitter_timeline or similar. # json2tsv | awk -F '\t' ' # # Return a copy of s with all occurrences of substring old replaced by new. # function replace(s, old, new) { while ((i = index(s, old)) > 0) { s = substr(s, 1, i - 1) new substr(s, length(old) + i) } return s } function push_tweet() { if (!item_username) { item_username = username } if (!item_fullname) { item_fullname = fullname } tweets_id[n++] = item_id for (u in urls) { text = replace(text, u, urls[u]) } # FIXME: Other entities can be present and should be replaced as well text = replace(text, "&", "&") text = replace(text, ">", ">") text = replace(text, "<", "<") text = replace(text, "Å", "Å") text = replace(text, "Ä", "Ä") text = replace(text, "Ë", "Ë") text = replace(text, "Ï", "Ï") text = replace(text, "Ö", "Ö") text = replace(text, "Ü", "Ü") text = replace(text, "å", "å") text = replace(text, "ä", "ä") text = replace(text, "ë", "ë") text = replace(text, "ï", "ï") text = replace(text, "ö", "ö") text = replace(text, "ü", "ü") text = replace(text, "\\n", " ") text = replace(text, "\\t", " ") text = replace(text, "\\\\", "\\") tweets[item_id,"timestamp"] = timestamp tweets[item_id,"username"] = username tweets[item_id,"fullname"] = fullname tweets[item_id,"text"] = text tweets[item_id,"item_username"] = item_username tweets[item_id,"item_fullname"] = item_fullname tweets[item_id,"item_retweetid"] = item_retweetid tweets[item_id,"item_pinned"] = int(pinned_tweets[item_id]) timestamp = "" username = "" fullname = "" text = "" item_username = "" item_fullname = "" item_retweetid = "" delete urls } $1 == "[].created_at" { if (timestamp) { push_tweet() } created_at = $3 gsub(/[^A-Za-z0-9:+ ]/, "", created_at) # XXX: this is not portable (NetBSD date(1)) cmd = sprintf("date -ud \"%s\" +%%s", created_at) cmd | getline timestamp close(cmd) } $1 == "[].id_str" { item_id = $3 } $1 ~ /^\[\](\.retweeted_status)?\.full_text$/ { text = $3 } $1 == "[].user.name" { fullname = $3 } $1 == "[].user.screen_name" { username = $3 } $1 == "[].retweeted_status" { delete urls } $1 == "[].retweeted_status.id_str" { item_retweetid = $3 } $1 == "[].retweeted_status.user.name" { item_fullname = $3 } $1 == "[].retweeted_status.user.screen_name" { item_username = $3 } $1 == "[].user.pinned_tweet_ids[]" { pinned_tweets[$3] = 1 } $1 ~ /^\[\](\.retweeted_status)?\.entities\.urls\[\]\.url$/ { url = $3 } $1 ~ /^\[\](\.retweeted_status)?\.entities\.urls\[\]\.expanded_url$/ { urls[url] = $3 } $1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.media_url_https$/ { media_url = $3 } $1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.url$/ { url = $3 } $1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.expanded_url$/ { expanded_url = $3 } $1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.type$/ { if ($3 == "photo") { urls[url] = urls[url] ? urls[url] " " media_url : media_url } else { urls[url] = urls[url] ? urls[url] " " expanded_url : expanded_url } } END { push_tweet() for (i = 0; i < n; i++) { printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\n", tweets[tweets_id[i],"timestamp"], tweets[tweets_id[i],"username"], tweets[tweets_id[i],"fullname"], tweets[tweets_id[i],"text"], tweets_id[i], tweets[tweets_id[i],"item_username"], tweets[tweets_id[i],"item_fullname"], tweets[tweets_id[i],"item_retweetid"], tweets[tweets_id[i],"item_pinned"]) } } '