#!/bin/sh # # Mostly compatible with tscrape that scraped HTML, some TODOs left: # # - In the text of tweet there could be HTML entities. Decode them. # - In the text of tweet now `\n' are present. Decide if they should be # converted to whitespaces or if the tscrape_* tools should be instructed # about them. # - `-d' option of date(1) is only supported by NetBSD date(1). # Write a program that just do the equivalent and only for the # `created_at' date format used by Twitter. # json2tsv | awk -F '\t' ' # # Return a copy of s with all occurrences of substring old replaced by new. # function replace(s, old, new) { while ((i = index(s, old)) > 0) { s = substr(s, 1, i - 1) new substr(s, length(old) + i) } return s } function push_tweet() { if (!item_username) { item_username = username } if (!item_fullname) { item_fullname = fullname } tweets_id[n++] = item_id for (u in urls) { text = replace(text, u, urls[u]) } tweets[item_id,"timestamp"] = timestamp tweets[item_id,"username"] = username tweets[item_id,"fullname"] = fullname tweets[item_id,"text"] = text tweets[item_id,"item_username"] = item_username tweets[item_id,"item_fullname"] = item_fullname tweets[item_id,"item_retweetid"] = item_retweetid tweets[item_id,"item_pinned"] = int(pinned_tweets[item_id]) timestamp = "" username = "" fullname = "" text = "" item_username = "" item_fullname = "" item_retweetid = "" delete urls } $1 == "[].created_at" { if (timestamp) { push_tweet() } created_at = $3 gsub(/[^A-Za-z0-9:+ ]/, "", created_at) # XXX: this is not portable (NetBSD date(1)) cmd = sprintf("date -ud \"%s\" +%%s", created_at) cmd | getline timestamp close(cmd) } $1 == "[].id" { item_id = $3 } $1 ~ /^\[\](\.retweeted_status)?\.full_text$/ { text = $3 } $1 == "[].user.name" { fullname = $3 } $1 == "[].user.screen_name" { username = $3 } $1 == "[].retweeted_status.id" { item_retweetid = $3 } $1 == "[].retweeted_status.user.name" { item_fullname = $3 } $1 == "[].retweeted_status.user.screen_name" { item_username = $3 } $1 == "[].user.pinned_tweets_ids[]" { pinned_tweets[$3] = 1 } $1 ~ /^\[\](\.retweeted_status)?\.entities\.urls\[\]\.url$/ { url = $3 urls[url] = "" } $1 ~ /^\[\](\.retweeted_status)?\.entities\.urls\[\]\.expanded_url$/ { urls[url] = $3 } $1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.media_url_https$/ { media_url = $3 } $1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.url$/ { url = $3 urls[url] = media_url } $1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.expanded_url$/ { if (media_url ~ /video/) { # Prefer video to the URL instead of the image urls[url] = $3 } } END { push_tweet() for (i = 0; i < n; i++) { printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\n", tweets[tweets_id[i],"timestamp"], tweets[tweets_id[i],"username"], tweets[tweets_id[i],"fullname"], tweets[tweets_id[i],"text"], tweets_id[i], tweets[tweets_id[i],"item_username"], tweets[tweets_id[i],"item_fullname"], tweets[tweets_id[i],"item_retweetid"], tweets[tweets_id[i],"item_pinned"]) } } '