#!/bin/sh

#
# Mostly compatible with tscrape that scraped HTML, some TODOs left:
#
#  - In the text of tweet there could be HTML entities. Decode them.
#  - In the text of tweet now `\n' are present.  Decide if they should be
#    converted to whitespaces or if the tscrape_* tools should be instructed
#    about them.
#  - `-d' option of date(1) is only supported by NetBSD date(1).
#    Write a program that just do the equivalent and only for the
#    `created_at' date format used by Twitter.
#

json2tsv |
awk -F '\t' '
#
# Return a copy of s with all occurrences of substring old replaced by new.
#
function replace(s, old, new) {
	while ((i = index(s, old)) > 0) {
		s = substr(s, 1, i - 1) new substr(s, length(old) + i)
	}

	return s
}

function push_tweet()
{
	if (!item_username) {
		item_username = username
	}
	if (!item_fullname) {
		item_fullname = fullname
	}

	tweets_id[n++] = item_id

	for (u in urls) {
		text = replace(text, u, urls[u])
	}

	tweets[item_id,"timestamp"] = timestamp
	tweets[item_id,"username"] = username
	tweets[item_id,"fullname"] = fullname
	tweets[item_id,"text"] = text
	tweets[item_id,"item_username"] = item_username
	tweets[item_id,"item_fullname"] = item_fullname
	tweets[item_id,"item_retweetid"] = item_retweetid
	tweets[item_id,"item_pinned"] = int(pinned_tweets[item_id])

	timestamp = ""
	username = ""
	fullname = ""
	text = ""
	item_username = ""
	item_fullname = ""
	item_retweetid = ""
	delete urls
}

$1 == "[].created_at" {
	if (timestamp) {
		push_tweet()
	}

	created_at = $3
	gsub(/[^A-Za-z0-9:+ ]/, "", created_at)

	# XXX: this is not portable (NetBSD date(1))
	cmd = sprintf("date -ud \"%s\" +%%s", created_at)
	cmd | getline timestamp
	close(cmd)
}

$1 == "[].id" {
	item_id = $3
}

$1 ~ /^\[\](\.retweeted_status)?\.full_text$/ {
	text = $3
}

$1 == "[].user.name" {
	fullname = $3
}

$1 == "[].user.screen_name" {
	username = $3
}

$1 == "[].retweeted_status.id" {
	item_retweetid = $3
}

$1 == "[].retweeted_status.user.name" {
	item_fullname = $3
}

$1 == "[].retweeted_status.user.screen_name" {
	item_username = $3
}

$1 == "[].user.pinned_tweets_ids[]" {
	pinned_tweets[$3] = 1
}

$1 ~ /^\[\](\.retweeted_status)?\.entities\.urls\[\]\.url$/ {
	url = $3
	urls[url] = ""
}
$1 ~ /^\[\](\.retweeted_status)?\.entities\.urls\[\]\.expanded_url$/ {
	urls[url] = $3
}

$1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.media_url_https$/ {
	media_url = $3
}
$1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.url$/ {
	url = $3
	urls[url] = media_url
}
$1 ~ /^\[\](\.retweeted_status)?\.extended_entities\.media\[\]\.expanded_url$/ {
	if (media_url ~ /video/) {
		# Prefer video to the URL instead of the image
		urls[url] = $3
	}
}

END {
	push_tweet()

	for (i = 0; i < n; i++) {
		printf("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\n",
		    tweets[tweets_id[i],"timestamp"],
		    tweets[tweets_id[i],"username"],
		    tweets[tweets_id[i],"fullname"],
		    tweets[tweets_id[i],"text"],
		    tweets_id[i],
		    tweets[tweets_id[i],"item_username"],
		    tweets[tweets_id[i],"item_fullname"],
		    tweets[tweets_id[i],"item_retweetid"],
		    tweets[tweets_id[i],"item_pinned"])
	}
}
'