work-in-progress: support the new Twitter site - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit c3e76b0f57c58b284cd13ce008c082525c8ee28a
(DIR) parent 663dab7d9883a291ed570a743fb89a16e1a01d85
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Fri, 5 Jun 2020 14:51:58 +0200
work-in-progress: support the new Twitter site
Scraping doesn't work anymore. Use the Twitter JSON API.
Major thanks to leot for helping with this.
Diffstat:
M Makefile | 18 +++++++++---------
M README | 28 ++++++++++++----------------
A json.c | 313 +++++++++++++++++++++++++++++++
A json.h | 26 ++++++++++++++++++++++++++
M tscrape.c | 591 +++++++++++++++++++------------
M tscrape_plain.c | 2 +-
M tscrape_update | 31 +++++++++++++++++++++++++++++--
M tscraperc.example | 8 ++++----
M util.c | 40 -------------------------------
M util.h | 2 --
D xml.c | 451 -------------------------------
D xml.h | 49 -------------------------------
12 files changed, 755 insertions(+), 804 deletions(-)
---
(DIR) diff --git a/Makefile b/Makefile
@@ -25,17 +25,17 @@ SCRIPTS = \
SRC = ${BIN:=.c}
HDR = \
util.h\
- xml.h
+ json.h
LIBUTIL = libutil.a
LIBUTILSRC = \
util.c
LIBUTILOBJ = ${LIBUTILSRC:.c=.o}
-LIBXML = libxml.a
-LIBXMLSRC = \
- xml.c
-LIBXMLOBJ = ${LIBXMLSRC:.c=.o}
+LIBJSON = libjson.a
+LIBJSONSRC = \
+ json.c
+LIBJSONOBJ = ${LIBJSONSRC:.c=.o}
COMPATSRC = \
strlcat.c\
@@ -44,7 +44,7 @@ COMPATOBJ =\
strlcat.o\
strlcpy.o
-LIB = ${LIBUTIL} ${LIBXML} ${COMPATOBJ}
+LIB = ${LIBUTIL} ${LIBJSON} ${COMPATOBJ}
MAN1 = ${BIN:=.1}\
${SCRIPTS:=.1}
@@ -59,7 +59,7 @@ all: $(BIN)
${BIN}: ${LIB} ${@:=.o}
-OBJ = ${SRC:.c=.o} ${LIBXMLOBJ} ${LIBUTILOBJ} ${COMPATOBJ}
+OBJ = ${SRC:.c=.o} ${LIBJSONOBJ} ${LIBUTILOBJ} ${COMPATOBJ}
${OBJ}: ${HDR}
@@ -73,7 +73,7 @@ ${LIBUTIL}: ${LIBUTILOBJ}
${AR} rc $@ $?
${RANLIB} $@
-${LIBXML}: ${LIBXMLOBJ}
+${LIBJSON}: ${LIBJSONOBJ}
${AR} rc $@ $?
${RANLIB} $@
@@ -81,7 +81,7 @@ dist:
rm -rf "${NAME}-${VERSION}"
mkdir -p "${NAME}-${VERSION}"
cp -f ${MAN1} ${MAN5} ${DOC} ${HDR} \
- ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
+ ${SRC} ${LIBJSONSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
Makefile \
tscraperc.example style.css \
"${NAME}-${VERSION}"
(DIR) diff --git a/README b/README
@@ -1,13 +1,16 @@
tscrape
-------
-Twitter feed HTML scraper.
+Twitter feed parser.
-It scrapes HTML from stdin and outputs it to a TAB-separated format that can be
-easier parsed with various (UNIX) tools. There are formatting programs included
-to convert this TAB-separated format to various other formats. There are also
-some programs and scripts included to import and export OPML and to fetch,
-filter, merge and order items.
+It parses JSON from stdin and outputs it to a TAB-separated format that can be
+processed easier with various (UNIX) tools. There are formatting programs
+included to convert this TAB-separated format to various other formats. There
+are also some programs and scripts included to import and export OPML and to
+fetch, filter, merge and order items.
+
+The name tscrape is used because it used to scrape the HTML from the Twitter
+page. It is now using the JSON API contents.
Build and install
@@ -20,20 +23,13 @@ $ make
Usage
-----
- curl -H 'User-Agent:' -s 'https://twitter.com/namehere' | tscrape
-
-or
-
- ftp -o - -U '' 'https://twitter.com/namehere' 2>/dev/null | tscrape
-
-or
-
- hurl 'https://twitter.com/namehere' | tscrape
+* Create a tscraperc configuration file in ~/.tscrape/tscraperc, see tscraperc.example.
+* Run tscrape_update
Using sfeed to convert the tscrape TSV output to an Atom feed:
- hurl 'https://twitter.com/namehere' | tscrape | \
+ tscrape < ~/.tscrape/feeds/name | \
awk 'BEGIN { OFS = FS = "\t"; }
{
print $1 OFS $4 OFS "https://twitter.com/" $6 "/status/" $5 \
(DIR) diff --git a/json.c b/json.c
@@ -0,0 +1,313 @@
+#include <ctype.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define GETNEXT getchar
+
+#include "json.h"
+
+static int
+codepointtoutf8(long r, char *s)
+{
+ if (r == 0) {
+ return 0; /* NUL byte */
+ } else if (r <= 0x7F) {
+ /* 1 byte: 0aaaaaaa */
+ s[0] = r;
+ return 1;
+ } else if (r <= 0x07FF) {
+ /* 2 bytes: 00000aaa aabbbbbb */
+ s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
+ s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
+ return 2;
+ } else if (r <= 0xFFFF) {
+ /* 3 bytes: aaaabbbb bbcccccc */
+ s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
+ s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
+ s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
+ return 3;
+ } else {
+ /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
+ s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
+ s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
+ s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
+ s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
+ return 4;
+ }
+}
+
+static int
+hexdigit(int c)
+{
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (c >= 'a' && c <= 'f')
+ return 10 + (c - 'a');
+ else if (c >= 'A' && c <= 'F')
+ return 10 + (c - 'A');
+ return 0;
+}
+
+static int
+capacity(char **value, size_t *sz, size_t cur, size_t inc)
+{
+ size_t need, newsiz;
+ char *newp;
+
+ /* check for addition overflow */
+ if (cur > SIZE_MAX - inc) {
+ errno = EOVERFLOW;
+ return -1;
+ }
+ need = cur + inc;
+
+ if (need > *sz) {
+ if (need > SIZE_MAX / 2) {
+ newsiz = SIZE_MAX;
+ } else {
+ for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; newsiz *= 2)
+ ;
+ }
+ if (!(newp = realloc(*value, newsiz)))
+ return -1; /* up to caller to free *value */
+ *value = newp;
+ *sz = newsiz;
+ }
+ return 0;
+}
+
+#define EXPECT_VALUE "{[\"-0123456789tfn"
+#define EXPECT_STRING "\""
+#define EXPECT_END "}],"
+#define EXPECT_OBJECT_STRING EXPECT_STRING "}"
+#define EXPECT_OBJECT_KEY ":"
+#define EXPECT_ARRAY_VALUE EXPECT_VALUE "]"
+
+#define JSON_INVALID() do { ret = JSON_ERROR_INVALID; goto end; } while (0);
+
+int
+parsejson(void (*cb)(struct json_node *, size_t, const char *))
+{
+ struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
+ size_t depth = 0, p = 0, len, sz = 0;
+ long cp, hi, lo;
+ char pri[128], *str = NULL;
+ int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM;
+ const char *expect = EXPECT_VALUE;
+
+ if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1)
+ goto end;
+ nodes[0].name[0] = '\0';
+
+ while (1) {
+ c = GETNEXT();
+handlechr:
+ if (c == EOF)
+ break;
+
+ /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */
+ if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+ continue;
+
+ if (!c || !strchr(expect, c))
+ JSON_INVALID();
+
+ switch (c) {
+ case ':':
+ iskey = 0;
+ expect = EXPECT_VALUE;
+ break;
+ case '"':
+ nodes[depth].type = JSON_TYPE_STRING;
+ escape = 0;
+ len = 0;
+ while (1) {
+ c = GETNEXT();
+chr:
+ /* EOF or control char: 0x7f is not defined as a control char in RFC8259 */
+ if (c < 0x20)
+ JSON_INVALID();
+
+ if (escape) {
+escchr:
+ escape = 0;
+ switch (c) {
+ case '"': /* FALLTHROUGH */
+ case '\\':
+ case '/': break;
+ case 'b': c = '\b'; break;
+ case 'f': c = '\f'; break;
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'u': /* hex hex hex hex */
+ if (capacity(&str, &sz, len, 4) == -1)
+ goto end;
+ for (i = 12, cp = 0; i >= 0; i -= 4) {
+ if ((c = GETNEXT()) == EOF || !isxdigit(c))
+ JSON_INVALID(); /* invalid code point */
+ cp |= (hexdigit(c) << i);
+ }
+ /* RFC8259 - 7. Strings - surrogates.
+ * 0xd800 - 0xdb7f - high surrogates */
+ if (cp >= 0xd800 && cp <= 0xdb7f) {
+ if ((c = GETNEXT()) != '\\') {
+ len += codepointtoutf8(cp, &str[len]);
+ goto chr;
+ }
+ if ((c = GETNEXT()) != 'u') {
+ len += codepointtoutf8(cp, &str[len]);
+ goto escchr;
+ }
+ for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
+ if ((c = GETNEXT()) == EOF || !isxdigit(c))
+ JSON_INVALID(); /* invalid code point */
+ lo |= (hexdigit(c) << i);
+ }
+ /* 0xdc00 - 0xdfff - low surrogates */
+ if (lo >= 0xdc00 && lo <= 0xdfff) {
+ cp = (hi << 10) + lo - 56613888; /* - offset */
+ } else {
+ /* handle graceful: raw invalid output bytes */
+ len += codepointtoutf8(hi, &str[len]);
+ if (capacity(&str, &sz, len, 4) == -1)
+ goto end;
+ len += codepointtoutf8(lo, &str[len]);
+ continue;
+ }
+ }
+ len += codepointtoutf8(cp, &str[len]);
+ continue;
+ default:
+ JSON_INVALID(); /* invalid escape char */
+ }
+ if (capacity(&str, &sz, len, 1) == -1)
+ goto end;
+ str[len++] = c;
+ } else if (c == '\\') {
+ escape = 1;
+ } else if (c == '"') {
+ if (capacity(&str, &sz, len, 1) == -1)
+ goto end;
+ str[len++] = '\0';
+
+ if (iskey) {
+ /* copy string as key, including NUL byte */
+ if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), len, 1) == -1)
+ goto end;
+ memcpy(nodes[depth].name, str, len);
+ } else {
+ cb(nodes, depth + 1, str);
+ }
+ break;
+ } else {
+ if (capacity(&str, &sz, len, 1) == -1)
+ goto end;
+ str[len++] = c;
+ }
+ }
+ if (iskey)
+ expect = EXPECT_OBJECT_KEY;
+ else
+ expect = EXPECT_END;
+ break;
+ case '[':
+ case '{':
+ if (depth + 1 >= JSON_MAX_NODE_DEPTH)
+ JSON_INVALID(); /* too deep */
+
+ nodes[depth].index = 0;
+ if (c == '[') {
+ nodes[depth].type = JSON_TYPE_ARRAY;
+ expect = EXPECT_ARRAY_VALUE;
+ } else if (c == '{') {
+ iskey = 1;
+ nodes[depth].type = JSON_TYPE_OBJECT;
+ expect = EXPECT_OBJECT_STRING;
+ }
+
+ cb(nodes, depth + 1, "");
+
+ depth++;
+ nodes[depth].index = 0;
+ if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), 0, 1) == -1)
+ goto end;
+ nodes[depth].name[0] = '\0';
+ break;
+ case ']':
+ case '}':
+ if (!depth ||
+ (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARRAY) ||
+ (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJECT))
+ JSON_INVALID(); /* unbalanced nodes */
+
+ nodes[--depth].index++;
+ expect = EXPECT_END;
+ break;
+ case ',':
+ if (!depth)
+ JSON_INVALID(); /* unbalanced nodes */
+
+ nodes[depth - 1].index++;
+ if (nodes[depth - 1].type == JSON_TYPE_OBJECT) {
+ iskey = 1;
+ expect = EXPECT_STRING;
+ } else {
+ expect = EXPECT_VALUE;
+ }
+ break;
+ case 't': /* true */
+ if (GETNEXT() != 'r' || GETNEXT() != 'u' || GETNEXT() != 'e')
+ JSON_INVALID();
+ nodes[depth].type = JSON_TYPE_BOOL;
+ cb(nodes, depth + 1, "true");
+ expect = EXPECT_END;
+ break;
+ case 'f': /* false */
+ if (GETNEXT() != 'a' || GETNEXT() != 'l' || GETNEXT() != 's' ||
+ GETNEXT() != 'e')
+ JSON_INVALID();
+ nodes[depth].type = JSON_TYPE_BOOL;
+ cb(nodes, depth + 1, "false");
+ expect = EXPECT_END;
+ break;
+ case 'n': /* null */
+ if (GETNEXT() != 'u' || GETNEXT() != 'l' || GETNEXT() != 'l')
+ JSON_INVALID();
+ nodes[depth].type = JSON_TYPE_NULL;
+ cb(nodes, depth + 1, "null");
+ expect = EXPECT_END;
+ break;
+ default: /* number */
+ nodes[depth].type = JSON_TYPE_NUMBER;
+ p = 0;
+ pri[p++] = c;
+ expect = EXPECT_END;
+ while (1) {
+ c = GETNEXT();
+ if (c == EOF ||
+ !c || !strchr("0123456789eE+-.", c) ||
+ p + 1 >= sizeof(pri)) {
+ pri[p] = '\0';
+ cb(nodes, depth + 1, pri);
+ goto handlechr; /* do not read next char, handle this */
+ } else {
+ pri[p++] = c;
+ }
+ }
+ }
+ }
+ if (depth)
+ JSON_INVALID(); /* unbalanced nodes */
+
+ ret = 0; /* success */
+end:
+ for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++)
+ free(nodes[depth].name);
+ free(str);
+
+ return ret;
+}
(DIR) diff --git a/json.h b/json.h
@@ -0,0 +1,26 @@
+#include <stddef.h>
+
+enum JSONType {
+ JSON_TYPE_ARRAY = 'a',
+ JSON_TYPE_OBJECT = 'o',
+ JSON_TYPE_STRING = 's',
+ JSON_TYPE_BOOL = 'b',
+ JSON_TYPE_NULL = '?',
+ JSON_TYPE_NUMBER = 'n'
+};
+
+enum JSONError {
+ JSON_ERROR_MEM = -2,
+ JSON_ERROR_INVALID = -1
+};
+
+#define JSON_MAX_NODE_DEPTH 64
+
+struct json_node {
+ enum JSONType type;
+ char *name;
+ size_t namesiz;
+ size_t index; /* count/index for array or object type */
+};
+
+int parsejson(void (*cb)(struct json_node *, size_t, const char *));
(DIR) diff --git a/tscrape.c b/tscrape.c
@@ -2,107 +2,52 @@
#include <ctype.h>
#include <err.h>
+#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
+#include <time.h>
#include <unistd.h>
-#include "xml.h"
+#include "json.h"
#include "util.h"
#define STRP(s) s,sizeof(s)-1
-/* states */
-enum {
- Item = 1,
- Stream = 2,
- Header = 4,
- Timestamp = 8,
- Text = 16
+/* a tweet */
+struct tweet {
+ char fullname[1024];
+ int ispinned;
+ char itemusername[1024];
+ char itemfullname[1024];
+ char full_text[4096];
+ char username[1024];
+ time_t timestamp;
+ char datatime[16];
+ char itemid[64];
+ char retweetid[64];
+
+ struct tweet *next;
};
-/* data */
-static char fullname[1024];
-static int ispinned;
-static char itemusername[1024];
-static char itemfullname[1024];
-static char timestamp[16];
-static char text[4096];
-static char username[1024];
-
-static char classname[256];
-static char datatime[16];
-static char itemid[64];
-static char retweetid[64];
-static int state;
-static XMLParser p;
-
-static const char *ignorestate, *endtag;
-static int (*getnext)(void);
-
-/* return a space for all data until some case-insensitive string occurs. This
- is used to parse incorrect HTML/XML that contains unescaped HTML in script
- or style tags. If you see some </script> tag in a CDATA or comment
- section then e-mail W3C and tell them the web is too complex. */
-static inline int
-getnext_ignore(void)
-{
- int c;
-
- if ((c = getnext()) == EOF)
- return EOF;
+/* url entities and their replacements */
+struct url {
+ char url[256];
+ size_t url_len;
+ char expanded_url[1024];
- if (tolower(c) == tolower((unsigned char)*ignorestate)) {
- ignorestate++;
- if (*ignorestate == '\0') {
- p.getnext = getnext; /* restore */
- return c;
- }
- } else {
- ignorestate = endtag;
- }
-
- return ' ';
-}
-
-static void
-printtweet(void)
-{
- char buf[32];
- time_t t;
-
- if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
- printf("%lld", (long long)t);
- putchar('\t');
- printescape(username);
- putchar('\t');
- printescape(fullname);
- putchar('\t');
- printescape(text);
- putchar('\t');
- printescape(itemid);
- putchar('\t');
- printescape(itemusername);
- putchar('\t');
- printescape(itemfullname);
- putchar('\t');
- printescape(retweetid);
- putchar('\t');
- printf("%d", ispinned);
- putchar('\n');
-}
+ struct url *next;
+};
-static int
-isclassmatch(const char *classes, const char *clss, size_t len)
-{
- const char *p;
+static struct tweet *tweets, *tc;
+static struct url *urls, *uc;
+static char url[256];
- if (!(p = strstr(classes, clss)))
- return 0;
- return (p == classes || isspace((unsigned char)p[-1])) &&
- (isspace((unsigned char)p[len]) || !p[len]);
-}
+#define MAX_PINNED 5
+static char pinnedids[MAX_PINNED][64];
+static size_t npinned;
+#if 0
/* convert XML and some HTML entities */
static int
html_entitytostr(const char *s, char *buf, size_t bufsiz)
@@ -115,192 +60,378 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
return (ssize_t)strlcpy(buf, " ", bufsiz);
return len;
}
+#endif
-static void
-xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
+long long
+datetounix(long long year, int mon, int day, int hour, int min, int sec)
{
- if (!strcmp(t, "p"))
- state &= ~Text;
- else if (!strcmp(t, "span"))
- state &= ~(Timestamp);
+ static const int secs_through_month[] = {
+ 0, 31 * 86400, 59 * 86400, 90 * 86400,
+ 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
+ 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
+ int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
+ long long t;
+
+ if (year - 2ULL <= 136) {
+ leaps = (year - 68) >> 2;
+ if (!((year - 68) & 3)) {
+ leaps--;
+ is_leap = 1;
+ } else {
+ is_leap = 0;
+ }
+ t = 31536000 * (year - 70) + 86400 * leaps;
+ } else {
+ cycles = (year - 100) / 400;
+ rem = (year - 100) % 400;
+ if (rem < 0) {
+ cycles--;
+ rem += 400;
+ }
+ if (!rem) {
+ is_leap = 1;
+ } else {
+ if (rem >= 300)
+ centuries = 3, rem -= 300;
+ else if (rem >= 200)
+ centuries = 2, rem -= 200;
+ else if (rem >= 100)
+ centuries = 1, rem -= 100;
+ if (rem) {
+ leaps = rem / 4U;
+ rem %= 4U;
+ is_leap = !rem;
+ }
+ }
+ leaps += 97 * cycles + 24 * centuries - is_leap;
+ t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
+ }
+ t += secs_through_month[mon];
+ if (is_leap && mon >= 2)
+ t += 86400;
+ t += 86400LL * (day - 1);
+ t += 3600LL * hour;
+ t += 60LL * min;
+ t += sec;
+
+ return t;
}
-static void
-xmltagstart(XMLParser *x, const char *t, size_t tl)
+/* parse time format: "Wed May 27 04:12:34 +0000 2020"
+ assumes tz offset is "+0000" */
+static int
+parsetime(const char *s, time_t *tp)
{
- classname[0] = '\0';
+ static char *mons[] = {
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
+ };
+ int year, mon = 0, mday, hour, min, sec, i;
+ char tzbuf[6], monbuf[4], wdaybuf[4];
+
+ for (; *s && isspace((unsigned char)*s); s++)
+ ;
+ i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d",
+ wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &year);
+ if (i != 8)
+ return -1;
+ for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) {
+ if (!strcmp(mons[i], monbuf)) {
+ mon = i + 1;
+ break;
+ }
+ }
+ if (mon == 0)
+ return -1;
+
+ /* invalid range */
+ if (year < 0 || year > 9999 ||
+ mon < 1 || mon > 12 ||
+ mday < 1 || mday > 31 ||
+ hour < 0 || hour > 23 ||
+ min < 0 || min> 59 ||
+ sec < 0 || sec > 59)
+ return -1;
+
+ if (tp)
+ *tp = datetounix(year - 1900, mon - 1, mday, hour, min, sec);
+ return 0;
}
static void
-xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
+printescape(const char *s)
{
- /* temporary replace the callback except the reader and end of tag
- restore the context once we receive the same ignored tag in the
- end tag handler */
- if (!strcasecmp(t, "script")) {
- ignorestate = endtag = "</script>";
- getnext = x->getnext; /* for restore */
- x->getnext = getnext_ignore;
- return;
- } else if (!strcasecmp(t, "style")) {
- ignorestate = endtag = "</style>";
- getnext = x->getnext; /* for restore */
- x->getnext = getnext_ignore;
- return;
+ for (; *s; s++) {
+ if (!iscntrl((unsigned char)*s))
+ putchar(*s);
}
-
- if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text"))) {
- if (state & (Item | Stream | Header))
- state |= Text;
- } else if (!strcmp(t, "div") &&
- isclassmatch(classname, STRP("stream-item-footer"))) {
- if (text[0] && username[0])
- printtweet();
- state = 0;
- } else if (!strcmp(t, "li") &&
- isclassmatch(classname, STRP("js-stream-item"))) {
- if (state & Item)
- return;
- state |= Item;
- datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0';
- itemid[0] = itemusername[0] = retweetid[0] = '\0';
- ispinned = 0;
- if (isclassmatch(classname, STRP("js-pinned")))
- ispinned = 1;
- } else if (state & Item) {
- if (!strcmp(t, "div") &&
- isclassmatch(classname, STRP("js-stream-tweet"))) {
- state &= ~(Text|Header);
- state |= Stream;
- } else if (!strcmp(t, "a") &&
- isclassmatch(classname, STRP("js-action-profile"))) {
- state |= Header;
- } else if (!strcmp(t, "span") &&
- isclassmatch(classname, STRP("js-short-timestamp"))) {
- state |= Timestamp;
- strlcpy(timestamp, datatime, sizeof(timestamp));
- datatime[0] = '\0';
- }
- }
- if ((state & Text) && !strcmp(t, "a") && !isspace((unsigned char)text[0]))
- strlcat(text, " ", sizeof(text));
}
+/* print text and expand urls */
static void
-xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+printexpand(const char *s)
{
- /* NOTE: assumes classname attribute is set before data-* in current tag */
- if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-actions"))) {
- if (!strcmp(a, "data-screen-name")) {
- strlcat(username, " ", sizeof(username));
- strlcat(username, v, sizeof(username));
- } else if (!strcmp(a, "data-name")) {
- strlcat(fullname, " ", sizeof(fullname));
- strlcat(fullname, v, sizeof(fullname));
- }
- }
-
- if (!strcmp(a, "class")) {
- strlcat(classname, v, sizeof(classname));
- } else if (state & Item) {
- if (!strcmp(t, "div")) {
- if (!strcmp(a, "data-item-id"))
- strlcpy(itemid, v, sizeof(itemid));
- else if (!strcmp(a, "data-retweet-id"))
- strlcpy(retweetid, v, sizeof(retweetid));
-
- if (isclassmatch(classname, STRP("js-stream-tweet"))) {
- if (!strcmp(a, "data-screen-name")) {
- strlcat(itemusername, " ", sizeof(itemusername));
- strlcat(itemusername, v, sizeof(itemusername));
- } else if (!strcmp(a, "data-name")) {
- strlcat(itemfullname, " ", sizeof(itemfullname));
- strlcat(itemfullname, v, sizeof(itemfullname));
- }
+ struct url *u;
+
+ for (; *s; s++) {
+ if (iscntrl((unsigned char)*s))
+ continue;
+ for (u = urls; u; u = u->next) {
+ if (!strncmp(s, u->url, u->url_len)) {
+ s += u->url_len;
+ printescape(u->expanded_url);
+ break;
}
- } else if (!strcmp(t, "span") && !strcmp(a, "data-time")) {
- /* UNIX timestamp */
- strlcpy(datatime, v, sizeof(datatime));
- }
- /* NOTE: can be <div data-image-url>. */
- if (!strcmp(a, "data-image-url")) {
- strlcat(text, " ", sizeof(text));
- strlcat(text, v, sizeof(text));
- }
-
- /* indication it has a video */
- if (itemid[0] && !strcmp(a, "data-playable-media-url")) {
- strlcat(text, " ", sizeof(text));
- strlcat(text, "https://twitter.com/i/videos/", sizeof(text));
- strlcat(text, itemid, sizeof(text));
}
+ if (!u)
+ putchar(*s);
}
}
static void
-xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
- const char *v, size_t vl)
+printtweet(struct tweet *t)
{
- char buf[16];
- int len;
+ if (t->timestamp != -1)
+ printf("%lld", (long long)t->timestamp);
+ putchar('\t');
+ printescape(t->username);
+ putchar('\t');
+ printescape(t->fullname);
+ putchar('\t');
+ printexpand(t->full_text);
+ putchar('\t');
+ printescape(t->itemid);
+ putchar('\t');
+ if (t->itemusername[0])
+ printescape(t->itemusername);
+ else
+ printescape(t->username);
+ putchar('\t');
+ if (t->itemfullname[0])
+ printescape(t->itemfullname);
+ else
+ printescape(t->fullname);
+ putchar('\t');
+ printescape(t->retweetid);
+ putchar('\t');
+ printf("%d", t->ispinned);
+ putchar('\n');
+}
- if (!state)
+void
+addpinned(const char *str)
+{
+ if (npinned + 1 >= MAX_PINNED)
return;
- if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0)
- xmlattr(x, t, tl, a, al, buf, (size_t)len);
- else
- xmlattr(x, t, tl, a, al, v, vl);
+ strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0]));
+ npinned++;
}
-static void
-xmldata(XMLParser *x, const char *d, size_t dl)
+void
+addtweet(void)
{
- if (state & Text) {
- if (!isclassmatch(classname, STRP("u-hidden")))
- strlcat(text, d, sizeof(text));
- }
+ struct tweet *t;
+
+ if (!(t = calloc(1, sizeof(*t))))
+ err(1, "calloc");
+ t->timestamp = -1;
+ if (tweets)
+ tc = tc->next = t;
+ else
+ tweets = tc = t;
}
-static void
-xmldataentity(XMLParser *x, const char *d, size_t dl)
+void
+addurl(const char *url, const char *expanded_url)
{
- char buf[16];
- int len;
+ struct url *u;
- if (!(state & Text))
- return;
- if ((len = html_entitytostr(d, buf, sizeof(buf))) > 0)
- xmldata(x, buf, (size_t)len);
+ if (!(u = calloc(1, sizeof(*u))))
+ err(1, "calloc");
+ strlcpy(u->url, url, sizeof(u->url));
+ u->url_len = strlen(u->url);
+ strlcpy(u->expanded_url, expanded_url, sizeof(u->expanded_url));
+
+ if (urls)
+ uc = uc->next = u;
else
- xmldata(x, d, dl);
+ urls = uc = u;
}
-static void
-xmlcdata(XMLParser *x, const char *d, size_t dl)
+void
+processnodes(struct json_node *nodes, size_t depth, const char *str)
{
- xmldata(x, d, dl);
+ if (depth == 2 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT) {
+ addtweet();
+ }
+
+ if (tc) {
+ if (depth == 3 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_STRING) {
+ if (!strcmp(nodes[2].name, "created_at")) {
+ parsetime(str, &tc->timestamp);
+ } else if (!strcmp(nodes[2].name, "id_str")) {
+ strlcpy(tc->itemid, str, sizeof(tc->itemid));
+ } else if (!strcmp(nodes[2].name, "full_text")) {
+ /* if set by retweet text don't override */
+ if (!tc->full_text[0])
+ strlcpy(tc->full_text, str, sizeof(tc->full_text));
+ }
+ }
+ if (depth == 4 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ !strcmp(nodes[2].name, "user")) {
+ if (nodes[3].type == JSON_TYPE_STRING) {
+ if (!strcmp(nodes[3].name, "name")) {
+ strlcpy(tc->fullname, str, sizeof(tc->fullname));
+ } else if (!strcmp(nodes[3].name, "screen_name")) {
+ strlcpy(tc->username, str, sizeof(tc->username));
+ }
+ }
+ }
+
+ if (depth == 4 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "retweeted_status")) {
+ if (!strcmp(nodes[3].name, "id_str")) {
+// printf("DEBUG: retweet: id: %s\n", str);
+ strlcpy(tc->retweetid, str, sizeof(tc->retweetid));
+ } else if (!strcmp(nodes[3].name, "full_text")) {
+ strlcpy(tc->full_text, str, sizeof(tc->full_text));
+// printf("DEBUG: retweet: full_text: %s\n", str);
+ }
+ }
+
+ if (depth == 5 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_OBJECT &&
+ nodes[4].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "retweeted_status") &&
+ !strcmp(nodes[3].name, "user")) {
+ if (!strcmp(nodes[4].name, "name")) {
+ strlcpy(tc->itemfullname, str, sizeof(tc->itemfullname));
+// printf("DEBUG: retweeted_status.user.name: %s\n", str);
+ } else if (!strcmp(nodes[4].name, "screen_name")) {
+ strlcpy(tc->itemusername, str, sizeof(tc->itemusername));
+// printf("DEBUG: retweeted_status.user.screen_name: %s\n", str);
+ }
+ }
+ }
+
+ if (depth == 5 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ !strcmp(nodes[2].name, "user")) {
+ if (nodes[3].type == JSON_TYPE_ARRAY &&
+ !strcmp(nodes[3].name, "pinned_tweet_ids")) {
+ if (nodes[4].type == JSON_TYPE_NUMBER) {
+ addpinned(str);
+// printf("DEBUG: pinned_tweets_ids[%zu]: %s\n",
+// nodes[4].index, str);
+ }
+ }
+ }
+
+ if (depth == 6 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_ARRAY &&
+ nodes[4].type == JSON_TYPE_OBJECT &&
+ nodes[5].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "entities") &&
+ !strcmp(nodes[3].name, "urls")) {
+ if (!strcmp(nodes[5].name, "url")) {
+// printf("DEBUG: url: %s\n", str);
+ strlcpy(url, str, sizeof(url));
+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
+// printf("DEBUG: expanded_url: %s\n", str);
+ /* assumes "expanded_url" is specified after "url" */
+ addurl(url, str);
+ url[0] = '\0';
+ }
+ }
+
+ /* [].entities.media[].url */
+ if (depth == 6 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_ARRAY &&
+ nodes[4].type == JSON_TYPE_OBJECT &&
+ nodes[5].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "entities") &&
+ !strcmp(nodes[3].name, "media")) {
+ if (!strcmp(nodes[5].name, "url")) {
+// printf("DEBUG: url: %s\n", str);
+ strlcpy(url, str, sizeof(url));
+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
+// printf("DEBUG: expanded_url: %s\n", str);
+ /* assumes "expanded_url" is specified after "url" */
+ addurl(url, str);
+ url[0] = '\0';
+ }
+ }
+
+// TODO: retweeted.status.entities.urls[]
+#if 0
+ if (depth == 6 &&
+ nodes[0].type == JSON_TYPE_ARRAY &&
+ nodes[1].type == JSON_TYPE_OBJECT &&
+ nodes[2].type == JSON_TYPE_OBJECT &&
+ nodes[3].type == JSON_TYPE_OBJECT &&
+ nodes[4].type == JSON_TYPE_ARRAY &&
+ nodes[5].type == JSON_TYPE_STRING &&
+ !strcmp(nodes[2].name, "retweeted_status") &&
+ !strcmp(nodes[3].name, "entities") &&
+ !strcmp(nodes[4].name, "urls")) {
+ if (!strcmp(nodes[5].name, "url")) {
+ printf("DEBUG: url: %s\n", str);
+ } else if (!strcmp(nodes[5].name, "expanded_url")) {
+ printf("DEBUG: expanded_url: %s\n", str);
+ }
+ }
+#endif
}
int
main(void)
{
+ struct tweet *t;
+ size_t i;
+ int r;
+
if (pledge("stdio", NULL) == -1)
err(1, "pledge");
- /* handlers */
- p.xmlattr = xmlattr;
- p.xmlattrentity = xmlattrentity;
- p.xmlcdata = xmlcdata;
- p.xmldata = xmldata;
- p.xmldataentity = xmldataentity;
- p.xmltagstart = xmltagstart;
- p.xmltagend = xmltagend;
- p.xmltagstartparsed = xmltagstartparsed;
- /* reader (stdin) */
- p.getnext = getchar;
-
- xml_parse(&p);
+ r = parsejson(processnodes);
+ if (r != 0)
+ errx(1, "invalid JSON");
+
+ // TODO: TEST: make sure the last tweet is printed too (addtweet() logic).
+ for (t = tweets; t; t = t->next) {
+ /* check for pinned tweets */
+ for (i = 0; i < npinned; i++) {
+ if (!strcmp(t->itemid, pinnedids[i])) {
+// printf("DEBUG: pinned: %s\n", pinnedids[i]);
+ t->ispinned = 1;
+ break;
+ }
+ }
+ printtweet(t);
+ }
return 0;
}
(DIR) diff --git a/tscrape_plain.c b/tscrape_plain.c
@@ -51,7 +51,7 @@ printfeed(FILE *fp, const char *feedname)
printutf8pad(stdout, fields[FieldItemFullname], 25, ' ');
fputs(" ", stdout);
- printescape(fields[FieldText]);
+ fputs(fields[FieldText], stdout);
putchar('\n');
}
}
(DIR) diff --git a/tscrape_update b/tscrape_update
@@ -9,6 +9,12 @@ tscrapepath="$HOME/.tscrape/feeds"
# feeds are finished at a time.
maxjobs=8
+# Twitter authentication bearer (seems to be static).
+bearer="AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
+
+# guest token.
+token=""
+
# load config (evaluate shellscript).
# loadconfig(configfile)
loadconfig() {
@@ -36,12 +42,26 @@ log() {
printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
}
+# acquire guest token.
+# guesttoken()
+guesttoken() {
+ # fail on redirects, hide User-Agent, timeout is 15 seconds.
+ curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
+ -H "Authorization: Bearer ${bearer}" \
+ 'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null | \
+ sed -nE 's@.*{"guest_token":"([^"]*)"}.*@\1@p'
+}
+
# fetch a feed via HTTP/HTTPS etc.
-# fetch(name, url, feedfile)
+# fetch(name, twittername, feedfile)
fetch() {
+ url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=$2&tweet_mode=extended&count=50&include_rts=1"
+
# fail on redirects, hide User-Agent, timeout is 15 seconds.
curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
- "$2" 2>/dev/null
+ -H "Authorization: Bearer ${bearer}" \
+ -H "x-guest-token: $token" \
+ "${url}" 2>/dev/null
}
# filter fields.
@@ -151,6 +171,13 @@ feeds() {
echo "See tscraperc.example for an example." >&2
}
+# get quest token.
+token=$(guesttoken)
+if [ -z "${token}" ]; then
+ echo "Failed to acquire guest token" >&2
+ exit 1
+fi
+
# job counter.
curjobs=0
# signal number received for parent.
(DIR) diff --git a/tscraperc.example b/tscraperc.example
@@ -2,8 +2,8 @@
# list of feeds to fetch:
feeds() {
- # feed <name> <feedurl>
- feed "Rich Felker" "https://twitter.com/richfelker"
- feed "Internet of shit" "https://twitter.com/internetofshit"
- feed "Donald Trump" "https://twitter.com/realdonaldtrump"
+ # feed <name> <twittername>
+ feed "Rich Felker" "richfelker"
+ feed "Internet of shit" "internetofshit"
+ feed "Donald Trump" "realdonaldtrump"
}
(DIR) diff --git a/util.c b/util.c
@@ -106,43 +106,3 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad)
for (; col < len; ++col)
putc(pad, fp);
}
-
-void
-printescape(const char *s)
-{
- int r;
- const char *e;
-
- /* strip leading and trailing white-space */
- for (; *s && isspace((unsigned char)*s); s++)
- ;
- for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--)
- ;
-
- for (r = 0; *s && s < e; s++) {
- if (iscntrl((unsigned char)*s) || isspace((unsigned char)*s)) {
- r = 1;
- continue;
- }
- if (r) {
- r = 0;
- putchar(' ');
- }
- putchar(*s);
- }
-}
-
-int
-parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
-{
- struct tm *tm;
-
- if (strtotime(s, t))
- return -1;
- if (!(tm = localtime(t)))
- return -1;
- if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
- return -1;
-
- return 0;
-}
(DIR) diff --git a/util.h b/util.h
@@ -30,8 +30,6 @@ enum {
};
size_t parseline(char *, char *[FieldLast]);
-int parsetime(const char *, time_t *, char *, size_t);
-void printescape(const char *);
void printutf8pad(FILE *, const char *, size_t, int);
int strtotime(const char *, time_t *);
void xmlencode(const char *, FILE *);
(DIR) diff --git a/xml.c b/xml.c
@@ -1,451 +0,0 @@
-#include <ctype.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "xml.h"
-
-static void
-xml_parseattrs(XMLParser *x)
-{
- size_t namelen = 0, valuelen;
- int c, endsep, endname = 0, valuestart = 0;
-
- while ((c = GETNEXT()) != EOF) {
- if (isspace(c)) {
- if (namelen)
- endname = 1;
- continue;
- } else if (c == '?')
- ; /* ignore */
- else if (c == '=') {
- x->name[namelen] = '\0';
- valuestart = 1;
- endname = 1;
- } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
- /* attribute without value */
- x->name[namelen] = '\0';
- if (x->xmlattrstart)
- x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
- if (x->xmlattrend)
- x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
- endname = 0;
- x->name[0] = c;
- namelen = 1;
- } else if (namelen && valuestart) {
- /* attribute with value */
- if (x->xmlattrstart)
- x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
-
- valuelen = 0;
- if (c == '\'' || c == '"') {
- endsep = c;
- } else {
- endsep = ' '; /* isspace() */
- goto startvalue;
- }
-
- while ((c = GETNEXT()) != EOF) {
-startvalue:
- if (c == '&') { /* entities */
- x->data[valuelen] = '\0';
- /* call data function with data before entity if there is data */
- if (valuelen && x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- x->data[0] = c;
- valuelen = 1;
- while ((c = GETNEXT()) != EOF) {
- if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
- break;
- if (valuelen < sizeof(x->data) - 1)
- x->data[valuelen++] = c;
- else {
- /* entity too long for buffer, handle as normal data */
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- x->data[0] = c;
- valuelen = 1;
- break;
- }
- if (c == ';') {
- x->data[valuelen] = '\0';
- if (x->xmlattrentity)
- x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- valuelen = 0;
- break;
- }
- }
- } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
- if (valuelen < sizeof(x->data) - 1) {
- x->data[valuelen++] = c;
- } else {
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- x->data[0] = c;
- valuelen = 1;
- }
- }
- if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
- x->data[valuelen] = '\0';
- if (x->xmlattr)
- x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
- if (x->xmlattrend)
- x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
- break;
- }
- }
- namelen = endname = valuestart = 0;
- } else if (namelen < sizeof(x->name) - 1) {
- x->name[namelen++] = c;
- }
- if (c == '>') {
- break;
- } else if (c == '/') {
- x->isshorttag = 1;
- x->name[0] = '\0';
- namelen = 0;
- }
- }
-}
-
-static void
-xml_parsecomment(XMLParser *x)
-{
- size_t datalen = 0, i = 0;
- int c;
-
- if (x->xmlcommentstart)
- x->xmlcommentstart(x);
- while ((c = GETNEXT()) != EOF) {
- if (c == '-' || c == '>') {
- if (x->xmlcomment && datalen) {
- x->data[datalen] = '\0';
- x->xmlcomment(x, x->data, datalen);
- datalen = 0;
- }
- }
-
- if (c == '-') {
- if (++i > 2) {
- if (x->xmlcomment)
- for (; i > 2; i--)
- x->xmlcomment(x, "-", 1);
- i = 2;
- }
- continue;
- } else if (c == '>' && i == 2) {
- if (x->xmlcommentend)
- x->xmlcommentend(x);
- return;
- } else if (i) {
- if (x->xmlcomment) {
- for (; i > 0; i--)
- x->xmlcomment(x, "-", 1);
- }
- i = 0;
- }
-
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmlcomment)
- x->xmlcomment(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
-}
-
-static void
-xml_parsecdata(XMLParser *x)
-{
- size_t datalen = 0, i = 0;
- int c;
-
- if (x->xmlcdatastart)
- x->xmlcdatastart(x);
- while ((c = GETNEXT()) != EOF) {
- if (c == ']' || c == '>') {
- if (x->xmlcdata && datalen) {
- x->data[datalen] = '\0';
- x->xmlcdata(x, x->data, datalen);
- datalen = 0;
- }
- }
-
- if (c == ']') {
- if (++i > 2) {
- if (x->xmlcdata)
- for (; i > 2; i--)
- x->xmlcdata(x, "]", 1);
- i = 2;
- }
- continue;
- } else if (c == '>' && i == 2) {
- if (x->xmlcdataend)
- x->xmlcdataend(x);
- return;
- } else if (i) {
- if (x->xmlcdata)
- for (; i > 0; i--)
- x->xmlcdata(x, "]", 1);
- i = 0;
- }
-
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmlcdata)
- x->xmlcdata(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
-}
-
-static int
-codepointtoutf8(long r, char *s)
-{
- if (r == 0) {
- return 0; /* NUL byte */
- } else if (r <= 0x7F) {
- /* 1 byte: 0aaaaaaa */
- s[0] = r;
- return 1;
- } else if (r <= 0x07FF) {
- /* 2 bytes: 00000aaa aabbbbbb */
- s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
- s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
- return 2;
- } else if (r <= 0xFFFF) {
- /* 3 bytes: aaaabbbb bbcccccc */
- s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
- s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
- s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
- return 3;
- } else {
- /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
- s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
- s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
- s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
- s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
- return 4;
- }
-}
-
-static int
-namedentitytostr(const char *e, char *buf, size_t bufsiz)
-{
- static const struct {
- const char *entity;
- int c;
- } entities[] = {
- { "amp;", '&' },
- { "lt;", '<' },
- { "gt;", '>' },
- { "apos;", '\'' },
- { "quot;", '"' },
- };
- size_t i;
-
- /* buffer is too small */
- if (bufsiz < 2)
- return -1;
-
- for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
- if (!strcmp(e, entities[i].entity)) {
- buf[0] = entities[i].c;
- buf[1] = '\0';
- return 1;
- }
- }
- return -1;
-}
-
-static int
-numericentitytostr(const char *e, char *buf, size_t bufsiz)
-{
- long l;
- int len;
- char *end;
-
- /* buffer is too small */
- if (bufsiz < 5)
- return -1;
-
- errno = 0;
- /* hex (16) or decimal (10) */
- if (*e == 'x')
- l = strtol(++e, &end, 16);
- else
- l = strtol(e, &end, 10);
- /* invalid value or not a well-formed entity or invalid code point */
- if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
- return -1;
- len = codepointtoutf8(l, buf);
- buf[len] = '\0';
-
- return len;
-}
-
-/* convert named- or numeric entity string to buffer string
- * returns byte-length of string or -1 on failure. */
-int
-xml_entitytostr(const char *e, char *buf, size_t bufsiz)
-{
- /* doesn't start with & */
- if (e[0] != '&')
- return -1;
- /* numeric entity */
- if (e[1] == '#')
- return numericentitytostr(e + 2, buf, bufsiz);
- else /* named entity */
- return namedentitytostr(e + 1, buf, bufsiz);
-}
-
-void
-xml_parse(XMLParser *x)
-{
- size_t datalen, tagdatalen;
- int c, isend;
-
- while ((c = GETNEXT()) != EOF && c != '<')
- ; /* skip until < */
-
- while (c != EOF) {
- if (c == '<') { /* parse tag */
- if ((c = GETNEXT()) == EOF)
- return;
-
- if (c == '!') { /* cdata and comments */
- for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
- /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
- if (tagdatalen <= sizeof("[CDATA[") - 1)
- x->data[tagdatalen++] = c;
- if (c == '>')
- break;
- else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
- (x->data[0] == '-')) {
- xml_parsecomment(x);
- break;
- } else if (c == '[') {
- if (tagdatalen == sizeof("[CDATA[") - 1 &&
- !strncmp(x->data, "[CDATA[", tagdatalen)) {
- xml_parsecdata(x);
- break;
- }
- }
- }
- } else {
- /* normal tag (open, short open, close), processing instruction. */
- x->tag[0] = c;
- x->taglen = 1;
- x->isshorttag = isend = 0;
-
- /* treat processing instruction as shorttag, don't strip "?" prefix. */
- if (c == '?') {
- x->isshorttag = 1;
- } else if (c == '/') {
- if ((c = GETNEXT()) == EOF)
- return;
- x->tag[0] = c;
- isend = 1;
- }
-
- while ((c = GETNEXT()) != EOF) {
- if (c == '/')
- x->isshorttag = 1; /* short tag */
- else if (c == '>' || isspace(c)) {
- x->tag[x->taglen] = '\0';
- if (isend) { /* end tag, starts with </ */
- if (x->xmltagend)
- x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
- x->tag[0] = '\0';
- x->taglen = 0;
- } else {
- /* start tag */
- if (x->xmltagstart)
- x->xmltagstart(x, x->tag, x->taglen);
- if (isspace(c))
- xml_parseattrs(x);
- if (x->xmltagstartparsed)
- x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
- }
- /* call tagend for shortform or processing instruction */
- if (x->isshorttag) {
- if (x->xmltagend)
- x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
- x->tag[0] = '\0';
- x->taglen = 0;
- }
- break;
- } else if (x->taglen < sizeof(x->tag) - 1)
- x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
- }
- }
- } else {
- /* parse tag data */
- datalen = 0;
- if (x->xmldatastart)
- x->xmldatastart(x);
- while ((c = GETNEXT()) != EOF) {
- if (c == '&') {
- if (datalen) {
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data, datalen);
- }
- x->data[0] = c;
- datalen = 1;
- while ((c = GETNEXT()) != EOF) {
- if (c == '<')
- break;
- if (datalen < sizeof(x->data) - 1)
- x->data[datalen++] = c;
- else {
- /* entity too long for buffer, handle as normal data */
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- break;
- }
- if (c == ';') {
- x->data[datalen] = '\0';
- if (x->xmldataentity)
- x->xmldataentity(x, x->data, datalen);
- datalen = 0;
- break;
- }
- }
- } else if (c != '<') {
- if (datalen < sizeof(x->data) - 1) {
- x->data[datalen++] = c;
- } else {
- x->data[datalen] = '\0';
- if (x->xmldata)
- x->xmldata(x, x->data, datalen);
- x->data[0] = c;
- datalen = 1;
- }
- }
- if (c == '<') {
- x->data[datalen] = '\0';
- if (x->xmldata && datalen)
- x->xmldata(x, x->data, datalen);
- if (x->xmldataend)
- x->xmldataend(x);
- break;
- }
- }
- }
- }
-}
(DIR) diff --git a/xml.h b/xml.h
@@ -1,49 +0,0 @@
-#ifndef _XML_H
-#define _XML_H
-
-#include <stdio.h>
-
-typedef struct xmlparser {
- /* handlers */
- void (*xmlattr)(struct xmlparser *, const char *, size_t,
- const char *, size_t, const char *, size_t);
- void (*xmlattrend)(struct xmlparser *, const char *, size_t,
- const char *, size_t);
- void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
- const char *, size_t);
- void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
- const char *, size_t, const char *, size_t);
- void (*xmlcdatastart)(struct xmlparser *);
- void (*xmlcdata)(struct xmlparser *, const char *, size_t);
- void (*xmlcdataend)(struct xmlparser *);
- void (*xmlcommentstart)(struct xmlparser *);
- void (*xmlcomment)(struct xmlparser *, const char *, size_t);
- void (*xmlcommentend)(struct xmlparser *);
- void (*xmldata)(struct xmlparser *, const char *, size_t);
- void (*xmldataend)(struct xmlparser *);
- void (*xmldataentity)(struct xmlparser *, const char *, size_t);
- void (*xmldatastart)(struct xmlparser *);
- void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
- void (*xmltagstart)(struct xmlparser *, const char *, size_t);
- void (*xmltagstartparsed)(struct xmlparser *, const char *,
- size_t, int);
-
-#ifndef GETNEXT
- #define GETNEXT (x)->getnext
- int (*getnext)(void);
-#endif
-
- /* current tag */
- char tag[1024];
- size_t taglen;
- /* current tag is in short form ? <tag /> */
- int isshorttag;
- /* current attribute name */
- char name[1024];
- /* data buffer used for tag data, cdata and attribute data */
- char data[BUFSIZ];
-} XMLParser;
-
-int xml_entitytostr(const char *, char *, size_t);
-void xml_parse(XMLParser *);
-#endif