work-in-progress: support the new Twitter site - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit c3e76b0f57c58b284cd13ce008c082525c8ee28a
 (DIR) parent 663dab7d9883a291ed570a743fb89a16e1a01d85
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Fri,  5 Jun 2020 14:51:58 +0200
       
       work-in-progress: support the new Twitter site
       
       Scraping doesn't work anymore. Use the Twitter JSON API.
       
       Major thanks to leot for helping with this.
       
       Diffstat:
         M Makefile                            |      18 +++++++++---------
         M README                              |      28 ++++++++++++----------------
         A json.c                              |     313 +++++++++++++++++++++++++++++++
         A json.h                              |      26 ++++++++++++++++++++++++++
         M tscrape.c                           |     591 +++++++++++++++++++------------
         M tscrape_plain.c                     |       2 +-
         M tscrape_update                      |      31 +++++++++++++++++++++++++++++--
         M tscraperc.example                   |       8 ++++----
         M util.c                              |      40 -------------------------------
         M util.h                              |       2 --
         D xml.c                               |     451 -------------------------------
         D xml.h                               |      49 -------------------------------
       
       12 files changed, 755 insertions(+), 804 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -25,17 +25,17 @@ SCRIPTS = \
        SRC = ${BIN:=.c}
        HDR = \
                util.h\
       -        xml.h
       +        json.h
        
        LIBUTIL = libutil.a
        LIBUTILSRC = \
                util.c
        LIBUTILOBJ = ${LIBUTILSRC:.c=.o}
        
       -LIBXML = libxml.a
       -LIBXMLSRC = \
       -        xml.c
       -LIBXMLOBJ = ${LIBXMLSRC:.c=.o}
       +LIBJSON = libjson.a
       +LIBJSONSRC = \
       +        json.c
       +LIBJSONOBJ = ${LIBJSONSRC:.c=.o}
        
        COMPATSRC = \
                strlcat.c\
       @@ -44,7 +44,7 @@ COMPATOBJ =\
                strlcat.o\
                strlcpy.o
        
       -LIB = ${LIBUTIL} ${LIBXML} ${COMPATOBJ}
       +LIB = ${LIBUTIL} ${LIBJSON} ${COMPATOBJ}
        
        MAN1 = ${BIN:=.1}\
                ${SCRIPTS:=.1}
       @@ -59,7 +59,7 @@ all: $(BIN)
        
        ${BIN}: ${LIB} ${@:=.o}
        
       -OBJ = ${SRC:.c=.o} ${LIBXMLOBJ} ${LIBUTILOBJ} ${COMPATOBJ}
       +OBJ = ${SRC:.c=.o} ${LIBJSONOBJ} ${LIBUTILOBJ} ${COMPATOBJ}
        
        ${OBJ}: ${HDR}
        
       @@ -73,7 +73,7 @@ ${LIBUTIL}: ${LIBUTILOBJ}
                ${AR} rc $@ $?
                ${RANLIB} $@
        
       -${LIBXML}: ${LIBXMLOBJ}
       +${LIBJSON}: ${LIBJSONOBJ}
                ${AR} rc $@ $?
                ${RANLIB} $@
        
       @@ -81,7 +81,7 @@ dist:
                rm -rf "${NAME}-${VERSION}"
                mkdir -p "${NAME}-${VERSION}"
                cp -f ${MAN1} ${MAN5} ${DOC} ${HDR} \
       -                ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
       +                ${SRC} ${LIBJSONSRC} ${LIBUTILSRC} ${COMPATSRC} ${SCRIPTS} \
                        Makefile \
                        tscraperc.example style.css \
                        "${NAME}-${VERSION}"
 (DIR) diff --git a/README b/README
       @@ -1,13 +1,16 @@
        tscrape
        -------
        
       -Twitter feed HTML scraper.
       +Twitter feed parser.
        
       -It scrapes HTML from stdin and outputs it to a TAB-separated format that can be
       -easier parsed with various (UNIX) tools. There are formatting programs included
       -to convert this TAB-separated format to various other formats. There are also
       -some programs and scripts included to import and export OPML and to fetch,
       -filter, merge and order items.
       +It parses JSON from stdin and outputs it to a TAB-separated format that can be
       +processed easier with various (UNIX) tools. There are formatting programs
       +included to convert this TAB-separated format to various other formats. There
       +are also some programs and scripts included to import and export OPML and to
       +fetch, filter, merge and order items.
       +
       +The name tscrape is used because it used to scrape the HTML from the Twitter
       +page.  It is now using the JSON API contents.
        
        
        Build and install
       @@ -20,20 +23,13 @@ $ make
        Usage
        -----
        
       -        curl -H 'User-Agent:' -s 'https://twitter.com/namehere' | tscrape
       -
       -or
       -
       -        ftp -o - -U '' 'https://twitter.com/namehere' 2>/dev/null | tscrape
       -
       -or
       -
       -        hurl 'https://twitter.com/namehere' | tscrape
       +* Create a tscraperc configuration file in ~/.tscrape/tscraperc, see tscraperc.example.
       +* Run tscrape_update
        
        
        Using sfeed to convert the tscrape TSV output to an Atom feed:
        
       -        hurl 'https://twitter.com/namehere' | tscrape | \
       +        tscrape < ~/.tscrape/feeds/name | \
                awk 'BEGIN { OFS = FS = "\t"; }
                {
                        print $1 OFS $4 OFS "https://twitter.com/" $6 "/status/" $5  \
 (DIR) diff --git a/json.c b/json.c
       @@ -0,0 +1,313 @@
       +#include <ctype.h>
       +#include <errno.h>
       +#include <stdint.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +#include <string.h>
       +
       +#define GETNEXT getchar
       +
       +#include "json.h"
       +
       +static int
       +codepointtoutf8(long r, char *s)
       +{
       +        if (r == 0) {
       +                return 0; /* NUL byte */
       +        } else if (r <= 0x7F) {
       +                /* 1 byte: 0aaaaaaa */
       +                s[0] = r;
       +                return 1;
       +        } else if (r <= 0x07FF) {
       +                /* 2 bytes: 00000aaa aabbbbbb */
       +                s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
       +                s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
       +                return 2;
       +        } else if (r <= 0xFFFF) {
       +                /* 3 bytes: aaaabbbb bbcccccc */
       +                s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
       +                s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
       +                s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
       +                return 3;
       +        } else {
       +                /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
       +                s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
       +                s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
       +                s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
       +                s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
       +                return 4;
       +        }
       +}
       +
       +static int
       +hexdigit(int c)
       +{
       +        if (c >= '0' && c <= '9')
       +                return c - '0';
       +        else if (c >= 'a' && c <= 'f')
       +                return 10 + (c - 'a');
       +        else if (c >= 'A' && c <= 'F')
       +                return 10 + (c - 'A');
       +        return 0;
       +}
       +
       +static int
       +capacity(char **value, size_t *sz, size_t cur, size_t inc)
       +{
       +        size_t need, newsiz;
       +        char *newp;
       +
       +        /* check for addition overflow */
       +        if (cur > SIZE_MAX - inc) {
       +                errno = EOVERFLOW;
       +                return -1;
       +        }
       +        need = cur + inc;
       +
       +        if (need > *sz) {
       +                if (need > SIZE_MAX / 2) {
       +                        newsiz = SIZE_MAX;
       +                } else {
       +                        for (newsiz = *sz < 64 ? 64 : *sz; newsiz <= need; newsiz *= 2)
       +                                ;
       +                }
       +                if (!(newp = realloc(*value, newsiz)))
       +                        return -1; /* up to caller to free *value */
       +                *value = newp;
       +                *sz = newsiz;
       +        }
       +        return 0;
       +}
       +
       +#define EXPECT_VALUE         "{[\"-0123456789tfn"
       +#define EXPECT_STRING        "\""
       +#define EXPECT_END           "}],"
       +#define EXPECT_OBJECT_STRING EXPECT_STRING "}"
       +#define EXPECT_OBJECT_KEY    ":"
       +#define EXPECT_ARRAY_VALUE   EXPECT_VALUE "]"
       +
       +#define JSON_INVALID()       do { ret = JSON_ERROR_INVALID; goto end; } while (0);
       +
       +int
       +parsejson(void (*cb)(struct json_node *, size_t, const char *))
       +{
       +        struct json_node nodes[JSON_MAX_NODE_DEPTH] = { 0 };
       +        size_t depth = 0, p = 0, len, sz = 0;
       +        long cp, hi, lo;
       +        char pri[128], *str = NULL;
       +        int c, i, escape, iskey = 0, ret = JSON_ERROR_MEM;
       +        const char *expect = EXPECT_VALUE;
       +
       +        if (capacity(&(nodes[0].name), &(nodes[0].namesiz), 0, 1) == -1)
       +                goto end;
       +        nodes[0].name[0] = '\0';
       +
       +        while (1) {
       +                c = GETNEXT();
       +handlechr:
       +                if (c == EOF)
       +                        break;
       +
       +                /* skip JSON white-space, (NOTE: no \v, \f, \b etc) */
       +                if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
       +                        continue;
       +
       +                if (!c || !strchr(expect, c))
       +                        JSON_INVALID();
       +
       +                switch (c) {
       +                case ':':
       +                        iskey = 0;
       +                        expect = EXPECT_VALUE;
       +                        break;
       +                case '"':
       +                        nodes[depth].type = JSON_TYPE_STRING;
       +                        escape = 0;
       +                        len = 0;
       +                        while (1) {
       +                                c = GETNEXT();
       +chr:
       +                                /* EOF or control char: 0x7f is not defined as a control char in RFC8259 */
       +                                if (c < 0x20)
       +                                        JSON_INVALID();
       +
       +                                if (escape) {
       +escchr:
       +                                        escape = 0;
       +                                        switch (c) {
       +                                        case '"': /* FALLTHROUGH */
       +                                        case '\\':
       +                                        case '/': break;
       +                                        case 'b': c = '\b'; break;
       +                                        case 'f': c = '\f'; break;
       +                                        case 'n': c = '\n'; break;
       +                                        case 'r': c = '\r'; break;
       +                                        case 't': c = '\t'; break;
       +                                        case 'u': /* hex hex hex hex */
       +                                                if (capacity(&str, &sz, len, 4) == -1)
       +                                                        goto end;
       +                                                for (i = 12, cp = 0; i >= 0; i -= 4) {
       +                                                        if ((c = GETNEXT()) == EOF || !isxdigit(c))
       +                                                                JSON_INVALID(); /* invalid code point */
       +                                                        cp |= (hexdigit(c) << i);
       +                                                }
       +                                                /* RFC8259 - 7. Strings - surrogates.
       +                                                 * 0xd800 - 0xdb7f - high surrogates */
       +                                                if (cp >= 0xd800 && cp <= 0xdb7f) {
       +                                                        if ((c = GETNEXT()) != '\\') {
       +                                                                len += codepointtoutf8(cp, &str[len]);
       +                                                                goto chr;
       +                                                        }
       +                                                        if ((c = GETNEXT()) != 'u') {
       +                                                                len += codepointtoutf8(cp, &str[len]);
       +                                                                goto escchr;
       +                                                        }
       +                                                        for (hi = cp, i = 12, lo = 0; i >= 0; i -= 4) {
       +                                                                if ((c = GETNEXT()) == EOF || !isxdigit(c))
       +                                                                        JSON_INVALID(); /* invalid code point */
       +                                                                lo |= (hexdigit(c) << i);
       +                                                        }
       +                                                        /* 0xdc00 - 0xdfff - low surrogates */
       +                                                        if (lo >= 0xdc00 && lo <= 0xdfff) {
       +                                                                cp = (hi << 10) + lo - 56613888; /* - offset */
       +                                                        } else {
       +                                                                /* handle graceful: raw invalid output bytes */
       +                                                                len += codepointtoutf8(hi, &str[len]);
       +                                                                if (capacity(&str, &sz, len, 4) == -1)
       +                                                                        goto end;
       +                                                                len += codepointtoutf8(lo, &str[len]);
       +                                                                continue;
       +                                                        }
       +                                                }
       +                                                len += codepointtoutf8(cp, &str[len]);
       +                                                continue;
       +                                        default:
       +                                                JSON_INVALID(); /* invalid escape char */
       +                                        }
       +                                        if (capacity(&str, &sz, len, 1) == -1)
       +                                                goto end;
       +                                        str[len++] = c;
       +                                } else if (c == '\\') {
       +                                        escape = 1;
       +                                } else if (c == '"') {
       +                                        if (capacity(&str, &sz, len, 1) == -1)
       +                                                goto end;
       +                                        str[len++] = '\0';
       +
       +                                        if (iskey) {
       +                                                /* copy string as key, including NUL byte */
       +                                                if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), len, 1) == -1)
       +                                                        goto end;
       +                                                memcpy(nodes[depth].name, str, len);
       +                                        } else {
       +                                                cb(nodes, depth + 1, str);
       +                                        }
       +                                        break;
       +                                } else {
       +                                        if (capacity(&str, &sz, len, 1) == -1)
       +                                                goto end;
       +                                        str[len++] = c;
       +                                }
       +                        }
       +                        if (iskey)
       +                                expect = EXPECT_OBJECT_KEY;
       +                        else
       +                                expect = EXPECT_END;
       +                        break;
       +                case '[':
       +                case '{':
       +                        if (depth + 1 >= JSON_MAX_NODE_DEPTH)
       +                                JSON_INVALID(); /* too deep */
       +
       +                        nodes[depth].index = 0;
       +                        if (c == '[') {
       +                                nodes[depth].type = JSON_TYPE_ARRAY;
       +                                expect = EXPECT_ARRAY_VALUE;
       +                        } else if (c == '{') {
       +                                iskey = 1;
       +                                nodes[depth].type = JSON_TYPE_OBJECT;
       +                                expect = EXPECT_OBJECT_STRING;
       +                        }
       +
       +                        cb(nodes, depth + 1, "");
       +
       +                        depth++;
       +                        nodes[depth].index = 0;
       +                        if (capacity(&(nodes[depth].name), &(nodes[depth].namesiz), 0, 1) == -1)
       +                                goto end;
       +                        nodes[depth].name[0] = '\0';
       +                        break;
       +                case ']':
       +                case '}':
       +                        if (!depth ||
       +                           (c == ']' && nodes[depth - 1].type != JSON_TYPE_ARRAY) ||
       +                           (c == '}' && nodes[depth - 1].type != JSON_TYPE_OBJECT))
       +                                JSON_INVALID(); /* unbalanced nodes */
       +
       +                        nodes[--depth].index++;
       +                        expect = EXPECT_END;
       +                        break;
       +                case ',':
       +                        if (!depth)
       +                                JSON_INVALID(); /* unbalanced nodes */
       +
       +                        nodes[depth - 1].index++;
       +                        if (nodes[depth - 1].type == JSON_TYPE_OBJECT) {
       +                                iskey = 1;
       +                                expect = EXPECT_STRING;
       +                        } else {
       +                                expect = EXPECT_VALUE;
       +                        }
       +                        break;
       +                case 't': /* true */
       +                        if (GETNEXT() != 'r' || GETNEXT() != 'u' || GETNEXT() != 'e')
       +                                JSON_INVALID();
       +                        nodes[depth].type = JSON_TYPE_BOOL;
       +                        cb(nodes, depth + 1, "true");
       +                        expect = EXPECT_END;
       +                        break;
       +                case 'f': /* false */
       +                        if (GETNEXT() != 'a' || GETNEXT() != 'l' || GETNEXT() != 's' ||
       +                            GETNEXT() != 'e')
       +                                JSON_INVALID();
       +                        nodes[depth].type = JSON_TYPE_BOOL;
       +                        cb(nodes, depth + 1, "false");
       +                        expect = EXPECT_END;
       +                        break;
       +                case 'n': /* null */
       +                        if (GETNEXT() != 'u' || GETNEXT() != 'l' || GETNEXT() != 'l')
       +                                JSON_INVALID();
       +                        nodes[depth].type = JSON_TYPE_NULL;
       +                        cb(nodes, depth + 1, "null");
       +                        expect = EXPECT_END;
       +                        break;
       +                default: /* number */
       +                        nodes[depth].type = JSON_TYPE_NUMBER;
       +                        p = 0;
       +                        pri[p++] = c;
       +                        expect = EXPECT_END;
       +                        while (1) {
       +                                c = GETNEXT();
       +                                if (c == EOF ||
       +                                    !c || !strchr("0123456789eE+-.", c) ||
       +                                    p + 1 >= sizeof(pri)) {
       +                                        pri[p] = '\0';
       +                                        cb(nodes, depth + 1, pri);
       +                                        goto handlechr; /* do not read next char, handle this */
       +                                } else {
       +                                        pri[p++] = c;
       +                                }
       +                        }
       +                }
       +        }
       +        if (depth)
       +                JSON_INVALID(); /* unbalanced nodes */
       +
       +        ret = 0; /* success */
       +end:
       +        for (depth = 0; depth < sizeof(nodes) / sizeof(nodes[0]); depth++)
       +                free(nodes[depth].name);
       +        free(str);
       +
       +        return ret;
       +}
 (DIR) diff --git a/json.h b/json.h
       @@ -0,0 +1,26 @@
       +#include <stddef.h>
       +
       +enum JSONType {
       +        JSON_TYPE_ARRAY  = 'a',
       +        JSON_TYPE_OBJECT = 'o',
       +        JSON_TYPE_STRING = 's',
       +        JSON_TYPE_BOOL   = 'b',
       +        JSON_TYPE_NULL   = '?',
       +        JSON_TYPE_NUMBER = 'n'
       +};
       +
       +enum JSONError {
       +        JSON_ERROR_MEM     = -2,
       +        JSON_ERROR_INVALID = -1
       +};
       +
       +#define JSON_MAX_NODE_DEPTH 64
       +
       +struct json_node {
       +        enum JSONType type;
       +        char *name;
       +        size_t namesiz;
       +        size_t index; /* count/index for array or object type */
       +};
       +
       +int parsejson(void (*cb)(struct json_node *, size_t, const char *));
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -2,107 +2,52 @@
        
        #include <ctype.h>
        #include <err.h>
       +#include <stdlib.h>
        #include <stdio.h>
        #include <string.h>
        #include <strings.h>
       +#include <time.h>
        #include <unistd.h>
        
       -#include "xml.h"
       +#include "json.h"
        #include "util.h"
        
        #define STRP(s) s,sizeof(s)-1
        
       -/* states */
       -enum {
       -        Item      = 1,
       -        Stream    = 2,
       -        Header    = 4,
       -        Timestamp = 8,
       -        Text      = 16
       +/* a tweet */
       +struct tweet {
       +        char fullname[1024];
       +        int  ispinned;
       +        char itemusername[1024];
       +        char itemfullname[1024];
       +        char full_text[4096];
       +        char username[1024];
       +        time_t timestamp;
       +        char datatime[16];
       +        char itemid[64];
       +        char retweetid[64];
       +
       +        struct tweet *next;
        };
        
       -/* data */
       -static char fullname[1024];
       -static int  ispinned;
       -static char itemusername[1024];
       -static char itemfullname[1024];
       -static char timestamp[16];
       -static char text[4096];
       -static char username[1024];
       -
       -static char      classname[256];
       -static char      datatime[16];
       -static char      itemid[64];
       -static char      retweetid[64];
       -static int       state;
       -static XMLParser p;
       -
       -static const char *ignorestate, *endtag;
       -static int (*getnext)(void);
       -
       -/* return a space for all data until some case-insensitive string occurs. This
       -   is used to parse incorrect HTML/XML that contains unescaped HTML in script
       -   or style tags. If you see some </script> tag in a CDATA or comment
       -   section then e-mail W3C and tell them the web is too complex. */
       -static inline int
       -getnext_ignore(void)
       -{
       -        int c;
       -
       -        if ((c = getnext()) == EOF)
       -                return EOF;
       +/* url entities and their replacements */
       +struct url {
       +        char url[256];
       +        size_t url_len;
       +        char expanded_url[1024];
        
       -        if (tolower(c) == tolower((unsigned char)*ignorestate)) {
       -                ignorestate++;
       -                if (*ignorestate == '\0') {
       -                        p.getnext = getnext; /* restore */
       -                        return c;
       -                }
       -        } else {
       -                ignorestate = endtag;
       -        }
       -
       -        return ' ';
       -}
       -
       -static void
       -printtweet(void)
       -{
       -        char buf[32];
       -        time_t t;
       -
       -        if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
       -                printf("%lld", (long long)t);
       -        putchar('\t');
       -        printescape(username);
       -        putchar('\t');
       -        printescape(fullname);
       -        putchar('\t');
       -        printescape(text);
       -        putchar('\t');
       -        printescape(itemid);
       -        putchar('\t');
       -        printescape(itemusername);
       -        putchar('\t');
       -        printescape(itemfullname);
       -        putchar('\t');
       -        printescape(retweetid);
       -        putchar('\t');
       -        printf("%d", ispinned);
       -        putchar('\n');
       -}
       +        struct url *next;
       +};
        
       -static int
       -isclassmatch(const char *classes, const char *clss, size_t len)
       -{
       -        const char *p;
       +static struct tweet *tweets, *tc;
       +static struct url *urls, *uc;
       +static char url[256];
        
       -        if (!(p = strstr(classes, clss)))
       -                return 0;
       -        return (p == classes || isspace((unsigned char)p[-1])) &&
       -                (isspace((unsigned char)p[len]) || !p[len]);
       -}
       +#define MAX_PINNED 5
       +static char pinnedids[MAX_PINNED][64];
       +static size_t npinned;
        
       +#if 0
        /* convert XML and some HTML entities */
        static int
        html_entitytostr(const char *s, char *buf, size_t bufsiz)
       @@ -115,192 +60,378 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
                        return (ssize_t)strlcpy(buf, " ", bufsiz);
                return len;
        }
       +#endif
        
       -static void
       -xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
       +long long
       +datetounix(long long year, int mon, int day, int hour, int min, int sec)
        {
       -        if (!strcmp(t, "p"))
       -                state &= ~Text;
       -        else if (!strcmp(t, "span"))
       -                state &= ~(Timestamp);
       +        static const int secs_through_month[] = {
       +                0, 31 * 86400, 59 * 86400, 90 * 86400,
       +                120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
       +                243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
       +        int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
       +        long long t;
       +
       +        if (year - 2ULL <= 136) {
       +                leaps = (year - 68) >> 2;
       +                if (!((year - 68) & 3)) {
       +                        leaps--;
       +                        is_leap = 1;
       +                } else {
       +                        is_leap = 0;
       +                }
       +                t = 31536000 * (year - 70) + 86400 * leaps;
       +        } else {
       +                cycles = (year - 100) / 400;
       +                rem = (year - 100) % 400;
       +                if (rem < 0) {
       +                        cycles--;
       +                        rem += 400;
       +                }
       +                if (!rem) {
       +                        is_leap = 1;
       +                } else {
       +                        if (rem >= 300)
       +                                centuries = 3, rem -= 300;
       +                        else if (rem >= 200)
       +                                centuries = 2, rem -= 200;
       +                        else if (rem >= 100)
       +                                centuries = 1, rem -= 100;
       +                        if (rem) {
       +                                leaps = rem / 4U;
       +                                rem %= 4U;
       +                                is_leap = !rem;
       +                        }
       +                }
       +                leaps += 97 * cycles + 24 * centuries - is_leap;
       +                t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
       +        }
       +        t += secs_through_month[mon];
       +        if (is_leap && mon >= 2)
       +                t += 86400;
       +        t += 86400LL * (day - 1);
       +        t += 3600LL * hour;
       +        t += 60LL * min;
       +        t += sec;
       +
       +        return t;
        }
        
       -static void
       -xmltagstart(XMLParser *x, const char *t, size_t tl)
       +/* parse time format: "Wed May 27 04:12:34 +0000 2020"
       +   assumes tz offset is "+0000" */
       +static int
       +parsetime(const char *s, time_t *tp)
        {
       -        classname[0] = '\0';
       +        static char *mons[] = {
       +                "Jan", "Feb", "Mar", "Apr", "May", "Jun",
       +                "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
       +        };
       +        int year, mon = 0, mday, hour, min, sec, i;
       +        char tzbuf[6], monbuf[4], wdaybuf[4];
       +
       +        for (; *s && isspace((unsigned char)*s); s++)
       +                ;
       +        i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d",
       +                   wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &year);
       +        if (i != 8)
       +                return -1;
       +        for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) {
       +                if (!strcmp(mons[i], monbuf)) {
       +                        mon = i + 1;
       +                        break;
       +                }
       +        }
       +        if (mon == 0)
       +                return -1;
       +
       +        /* invalid range */
       +        if (year < 0 || year > 9999 ||
       +            mon < 1 || mon > 12 ||
       +            mday < 1 || mday > 31 ||
       +            hour < 0 || hour > 23 ||
       +            min < 0 || min> 59 ||
       +            sec < 0 || sec > 59)
       +                return -1;
       +
       +        if (tp)
       +                *tp = datetounix(year - 1900, mon - 1, mday, hour, min, sec);
       +        return 0;
        }
        
        static void
       -xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
       +printescape(const char *s)
        {
       -        /* temporary replace the callback except the reader and end of tag
       -           restore the context once we receive the same ignored tag in the
       -           end tag handler */
       -        if (!strcasecmp(t, "script")) {
       -                ignorestate = endtag = "</script>";
       -                getnext = x->getnext; /* for restore */
       -                x->getnext = getnext_ignore;
       -                return;
       -        } else if (!strcasecmp(t, "style")) {
       -                ignorestate = endtag = "</style>";
       -                getnext = x->getnext; /* for restore */
       -                x->getnext = getnext_ignore;
       -                return;
       +        for (; *s; s++) {
       +                if (!iscntrl((unsigned char)*s))
       +                        putchar(*s);
                }
       -
       -        if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text"))) {
       -                if (state & (Item | Stream | Header))
       -                        state |= Text;
       -        } else if (!strcmp(t, "div") &&
       -                   isclassmatch(classname, STRP("stream-item-footer"))) {
       -                if (text[0] && username[0])
       -                        printtweet();
       -                state = 0;
       -        } else if (!strcmp(t, "li") &&
       -                   isclassmatch(classname, STRP("js-stream-item"))) {
       -                if (state & Item)
       -                        return;
       -                state |= Item;
       -                datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0';
       -                itemid[0] = itemusername[0] = retweetid[0] = '\0';
       -                ispinned = 0;
       -                if (isclassmatch(classname, STRP("js-pinned")))
       -                        ispinned = 1;
       -        } else if (state & Item) {
       -                if (!strcmp(t, "div") &&
       -                    isclassmatch(classname, STRP("js-stream-tweet"))) {
       -                        state &= ~(Text|Header);
       -                        state |= Stream;
       -                } else if (!strcmp(t, "a") &&
       -                           isclassmatch(classname, STRP("js-action-profile"))) {
       -                        state |= Header;
       -                } else if (!strcmp(t, "span") &&
       -                          isclassmatch(classname, STRP("js-short-timestamp"))) {
       -                        state |= Timestamp;
       -                        strlcpy(timestamp, datatime, sizeof(timestamp));
       -                        datatime[0] = '\0';
       -                }
       -        }
       -        if ((state & Text) && !strcmp(t, "a") && !isspace((unsigned char)text[0]))
       -                strlcat(text, " ", sizeof(text));
        }
        
       +/* print text and expand urls */
        static void
       -xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
       -        const char *v, size_t vl)
       +printexpand(const char *s)
        {
       -        /* NOTE: assumes classname attribute is set before data-* in current tag */
       -        if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-actions"))) {
       -                if (!strcmp(a, "data-screen-name")) {
       -                        strlcat(username, " ", sizeof(username));
       -                        strlcat(username, v, sizeof(username));
       -                } else if (!strcmp(a, "data-name")) {
       -                        strlcat(fullname, " ", sizeof(fullname));
       -                        strlcat(fullname, v, sizeof(fullname));
       -                }
       -        }
       -
       -        if (!strcmp(a, "class")) {
       -                strlcat(classname, v, sizeof(classname));
       -        } else if (state & Item) {
       -                if (!strcmp(t, "div")) {
       -                        if (!strcmp(a, "data-item-id"))
       -                                strlcpy(itemid, v, sizeof(itemid));
       -                        else if (!strcmp(a, "data-retweet-id"))
       -                                strlcpy(retweetid, v, sizeof(retweetid));
       -
       -                        if (isclassmatch(classname, STRP("js-stream-tweet"))) {
       -                                if (!strcmp(a, "data-screen-name")) {
       -                                        strlcat(itemusername, " ", sizeof(itemusername));
       -                                        strlcat(itemusername, v, sizeof(itemusername));
       -                                } else if (!strcmp(a, "data-name")) {
       -                                        strlcat(itemfullname, " ", sizeof(itemfullname));
       -                                        strlcat(itemfullname, v, sizeof(itemfullname));
       -                                }
       +        struct url *u;
       +
       +        for (; *s; s++) {
       +                if (iscntrl((unsigned char)*s))
       +                        continue;
       +                for (u = urls; u; u = u->next) {
       +                        if (!strncmp(s, u->url, u->url_len)) {
       +                                s += u->url_len;
       +                                printescape(u->expanded_url);
       +                                break;
                                }
       -                } else if (!strcmp(t, "span") && !strcmp(a, "data-time")) {
       -                        /* UNIX timestamp */
       -                        strlcpy(datatime, v, sizeof(datatime));
       -                }
       -                /* NOTE: can be <div data-image-url>. */
       -                if (!strcmp(a, "data-image-url")) {
       -                        strlcat(text, " ", sizeof(text));
       -                        strlcat(text, v, sizeof(text));
       -                }
       -
       -                /* indication it has a video */
       -                if (itemid[0] && !strcmp(a, "data-playable-media-url")) {
       -                        strlcat(text, " ", sizeof(text));
       -                        strlcat(text, "https://twitter.com/i/videos/", sizeof(text));
       -                        strlcat(text, itemid, sizeof(text));
                        }
       +                if (!u)
       +                        putchar(*s);
                }
        }
        
        static void
       -xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
       -              const char *v, size_t vl)
       +printtweet(struct tweet *t)
        {
       -        char buf[16];
       -        int len;
       +        if (t->timestamp != -1)
       +                printf("%lld", (long long)t->timestamp);
       +        putchar('\t');
       +        printescape(t->username);
       +        putchar('\t');
       +        printescape(t->fullname);
       +        putchar('\t');
       +        printexpand(t->full_text);
       +        putchar('\t');
       +        printescape(t->itemid);
       +        putchar('\t');
       +        if (t->itemusername[0])
       +                printescape(t->itemusername);
       +        else
       +                printescape(t->username);
       +        putchar('\t');
       +        if (t->itemfullname[0])
       +                printescape(t->itemfullname);
       +        else
       +                printescape(t->fullname);
       +        putchar('\t');
       +        printescape(t->retweetid);
       +        putchar('\t');
       +        printf("%d", t->ispinned);
       +        putchar('\n');
       +}
        
       -        if (!state)
       +void
       +addpinned(const char *str)
       +{
       +        if (npinned + 1 >= MAX_PINNED)
                        return;
       -        if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0)
       -                xmlattr(x, t, tl, a, al, buf, (size_t)len);
       -        else
       -                xmlattr(x, t, tl, a, al, v, vl);
       +        strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0]));
       +        npinned++;
        }
        
       -static void
       -xmldata(XMLParser *x, const char *d, size_t dl)
       +void
       +addtweet(void)
        {
       -        if (state & Text) {
       -                if (!isclassmatch(classname, STRP("u-hidden")))
       -                        strlcat(text, d, sizeof(text));
       -        }
       +        struct tweet *t;
       +
       +        if (!(t = calloc(1, sizeof(*t))))
       +                err(1, "calloc");
       +        t->timestamp = -1;
       +        if (tweets)
       +                tc = tc->next = t;
       +        else
       +                tweets = tc = t;
        }
        
       -static void
       -xmldataentity(XMLParser *x, const char *d, size_t dl)
       +void
       +addurl(const char *url, const char *expanded_url)
        {
       -        char buf[16];
       -        int len;
       +        struct url *u;
        
       -        if (!(state & Text))
       -                return;
       -        if ((len = html_entitytostr(d, buf, sizeof(buf))) > 0)
       -                xmldata(x, buf, (size_t)len);
       +        if (!(u = calloc(1, sizeof(*u))))
       +                err(1, "calloc");
       +        strlcpy(u->url, url, sizeof(u->url));
       +        u->url_len = strlen(u->url);
       +        strlcpy(u->expanded_url, expanded_url, sizeof(u->expanded_url));
       +
       +        if (urls)
       +                uc = uc->next = u;
                else
       -                xmldata(x, d, dl);
       +                urls = uc = u;
        }
        
       -static void
       -xmlcdata(XMLParser *x, const char *d, size_t dl)
       +void
       +processnodes(struct json_node *nodes, size_t depth, const char *str)
        {
       -        xmldata(x, d, dl);
       +        if (depth == 2 &&
       +            nodes[0].type == JSON_TYPE_ARRAY &&
       +            nodes[1].type == JSON_TYPE_OBJECT) {
       +                addtweet();
       +        }
       +
       +        if (tc) {
       +                if (depth == 3 &&
       +                    nodes[0].type == JSON_TYPE_ARRAY &&
       +                    nodes[1].type == JSON_TYPE_OBJECT &&
       +                    nodes[2].type == JSON_TYPE_STRING) {
       +                        if (!strcmp(nodes[2].name, "created_at")) {
       +                                parsetime(str, &tc->timestamp);
       +                        } else if (!strcmp(nodes[2].name, "id_str")) {
       +                                strlcpy(tc->itemid, str, sizeof(tc->itemid));
       +                        } else if (!strcmp(nodes[2].name, "full_text")) {
       +                                /* if set by retweet text don't override */
       +                                if (!tc->full_text[0])
       +                                        strlcpy(tc->full_text, str, sizeof(tc->full_text));
       +                        }
       +                }
       +                if (depth == 4 &&
       +                    nodes[0].type == JSON_TYPE_ARRAY &&
       +                    nodes[1].type == JSON_TYPE_OBJECT &&
       +                    nodes[2].type == JSON_TYPE_OBJECT &&
       +                    !strcmp(nodes[2].name, "user")) {
       +                        if (nodes[3].type == JSON_TYPE_STRING) {
       +                                if (!strcmp(nodes[3].name, "name")) {
       +                                        strlcpy(tc->fullname, str, sizeof(tc->fullname));
       +                                } else if (!strcmp(nodes[3].name, "screen_name")) {
       +                                        strlcpy(tc->username, str, sizeof(tc->username));
       +                                }
       +                        }
       +                }
       +
       +                if (depth == 4 &&
       +                    nodes[0].type == JSON_TYPE_ARRAY &&
       +                    nodes[1].type == JSON_TYPE_OBJECT &&
       +                    nodes[2].type == JSON_TYPE_OBJECT &&
       +                    nodes[3].type == JSON_TYPE_STRING &&
       +                    !strcmp(nodes[2].name, "retweeted_status")) {
       +                        if (!strcmp(nodes[3].name, "id_str")) {
       +//                                printf("DEBUG: retweet: id: %s\n", str);
       +                                strlcpy(tc->retweetid, str, sizeof(tc->retweetid));
       +                        } else if (!strcmp(nodes[3].name, "full_text")) {
       +                                strlcpy(tc->full_text, str, sizeof(tc->full_text));
       +//                                printf("DEBUG: retweet: full_text: %s\n", str);
       +                        }
       +                }
       +
       +                if (depth == 5 &&
       +                    nodes[0].type == JSON_TYPE_ARRAY &&
       +                    nodes[1].type == JSON_TYPE_OBJECT &&
       +                    nodes[2].type == JSON_TYPE_OBJECT &&
       +                    nodes[3].type == JSON_TYPE_OBJECT &&
       +                    nodes[4].type == JSON_TYPE_STRING &&
       +                    !strcmp(nodes[2].name, "retweeted_status") &&
       +                    !strcmp(nodes[3].name, "user")) {
       +                        if (!strcmp(nodes[4].name, "name")) {
       +                                strlcpy(tc->itemfullname, str, sizeof(tc->itemfullname));
       +//                                printf("DEBUG: retweeted_status.user.name: %s\n", str);
       +                        } else if (!strcmp(nodes[4].name, "screen_name")) {
       +                                strlcpy(tc->itemusername, str, sizeof(tc->itemusername));
       +//                                printf("DEBUG: retweeted_status.user.screen_name: %s\n", str);
       +                        }
       +                }
       +        }
       +
       +        if (depth == 5 &&
       +            nodes[0].type == JSON_TYPE_ARRAY &&
       +            nodes[1].type == JSON_TYPE_OBJECT &&
       +            nodes[2].type == JSON_TYPE_OBJECT &&
       +            !strcmp(nodes[2].name, "user")) {
       +                if (nodes[3].type == JSON_TYPE_ARRAY &&
       +                    !strcmp(nodes[3].name, "pinned_tweet_ids")) {
       +                        if (nodes[4].type == JSON_TYPE_NUMBER) {
       +                                addpinned(str);
       +//                                printf("DEBUG: pinned_tweets_ids[%zu]: %s\n",
       +//                                        nodes[4].index, str);
       +                        }
       +                }
       +        }
       +
       +        if (depth == 6 &&
       +            nodes[0].type == JSON_TYPE_ARRAY &&
       +            nodes[1].type == JSON_TYPE_OBJECT &&
       +            nodes[2].type == JSON_TYPE_OBJECT &&
       +            nodes[3].type == JSON_TYPE_ARRAY &&
       +            nodes[4].type == JSON_TYPE_OBJECT &&
       +            nodes[5].type == JSON_TYPE_STRING &&
       +            !strcmp(nodes[2].name, "entities") &&
       +            !strcmp(nodes[3].name, "urls")) {
       +                if (!strcmp(nodes[5].name, "url")) {
       +//                        printf("DEBUG: url: %s\n", str);
       +                        strlcpy(url, str, sizeof(url));
       +                } else if (!strcmp(nodes[5].name, "expanded_url")) {
       +//                        printf("DEBUG: expanded_url: %s\n", str);
       +                        /* assumes "expanded_url" is specified after "url" */
       +                        addurl(url, str);
       +                        url[0] = '\0';
       +                }
       +        }
       +
       +        /* [].entities.media[].url */
       +        if (depth == 6 &&
       +            nodes[0].type == JSON_TYPE_ARRAY &&
       +            nodes[1].type == JSON_TYPE_OBJECT &&
       +            nodes[2].type == JSON_TYPE_OBJECT &&
       +            nodes[3].type == JSON_TYPE_ARRAY &&
       +            nodes[4].type == JSON_TYPE_OBJECT &&
       +            nodes[5].type == JSON_TYPE_STRING &&
       +            !strcmp(nodes[2].name, "entities") &&
       +            !strcmp(nodes[3].name, "media")) {
       +                if (!strcmp(nodes[5].name, "url")) {
       +//                        printf("DEBUG: url: %s\n", str);
       +                        strlcpy(url, str, sizeof(url));
       +                } else if (!strcmp(nodes[5].name, "expanded_url")) {
       +//                        printf("DEBUG: expanded_url: %s\n", str);
       +                        /* assumes "expanded_url" is specified after "url" */
       +                        addurl(url, str);
       +                        url[0] = '\0';
       +                }
       +        }
       +
       +// TODO: retweeted.status.entities.urls[]
       +#if 0
       +        if (depth == 6 &&
       +            nodes[0].type == JSON_TYPE_ARRAY &&
       +            nodes[1].type == JSON_TYPE_OBJECT &&
       +            nodes[2].type == JSON_TYPE_OBJECT &&
       +            nodes[3].type == JSON_TYPE_OBJECT &&
       +            nodes[4].type == JSON_TYPE_ARRAY &&
       +            nodes[5].type == JSON_TYPE_STRING &&
       +            !strcmp(nodes[2].name, "retweeted_status") &&
       +            !strcmp(nodes[3].name, "entities") &&
       +            !strcmp(nodes[4].name, "urls")) {
       +                if (!strcmp(nodes[5].name, "url")) {
       +                        printf("DEBUG: url: %s\n", str);
       +                } else if (!strcmp(nodes[5].name, "expanded_url")) {
       +                        printf("DEBUG: expanded_url: %s\n", str);
       +                }
       +        }
       +#endif
        }
        
        int
        main(void)
        {
       +        struct tweet *t;
       +        size_t i;
       +        int r;
       +
                if (pledge("stdio", NULL) == -1)
                        err(1, "pledge");
        
       -        /* handlers */
       -        p.xmlattr           = xmlattr;
       -        p.xmlattrentity     = xmlattrentity;
       -        p.xmlcdata          = xmlcdata;
       -        p.xmldata           = xmldata;
       -        p.xmldataentity     = xmldataentity;
       -        p.xmltagstart       = xmltagstart;
       -        p.xmltagend         = xmltagend;
       -        p.xmltagstartparsed = xmltagstartparsed;
       -        /* reader (stdin) */
       -        p.getnext           = getchar;
       -
       -        xml_parse(&p);
       +        r = parsejson(processnodes);
       +        if (r != 0)
       +                errx(1, "invalid JSON");
       +
       +        // TODO: TEST: make sure the last tweet is printed too (addtweet() logic).
       +        for (t = tweets; t; t = t->next) {
       +                /* check for pinned tweets */
       +                for (i = 0; i < npinned; i++) {
       +                        if (!strcmp(t->itemid, pinnedids[i])) {
       +//                                printf("DEBUG: pinned: %s\n", pinnedids[i]);
       +                                t->ispinned = 1;
       +                                break;
       +                        }
       +                }
       +                printtweet(t);
       +        }
        
                return 0;
        }
 (DIR) diff --git a/tscrape_plain.c b/tscrape_plain.c
       @@ -51,7 +51,7 @@ printfeed(FILE *fp, const char *feedname)
        
                        printutf8pad(stdout, fields[FieldItemFullname], 25, ' ');
                        fputs("  ", stdout);
       -                printescape(fields[FieldText]);
       +                fputs(fields[FieldText], stdout);
                        putchar('\n');
                }
        }
 (DIR) diff --git a/tscrape_update b/tscrape_update
       @@ -9,6 +9,12 @@ tscrapepath="$HOME/.tscrape/feeds"
        # feeds are finished at a time.
        maxjobs=8
        
       +# Twitter authentication bearer (seems to be static).
       +bearer="AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA"
       +
       +# guest token.
       +token=""
       +
        # load config (evaluate shellscript).
        # loadconfig(configfile)
        loadconfig() {
       @@ -36,12 +42,26 @@ log() {
                printf '[%s] %-50.50s %s\n' "$(date +'%H:%M:%S')" "$1" "$2" >&2
        }
        
       +# acquire guest token.
       +# guesttoken()
       +guesttoken() {
       +        # fail on redirects, hide User-Agent, timeout is 15 seconds.
       +        curl -X POST -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
       +                -H "Authorization: Bearer ${bearer}" \
       +                'https://api.twitter.com/1.1/guest/activate.json' 2>/dev/null | \
       +                sed -nE 's@.*{"guest_token":"([^"]*)"}.*@\1@p'
       +}
       +
        # fetch a feed via HTTP/HTTPS etc.
       -# fetch(name, url, feedfile)
       +# fetch(name, twittername, feedfile)
        fetch() {
       +        url="https://api.twitter.com/1.1/statuses/user_timeline.json?screen_name=$2&tweet_mode=extended&count=50&include_rts=1"
       +
                # fail on redirects, hide User-Agent, timeout is 15 seconds.
                curl -L --max-redirs 0 -H "User-Agent:" -f -s -m 15 \
       -                "$2" 2>/dev/null
       +                -H "Authorization: Bearer ${bearer}" \
       +                -H "x-guest-token: $token" \
       +                "${url}" 2>/dev/null
        }
        
        # filter fields.
       @@ -151,6 +171,13 @@ feeds() {
                echo "See tscraperc.example for an example." >&2
        }
        
       +# get quest token.
       +token=$(guesttoken)
       +if [ -z "${token}" ]; then
       +        echo "Failed to acquire guest token" >&2
       +        exit 1
       +fi
       +
        # job counter.
        curjobs=0
        # signal number received for parent.
 (DIR) diff --git a/tscraperc.example b/tscraperc.example
       @@ -2,8 +2,8 @@
        
        # list of feeds to fetch:
        feeds() {
       -        # feed <name> <feedurl>
       -        feed "Rich Felker" "https://twitter.com/richfelker"
       -        feed "Internet of shit" "https://twitter.com/internetofshit"
       -        feed "Donald Trump" "https://twitter.com/realdonaldtrump"
       +        # feed <name> <twittername>
       +        feed "Rich Felker" "richfelker"
       +        feed "Internet of shit" "internetofshit"
       +        feed "Donald Trump" "realdonaldtrump"
        }
 (DIR) diff --git a/util.c b/util.c
       @@ -106,43 +106,3 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad)
                for (; col < len; ++col)
                        putc(pad, fp);
        }
       -
       -void
       -printescape(const char *s)
       -{
       -        int r;
       -        const char *e;
       -
       -        /* strip leading and trailing white-space */
       -        for (; *s && isspace((unsigned char)*s); s++)
       -                ;
       -        for (e = s + strlen(s); e > s && isspace((unsigned char)*(e - 1)); e--)
       -                ;
       -
       -        for (r = 0; *s && s < e; s++) {
       -                if (iscntrl((unsigned char)*s) || isspace((unsigned char)*s)) {
       -                        r = 1;
       -                        continue;
       -                }
       -                if (r) {
       -                        r = 0;
       -                        putchar(' ');
       -                }
       -                putchar(*s);
       -        }
       -}
       -
       -int
       -parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
       -{
       -        struct tm *tm;
       -
       -        if (strtotime(s, t))
       -                return -1;
       -        if (!(tm = localtime(t)))
       -                return -1;
       -        if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
       -                return -1;
       -
       -        return 0;
       -}
 (DIR) diff --git a/util.h b/util.h
       @@ -30,8 +30,6 @@ enum {
        };
        
        size_t  parseline(char *, char *[FieldLast]);
       -int     parsetime(const char *, time_t *, char *, size_t);
       -void    printescape(const char *);
        void    printutf8pad(FILE *, const char *, size_t, int);
        int     strtotime(const char *, time_t *);
        void    xmlencode(const char *, FILE *);
 (DIR) diff --git a/xml.c b/xml.c
       @@ -1,451 +0,0 @@
       -#include <ctype.h>
       -#include <errno.h>
       -#include <stdio.h>
       -#include <stdlib.h>
       -#include <string.h>
       -
       -#include "xml.h"
       -
       -static void
       -xml_parseattrs(XMLParser *x)
       -{
       -        size_t namelen = 0, valuelen;
       -        int c, endsep, endname = 0, valuestart = 0;
       -
       -        while ((c = GETNEXT()) != EOF) {
       -                if (isspace(c)) {
       -                        if (namelen)
       -                                endname = 1;
       -                        continue;
       -                } else if (c == '?')
       -                        ; /* ignore */
       -                else if (c == '=') {
       -                        x->name[namelen] = '\0';
       -                        valuestart = 1;
       -                        endname = 1;
       -                } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) {
       -                        /* attribute without value */
       -                        x->name[namelen] = '\0';
       -                        if (x->xmlattrstart)
       -                                x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
       -                        if (x->xmlattr)
       -                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
       -                        if (x->xmlattrend)
       -                                x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
       -                        endname = 0;
       -                        x->name[0] = c;
       -                        namelen = 1;
       -                } else if (namelen && valuestart) {
       -                        /* attribute with value */
       -                        if (x->xmlattrstart)
       -                                x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen);
       -
       -                        valuelen = 0;
       -                        if (c == '\'' || c == '"') {
       -                                endsep = c;
       -                        } else {
       -                                endsep = ' '; /* isspace() */
       -                                goto startvalue;
       -                        }
       -
       -                        while ((c = GETNEXT()) != EOF) {
       -startvalue:
       -                                if (c == '&') { /* entities */
       -                                        x->data[valuelen] = '\0';
       -                                        /* call data function with data before entity if there is data */
       -                                        if (valuelen && x->xmlattr)
       -                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                        x->data[0] = c;
       -                                        valuelen = 1;
       -                                        while ((c = GETNEXT()) != EOF) {
       -                                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
       -                                                        break;
       -                                                if (valuelen < sizeof(x->data) - 1)
       -                                                        x->data[valuelen++] = c;
       -                                                else {
       -                                                        /* entity too long for buffer, handle as normal data */
       -                                                        x->data[valuelen] = '\0';
       -                                                        if (x->xmlattr)
       -                                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                                        x->data[0] = c;
       -                                                        valuelen = 1;
       -                                                        break;
       -                                                }
       -                                                if (c == ';') {
       -                                                        x->data[valuelen] = '\0';
       -                                                        if (x->xmlattrentity)
       -                                                                x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                                        valuelen = 0;
       -                                                        break;
       -                                                }
       -                                        }
       -                                } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) {
       -                                        if (valuelen < sizeof(x->data) - 1) {
       -                                                x->data[valuelen++] = c;
       -                                        } else {
       -                                                x->data[valuelen] = '\0';
       -                                                if (x->xmlattr)
       -                                                        x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                                x->data[0] = c;
       -                                                valuelen = 1;
       -                                        }
       -                                }
       -                                if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) {
       -                                        x->data[valuelen] = '\0';
       -                                        if (x->xmlattr)
       -                                                x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
       -                                        if (x->xmlattrend)
       -                                                x->xmlattrend(x, x->tag, x->taglen, x->name, namelen);
       -                                        break;
       -                                }
       -                        }
       -                        namelen = endname = valuestart = 0;
       -                } else if (namelen < sizeof(x->name) - 1) {
       -                        x->name[namelen++] = c;
       -                }
       -                if (c == '>') {
       -                        break;
       -                } else if (c == '/') {
       -                        x->isshorttag = 1;
       -                        x->name[0] = '\0';
       -                        namelen = 0;
       -                }
       -        }
       -}
       -
       -static void
       -xml_parsecomment(XMLParser *x)
       -{
       -        size_t datalen = 0, i = 0;
       -        int c;
       -
       -        if (x->xmlcommentstart)
       -                x->xmlcommentstart(x);
       -        while ((c = GETNEXT()) != EOF) {
       -                if (c == '-' || c == '>') {
       -                        if (x->xmlcomment && datalen) {
       -                                x->data[datalen] = '\0';
       -                                x->xmlcomment(x, x->data, datalen);
       -                                datalen = 0;
       -                        }
       -                }
       -
       -                if (c == '-') {
       -                        if (++i > 2) {
       -                                if (x->xmlcomment)
       -                                        for (; i > 2; i--)
       -                                                x->xmlcomment(x, "-", 1);
       -                                i = 2;
       -                        }
       -                        continue;
       -                } else if (c == '>' && i == 2) {
       -                        if (x->xmlcommentend)
       -                                x->xmlcommentend(x);
       -                        return;
       -                } else if (i) {
       -                        if (x->xmlcomment) {
       -                                for (; i > 0; i--)
       -                                        x->xmlcomment(x, "-", 1);
       -                        }
       -                        i = 0;
       -                }
       -
       -                if (datalen < sizeof(x->data) - 1) {
       -                        x->data[datalen++] = c;
       -                } else {
       -                        x->data[datalen] = '\0';
       -                        if (x->xmlcomment)
       -                                x->xmlcomment(x, x->data, datalen);
       -                        x->data[0] = c;
       -                        datalen = 1;
       -                }
       -        }
       -}
       -
       -static void
       -xml_parsecdata(XMLParser *x)
       -{
       -        size_t datalen = 0, i = 0;
       -        int c;
       -
       -        if (x->xmlcdatastart)
       -                x->xmlcdatastart(x);
       -        while ((c = GETNEXT()) != EOF) {
       -                if (c == ']' || c == '>') {
       -                        if (x->xmlcdata && datalen) {
       -                                x->data[datalen] = '\0';
       -                                x->xmlcdata(x, x->data, datalen);
       -                                datalen = 0;
       -                        }
       -                }
       -
       -                if (c == ']') {
       -                        if (++i > 2) {
       -                                if (x->xmlcdata)
       -                                        for (; i > 2; i--)
       -                                                x->xmlcdata(x, "]", 1);
       -                                i = 2;
       -                        }
       -                        continue;
       -                } else if (c == '>' && i == 2) {
       -                        if (x->xmlcdataend)
       -                                x->xmlcdataend(x);
       -                        return;
       -                } else if (i) {
       -                        if (x->xmlcdata)
       -                                for (; i > 0; i--)
       -                                        x->xmlcdata(x, "]", 1);
       -                        i = 0;
       -                }
       -
       -                if (datalen < sizeof(x->data) - 1) {
       -                        x->data[datalen++] = c;
       -                } else {
       -                        x->data[datalen] = '\0';
       -                        if (x->xmlcdata)
       -                                x->xmlcdata(x, x->data, datalen);
       -                        x->data[0] = c;
       -                        datalen = 1;
       -                }
       -        }
       -}
       -
       -static int
       -codepointtoutf8(long r, char *s)
       -{
       -        if (r == 0) {
       -                return 0; /* NUL byte */
       -        } else if (r <= 0x7F) {
       -                /* 1 byte: 0aaaaaaa */
       -                s[0] = r;
       -                return 1;
       -        } else if (r <= 0x07FF) {
       -                /* 2 bytes: 00000aaa aabbbbbb */
       -                s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
       -                s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
       -                return 2;
       -        } else if (r <= 0xFFFF) {
       -                /* 3 bytes: aaaabbbb bbcccccc */
       -                s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
       -                s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
       -                s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
       -                return 3;
       -        } else {
       -                /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
       -                s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
       -                s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
       -                s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
       -                s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
       -                return 4;
       -        }
       -}
       -
       -static int
       -namedentitytostr(const char *e, char *buf, size_t bufsiz)
       -{
       -        static const struct {
       -                const char *entity;
       -                int c;
       -        } entities[] = {
       -                { "amp;",  '&'  },
       -                { "lt;",   '<'  },
       -                { "gt;",   '>'  },
       -                { "apos;", '\'' },
       -                { "quot;", '"'  },
       -        };
       -        size_t i;
       -
       -        /* buffer is too small */
       -        if (bufsiz < 2)
       -                return -1;
       -
       -        for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
       -                if (!strcmp(e, entities[i].entity)) {
       -                        buf[0] = entities[i].c;
       -                        buf[1] = '\0';
       -                        return 1;
       -                }
       -        }
       -        return -1;
       -}
       -
       -static int
       -numericentitytostr(const char *e, char *buf, size_t bufsiz)
       -{
       -        long l;
       -        int len;
       -        char *end;
       -
       -        /* buffer is too small */
       -        if (bufsiz < 5)
       -                return -1;
       -
       -        errno = 0;
       -        /* hex (16) or decimal (10) */
       -        if (*e == 'x')
       -                l = strtol(++e, &end, 16);
       -        else
       -                l = strtol(e, &end, 10);
       -        /* invalid value or not a well-formed entity or invalid code point */
       -        if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
       -                return -1;
       -        len = codepointtoutf8(l, buf);
       -        buf[len] = '\0';
       -
       -        return len;
       -}
       -
       -/* convert named- or numeric entity string to buffer string
       - * returns byte-length of string or -1 on failure. */
       -int
       -xml_entitytostr(const char *e, char *buf, size_t bufsiz)
       -{
       -        /* doesn't start with & */
       -        if (e[0] != '&')
       -                return -1;
       -        /* numeric entity */
       -        if (e[1] == '#')
       -                return numericentitytostr(e + 2, buf, bufsiz);
       -        else /* named entity */
       -                return namedentitytostr(e + 1, buf, bufsiz);
       -}
       -
       -void
       -xml_parse(XMLParser *x)
       -{
       -        size_t datalen, tagdatalen;
       -        int c, isend;
       -
       -        while ((c = GETNEXT()) != EOF && c != '<')
       -                ; /* skip until < */
       -
       -        while (c != EOF) {
       -                if (c == '<') { /* parse tag */
       -                        if ((c = GETNEXT()) == EOF)
       -                                return;
       -
       -                        if (c == '!') { /* cdata and comments */
       -                                for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
       -                                        /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
       -                                        if (tagdatalen <= sizeof("[CDATA[") - 1)
       -                                                x->data[tagdatalen++] = c;
       -                                        if (c == '>')
       -                                                break;
       -                                        else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
       -                                                        (x->data[0] == '-')) {
       -                                                xml_parsecomment(x);
       -                                                break;
       -                                        } else if (c == '[') {
       -                                                if (tagdatalen == sizeof("[CDATA[") - 1 &&
       -                                                    !strncmp(x->data, "[CDATA[", tagdatalen)) {
       -                                                        xml_parsecdata(x);
       -                                                        break;
       -                                                }
       -                                        }
       -                                }
       -                        } else {
       -                                /* normal tag (open, short open, close), processing instruction. */
       -                                x->tag[0] = c;
       -                                x->taglen = 1;
       -                                x->isshorttag = isend = 0;
       -
       -                                /* treat processing instruction as shorttag, don't strip "?" prefix. */
       -                                if (c == '?') {
       -                                        x->isshorttag = 1;
       -                                } else if (c == '/') {
       -                                        if ((c = GETNEXT()) == EOF)
       -                                                return;
       -                                        x->tag[0] = c;
       -                                        isend = 1;
       -                                }
       -
       -                                while ((c = GETNEXT()) != EOF) {
       -                                        if (c == '/')
       -                                                x->isshorttag = 1; /* short tag */
       -                                        else if (c == '>' || isspace(c)) {
       -                                                x->tag[x->taglen] = '\0';
       -                                                if (isend) { /* end tag, starts with </ */
       -                                                        if (x->xmltagend)
       -                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       -                                                        x->tag[0] = '\0';
       -                                                        x->taglen = 0;
       -                                                } else {
       -                                                        /* start tag */
       -                                                        if (x->xmltagstart)
       -                                                                x->xmltagstart(x, x->tag, x->taglen);
       -                                                        if (isspace(c))
       -                                                                xml_parseattrs(x);
       -                                                        if (x->xmltagstartparsed)
       -                                                                x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
       -                                                }
       -                                                /* call tagend for shortform or processing instruction */
       -                                                if (x->isshorttag) {
       -                                                        if (x->xmltagend)
       -                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       -                                                        x->tag[0] = '\0';
       -                                                        x->taglen = 0;
       -                                                }
       -                                                break;
       -                                        } else if (x->taglen < sizeof(x->tag) - 1)
       -                                                x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
       -                                }
       -                        }
       -                } else {
       -                        /* parse tag data */
       -                        datalen = 0;
       -                        if (x->xmldatastart)
       -                                x->xmldatastart(x);
       -                        while ((c = GETNEXT()) != EOF) {
       -                                if (c == '&') {
       -                                        if (datalen) {
       -                                                x->data[datalen] = '\0';
       -                                                if (x->xmldata)
       -                                                        x->xmldata(x, x->data, datalen);
       -                                        }
       -                                        x->data[0] = c;
       -                                        datalen = 1;
       -                                        while ((c = GETNEXT()) != EOF) {
       -                                                if (c == '<')
       -                                                        break;
       -                                                if (datalen < sizeof(x->data) - 1)
       -                                                        x->data[datalen++] = c;
       -                                                else {
       -                                                        /* entity too long for buffer, handle as normal data */
       -                                                        x->data[datalen] = '\0';
       -                                                        if (x->xmldata)
       -                                                                x->xmldata(x, x->data, datalen);
       -                                                        x->data[0] = c;
       -                                                        datalen = 1;
       -                                                        break;
       -                                                }
       -                                                if (c == ';') {
       -                                                        x->data[datalen] = '\0';
       -                                                        if (x->xmldataentity)
       -                                                                x->xmldataentity(x, x->data, datalen);
       -                                                        datalen = 0;
       -                                                        break;
       -                                                }
       -                                        }
       -                                } else if (c != '<') {
       -                                        if (datalen < sizeof(x->data) - 1) {
       -                                                x->data[datalen++] = c;
       -                                        } else {
       -                                                x->data[datalen] = '\0';
       -                                                if (x->xmldata)
       -                                                        x->xmldata(x, x->data, datalen);
       -                                                x->data[0] = c;
       -                                                datalen = 1;
       -                                        }
       -                                }
       -                                if (c == '<') {
       -                                        x->data[datalen] = '\0';
       -                                        if (x->xmldata && datalen)
       -                                                x->xmldata(x, x->data, datalen);
       -                                        if (x->xmldataend)
       -                                                x->xmldataend(x);
       -                                        break;
       -                                }
       -                        }
       -                }
       -        }
       -}
 (DIR) diff --git a/xml.h b/xml.h
       @@ -1,49 +0,0 @@
       -#ifndef _XML_H
       -#define _XML_H
       -
       -#include <stdio.h>
       -
       -typedef struct xmlparser {
       -        /* handlers */
       -        void (*xmlattr)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t, const char *, size_t);
       -        void (*xmlattrend)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t);
       -        void (*xmlattrstart)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t);
       -        void (*xmlattrentity)(struct xmlparser *, const char *, size_t,
       -              const char *, size_t, const char *, size_t);
       -        void (*xmlcdatastart)(struct xmlparser *);
       -        void (*xmlcdata)(struct xmlparser *, const char *, size_t);
       -        void (*xmlcdataend)(struct xmlparser *);
       -        void (*xmlcommentstart)(struct xmlparser *);
       -        void (*xmlcomment)(struct xmlparser *, const char *, size_t);
       -        void (*xmlcommentend)(struct xmlparser *);
       -        void (*xmldata)(struct xmlparser *, const char *, size_t);
       -        void (*xmldataend)(struct xmlparser *);
       -        void (*xmldataentity)(struct xmlparser *, const char *, size_t);
       -        void (*xmldatastart)(struct xmlparser *);
       -        void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
       -        void (*xmltagstart)(struct xmlparser *, const char *, size_t);
       -        void (*xmltagstartparsed)(struct xmlparser *, const char *,
       -              size_t, int);
       -
       -#ifndef GETNEXT
       -        #define GETNEXT (x)->getnext
       -        int (*getnext)(void);
       -#endif
       -
       -        /* current tag */
       -        char tag[1024];
       -        size_t taglen;
       -        /* current tag is in short form ? <tag /> */
       -        int isshorttag;
       -        /* current attribute name */
       -        char name[1024];
       -        /* data buffer used for tag data, cdata and attribute data */
       -        char data[BUFSIZ];
       -} XMLParser;
       -
       -int xml_entitytostr(const char *, char *, size_t);
       -void xml_parse(XMLParser *);
       -#endif