parse own username and fullname from data, add item username and fullname - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 4640420521e94158d80f94202ed40f7dc4a66169
 (DIR) parent f712b91a8db0fb66f7facf349ea859da07717dc7
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 12 Aug 2017 17:15:41 +0200
       
       parse own username and fullname from data, add item username and fullname
       
       Diffstat:
         M tscrape.c                           |      60 +++++++++++++++++++-------------
         M util.h                              |       3 ++-
       
       2 files changed, 38 insertions(+), 25 deletions(-)
       ---
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -22,17 +22,17 @@ enum {
                Stream    = 2,
                Header    = 4,
                Timestamp = 8,
       -        Text      = 16,
       -        Fullname  = 32,
       -        Username  = 64
       +        Text      = 16
        };
        
        /* data */
        static char fullname[1024];
       +static int  ispinned;
       +static char itemusername[1024];
       +static char itemfullname[1024];
        static char timestamp[16];
        static char text[4096];
        static char username[1024];
       -static int  ispinned;
        
        static char      classname[256];
        static char      datatime[16];
       @@ -50,13 +50,17 @@ printtweet(void)
                if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
                        printf("%lld", (long long)t);
                putchar('\t');
       +        printescape(username);
       +        putchar('\t');
       +        printescape(fullname);
       +        putchar('\t');
                printescape(text);
                putchar('\t');
                printescape(itemid);
                putchar('\t');
       -        printescape(username);
       +        printescape(itemusername);
                putchar('\t');
       -        printescape(fullname);
       +        printescape(itemfullname);
                putchar('\t');
                printescape(retweetid);
                putchar('\t');
       @@ -93,9 +97,7 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
                if (!strcmp(t, "p"))
                        state &= ~Text;
                else if (!strcmp(t, "span"))
       -                state &= ~(Timestamp|Username);
       -        else if (!strcmp(t, "strong"))
       -                state &= ~Fullname;
       +                state &= ~(Timestamp);
        }
        
        static void
       @@ -118,8 +120,8 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
                        state = 0;
                } else if (!strcmp(t, "li") && isclassmatch(v, STRP("js-stream-item"))) {
                        state |= Item;
       -                datatime[0] = text[0] = timestamp[0] = fullname[0] = '\0';
       -                itemid[0] = username[0] = retweetid[0] = '\0';
       +                datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0';
       +                itemid[0] = itemusername[0] = retweetid[0] = '\0';
                        ispinned = 0;
                        if (isclassmatch(v, STRP("js-pinned")))
                                ispinned = 1;
       @@ -129,14 +131,10 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
                                state |= Stream;
                        } else if (!strcmp(t, "a") && isclassmatch(v, STRP("js-action-profile"))) {
                                state |= Header;
       -                } else if (!strcmp(t, "strong") && isclassmatch(v, STRP("fullname"))) {
       -                        state |= Fullname;
                        } else if (!strcmp(t, "span") && isclassmatch(v, STRP("js-short-timestamp"))) {
                                state |= Timestamp;
                                strlcpy(timestamp, datatime, sizeof(timestamp));
                                datatime[0] = '\0';
       -                } else if (!strcmp(t, "span") && isclassmatch(v, STRP("username"))) {
       -                        state |= Username;
                        }
                }
                if ((state & Text) && !strcmp(t, "a") && !isspace(text[0]))
       @@ -147,6 +145,17 @@ static void
        xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
                const char *v, size_t vl)
        {
       +        /* NOTE: assumes classname attribute is set before data-* in current tag */
       +        if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-actions"))) {
       +                if (!strcmp(a, "data-screen-name")) {
       +                        strlcat(username, " ", sizeof(username));
       +                        strlcat(username, v, sizeof(username));
       +                } else if (!strcmp(a, "data-name")) {
       +                        strlcat(fullname, " ", sizeof(fullname));
       +                        strlcat(fullname, v, sizeof(fullname));
       +                }
       +        }
       +
                if (!strcmp(a, "class")) {
                        strlcat(classname, v, sizeof(classname));
                } else if (state & Item) {
       @@ -155,6 +164,16 @@ xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
                                        strlcpy(itemid, v, sizeof(itemid));
                                else if (!strcmp(a, "data-retweet-id"))
                                        strlcpy(retweetid, v, sizeof(retweetid));
       +
       +                        if (isclassmatch(classname, STRP("js-stream-tweet"))) {
       +                                if (!strcmp(a, "data-screen-name")) {
       +                                        strlcat(itemusername, " ", sizeof(itemusername));
       +                                        strlcat(itemusername, v, sizeof(itemusername));
       +                                } else if (!strcmp(a, "data-name")) {
       +                                        strlcat(itemfullname, " ", sizeof(itemfullname));
       +                                        strlcat(itemfullname, v, sizeof(itemfullname));
       +                                }
       +                        }
                        } else if (!strcmp(t, "span") && !strcmp(a, "data-time")) {
                                /* UNIX timestamp */
                                strlcpy(datatime, v, sizeof(datatime));
       @@ -183,14 +202,7 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
        static void
        xmldata(XMLParser *x, const char *d, size_t dl)
        {
       -        if (state & Username) {
       -                if (d[0] == '@')
       -                        strlcat(username, " ", sizeof(username));
       -                strlcat(username, d, sizeof(username));
       -        } else if (state & Fullname) {
       -                strlcat(fullname, " ", sizeof(fullname));
       -                strlcat(fullname, d, sizeof(fullname));
       -        } else if (state & Text) {
       +        if (state & Text) {
                        if (!isclassmatch(classname, STRP("u-hidden")))
                                strlcat(text, d, sizeof(text));
                }
       @@ -202,7 +214,7 @@ xmldataentity(XMLParser *x, const char *d, size_t dl)
                char buf[16];
                ssize_t len;
        
       -        if (!(state & (Text|Username|Fullname)))
       +        if (!(state & Text))
                        return;
                if ((len = html_entitytostr(d, buf, sizeof(buf))) > 0)
                        xmldata(x, buf, (size_t)len);
 (DIR) diff --git a/util.h b/util.h
       @@ -24,8 +24,9 @@ struct feed {
        
        enum {
                FieldUnixTimestamp = 0,
       -        FieldText, FieldItemid,
                FieldUsername, FieldFullname,
       +        FieldText, FieldItemid,
       +        FieldItemUsername, FieldItemFullname,
                FieldRetweetid, FieldIspinned,
                FieldLast
        };