separate parsing and formatting like sfeed - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit f712b91a8db0fb66f7facf349ea859da07717dc7
 (DIR) parent f0b8be83a871c59f1bd9a99f16bf20ce9df57c22
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 12 Aug 2017 12:52:23 +0200
       
       separate parsing and formatting like sfeed
       
       - remove formatted timestamp field.
       - add tscrape_plain
       
       Diffstat:
         M Makefile                            |       6 ++++--
         M tscrape.c                           |      76 +------------------------------
         A tscrape_plain.c                     |      91 +++++++++++++++++++++++++++++++
         A util.c                              |     146 +++++++++++++++++++++++++++++++
         A util.h                              |      38 +++++++++++++++++++++++++++++++
       
       5 files changed, 281 insertions(+), 76 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -3,14 +3,16 @@ include config.mk
        NAME = tscrape
        VERSION = 0.1
        BIN = \
       -        tscrape
       +        tscrape\
       +        tscrape_plain
        
        SRC = ${BIN:=.c}
        
        LIBUTIL = libutil.a
        LIBUTILSRC = \
                strlcat.c\
       -        strlcpy.c
       +        strlcpy.c\
       +        util.c
        LIBUTILOBJ = ${LIBUTILSRC:.c=.o}
        
        LIBXML = libxml.a
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -11,11 +11,8 @@
        #include <time.h>
        #include <unistd.h>
        
       -#ifndef USE_PLEDGE
       -#define pledge(p1,p2) 0
       -#endif
       -
        #include "xml.h"
       +#include "util.h"
        
        #define STRP(s) s,sizeof(s)-1
        
       @@ -30,13 +27,6 @@ enum {
                Username  = 64
        };
        
       -/* for compatibility with libc's that don't have strlcat or strlcpy. The
       - * functions are synced from OpenBSD */
       -#undef strlcat
       -size_t strlcat(char *, const char *, size_t);
       -#undef strlcpy
       -size_t strlcpy(char *, const char *, size_t);
       -
        /* data */
        static char fullname[1024];
        static char timestamp[16];
       @@ -52,75 +42,13 @@ static int       state;
        static XMLParser p;
        
        static void
       -printescape(const char *s)
       -{
       -        size_t i;
       -        const char *e;
       -
       -        /* strip leading and trailing white-space */
       -        for (; *s && isspace(*s); s++)
       -                ;
       -        for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--)
       -                ;
       -
       -        for (i = 0; *s && s < e; s++) {
       -                if (iscntrl(*s) || isspace(*s)) {
       -                        i++;
       -                        continue;
       -                }
       -                if (i) {
       -                        i = 0;
       -                        putchar(' ');
       -                }
       -                putchar(*s);
       -        }
       -}
       -
       -/* Parse time to time_t, assumes time_t is signed. */
       -int
       -strtotime(const char *s, time_t *t)
       -{
       -        long long l;
       -        char *e;
       -
       -        errno = 0;
       -        l = strtoll(s, &e, 10);
       -        if (*s == '\0' || *e != '\0')
       -                return -1;
       -        if (t)
       -                *t = (time_t)l;
       -
       -        return 0;
       -}
       -
       -static int
       -parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
       -{
       -        struct tm *tm;
       -
       -        if (strtotime(s, t))
       -                return -1;
       -        if (!(tm = localtime(t)))
       -                return -1;
       -        if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
       -                return -1;
       -
       -        return 0;
       -}
       -
       -static void
        printtweet(void)
        {
                char buf[32];
                time_t t;
        
       -        if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1) {
       +        if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
                        printf("%lld", (long long)t);
       -                putchar('\t');
       -                fputs(buf, stdout);
       -        } else {
       -                putchar('\t');
       -        }
                putchar('\t');
                printescape(text);
                putchar('\t');
 (DIR) diff --git a/tscrape_plain.c b/tscrape_plain.c
       @@ -0,0 +1,91 @@
       +#include <ctype.h>
       +#include <err.h>
       +#include <locale.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +#include <string.h>
       +#include <time.h>
       +
       +#include "util.h"
       +
       +static time_t comparetime;
       +static char *line;
       +static size_t linesize;
       +
       +static void
       +printfeed(FILE *fp, const char *feedname)
       +{
       +        char *fields[FieldLast];
       +        struct tm *tm;
       +        time_t parsedtime;
       +        ssize_t linelen;
       +
       +        while ((linelen = getline(&line, &linesize, fp)) > 0) {
       +                if (line[linelen - 1] == '\n')
       +                        line[--linelen] = '\0';
       +                if (!parseline(line, fields))
       +                        break;
       +
       +                parsedtime = 0;
       +                strtotime(fields[FieldUnixTimestamp], &parsedtime);
       +                if (!(tm = localtime(&parsedtime)))
       +                        err(1, "localtime");
       +
       +                if (parsedtime >= comparetime)
       +                        putchar('N');
       +                else
       +                        putchar(' ');
       +                if (fields[FieldRetweetid][0])
       +                        putchar('R');
       +                else
       +                        putchar(' ');
       +                putchar(' ');
       +
       +                if (feedname[0])
       +                        printf("%-15.15s  ", feedname);
       +
       +                fprintf(stdout, "%04d-%02d-%02d %02d:%02d  ",
       +                        tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
       +                        tm->tm_hour, tm->tm_min);
       +
       +                printutf8pad(stdout, fields[FieldFullname], 25, ' ');
       +                printescape(fields[FieldText]);
       +                putchar('\n');
       +        }
       +}
       +
       +int
       +main(int argc, char *argv[])
       +{
       +        FILE *fp;
       +        char *name;
       +        int i;
       +
       +        if (pledge("stdio rpath", NULL) == -1)
       +                err(1, "pledge");
       +
       +        setlocale(LC_CTYPE, "");
       +
       +        if (pledge(argc == 1 ? "stdio" : "stdio rpath", NULL) == -1)
       +                err(1, "pledge");
       +
       +        if ((comparetime = time(NULL)) == -1)
       +                err(1, "time");
       +        /* 1 day is old news */
       +        comparetime -= 86400;
       +
       +        if (argc == 1) {
       +                printfeed(stdin, "");
       +        } else {
       +                for (i = 1; i < argc; i++) {
       +                        if (!(fp = fopen(argv[i], "r")))
       +                                err(1, "fopen: %s", argv[i]);
       +                        name = ((name = strrchr(argv[i], '/'))) ? name + 1 : argv[i];
       +                        printfeed(fp, name);
       +                        if (ferror(fp))
       +                                err(1, "ferror: %s", argv[i]);
       +                        fclose(fp);
       +                }
       +        }
       +        return 0;
       +}
 (DIR) diff --git a/util.c b/util.c
       @@ -0,0 +1,146 @@
       +#include <sys/types.h>
       +
       +#include <ctype.h>
       +#include <err.h>
       +#include <errno.h>
       +#include <limits.h>
       +#include <stdarg.h>
       +#include <stdio.h>
       +#include <stdint.h>
       +#include <stdlib.h>
       +#include <string.h>
       +#include <time.h>
       +#include <wchar.h>
       +
       +#include "util.h"
       +
       +/* Read a field-separated line from 'fp',
       + * separated by a character 'separator',
       + * 'fields' is a list of pointers with a size of FieldLast (must be >0).
       + * 'line' buffer is allocated using malloc, 'size' will contain the allocated
       + * buffer size.
       + * returns: amount of fields read (>0) or -1 on error. */
       +size_t
       +parseline(char *line, char *fields[FieldLast])
       +{
       +        char *prev, *s;
       +        size_t i;
       +
       +        for (prev = line, i = 0;
       +            (s = strchr(prev, '\t')) && i < FieldLast - 1;
       +            i++) {
       +                *s = '\0';
       +                fields[i] = prev;
       +                prev = s + 1;
       +        }
       +        fields[i++] = prev;
       +        /* make non-parsed fields empty. */
       +        for (; i < FieldLast; i++)
       +                fields[i] = "";
       +
       +        return i;
       +}
       +
       +/* Parse time to time_t, assumes time_t is signed, ignores fractions. */
       +int
       +strtotime(const char *s, time_t *t)
       +{
       +        long long l;
       +        char *e;
       +
       +        errno = 0;
       +        l = strtoll(s, &e, 10);
       +        if (errno || *s == '\0' || *e)
       +                return -1;
       +        /* NOTE: assumes time_t is 64-bit on 64-bit platforms:
       +                 long long (atleast 32-bit) to time_t. */
       +        if (t)
       +                *t = (time_t)l;
       +
       +        return 0;
       +}
       +
       +/* Escape characters below as HTML 2.0 / XML 1.0. */
       +void
       +xmlencode(const char *s, FILE *fp)
       +{
       +        for (; *s; s++) {
       +                switch(*s) {
       +                case '<':  fputs("&lt;",   fp); break;
       +                case '>':  fputs("&gt;",   fp); break;
       +                case '\'': fputs("&#39;",  fp); break;
       +                case '&':  fputs("&amp;",  fp); break;
       +                case '"':  fputs("&quot;", fp); break;
       +                default:   fputc(*s, fp);
       +                }
       +        }
       +}
       +
       +/* print `len' columns of characters. If string is shorter pad the rest
       + * with characters `pad`. */
       +void
       +printutf8pad(FILE *fp, const char *s, size_t len, int pad)
       +{
       +        wchar_t w;
       +        size_t col = 0, i, slen;
       +        int rl, wc;
       +
       +        if (!len)
       +                return;
       +
       +        slen = strlen(s);
       +        for (i = 0; i < slen && col < len + 1; i += rl) {
       +                if ((rl = mbtowc(&w, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
       +                        break;
       +                if ((wc = wcwidth(w)) == -1)
       +                        wc = 1;
       +                col += (size_t)wc;
       +                if (col >= len && s[i + rl]) {
       +                        fputs("\xe2\x80\xa6", fp);
       +                        break;
       +                }
       +                fwrite(&s[i], 1, rl, fp);
       +        }
       +        for (; col < len; col++)
       +                putc(pad, fp);
       +}
       +
       +void
       +printescape(const char *s)
       +{
       +        size_t i;
       +        const char *e;
       +
       +        /* strip leading and trailing white-space */
       +        for (; *s && isspace(*s); s++)
       +                ;
       +        for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--)
       +                ;
       +
       +        for (i = 0; *s && s < e; s++) {
       +                if (iscntrl(*s) || isspace(*s)) {
       +                        i++;
       +                        continue;
       +                }
       +                if (i) {
       +                        i = 0;
       +                        putchar(' ');
       +                }
       +                putchar(*s);
       +        }
       +}
       +
       +int
       +parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
       +{
       +        struct tm *tm;
       +
       +        if (strtotime(s, t))
       +                return -1;
       +        if (!(tm = localtime(t)))
       +                return -1;
       +        if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
       +                return -1;
       +
       +        return 0;
       +}
 (DIR) diff --git a/util.h b/util.h
       @@ -0,0 +1,38 @@
       +#include <stdint.h>
       +#include <time.h>
       +#ifdef USE_PLEDGE
       +#include <unistd.h>
       +#else
       +#define pledge(p1,p2) 0
       +#endif
       +
       +#undef strlcat
       +size_t strlcat(char *, const char *, size_t);
       +#undef strlcpy
       +size_t strlcpy(char *, const char *, size_t);
       +
       +#define ISUTF8(c) (((c) & 0xc0) != 0x80)
       +
       +/* feed info */
       +struct feed {
       +        char *        name;     /* feed name */
       +        unsigned long totalnew; /* amount of new items per feed */
       +        unsigned long total;    /* total items */
       +        time_t        timenewest;
       +        char          timenewestformat[64];
       +};
       +
       +enum {
       +        FieldUnixTimestamp = 0,
       +        FieldText, FieldItemid,
       +        FieldUsername, FieldFullname,
       +        FieldRetweetid, FieldIspinned,
       +        FieldLast
       +};
       +
       +size_t  parseline(char *, char *[FieldLast]);
       +int     parsetime(const char *, time_t *, char *, size_t);
       +void    printescape(const char *);
       +void    printutf8pad(FILE *, const char *, size_t, int);
       +int     strtotime(const char *, time_t *);
       +void    xmlencode(const char *, FILE *);