separate parsing and formatting like sfeed - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit f712b91a8db0fb66f7facf349ea859da07717dc7
(DIR) parent f0b8be83a871c59f1bd9a99f16bf20ce9df57c22
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 12 Aug 2017 12:52:23 +0200
separate parsing and formatting like sfeed
- remove formatted timestamp field.
- add tscrape_plain
Diffstat:
M Makefile | 6 ++++--
M tscrape.c | 76 +------------------------------
A tscrape_plain.c | 91 +++++++++++++++++++++++++++++++
A util.c | 146 +++++++++++++++++++++++++++++++
A util.h | 38 +++++++++++++++++++++++++++++++
5 files changed, 281 insertions(+), 76 deletions(-)
---
(DIR) diff --git a/Makefile b/Makefile
@@ -3,14 +3,16 @@ include config.mk
NAME = tscrape
VERSION = 0.1
BIN = \
- tscrape
+ tscrape\
+ tscrape_plain
SRC = ${BIN:=.c}
LIBUTIL = libutil.a
LIBUTILSRC = \
strlcat.c\
- strlcpy.c
+ strlcpy.c\
+ util.c
LIBUTILOBJ = ${LIBUTILSRC:.c=.o}
LIBXML = libxml.a
(DIR) diff --git a/tscrape.c b/tscrape.c
@@ -11,11 +11,8 @@
#include <time.h>
#include <unistd.h>
-#ifndef USE_PLEDGE
-#define pledge(p1,p2) 0
-#endif
-
#include "xml.h"
+#include "util.h"
#define STRP(s) s,sizeof(s)-1
@@ -30,13 +27,6 @@ enum {
Username = 64
};
-/* for compatibility with libc's that don't have strlcat or strlcpy. The
- * functions are synced from OpenBSD */
-#undef strlcat
-size_t strlcat(char *, const char *, size_t);
-#undef strlcpy
-size_t strlcpy(char *, const char *, size_t);
-
/* data */
static char fullname[1024];
static char timestamp[16];
@@ -52,75 +42,13 @@ static int state;
static XMLParser p;
static void
-printescape(const char *s)
-{
- size_t i;
- const char *e;
-
- /* strip leading and trailing white-space */
- for (; *s && isspace(*s); s++)
- ;
- for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--)
- ;
-
- for (i = 0; *s && s < e; s++) {
- if (iscntrl(*s) || isspace(*s)) {
- i++;
- continue;
- }
- if (i) {
- i = 0;
- putchar(' ');
- }
- putchar(*s);
- }
-}
-
-/* Parse time to time_t, assumes time_t is signed. */
-int
-strtotime(const char *s, time_t *t)
-{
- long long l;
- char *e;
-
- errno = 0;
- l = strtoll(s, &e, 10);
- if (*s == '\0' || *e != '\0')
- return -1;
- if (t)
- *t = (time_t)l;
-
- return 0;
-}
-
-static int
-parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
-{
- struct tm *tm;
-
- if (strtotime(s, t))
- return -1;
- if (!(tm = localtime(t)))
- return -1;
- if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
- return -1;
-
- return 0;
-}
-
-static void
printtweet(void)
{
char buf[32];
time_t t;
- if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1) {
+ if (parsetime(timestamp, &t, buf, sizeof(buf)) != -1)
printf("%lld", (long long)t);
- putchar('\t');
- fputs(buf, stdout);
- } else {
- putchar('\t');
- }
putchar('\t');
printescape(text);
putchar('\t');
(DIR) diff --git a/tscrape_plain.c b/tscrape_plain.c
@@ -0,0 +1,91 @@
+#include <ctype.h>
+#include <err.h>
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "util.h"
+
+static time_t comparetime;
+static char *line;
+static size_t linesize;
+
+static void
+printfeed(FILE *fp, const char *feedname)
+{
+ char *fields[FieldLast];
+ struct tm *tm;
+ time_t parsedtime;
+ ssize_t linelen;
+
+ while ((linelen = getline(&line, &linesize, fp)) > 0) {
+ if (line[linelen - 1] == '\n')
+ line[--linelen] = '\0';
+ if (!parseline(line, fields))
+ break;
+
+ parsedtime = 0;
+ strtotime(fields[FieldUnixTimestamp], &parsedtime);
+ if (!(tm = localtime(&parsedtime)))
+ err(1, "localtime");
+
+ if (parsedtime >= comparetime)
+ putchar('N');
+ else
+ putchar(' ');
+ if (fields[FieldRetweetid][0])
+ putchar('R');
+ else
+ putchar(' ');
+ putchar(' ');
+
+ if (feedname[0])
+ printf("%-15.15s ", feedname);
+
+ fprintf(stdout, "%04d-%02d-%02d %02d:%02d ",
+ tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
+ tm->tm_hour, tm->tm_min);
+
+ printutf8pad(stdout, fields[FieldFullname], 25, ' ');
+ printescape(fields[FieldText]);
+ putchar('\n');
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ FILE *fp;
+ char *name;
+ int i;
+
+ if (pledge("stdio rpath", NULL) == -1)
+ err(1, "pledge");
+
+ setlocale(LC_CTYPE, "");
+
+ if (pledge(argc == 1 ? "stdio" : "stdio rpath", NULL) == -1)
+ err(1, "pledge");
+
+ if ((comparetime = time(NULL)) == -1)
+ err(1, "time");
+ /* 1 day is old news */
+ comparetime -= 86400;
+
+ if (argc == 1) {
+ printfeed(stdin, "");
+ } else {
+ for (i = 1; i < argc; i++) {
+ if (!(fp = fopen(argv[i], "r")))
+ err(1, "fopen: %s", argv[i]);
+ name = ((name = strrchr(argv[i], '/'))) ? name + 1 : argv[i];
+ printfeed(fp, name);
+ if (ferror(fp))
+ err(1, "ferror: %s", argv[i]);
+ fclose(fp);
+ }
+ }
+ return 0;
+}
(DIR) diff --git a/util.c b/util.c
@@ -0,0 +1,146 @@
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <wchar.h>
+
+#include "util.h"
+
+/* Read a field-separated line from 'fp',
+ * separated by a character 'separator',
+ * 'fields' is a list of pointers with a size of FieldLast (must be >0).
+ * 'line' buffer is allocated using malloc, 'size' will contain the allocated
+ * buffer size.
+ * returns: amount of fields read (>0) or -1 on error. */
+size_t
+parseline(char *line, char *fields[FieldLast])
+{
+ char *prev, *s;
+ size_t i;
+
+ for (prev = line, i = 0;
+ (s = strchr(prev, '\t')) && i < FieldLast - 1;
+ i++) {
+ *s = '\0';
+ fields[i] = prev;
+ prev = s + 1;
+ }
+ fields[i++] = prev;
+ /* make non-parsed fields empty. */
+ for (; i < FieldLast; i++)
+ fields[i] = "";
+
+ return i;
+}
+
+/* Parse time to time_t, assumes time_t is signed, ignores fractions. */
+int
+strtotime(const char *s, time_t *t)
+{
+ long long l;
+ char *e;
+
+ errno = 0;
+ l = strtoll(s, &e, 10);
+ if (errno || *s == '\0' || *e)
+ return -1;
+ /* NOTE: assumes time_t is 64-bit on 64-bit platforms:
+ long long (atleast 32-bit) to time_t. */
+ if (t)
+ *t = (time_t)l;
+
+ return 0;
+}
+
+/* Escape characters below as HTML 2.0 / XML 1.0. */
+void
+xmlencode(const char *s, FILE *fp)
+{
+ for (; *s; s++) {
+ switch(*s) {
+ case '<': fputs("<", fp); break;
+ case '>': fputs(">", fp); break;
+ case '\'': fputs("'", fp); break;
+ case '&': fputs("&", fp); break;
+ case '"': fputs(""", fp); break;
+ default: fputc(*s, fp);
+ }
+ }
+}
+
+/* print `len' columns of characters. If string is shorter pad the rest
+ * with characters `pad`. */
+void
+printutf8pad(FILE *fp, const char *s, size_t len, int pad)
+{
+ wchar_t w;
+ size_t col = 0, i, slen;
+ int rl, wc;
+
+ if (!len)
+ return;
+
+ slen = strlen(s);
+ for (i = 0; i < slen && col < len + 1; i += rl) {
+ if ((rl = mbtowc(&w, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
+ break;
+ if ((wc = wcwidth(w)) == -1)
+ wc = 1;
+ col += (size_t)wc;
+ if (col >= len && s[i + rl]) {
+ fputs("\xe2\x80\xa6", fp);
+ break;
+ }
+ fwrite(&s[i], 1, rl, fp);
+ }
+ for (; col < len; col++)
+ putc(pad, fp);
+}
+
+void
+printescape(const char *s)
+{
+ size_t i;
+ const char *e;
+
+ /* strip leading and trailing white-space */
+ for (; *s && isspace(*s); s++)
+ ;
+ for (e = s + strlen(s); e > s && isspace(*(e - 1)); e--)
+ ;
+
+ for (i = 0; *s && s < e; s++) {
+ if (iscntrl(*s) || isspace(*s)) {
+ i++;
+ continue;
+ }
+ if (i) {
+ i = 0;
+ putchar(' ');
+ }
+ putchar(*s);
+ }
+}
+
+int
+parsetime(const char *s, time_t *t, char *buf, size_t bufsiz)
+{
+ struct tm *tm;
+
+ if (strtotime(s, t))
+ return -1;
+ if (!(tm = localtime(t)))
+ return -1;
+ if (!strftime(buf, bufsiz, "%Y-%m-%d %H:%M", tm))
+ return -1;
+
+ return 0;
+}
(DIR) diff --git a/util.h b/util.h
@@ -0,0 +1,38 @@
+#include <stdint.h>
+#include <time.h>
+#ifdef USE_PLEDGE
+#include <unistd.h>
+#else
+#define pledge(p1,p2) 0
+#endif
+
+#undef strlcat
+size_t strlcat(char *, const char *, size_t);
+#undef strlcpy
+size_t strlcpy(char *, const char *, size_t);
+
+#define ISUTF8(c) (((c) & 0xc0) != 0x80)
+
+/* feed info */
+struct feed {
+ char * name; /* feed name */
+ unsigned long totalnew; /* amount of new items per feed */
+ unsigned long total; /* total items */
+ time_t timenewest;
+ char timenewestformat[64];
+};
+
+enum {
+ FieldUnixTimestamp = 0,
+ FieldText, FieldItemid,
+ FieldUsername, FieldFullname,
+ FieldRetweetid, FieldIspinned,
+ FieldLast
+};
+
+size_t parseline(char *, char *[FieldLast]);
+int parsetime(const char *, time_t *, char *, size_t);
+void printescape(const char *);
+void printutf8pad(FILE *, const char *, size_t, int);
+int strtotime(const char *, time_t *);
+void xmlencode(const char *, FILE *);