jf2sfeed.c - jfconvert - JSON Feed (subset) to sfeed or Atom converter
 (HTM) git clone git://git.codemadness.org/jfconvert
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       jf2sfeed.c (14438B)
       ---
            1 #include <errno.h>
            2 #include <stdarg.h>
            3 #include <stdint.h>
            4 #include <stdio.h>
            5 #include <stdlib.h>
            6 #include <string.h>
            7 
            8 #ifdef __OpenBSD__
            9 #include <unistd.h>
           10 #else
           11 #define pledge(a,b) 0
           12 #endif
           13 
           14 #include "json.h"
           15 
           16 /* hint for compilers and static analyzers that a function exits */
           17 #ifndef __dead
           18 #define __dead
           19 #endif
           20 
           21 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
           22 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           23 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
           24 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           25 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           26 
           27 /* compare attributes case-sensitively */
           28 #define attrcmp strcmp
           29 
           30 enum {
           31         FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
           32         FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
           33         FeedFieldLast
           34 };
           35 
           36 enum ContentType {
           37         ContentTypeNone  = 0,
           38         ContentTypePlain = 1,
           39         ContentTypeHTML  = 2
           40 };
           41 static const char *contenttypes[] = { "", "plain", "html" };
           42 
           43 /* String data / memory pool */
           44 typedef struct string {
           45         char   *data;   /* data */
           46         size_t  len;    /* string length */
           47         size_t  bufsiz; /* allocated size */
           48 } String;
           49 
           50 static String fields[FeedFieldLast];  /* data for current item */
           51 static enum ContentType contenttype;  /* content-type for item */
           52 static int itemisopen = 0;
           53 
           54 static const int FieldSeparator = '\t';
           55 /* separator for multiple values in a field, separator should be 1 byte */
           56 static const char FieldMultiSeparator[] = "|";
           57 
           58 /* print to stderr, print error message of errno and exit().
           59    Unlike BSD err() it does not prefix __progname */
           60 __dead void
           61 err(int exitstatus, const char *fmt, ...)
           62 {
           63         va_list ap;
           64         int saved_errno;
           65 
           66         saved_errno = errno;
           67 
           68         if (fmt) {
           69                 va_start(ap, fmt);
           70                 vfprintf(stderr, fmt, ap);
           71                 va_end(ap);
           72                 fputs(": ", stderr);
           73         }
           74         fprintf(stderr, "%s\n", strerror(saved_errno));
           75 
           76         exit(exitstatus);
           77 }
           78 
           79 /* print to stderr and exit().
           80    Unlike BSD errx() it does not prefix __progname */
           81 __dead void
           82 errx(int exitstatus, const char *fmt, ...)
           83 {
           84         va_list ap;
           85 
           86         if (fmt) {
           87                 va_start(ap, fmt);
           88                 vfprintf(stderr, fmt, ap);
           89                 va_end(ap);
           90         }
           91         fputs("\n", stderr);
           92 
           93         exit(exitstatus);
           94 }
           95 
           96 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
           97  * Parameters should be passed as they are in a struct tm and in a valid range:
           98  * that is: year = year - 1900, month = month - 1. */
           99 static long long
          100 datetounix(long long year, int mon, int day, int hour, int min, int sec)
          101 {
          102         /* seconds in a month in a regular (non-leap) year */
          103         static const long secs_through_month[] = {
          104                 0, 31 * 86400, 59 * 86400, 90 * 86400,
          105                 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
          106                 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
          107         int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
          108         long long t;
          109 
          110         /* optimization: handle common range year 1902 up to and including 2038 */
          111         if (year - 2ULL <= 136) {
          112                 /* amount of leap days relative to 1970: every 4 years */
          113                 leaps = (year / 4) - 17; /* 17 leap years offset for 1902 - 1970 */
          114                 if (!(year & 3)) {
          115                         leaps--;
          116                         is_leap = 1;
          117                 } else {
          118                         is_leap = 0;
          119                 }
          120                 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
          121         } else {
          122                 /* general leap year calculation:
          123                  * leap years occur mostly every 4 years but every 100 years
          124                  * a leap year is skipped unless the year is divisible by 400 */
          125                 cycles = (year - 100) / 400;
          126                 rem = (year - 100) % 400;
          127                 if (rem < 0) {
          128                         cycles--;
          129                         rem += 400;
          130                 }
          131                 if (!rem) {
          132                         is_leap = 1;
          133                 } else {
          134                         if (rem >= 300) {
          135                                 centuries = 3;
          136                                 rem -= 300;
          137                         } else if (rem >= 200) {
          138                                 centuries = 2;
          139                                 rem -= 200;
          140                         } else if (rem >= 100) {
          141                                 centuries = 1;
          142                                 rem -= 100;
          143                         }
          144                         if (rem) {
          145                                 leaps = rem / 4U;
          146                                 rem %= 4U;
          147                                 is_leap = !rem;
          148                         }
          149                 }
          150                 leaps += (97 * cycles) + (24 * centuries) - is_leap;
          151 
          152                 /* adjust 8 leap days from 1970 up to and including 2000:
          153                  * ((30 * 365) + 8) * 86400 = 946771200 */
          154                 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
          155         }
          156         t += secs_through_month[mon];
          157         if (is_leap && mon >= 2)
          158                 t += 86400;
          159         t += 86400LL * (day - 1);
          160         t += 3600LL * hour;
          161         t += 60LL * min;
          162         t += sec;
          163 
          164         return t;
          165 }
          166 
          167 /* Get timezone from string, return time offset in seconds from UTC. */
          168 static long
          169 gettzoffset(const char *s)
          170 {
          171         const char *p;
          172         long tzhour = 0, tzmin = 0;
          173         size_t i;
          174 
          175         for (; ISSPACE((unsigned char)*s); s++)
          176                 ;
          177         switch (*s) {
          178         case '-': /* offset */
          179         case '+':
          180                 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
          181                         tzhour = (tzhour * 10) + (*p - '0');
          182                 if (*p == ':')
          183                         p++;
          184                 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
          185                         tzmin = (tzmin * 10) + (*p - '0');
          186                 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
          187         default: /* timezone name */
          188                 break;
          189         }
          190         return 0;
          191 }
          192 
          193 /* Parse time string `s` into the UNIX timestamp `tp`.
          194    Returns 0 on success or -1 on failure. */
          195 static int
          196 parsetime(const char *s, long long *tp)
          197 {
          198         int va[6] = { 0 }, i, v, vi;
          199 
          200         for (; ISSPACE((unsigned char)*s); s++)
          201                 ;
          202 
          203         if (!ISDIGIT((unsigned char)s[0]) ||
          204             !ISDIGIT((unsigned char)s[1]) ||
          205             !ISDIGIT((unsigned char)s[2]) ||
          206             !ISDIGIT((unsigned char)s[3]))
          207                 return -1;
          208 
          209         /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
          210         vi = 0;
          211 
          212         /* parse time parts (and possibly remaining date parts) */
          213         for (; *s && vi < 6; vi++) {
          214                 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
          215                                    ISDIGIT((unsigned char)*s); s++, i++) {
          216                         v = (v * 10) + (*s - '0');
          217                 }
          218                 va[vi] = v;
          219 
          220                 if ((vi < 2 && *s == '-') ||
          221                     (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
          222                     (vi > 2 && *s == ':'))
          223                         s++;
          224         }
          225 
          226         /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
          227         if (*s == '.') {
          228                 for (s++; ISDIGIT((unsigned char)*s); s++)
          229                         ;
          230         }
          231 
          232         /* invalid range */
          233         if (va[0] < 0 || va[0] > 9999 ||
          234             va[1] < 1 || va[1] > 12 ||
          235             va[2] < 1 || va[2] > 31 ||
          236             va[3] < 0 || va[3] > 23 ||
          237             va[4] < 0 || va[4] > 59 ||
          238             va[5] < 0 || va[5] > 60) /* allow leap second */
          239                 return -1;
          240 
          241         *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
          242               gettzoffset(s);
          243 
          244         return 0;
          245 }
          246 
          247 /* Handle read or write errors for a FILE * stream */
          248 static void
          249 checkfileerror(FILE *fp, const char *name, int mode)
          250 {
          251         if (mode == 'r' && ferror(fp))
          252                 errx(1, "read error: %s", name);
          253         else if (mode == 'w' && (fflush(fp) || ferror(fp)))
          254                 errx(1, "write error: %s", name);
          255 }
          256 
          257 /* Clear string only; don't free, prevents unnecessary reallocation. */
          258 static void
          259 string_clear(String *s)
          260 {
          261         if (s->data)
          262                 s->data[0] = '\0';
          263         s->len = 0;
          264 }
          265 
          266 static void
          267 string_buffer_realloc(String *s, size_t newlen)
          268 {
          269         size_t alloclen;
          270 
          271         if (newlen > SIZE_MAX / 2) {
          272                 alloclen = SIZE_MAX;
          273         } else {
          274                 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
          275                         ;
          276         }
          277         if (!(s->data = realloc(s->data, alloclen)))
          278                 err(1, "realloc");
          279         s->bufsiz = alloclen;
          280 }
          281 
          282 /* Append data to String, s->data and data may not overlap. */
          283 static void
          284 string_append(String *s, const char *data, size_t len)
          285 {
          286         if (!len)
          287                 return;
          288 
          289         if (s->len >= SIZE_MAX - len) {
          290                 errno = ENOMEM;
          291                 err(1, "realloc");
          292         }
          293 
          294         /* check if allocation is necessary, never shrink the buffer. */
          295         if (s->len + len >= s->bufsiz)
          296                 string_buffer_realloc(s, s->len + len + 1);
          297         memcpy(s->data + s->len, data, len);
          298         s->len += len;
          299         s->data[s->len] = '\0';
          300 }
          301 
          302 /* Clear and append string */
          303 static void
          304 string_set(String *s, const char *data, size_t len)
          305 {
          306         string_clear(s);
          307         string_append(s, data, len);
          308 }
          309 
          310 /* Print text, encode TABs, newlines and '\', remove other whitespace.
          311  * Remove leading and trailing whitespace. */
          312 static void
          313 string_print_encoded(String *s)
          314 {
          315         const char *p, *e;
          316 
          317         if (!s->data || !s->len)
          318                 return;
          319 
          320         p = s->data;
          321         e = p + s->len;
          322 
          323         for (; *p && p != e; p++) {
          324                 switch (*p) {
          325                 case '\n': putchar('\\'); putchar('n'); break;
          326                 case '\\': putchar('\\'); putchar('\\'); break;
          327                 case '\t': putchar('\\'); putchar('t'); break;
          328                 default:
          329                         /* ignore control chars */
          330                         if (!ISCNTRL((unsigned char)*p))
          331                                 putchar(*p);
          332                         break;
          333                 }
          334         }
          335 }
          336 
          337 /* Print text, replace TABs, carriage return and other whitespace with ' '.
          338  * Other control chars are removed. Remove leading and trailing whitespace. */
          339 static void
          340 string_print(String *s)
          341 {
          342         const char *p, *e;
          343 
          344         if (!s->data || !s->len)
          345                 return;
          346 
          347         p = s->data;
          348         e = s->data + s->len;
          349         for (; *p && p != e; p++) {
          350                 if (ISSPACE((unsigned char)*p))
          351                         putchar(' '); /* any whitespace to space */
          352                 else if (!ISCNTRL((unsigned char)*p))
          353                         /* ignore other control chars */
          354                         putchar(*p);
          355         }
          356 }
          357 
          358 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
          359 static void
          360 string_print_timestamp(String *s)
          361 {
          362         long long t;
          363 
          364         if (!s->data || !s->len)
          365                 return;
          366 
          367         if (parsetime(s->data, &t) != -1)
          368                 printf("%lld", t);
          369 }
          370 
          371 static void
          372 printfields(void)
          373 {
          374         string_print_timestamp(&fields[FeedFieldTime]);
          375         putchar(FieldSeparator);
          376         string_print(&fields[FeedFieldTitle]);
          377         putchar(FieldSeparator);
          378         string_print(&fields[FeedFieldLink]);
          379         putchar(FieldSeparator);
          380         string_print_encoded(&fields[FeedFieldContent]);
          381         putchar(FieldSeparator);
          382         fputs(contenttypes[contenttype], stdout);
          383         putchar(FieldSeparator);
          384         string_print(&fields[FeedFieldId]);
          385         putchar(FieldSeparator);
          386         string_print(&fields[FeedFieldAuthor]);
          387         putchar(FieldSeparator);
          388         string_print(&fields[FeedFieldEnclosure]);
          389         putchar(FieldSeparator);
          390         string_print(&fields[FeedFieldCategory]);
          391         putchar('\n');
          392 
          393         if (ferror(stdout)) /* check for errors but do not flush */
          394                 checkfileerror(stdout, "<stdout>", 'w');
          395 }
          396 
          397 static void
          398 newitem(void)
          399 {
          400         size_t i;
          401 
          402         contenttype = ContentTypeNone;
          403         for (i = 0; i < FeedFieldLast; i++)
          404                 string_clear(&fields[i]);
          405 
          406 }
          407 
          408 static void
          409 processnode(struct json_node *nodes, size_t depth, const char *value, size_t valuelen)
          410 {
          411         /* item */
          412         if (depth == 3) {
          413                 if (nodes[0].type == JSON_TYPE_OBJECT &&
          414                     nodes[1].type == JSON_TYPE_ARRAY &&
          415                     nodes[2].type == JSON_TYPE_OBJECT &&
          416                     !attrcmp(nodes[1].name, "items")) {
          417                         if (itemisopen)
          418                                 printfields();
          419                         newitem();
          420                         itemisopen = 1;
          421                 }
          422         }
          423 
          424         /* item attributes */
          425         if (depth == 4) {
          426                 if (nodes[0].type == JSON_TYPE_OBJECT &&
          427                     nodes[1].type == JSON_TYPE_ARRAY &&
          428                     nodes[2].type == JSON_TYPE_OBJECT &&
          429                     !attrcmp(nodes[1].name, "items")) {
          430                         if (!attrcmp(nodes[3].name, "content_html")) {
          431                                 string_set(&fields[FeedFieldContent], value, valuelen);
          432                                 contenttype = ContentTypeHTML;
          433                         } else if (!attrcmp(nodes[3].name, "content_text")) {
          434                                 /* prefer HTML, if summary text is set override it also */
          435                                 if (!fields[FeedFieldContent].len && contenttype != ContentTypeHTML) {
          436                                         string_set(&fields[FeedFieldContent], value, valuelen);
          437                                         contenttype = ContentTypePlain;
          438                                 }
          439                         } else if (!attrcmp(nodes[3].name, "date_published")) {
          440                                 /* published has higher priority than updated */
          441                                 string_set(&fields[FeedFieldTime], value, valuelen);
          442                         } else if (!attrcmp(nodes[3].name, "date_modified")) {
          443                                 if (!fields[FeedFieldTime].len)
          444                                         string_append(&fields[FeedFieldTime], value, valuelen);
          445                         } else if (!attrcmp(nodes[3].name, "id")) {
          446                                 if (!fields[FeedFieldId].len)
          447                                         string_append(&fields[FeedFieldId], value, valuelen);
          448                         } else if (!attrcmp(nodes[3].name, "summary")) {
          449                                 /* only if content_html or content_text is not set yet. */
          450                                 if (!fields[FeedFieldContent].len) {
          451                                         string_append(&fields[FeedFieldContent], value, valuelen);
          452                                         contenttype = ContentTypePlain;
          453                                 }
          454                         } else if (!attrcmp(nodes[3].name, "title")) {
          455                                 if (!fields[FeedFieldTitle].len)
          456                                         string_set(&fields[FeedFieldTitle], value, valuelen);
          457                         } else if (!attrcmp(nodes[3].name, "url")) {
          458                                 if (!fields[FeedFieldLink].len)
          459                                         string_append(&fields[FeedFieldLink], value, valuelen);
          460                         }
          461                 }
          462         }
          463 
          464         if (depth == 5) {
          465                 /* 1.0 author name */
          466                 if (nodes[0].type == JSON_TYPE_OBJECT &&
          467                     nodes[1].type == JSON_TYPE_ARRAY &&
          468                     nodes[2].type == JSON_TYPE_OBJECT &&
          469                     nodes[3].type == JSON_TYPE_OBJECT &&
          470                     nodes[4].type == JSON_TYPE_STRING &&
          471                     !attrcmp(nodes[1].name, "items") &&
          472                     !attrcmp(nodes[3].name, "author") &&
          473                     !attrcmp(nodes[4].name, "name")) {
          474                         if (!fields[FeedFieldAuthor].len)
          475                                 string_append(&fields[FeedFieldAuthor], value, valuelen);
          476                 }
          477 
          478                 /* tags / categories */
          479                 if (nodes[0].type == JSON_TYPE_OBJECT &&
          480                     nodes[1].type == JSON_TYPE_ARRAY &&
          481                     nodes[2].type == JSON_TYPE_OBJECT &&
          482                     nodes[3].type == JSON_TYPE_ARRAY &&
          483                     nodes[4].type == JSON_TYPE_STRING &&
          484                     !attrcmp(nodes[1].name, "items") &&
          485                     !attrcmp(nodes[3].name, "tags")) {
          486                         if (fields[FeedFieldCategory].len)
          487                                 string_append(&fields[FeedFieldCategory], FieldMultiSeparator,
          488                                               sizeof(FieldMultiSeparator) - 1);
          489                         string_append(&fields[FeedFieldCategory], value, valuelen);
          490                 }
          491         }
          492 
          493         if (depth == 6) {
          494                 /* 1.1 author name */
          495                 if (nodes[0].type == JSON_TYPE_OBJECT &&
          496                     nodes[1].type == JSON_TYPE_ARRAY &&
          497                     nodes[2].type == JSON_TYPE_OBJECT &&
          498                     nodes[3].type == JSON_TYPE_ARRAY &&
          499                     nodes[4].type == JSON_TYPE_OBJECT &&
          500                     nodes[5].type == JSON_TYPE_STRING &&
          501                     !attrcmp(nodes[1].name, "items") &&
          502                     !attrcmp(nodes[3].name, "authors") &&
          503                     !attrcmp(nodes[5].name, "name")) {
          504                         if (!fields[FeedFieldAuthor].len)
          505                                 string_append(&fields[FeedFieldAuthor], value, valuelen);
          506                 }
          507 
          508                 /* enclosure attributes */
          509                 if (nodes[0].type == JSON_TYPE_OBJECT &&
          510                     nodes[1].type == JSON_TYPE_ARRAY &&
          511                     nodes[2].type == JSON_TYPE_OBJECT &&
          512                     nodes[3].type == JSON_TYPE_ARRAY &&
          513                     nodes[4].type == JSON_TYPE_OBJECT &&
          514                     (nodes[5].type == JSON_TYPE_STRING || nodes[5].type == JSON_TYPE_NUMBER) &&
          515                     !attrcmp(nodes[1].name, "items") &&
          516                     !attrcmp(nodes[3].name, "attachments") &&
          517                     !attrcmp(nodes[5].name, "url")) {
          518                         if (!fields[FeedFieldEnclosure].len)
          519                                 string_append(&fields[FeedFieldEnclosure], value, valuelen);
          520                 }
          521         }
          522 
          523         if (ferror(stdout)) {
          524                 fprintf(stderr, "write error: <stdout>\n");
          525                 exit(2);
          526         }
          527 }
          528 
          529 int
          530 main(int argc, char *argv[])
          531 {
          532         if (pledge("stdio", NULL) == -1)
          533                 err(1, "pledge");
          534 
          535         switch (parsejson(processnode)) {
          536         case JSON_ERROR_MEM:
          537                 errx(2, "error: cannot allocate enough memory");
          538         case JSON_ERROR_INVALID:
          539                 errx(1, "error: invalid JSON");
          540         }
          541 
          542         if (itemisopen)
          543                 printfields();
          544 
          545         if (ferror(stdin))
          546                 errx(2, "read error: <stdin>");
          547         if (fflush(stdout) || ferror(stdout))
          548                 errx(2, "write error: <stdout>");
          549 
          550         return 0;
          551 }