jf2sfeed.c - jfconvert - JSON Feed (subset) to sfeed or Atom converter
(HTM) git clone git://git.codemadness.org/jfconvert
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
jf2sfeed.c (14438B)
---
1 #include <errno.h>
2 #include <stdarg.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7
8 #ifdef __OpenBSD__
9 #include <unistd.h>
10 #else
11 #define pledge(a,b) 0
12 #endif
13
14 #include "json.h"
15
16 /* hint for compilers and static analyzers that a function exits */
17 #ifndef __dead
18 #define __dead
19 #endif
20
21 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
22 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
23 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
24 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
25 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
26
27 /* compare attributes case-sensitively */
28 #define attrcmp strcmp
29
30 enum {
31 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
32 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
33 FeedFieldLast
34 };
35
36 enum ContentType {
37 ContentTypeNone = 0,
38 ContentTypePlain = 1,
39 ContentTypeHTML = 2
40 };
41 static const char *contenttypes[] = { "", "plain", "html" };
42
43 /* String data / memory pool */
44 typedef struct string {
45 char *data; /* data */
46 size_t len; /* string length */
47 size_t bufsiz; /* allocated size */
48 } String;
49
50 static String fields[FeedFieldLast]; /* data for current item */
51 static enum ContentType contenttype; /* content-type for item */
52 static int itemisopen = 0;
53
54 static const int FieldSeparator = '\t';
55 /* separator for multiple values in a field, separator should be 1 byte */
56 static const char FieldMultiSeparator[] = "|";
57
58 /* print to stderr, print error message of errno and exit().
59 Unlike BSD err() it does not prefix __progname */
60 __dead void
61 err(int exitstatus, const char *fmt, ...)
62 {
63 va_list ap;
64 int saved_errno;
65
66 saved_errno = errno;
67
68 if (fmt) {
69 va_start(ap, fmt);
70 vfprintf(stderr, fmt, ap);
71 va_end(ap);
72 fputs(": ", stderr);
73 }
74 fprintf(stderr, "%s\n", strerror(saved_errno));
75
76 exit(exitstatus);
77 }
78
79 /* print to stderr and exit().
80 Unlike BSD errx() it does not prefix __progname */
81 __dead void
82 errx(int exitstatus, const char *fmt, ...)
83 {
84 va_list ap;
85
86 if (fmt) {
87 va_start(ap, fmt);
88 vfprintf(stderr, fmt, ap);
89 va_end(ap);
90 }
91 fputs("\n", stderr);
92
93 exit(exitstatus);
94 }
95
96 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
97 * Parameters should be passed as they are in a struct tm and in a valid range:
98 * that is: year = year - 1900, month = month - 1. */
99 static long long
100 datetounix(long long year, int mon, int day, int hour, int min, int sec)
101 {
102 /* seconds in a month in a regular (non-leap) year */
103 static const long secs_through_month[] = {
104 0, 31 * 86400, 59 * 86400, 90 * 86400,
105 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
106 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
107 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
108 long long t;
109
110 /* optimization: handle common range year 1902 up to and including 2038 */
111 if (year - 2ULL <= 136) {
112 /* amount of leap days relative to 1970: every 4 years */
113 leaps = (year / 4) - 17; /* 17 leap years offset for 1902 - 1970 */
114 if (!(year & 3)) {
115 leaps--;
116 is_leap = 1;
117 } else {
118 is_leap = 0;
119 }
120 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
121 } else {
122 /* general leap year calculation:
123 * leap years occur mostly every 4 years but every 100 years
124 * a leap year is skipped unless the year is divisible by 400 */
125 cycles = (year - 100) / 400;
126 rem = (year - 100) % 400;
127 if (rem < 0) {
128 cycles--;
129 rem += 400;
130 }
131 if (!rem) {
132 is_leap = 1;
133 } else {
134 if (rem >= 300) {
135 centuries = 3;
136 rem -= 300;
137 } else if (rem >= 200) {
138 centuries = 2;
139 rem -= 200;
140 } else if (rem >= 100) {
141 centuries = 1;
142 rem -= 100;
143 }
144 if (rem) {
145 leaps = rem / 4U;
146 rem %= 4U;
147 is_leap = !rem;
148 }
149 }
150 leaps += (97 * cycles) + (24 * centuries) - is_leap;
151
152 /* adjust 8 leap days from 1970 up to and including 2000:
153 * ((30 * 365) + 8) * 86400 = 946771200 */
154 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
155 }
156 t += secs_through_month[mon];
157 if (is_leap && mon >= 2)
158 t += 86400;
159 t += 86400LL * (day - 1);
160 t += 3600LL * hour;
161 t += 60LL * min;
162 t += sec;
163
164 return t;
165 }
166
167 /* Get timezone from string, return time offset in seconds from UTC. */
168 static long
169 gettzoffset(const char *s)
170 {
171 const char *p;
172 long tzhour = 0, tzmin = 0;
173 size_t i;
174
175 for (; ISSPACE((unsigned char)*s); s++)
176 ;
177 switch (*s) {
178 case '-': /* offset */
179 case '+':
180 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
181 tzhour = (tzhour * 10) + (*p - '0');
182 if (*p == ':')
183 p++;
184 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
185 tzmin = (tzmin * 10) + (*p - '0');
186 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
187 default: /* timezone name */
188 break;
189 }
190 return 0;
191 }
192
193 /* Parse time string `s` into the UNIX timestamp `tp`.
194 Returns 0 on success or -1 on failure. */
195 static int
196 parsetime(const char *s, long long *tp)
197 {
198 int va[6] = { 0 }, i, v, vi;
199
200 for (; ISSPACE((unsigned char)*s); s++)
201 ;
202
203 if (!ISDIGIT((unsigned char)s[0]) ||
204 !ISDIGIT((unsigned char)s[1]) ||
205 !ISDIGIT((unsigned char)s[2]) ||
206 !ISDIGIT((unsigned char)s[3]))
207 return -1;
208
209 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
210 vi = 0;
211
212 /* parse time parts (and possibly remaining date parts) */
213 for (; *s && vi < 6; vi++) {
214 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
215 ISDIGIT((unsigned char)*s); s++, i++) {
216 v = (v * 10) + (*s - '0');
217 }
218 va[vi] = v;
219
220 if ((vi < 2 && *s == '-') ||
221 (vi == 2 && (*s == 'T' || ISSPACE((unsigned char)*s))) ||
222 (vi > 2 && *s == ':'))
223 s++;
224 }
225
226 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
227 if (*s == '.') {
228 for (s++; ISDIGIT((unsigned char)*s); s++)
229 ;
230 }
231
232 /* invalid range */
233 if (va[0] < 0 || va[0] > 9999 ||
234 va[1] < 1 || va[1] > 12 ||
235 va[2] < 1 || va[2] > 31 ||
236 va[3] < 0 || va[3] > 23 ||
237 va[4] < 0 || va[4] > 59 ||
238 va[5] < 0 || va[5] > 60) /* allow leap second */
239 return -1;
240
241 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
242 gettzoffset(s);
243
244 return 0;
245 }
246
247 /* Handle read or write errors for a FILE * stream */
248 static void
249 checkfileerror(FILE *fp, const char *name, int mode)
250 {
251 if (mode == 'r' && ferror(fp))
252 errx(1, "read error: %s", name);
253 else if (mode == 'w' && (fflush(fp) || ferror(fp)))
254 errx(1, "write error: %s", name);
255 }
256
257 /* Clear string only; don't free, prevents unnecessary reallocation. */
258 static void
259 string_clear(String *s)
260 {
261 if (s->data)
262 s->data[0] = '\0';
263 s->len = 0;
264 }
265
266 static void
267 string_buffer_realloc(String *s, size_t newlen)
268 {
269 size_t alloclen;
270
271 if (newlen > SIZE_MAX / 2) {
272 alloclen = SIZE_MAX;
273 } else {
274 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
275 ;
276 }
277 if (!(s->data = realloc(s->data, alloclen)))
278 err(1, "realloc");
279 s->bufsiz = alloclen;
280 }
281
282 /* Append data to String, s->data and data may not overlap. */
283 static void
284 string_append(String *s, const char *data, size_t len)
285 {
286 if (!len)
287 return;
288
289 if (s->len >= SIZE_MAX - len) {
290 errno = ENOMEM;
291 err(1, "realloc");
292 }
293
294 /* check if allocation is necessary, never shrink the buffer. */
295 if (s->len + len >= s->bufsiz)
296 string_buffer_realloc(s, s->len + len + 1);
297 memcpy(s->data + s->len, data, len);
298 s->len += len;
299 s->data[s->len] = '\0';
300 }
301
302 /* Clear and append string */
303 static void
304 string_set(String *s, const char *data, size_t len)
305 {
306 string_clear(s);
307 string_append(s, data, len);
308 }
309
310 /* Print text, encode TABs, newlines and '\', remove other whitespace.
311 * Remove leading and trailing whitespace. */
312 static void
313 string_print_encoded(String *s)
314 {
315 const char *p, *e;
316
317 if (!s->data || !s->len)
318 return;
319
320 p = s->data;
321 e = p + s->len;
322
323 for (; *p && p != e; p++) {
324 switch (*p) {
325 case '\n': putchar('\\'); putchar('n'); break;
326 case '\\': putchar('\\'); putchar('\\'); break;
327 case '\t': putchar('\\'); putchar('t'); break;
328 default:
329 /* ignore control chars */
330 if (!ISCNTRL((unsigned char)*p))
331 putchar(*p);
332 break;
333 }
334 }
335 }
336
337 /* Print text, replace TABs, carriage return and other whitespace with ' '.
338 * Other control chars are removed. Remove leading and trailing whitespace. */
339 static void
340 string_print(String *s)
341 {
342 const char *p, *e;
343
344 if (!s->data || !s->len)
345 return;
346
347 p = s->data;
348 e = s->data + s->len;
349 for (; *p && p != e; p++) {
350 if (ISSPACE((unsigned char)*p))
351 putchar(' '); /* any whitespace to space */
352 else if (!ISCNTRL((unsigned char)*p))
353 /* ignore other control chars */
354 putchar(*p);
355 }
356 }
357
358 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
359 static void
360 string_print_timestamp(String *s)
361 {
362 long long t;
363
364 if (!s->data || !s->len)
365 return;
366
367 if (parsetime(s->data, &t) != -1)
368 printf("%lld", t);
369 }
370
371 static void
372 printfields(void)
373 {
374 string_print_timestamp(&fields[FeedFieldTime]);
375 putchar(FieldSeparator);
376 string_print(&fields[FeedFieldTitle]);
377 putchar(FieldSeparator);
378 string_print(&fields[FeedFieldLink]);
379 putchar(FieldSeparator);
380 string_print_encoded(&fields[FeedFieldContent]);
381 putchar(FieldSeparator);
382 fputs(contenttypes[contenttype], stdout);
383 putchar(FieldSeparator);
384 string_print(&fields[FeedFieldId]);
385 putchar(FieldSeparator);
386 string_print(&fields[FeedFieldAuthor]);
387 putchar(FieldSeparator);
388 string_print(&fields[FeedFieldEnclosure]);
389 putchar(FieldSeparator);
390 string_print(&fields[FeedFieldCategory]);
391 putchar('\n');
392
393 if (ferror(stdout)) /* check for errors but do not flush */
394 checkfileerror(stdout, "<stdout>", 'w');
395 }
396
397 static void
398 newitem(void)
399 {
400 size_t i;
401
402 contenttype = ContentTypeNone;
403 for (i = 0; i < FeedFieldLast; i++)
404 string_clear(&fields[i]);
405
406 }
407
408 static void
409 processnode(struct json_node *nodes, size_t depth, const char *value, size_t valuelen)
410 {
411 /* item */
412 if (depth == 3) {
413 if (nodes[0].type == JSON_TYPE_OBJECT &&
414 nodes[1].type == JSON_TYPE_ARRAY &&
415 nodes[2].type == JSON_TYPE_OBJECT &&
416 !attrcmp(nodes[1].name, "items")) {
417 if (itemisopen)
418 printfields();
419 newitem();
420 itemisopen = 1;
421 }
422 }
423
424 /* item attributes */
425 if (depth == 4) {
426 if (nodes[0].type == JSON_TYPE_OBJECT &&
427 nodes[1].type == JSON_TYPE_ARRAY &&
428 nodes[2].type == JSON_TYPE_OBJECT &&
429 !attrcmp(nodes[1].name, "items")) {
430 if (!attrcmp(nodes[3].name, "content_html")) {
431 string_set(&fields[FeedFieldContent], value, valuelen);
432 contenttype = ContentTypeHTML;
433 } else if (!attrcmp(nodes[3].name, "content_text")) {
434 /* prefer HTML, if summary text is set override it also */
435 if (!fields[FeedFieldContent].len && contenttype != ContentTypeHTML) {
436 string_set(&fields[FeedFieldContent], value, valuelen);
437 contenttype = ContentTypePlain;
438 }
439 } else if (!attrcmp(nodes[3].name, "date_published")) {
440 /* published has higher priority than updated */
441 string_set(&fields[FeedFieldTime], value, valuelen);
442 } else if (!attrcmp(nodes[3].name, "date_modified")) {
443 if (!fields[FeedFieldTime].len)
444 string_append(&fields[FeedFieldTime], value, valuelen);
445 } else if (!attrcmp(nodes[3].name, "id")) {
446 if (!fields[FeedFieldId].len)
447 string_append(&fields[FeedFieldId], value, valuelen);
448 } else if (!attrcmp(nodes[3].name, "summary")) {
449 /* only if content_html or content_text is not set yet. */
450 if (!fields[FeedFieldContent].len) {
451 string_append(&fields[FeedFieldContent], value, valuelen);
452 contenttype = ContentTypePlain;
453 }
454 } else if (!attrcmp(nodes[3].name, "title")) {
455 if (!fields[FeedFieldTitle].len)
456 string_set(&fields[FeedFieldTitle], value, valuelen);
457 } else if (!attrcmp(nodes[3].name, "url")) {
458 if (!fields[FeedFieldLink].len)
459 string_append(&fields[FeedFieldLink], value, valuelen);
460 }
461 }
462 }
463
464 if (depth == 5) {
465 /* 1.0 author name */
466 if (nodes[0].type == JSON_TYPE_OBJECT &&
467 nodes[1].type == JSON_TYPE_ARRAY &&
468 nodes[2].type == JSON_TYPE_OBJECT &&
469 nodes[3].type == JSON_TYPE_OBJECT &&
470 nodes[4].type == JSON_TYPE_STRING &&
471 !attrcmp(nodes[1].name, "items") &&
472 !attrcmp(nodes[3].name, "author") &&
473 !attrcmp(nodes[4].name, "name")) {
474 if (!fields[FeedFieldAuthor].len)
475 string_append(&fields[FeedFieldAuthor], value, valuelen);
476 }
477
478 /* tags / categories */
479 if (nodes[0].type == JSON_TYPE_OBJECT &&
480 nodes[1].type == JSON_TYPE_ARRAY &&
481 nodes[2].type == JSON_TYPE_OBJECT &&
482 nodes[3].type == JSON_TYPE_ARRAY &&
483 nodes[4].type == JSON_TYPE_STRING &&
484 !attrcmp(nodes[1].name, "items") &&
485 !attrcmp(nodes[3].name, "tags")) {
486 if (fields[FeedFieldCategory].len)
487 string_append(&fields[FeedFieldCategory], FieldMultiSeparator,
488 sizeof(FieldMultiSeparator) - 1);
489 string_append(&fields[FeedFieldCategory], value, valuelen);
490 }
491 }
492
493 if (depth == 6) {
494 /* 1.1 author name */
495 if (nodes[0].type == JSON_TYPE_OBJECT &&
496 nodes[1].type == JSON_TYPE_ARRAY &&
497 nodes[2].type == JSON_TYPE_OBJECT &&
498 nodes[3].type == JSON_TYPE_ARRAY &&
499 nodes[4].type == JSON_TYPE_OBJECT &&
500 nodes[5].type == JSON_TYPE_STRING &&
501 !attrcmp(nodes[1].name, "items") &&
502 !attrcmp(nodes[3].name, "authors") &&
503 !attrcmp(nodes[5].name, "name")) {
504 if (!fields[FeedFieldAuthor].len)
505 string_append(&fields[FeedFieldAuthor], value, valuelen);
506 }
507
508 /* enclosure attributes */
509 if (nodes[0].type == JSON_TYPE_OBJECT &&
510 nodes[1].type == JSON_TYPE_ARRAY &&
511 nodes[2].type == JSON_TYPE_OBJECT &&
512 nodes[3].type == JSON_TYPE_ARRAY &&
513 nodes[4].type == JSON_TYPE_OBJECT &&
514 (nodes[5].type == JSON_TYPE_STRING || nodes[5].type == JSON_TYPE_NUMBER) &&
515 !attrcmp(nodes[1].name, "items") &&
516 !attrcmp(nodes[3].name, "attachments") &&
517 !attrcmp(nodes[5].name, "url")) {
518 if (!fields[FeedFieldEnclosure].len)
519 string_append(&fields[FeedFieldEnclosure], value, valuelen);
520 }
521 }
522
523 if (ferror(stdout)) {
524 fprintf(stderr, "write error: <stdout>\n");
525 exit(2);
526 }
527 }
528
529 int
530 main(int argc, char *argv[])
531 {
532 if (pledge("stdio", NULL) == -1)
533 err(1, "pledge");
534
535 switch (parsejson(processnode)) {
536 case JSON_ERROR_MEM:
537 errx(2, "error: cannot allocate enough memory");
538 case JSON_ERROR_INVALID:
539 errx(1, "error: invalid JSON");
540 }
541
542 if (itemisopen)
543 printfields();
544
545 if (ferror(stdin))
546 errx(2, "read error: <stdin>");
547 if (fflush(stdout) || ferror(stdout))
548 errx(2, "write error: <stdout>");
549
550 return 0;
551 }