sfeed.c - sfeed - RSS and Atom parser
(HTM) git clone git://git.codemadness.org/sfeed
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
sfeed.c (30132B)
---
1 #include <errno.h>
2 #include <stdint.h>
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <strings.h>
7
8 #include "util.h"
9 #include "xml.h"
10
11 #define ISINCONTENT(ctx) ((ctx).iscontent && !((ctx).iscontenttag))
12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
13
14 /* these feed fields support multiple separated values */
15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
16
17 /* string and byte-length */
18 #define STRP(s) s,sizeof(s)-1
19
20 enum FeedType {
21 FeedTypeNone = 0,
22 FeedTypeRSS = 1,
23 FeedTypeAtom = 2
24 };
25
26 enum ContentType {
27 ContentTypeNone = 0,
28 ContentTypePlain = 1,
29 ContentTypeHTML = 2
30 };
31 static const char *contenttypes[] = { "", "plain", "html" };
32
33 /* String data / memory pool */
34 typedef struct string {
35 char *data; /* data */
36 size_t len; /* string length */
37 size_t bufsiz; /* allocated size */
38 } String;
39
40 /* NOTE: the order of these fields (content, date, author) indicate the
41 * priority to use them, from least important to high. */
42 enum TagId {
43 TagUnknown = 0,
44 /* RSS */
45 RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
46 RSSTagTitle,
47 RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
48 RSSTagGuid,
49 RSSTagGuidPermalinkFalse,
50 RSSTagGuidPermalinkTrue,
51 /* must be defined after GUID, because it can be a link (isPermaLink) */
52 RSSTagLink,
53 RSSTagEnclosure,
54 RSSTagAuthor, RSSTagDccreator,
55 RSSTagCategory,
56 /* Atom */
57 /* creation date has higher priority */
58 AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
59 AtomTagTitle,
60 AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
61 AtomTagId,
62 AtomTagLink,
63 AtomTagLinkAlternate,
64 AtomTagLinkEnclosure,
65 AtomTagAuthor, AtomTagAuthorName,
66 AtomTagCategory,
67 TagLast
68 };
69
70 typedef struct feedtag {
71 char *name; /* name of tag to match */
72 size_t len; /* len of `name` */
73 enum TagId id; /* unique ID */
74 } FeedTag;
75
76 typedef struct field {
77 String str;
78 enum TagId tagid; /* tagid set previously, used for tag priority */
79 } FeedField;
80
81 enum {
82 FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
83 FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
84 FeedFieldLast
85 };
86
87 typedef struct feedcontext {
88 String *field; /* current FeedItem field String */
89 FeedField fields[FeedFieldLast]; /* data for current item */
90 FeedTag tag; /* unique current parsed tag */
91 int iscontent; /* in content data */
92 int iscontenttag; /* in content tag */
93 enum ContentType contenttype; /* content-type for item */
94 enum FeedType feedtype;
95 int attrcount; /* count item HTML element attributes */
96 } FeedContext;
97
98 static long long datetounix(long long, int, int, int, int, int);
99 static FeedTag * gettag(enum FeedType, const char *, size_t);
100 static long gettzoffset(const char *);
101 static int isattr(const char *, size_t, const char *, size_t);
102 static int istag(const char *, size_t, const char *, size_t);
103 static int parsetime(const char *, long long *);
104 static void printfields(void);
105 static void string_append(String *, const char *, size_t);
106 static void string_buffer_realloc(String *, size_t);
107 static void string_clear(String *);
108 static void string_print_encoded(String *);
109 static void string_print_timestamp(String *);
110 static void string_print_trimmed(String *);
111 static void string_print_trimmed_multi(String *);
112 static void string_print_uri(String *);
113 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
114 const char *, size_t);
115 static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
116 size_t, const char *, size_t);
117 static void xmlattrend(XMLParser *, const char *, size_t, const char *,
118 size_t);
119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
120 size_t);
121 static void xmldata(XMLParser *, const char *, size_t);
122 static void xmldataentity(XMLParser *, const char *, size_t);
123 static void xmltagend(XMLParser *, const char *, size_t, int);
124 static void xmltagstart(XMLParser *, const char *, size_t);
125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
126
127 /* map tag name to TagId type */
128 /* RSS, keep this in alphabetical order */
129 static const FeedTag rsstags[] = {
130 { STRP("author"), RSSTagAuthor },
131 { STRP("category"), RSSTagCategory },
132 { STRP("content:encoded"), RSSTagContentEncoded },
133 { STRP("dc:creator"), RSSTagDccreator },
134 { STRP("dc:date"), RSSTagDcdate },
135 { STRP("description"), RSSTagDescription },
136 /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
137 { STRP("enclosure"), RSSTagEnclosure },
138 { STRP("guid"), RSSTagGuid },
139 { STRP("link"), RSSTagLink },
140 { STRP("media:description"), RSSTagMediaDescription },
141 { STRP("pubdate"), RSSTagPubdate },
142 { STRP("title"), RSSTagTitle }
143 };
144
145 /* Atom, keep this in alphabetical order */
146 static const FeedTag atomtags[] = {
147 { STRP("author"), AtomTagAuthor },
148 { STRP("category"), AtomTagCategory },
149 { STRP("content"), AtomTagContent },
150 { STRP("id"), AtomTagId },
151 { STRP("issued"), AtomTagIssued }, /* Atom 0.3 */
152 /* Atom: <link href="" />, RSS has <link></link> */
153 { STRP("link"), AtomTagLink },
154 { STRP("media:description"), AtomTagMediaDescription },
155 { STRP("modified"), AtomTagModified }, /* Atom 0.3 */
156 { STRP("published"), AtomTagPublished },
157 { STRP("summary"), AtomTagSummary },
158 { STRP("title"), AtomTagTitle },
159 { STRP("updated"), AtomTagUpdated }
160 };
161
162 /* special case: nested <author><name> */
163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
165
166 /* reference to no / unknown tag */
167 static const FeedTag notag = { STRP(""), TagUnknown };
168
169 /* map TagId type to RSS/Atom field, all tags must be defined */
170 static const int fieldmap[TagLast] = {
171 [TagUnknown] = -1,
172 /* RSS */
173 [RSSTagDcdate] = FeedFieldTime,
174 [RSSTagPubdate] = FeedFieldTime,
175 [RSSTagTitle] = FeedFieldTitle,
176 [RSSTagMediaDescription] = FeedFieldContent,
177 [RSSTagDescription] = FeedFieldContent,
178 [RSSTagContentEncoded] = FeedFieldContent,
179 [RSSTagGuid] = -1,
180 [RSSTagGuidPermalinkFalse] = FeedFieldId,
181 [RSSTagGuidPermalinkTrue] = FeedFieldId, /* special case: both a link and an id */
182 [RSSTagLink] = FeedFieldLink,
183 [RSSTagEnclosure] = FeedFieldEnclosure,
184 [RSSTagAuthor] = FeedFieldAuthor,
185 [RSSTagDccreator] = FeedFieldAuthor,
186 [RSSTagCategory] = FeedFieldCategory,
187 /* Atom */
188 [AtomTagModified] = FeedFieldTime,
189 [AtomTagUpdated] = FeedFieldTime,
190 [AtomTagIssued] = FeedFieldTime,
191 [AtomTagPublished] = FeedFieldTime,
192 [AtomTagTitle] = FeedFieldTitle,
193 [AtomTagMediaDescription] = FeedFieldContent,
194 [AtomTagSummary] = FeedFieldContent,
195 [AtomTagContent] = FeedFieldContent,
196 [AtomTagId] = FeedFieldId,
197 [AtomTagLink] = -1,
198 [AtomTagLinkAlternate] = FeedFieldLink,
199 [AtomTagLinkEnclosure] = FeedFieldEnclosure,
200 [AtomTagAuthor] = -1,
201 [AtomTagAuthorName] = FeedFieldAuthor,
202 [AtomTagCategory] = FeedFieldCategory
203 };
204
205 static const int FieldSeparator = '\t';
206 /* separator for multiple values in a field, separator should be 1 byte */
207 static const char FieldMultiSeparator[] = "|";
208 static struct uri baseuri;
209 static const char *baseurl;
210
211 static FeedContext ctx;
212 static XMLParser parser; /* XML parser state */
213 static String attrispermalink, attrrel, attrtype, tmpstr;
214
215 /* Unique tag(id) for parsed tag name. */
216 static FeedTag *
217 gettag(enum FeedType feedtype, const char *name, size_t namelen)
218 {
219 FeedTag *r;
220 size_t i;
221
222 switch (feedtype) {
223 case FeedTypeRSS:
224 for (i = 0; i < sizeof(rsstags) / sizeof(rsstags[0]); i++) {
225 r = (FeedTag *)&rsstags[i];
226 if (r->len == namelen && !strcasecmp(r->name, name))
227 return r;
228 }
229 break;
230 case FeedTypeAtom:
231 for (i = 0; i < sizeof(atomtags) / sizeof(atomtags[0]); i++) {
232 r = (FeedTag *)&atomtags[i];
233 if (r->len == namelen && !strcasecmp(r->name, name))
234 return r;
235 }
236 break;
237 default:
238 break;
239 }
240
241 return NULL;
242 }
243
244 static char *
245 ltrim(const char *s)
246 {
247 for (; ISSPACE((unsigned char)*s); s++)
248 ;
249 return (char *)s;
250 }
251
252 static char *
253 rtrim(const char *s)
254 {
255 const char *e;
256
257 for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
258 ;
259 return (char *)e;
260 }
261
262 /* Clear string only; don't free, prevents unnecessary reallocation. */
263 static void
264 string_clear(String *s)
265 {
266 if (s->data)
267 s->data[0] = '\0';
268 s->len = 0;
269 }
270
271 static void
272 string_buffer_realloc(String *s, size_t newlen)
273 {
274 size_t alloclen;
275
276 if (newlen > SIZE_MAX / 2) {
277 alloclen = SIZE_MAX;
278 } else {
279 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
280 ;
281 }
282 if (!(s->data = realloc(s->data, alloclen)))
283 err(1, "realloc");
284 s->bufsiz = alloclen;
285 }
286
287 /* Append data to String, s->data and data may not overlap. */
288 static void
289 string_append(String *s, const char *data, size_t len)
290 {
291 if (!len)
292 return;
293
294 if (s->len >= SIZE_MAX - len) {
295 errno = ENOMEM;
296 err(1, "realloc");
297 }
298
299 /* check if allocation is necessary, never shrink the buffer. */
300 if (s->len + len >= s->bufsiz)
301 string_buffer_realloc(s, s->len + len + 1);
302 memcpy(s->data + s->len, data, len);
303 s->len += len;
304 s->data[s->len] = '\0';
305 }
306
307 /* Print text, encode TABs, newlines and '\', remove other whitespace.
308 * Remove leading and trailing whitespace. */
309 static void
310 string_print_encoded(String *s)
311 {
312 const char *p, *e;
313
314 if (!s->data || !s->len)
315 return;
316
317 p = ltrim(s->data);
318 e = rtrim(p);
319
320 for (; *p && p != e; p++) {
321 switch (*p) {
322 case '\n': putchar('\\'); putchar('n'); break;
323 case '\\': putchar('\\'); putchar('\\'); break;
324 case '\t': putchar('\\'); putchar('t'); break;
325 default:
326 /* ignore control chars */
327 if (!ISCNTRL((unsigned char)*p))
328 putchar(*p);
329 break;
330 }
331 }
332 }
333
334 static void
335 printtrimmed(const char *s)
336 {
337 char *p, *e;
338
339 p = ltrim(s);
340 e = rtrim(p);
341 for (; *p && p != e; p++) {
342 if (ISSPACE((unsigned char)*p))
343 putchar(' '); /* any whitespace to space */
344 else if (!ISCNTRL((unsigned char)*p))
345 /* ignore other control chars */
346 putchar(*p);
347 }
348 }
349
350 /* Print text, replace TABs, carriage return and other whitespace with ' '.
351 * Other control chars are removed. Remove leading and trailing whitespace. */
352 static void
353 string_print_trimmed(String *s)
354 {
355 if (!s->data || !s->len)
356 return;
357
358 printtrimmed(s->data);
359 }
360
361 /* Print each field with trimmed whitespace, separated by '|'. */
362 static void
363 string_print_trimmed_multi(String *s)
364 {
365 char *p, *e;
366 int c;
367
368 if (!s->data || !s->len)
369 return;
370
371 for (p = s->data; ; p = e + 1) {
372 if ((e = strstr(p, FieldMultiSeparator))) {
373 c = *e;
374 *e = '\0';
375 printtrimmed(p);
376 *e = c; /* restore NUL byte to original character */
377 fputs(FieldMultiSeparator, stdout);
378 } else {
379 printtrimmed(p);
380 break;
381 }
382 }
383 }
384
385 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
386 static void
387 printuri(char *s)
388 {
389 char link[4096], *p, *e;
390 struct uri newuri, olduri;
391 int c, r = -1;
392
393 p = ltrim(s);
394 e = rtrim(p);
395 c = *e;
396 *e = '\0';
397
398 if (baseurl && !uri_hasscheme(p) &&
399 uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
400 uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
401 r = uri_format(link, sizeof(link), &newuri);
402
403 if (r >= 0 && (size_t)r < sizeof(link))
404 printtrimmed(link);
405 else
406 printtrimmed(p);
407
408 *e = c; /* restore NUL byte to original character */
409 }
410
411 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
412 static void
413 string_print_uri(String *s)
414 {
415 if (!s->data || !s->len)
416 return;
417
418 printuri(s->data);
419 }
420
421 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
422 static void
423 string_print_timestamp(String *s)
424 {
425 long long t;
426
427 if (!s->data || !s->len)
428 return;
429
430 if (parsetime(s->data, &t) != -1)
431 printf("%lld", t);
432 }
433
434 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
435 * Parameters should be passed as they are in a struct tm and in a valid range:
436 * that is: year = year - 1900, month = month - 1. */
437 static long long
438 datetounix(long long year, int mon, int day, int hour, int min, int sec)
439 {
440 /* seconds in a month in a regular (non-leap) year */
441 static const long secs_through_month[] = {
442 0, 31 * 86400, 59 * 86400, 90 * 86400,
443 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
444 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
445 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
446 long long t;
447
448 /* optimization: handle common range year 1902 up to and including 2038 */
449 if (year - 2ULL <= 136) {
450 /* amount of leap days relative to 1970: every 4 years */
451 leaps = (year / 4) - 17; /* 17 leap years offset for 1902 - 1970 */
452 if (!(year & 3)) {
453 leaps--;
454 is_leap = 1;
455 } else {
456 is_leap = 0;
457 }
458 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
459 } else {
460 /* general leap year calculation:
461 * leap years occur mostly every 4 years but every 100 years
462 * a leap year is skipped unless the year is divisible by 400 */
463 cycles = (year - 100) / 400;
464 rem = (year - 100) % 400;
465 if (rem < 0) {
466 cycles--;
467 rem += 400;
468 }
469 if (!rem) {
470 is_leap = 1;
471 } else {
472 if (rem >= 300) {
473 centuries = 3;
474 rem -= 300;
475 } else if (rem >= 200) {
476 centuries = 2;
477 rem -= 200;
478 } else if (rem >= 100) {
479 centuries = 1;
480 rem -= 100;
481 }
482 if (rem) {
483 leaps = rem / 4U;
484 rem %= 4U;
485 is_leap = !rem;
486 }
487 }
488 leaps += (97 * cycles) + (24 * centuries) - is_leap;
489
490 /* adjust 8 leap days from 1970 up to and including 2000:
491 * ((30 * 365) + 8) * 86400 = 946771200 */
492 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
493 }
494 t += secs_through_month[mon];
495 if (is_leap && mon >= 2)
496 t += 86400;
497 t += 86400LL * (day - 1);
498 t += 3600LL * hour;
499 t += 60LL * min;
500 t += sec;
501
502 return t;
503 }
504
505 /* Get timezone from string, return time offset in seconds from UTC.
506 * NOTE: only parses timezones in RFC 822, many other timezone names are
507 * ambiguous anyway.
508 * ANSI and military zones are defined wrong in RFC 822 and are unsupported,
509 * see note on RFC 2822 4.3 page 32. */
510 static long
511 gettzoffset(const char *s)
512 {
513 static const struct {
514 char *name;
515 int offhour;
516 } tzones[] = {
517 { "CDT", -5 * 3600 },
518 { "CST", -6 * 3600 },
519 { "EDT", -4 * 3600 },
520 { "EST", -5 * 3600 },
521 { "MDT", -6 * 3600 },
522 { "MST", -7 * 3600 },
523 { "PDT", -7 * 3600 },
524 { "PST", -8 * 3600 },
525 };
526 const char *p;
527 long tzhour = 0, tzmin = 0;
528 size_t i;
529
530 for (; ISSPACE((unsigned char)*s); s++)
531 ;
532 switch (*s) {
533 case '-': /* offset */
534 case '+':
535 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
536 tzhour = (tzhour * 10) + (*p - '0');
537 if (*p == ':')
538 p++;
539 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
540 tzmin = (tzmin * 10) + (*p - '0');
541 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
542 default: /* timezone name */
543 for (i = 0; ISALPHA((unsigned char)s[i]); i++)
544 ;
545 if (i != 3)
546 return 0;
547 /* compare timezone and adjust offset relative to UTC */
548 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
549 if (!memcmp(s, tzones[i].name, 3))
550 return tzones[i].offhour;
551 }
552 }
553 return 0;
554 }
555
556 /* Parse time string `s` into the UNIX timestamp `tp`.
557 * Returns 0 on success or -1 on failure. */
558 static int
559 parsetime(const char *s, long long *tp)
560 {
561 static const struct {
562 char *name;
563 int len;
564 } mons[] = {
565 { STRP("January"), },
566 { STRP("February"), },
567 { STRP("March"), },
568 { STRP("April"), },
569 { STRP("May"), },
570 { STRP("June"), },
571 { STRP("July"), },
572 { STRP("August"), },
573 { STRP("September"), },
574 { STRP("October"), },
575 { STRP("November"), },
576 { STRP("December"), },
577 };
578 int va[6] = { 0 }, i, j, v, vi;
579 size_t m;
580
581 for (; ISSPACE((unsigned char)*s); s++)
582 ;
583 if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
584 return -1;
585
586 if (ISDIGIT((unsigned char)s[0]) &&
587 ISDIGIT((unsigned char)s[1]) &&
588 ISDIGIT((unsigned char)s[2]) &&
589 ISDIGIT((unsigned char)s[3])) {
590 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
591 vi = 0;
592 } else {
593 /* format: "[%a, ]%d %b %Y %H:%M:%S" */
594 /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
595 for (; ISALPHA((unsigned char)*s); s++)
596 ;
597 for (; ISSPACE((unsigned char)*s); s++)
598 ;
599 if (*s == ',')
600 s++;
601 for (; ISSPACE((unsigned char)*s); s++)
602 ;
603 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
604 v = (v * 10) + (*s - '0');
605 va[2] = v; /* day */
606 for (; ISSPACE((unsigned char)*s); s++)
607 ;
608 /* end of word month */
609 for (j = 0; ISALPHA((unsigned char)s[j]); j++)
610 ;
611 /* check month name */
612 if (j < 3 || j > 9)
613 return -1; /* month cannot match */
614 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
615 /* abbreviation (3 length) or long name */
616 if ((j == 3 || j == mons[m].len) &&
617 !strncasecmp(mons[m].name, s, j)) {
618 va[1] = m + 1;
619 s += j;
620 break;
621 }
622 }
623 if (m >= 12)
624 return -1; /* no month found */
625 for (; ISSPACE((unsigned char)*s); s++)
626 ;
627 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
628 v = (v * 10) + (*s - '0');
629 /* obsolete short year: RFC 2822 4.3 */
630 if (i == 2 || i == 3)
631 v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
632 va[0] = v; /* year */
633 for (; ISSPACE((unsigned char)*s); s++)
634 ;
635 /* parse only regular time part, see below */
636 vi = 3;
637 }
638
639 /* parse time parts (and possibly remaining date parts) */
640 for (; *s && vi < 6; vi++) {
641 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
642 ISDIGIT((unsigned char)*s); s++, i++) {
643 v = (v * 10) + (*s - '0');
644 }
645 va[vi] = v;
646
647 if ((vi < 2 && (*s == '-' || *s == '/')) ||
648 (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsigned char)*s))) ||
649 (vi > 2 && *s == ':'))
650 s++;
651 }
652
653 /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
654 if (*s == '.' || *s == ',') {
655 for (s++; ISDIGIT((unsigned char)*s); s++)
656 ;
657 }
658
659 /* invalid range */
660 if (va[0] < 0 || va[0] > 9999 ||
661 va[1] < 1 || va[1] > 12 ||
662 va[2] < 1 || va[2] > 31 ||
663 va[3] < 0 || va[3] > 23 ||
664 va[4] < 0 || va[4] > 59 ||
665 va[5] < 0 || va[5] > 60) /* allow leap second */
666 return -1;
667
668 *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
669 gettzoffset(s);
670
671 return 0;
672 }
673
674 static void
675 printfields(void)
676 {
677 string_print_timestamp(&ctx.fields[FeedFieldTime].str);
678 putchar(FieldSeparator);
679 string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
680 putchar(FieldSeparator);
681 string_print_uri(&ctx.fields[FeedFieldLink].str);
682 putchar(FieldSeparator);
683 string_print_encoded(&ctx.fields[FeedFieldContent].str);
684 putchar(FieldSeparator);
685 fputs(contenttypes[ctx.contenttype], stdout);
686 putchar(FieldSeparator);
687 string_print_trimmed(&ctx.fields[FeedFieldId].str);
688 putchar(FieldSeparator);
689 string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
690 putchar(FieldSeparator);
691 string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
692 putchar(FieldSeparator);
693 string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
694 putchar('\n');
695
696 if (ferror(stdout)) /* check for errors but do not flush */
697 checkfileerror(stdout, "<stdout>", 'w');
698 }
699
700 static int
701 istag(const char *name, size_t len, const char *name2, size_t len2)
702 {
703 return (len == len2 && !strcasecmp(name, name2));
704 }
705
706 static int
707 isattr(const char *name, size_t len, const char *name2, size_t len2)
708 {
709 return (len == len2 && !strcasecmp(name, name2));
710 }
711
712 static void
713 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
714 const char *v, size_t vl)
715 {
716 /* handles transforming inline XML to data */
717 if (ISINCONTENT(ctx)) {
718 if (ctx.contenttype == ContentTypeHTML)
719 xmldata(p, v, vl);
720 return;
721 }
722
723 if (!ctx.tag.id)
724 return;
725
726 /* content-type may be for Atom: text, xhtml, html or a mime-type.
727 * for MRSS (media:description): plain, html. */
728 if (ISCONTENTTAG(ctx)) {
729 if (isattr(n, nl, STRP("type")))
730 string_append(&attrtype, v, vl);
731 return;
732 }
733
734 if (ctx.feedtype == FeedTypeRSS) {
735 if (ctx.tag.id == RSSTagEnclosure &&
736 isattr(n, nl, STRP("url"))) {
737 string_append(&tmpstr, v, vl);
738 } else if (ctx.tag.id == RSSTagGuid &&
739 isattr(n, nl, STRP("ispermalink"))) {
740 string_append(&attrispermalink, v, vl);
741 }
742 } else if (ctx.feedtype == FeedTypeAtom) {
743 if (ctx.tag.id == AtomTagLink) {
744 if (isattr(n, nl, STRP("rel"))) {
745 string_append(&attrrel, v, vl);
746 } else if (isattr(n, nl, STRP("href"))) {
747 string_append(&tmpstr, v, vl);
748 }
749 } else if (ctx.tag.id == AtomTagCategory &&
750 isattr(n, nl, STRP("term"))) {
751 string_append(&tmpstr, v, vl);
752 }
753 }
754 }
755
756 static void
757 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
758 const char *data, size_t datalen)
759 {
760 char buf[8];
761 int len;
762
763 /* handles transforming inline XML to data */
764 if (ISINCONTENT(ctx)) {
765 if (ctx.contenttype == ContentTypeHTML)
766 xmldata(p, data, datalen);
767 return;
768 }
769
770 if (!ctx.tag.id)
771 return;
772
773 /* try to translate entity, else just pass as data to
774 * xmlattr handler. */
775 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
776 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
777 else
778 xmlattr(p, t, tl, n, nl, data, datalen);
779 }
780
781 static void
782 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
783 {
784 if (ISINCONTENT(ctx)) {
785 if (ctx.contenttype == ContentTypeHTML) {
786 /* handles transforming inline XML to data */
787 xmldata(p, "\"", 1);
788 ctx.attrcount = 0;
789 }
790 return;
791 }
792 }
793
794 static void
795 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
796 {
797 if (ISINCONTENT(ctx)) {
798 if (ctx.contenttype == ContentTypeHTML) {
799 /* handles transforming inline XML to data */
800 if (!ctx.attrcount)
801 xmldata(p, " ", 1);
802 ctx.attrcount++;
803 xmldata(p, n, nl);
804 xmldata(p, "=\"", 2);
805 }
806 return;
807 }
808
809 if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
810 string_clear(&attrispermalink);
811 else if (attrrel.len && isattr(n, nl, STRP("rel")))
812 string_clear(&attrrel);
813 else if (attrtype.len && isattr(n, nl, STRP("type")))
814 string_clear(&attrtype);
815 else if (tmpstr.len &&
816 (isattr(n, nl, STRP("href")) ||
817 isattr(n, nl, STRP("term")) ||
818 isattr(n, nl, STRP("url"))))
819 string_clear(&tmpstr); /* use the last value for multiple attribute values */
820 }
821
822 static void
823 xmldata(XMLParser *p, const char *s, size_t len)
824 {
825 if (!ctx.field)
826 return;
827
828 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
829 string_append(&tmpstr, s, len);
830 else
831 string_append(ctx.field, s, len);
832 }
833
834 static void
835 xmldataentity(XMLParser *p, const char *data, size_t datalen)
836 {
837 char buf[8];
838 int len;
839
840 if (!ctx.field)
841 return;
842
843 /* try to translate entity, else just pass as data to
844 * xmldata handler. */
845 if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
846 xmldata(p, buf, (size_t)len);
847 else
848 xmldata(p, data, datalen);
849 }
850
851 static void
852 xmltagstart(XMLParser *p, const char *t, size_t tl)
853 {
854 const FeedTag *f;
855
856 if (ISINCONTENT(ctx)) {
857 if (ctx.contenttype == ContentTypeHTML) {
858 ctx.attrcount = 0;
859 xmldata(p, "<", 1);
860 xmldata(p, t, tl);
861 }
862 return;
863 }
864
865 /* start of RSS or Atom item / entry */
866 if (ctx.feedtype == FeedTypeNone) {
867 if (istag(t, tl, STRP("entry")))
868 ctx.feedtype = FeedTypeAtom;
869 else if (istag(t, tl, STRP("item")))
870 ctx.feedtype = FeedTypeRSS;
871 return;
872 }
873
874 /* field tagid already set or nested tags. */
875 if (ctx.tag.id) {
876 /* nested <author><name> for Atom */
877 if (ctx.tag.id == AtomTagAuthor &&
878 istag(t, tl, STRP("name"))) {
879 memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
880 } else {
881 return; /* other nested tags are not allowed: return */
882 }
883 }
884
885 /* in item */
886 if (ctx.tag.id == TagUnknown) {
887 if (!(f = gettag(ctx.feedtype, t, tl)))
888 f = ¬ag;
889 memcpy(&(ctx.tag), f, sizeof(ctx.tag));
890 }
891
892 ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
893 string_clear(&attrispermalink);
894 string_clear(&attrrel);
895 string_clear(&attrtype);
896 }
897
898 static void
899 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
900 {
901 enum TagId tagid;
902
903 if (ISINCONTENT(ctx)) {
904 if (ctx.contenttype == ContentTypeHTML) {
905 if (isshort)
906 xmldata(p, "/>", 2);
907 else
908 xmldata(p, ">", 1);
909 }
910 return;
911 }
912
913 /* set tag type based on its attribute value */
914 if (ctx.tag.id == RSSTagGuid) {
915 /* if empty the default is "true" */
916 if (!attrispermalink.len ||
917 isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
918 ctx.tag.id = RSSTagGuidPermalinkTrue;
919 else
920 ctx.tag.id = RSSTagGuidPermalinkFalse;
921 } else if (ctx.tag.id == AtomTagLink) {
922 /* empty or "alternate": other types could be
923 * "enclosure", "related", "self" or "via" */
924 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
925 ctx.tag.id = AtomTagLinkAlternate;
926 else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
927 ctx.tag.id = AtomTagLinkEnclosure;
928 else
929 ctx.tag.id = AtomTagLink; /* unknown */
930 }
931
932 tagid = ctx.tag.id;
933
934 /* map tag type to field: unknown or lesser priority is ignored,
935 * when tags of the same type are repeated only the first is used. */
936 if (fieldmap[tagid] == -1 ||
937 (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
938 tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
939 return;
940 }
941
942 if (ctx.iscontenttag) {
943 ctx.iscontent = 1;
944 ctx.iscontenttag = 0;
945
946 /* detect content-type based on type attribute */
947 if (attrtype.len) {
948 if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
949 isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
950 isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
951 isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
952 isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
953 ctx.contenttype = ContentTypeHTML;
954 else /* unknown: handle as base64 text data */
955 ctx.contenttype = ContentTypePlain;
956 } else {
957 /* default content-type */
958 if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
959 ctx.contenttype = ContentTypeHTML;
960 else
961 ctx.contenttype = ContentTypePlain;
962 }
963 }
964
965 ctx.field = &(ctx.fields[fieldmap[tagid]].str);
966 ctx.fields[fieldmap[tagid]].tagid = tagid;
967
968 /* clear field if it is overwritten (with a priority order) for the new
969 * value, if the field can have multiple values then do not clear it. */
970 if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
971 string_clear(ctx.field);
972 }
973
974 static void
975 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
976 {
977 size_t i;
978
979 if (ctx.feedtype == FeedTypeNone)
980 return;
981
982 if (ISINCONTENT(ctx)) {
983 /* not a closed content field */
984 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
985 if (!isshort && ctx.contenttype == ContentTypeHTML) {
986 xmldata(p, "</", 2);
987 xmldata(p, t, tl);
988 xmldata(p, ">", 1);
989 }
990 return;
991 }
992 } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
993 /* matched tag end: close it.
994 * copy also to the link field if the attribute isPermaLink="true"
995 * and it is not set by a tag with higher priority. */
996 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
997 ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
998 string_clear(&ctx.fields[FeedFieldLink].str);
999 string_append(&ctx.fields[FeedFieldLink].str,
1000 ctx.field->data, ctx.field->len);
1001 ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
1002 }
1003 } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
1004 istag(t, tl, STRP("entry"))) || /* Atom */
1005 (ctx.feedtype == FeedTypeRSS &&
1006 istag(t, tl, STRP("item"))))) /* RSS */
1007 {
1008 /* end of RSS or Atom entry / item */
1009 printfields();
1010
1011 /* clear strings */
1012 for (i = 0; i < FeedFieldLast; i++) {
1013 string_clear(&ctx.fields[i].str);
1014 ctx.fields[i].tagid = TagUnknown;
1015 }
1016 ctx.contenttype = ContentTypeNone;
1017 /* allow parsing of Atom and RSS concatenated in one XML stream. */
1018 ctx.feedtype = FeedTypeNone;
1019 } else {
1020 return; /* not end of field */
1021 }
1022
1023 /* temporary string: for fields that cannot be processed
1024 * directly and need more context, for example by its tag
1025 * attributes, like the Atom link rel="alternate|enclosure". */
1026 if (tmpstr.len && ctx.field) {
1027 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
1028 if (ctx.field->len)
1029 string_append(ctx.field, FieldMultiSeparator, 1);
1030 string_append(ctx.field, tmpstr.data, tmpstr.len);
1031 } else {
1032 string_clear(ctx.field);
1033 string_append(ctx.field, tmpstr.data, tmpstr.len);
1034 }
1035 }
1036
1037 /* close field */
1038 string_clear(&tmpstr); /* reuse and clear temporary string */
1039
1040 if (ctx.tag.id == AtomTagAuthorName)
1041 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
1042 else
1043 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
1044
1045 ctx.iscontent = 0;
1046 ctx.field = NULL;
1047 }
1048
1049 int
1050 main(int argc, char *argv[])
1051 {
1052 if (pledge("stdio", NULL) == -1)
1053 err(1, "pledge");
1054
1055 if (argc > 1) {
1056 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
1057 baseurl = argv[1];
1058 else
1059 errx(1, "baseurl incorrect or too long");
1060 }
1061
1062 memcpy(&(ctx.tag), ¬ag, sizeof(ctx.tag));
1063
1064 parser.xmlattr = xmlattr;
1065 parser.xmlattrentity = xmlattrentity;
1066 parser.xmlattrend = xmlattrend;
1067 parser.xmlattrstart = xmlattrstart;
1068 parser.xmlcdata = xmldata;
1069 parser.xmldata = xmldata;
1070 parser.xmldataentity = xmldataentity;
1071 parser.xmltagend = xmltagend;
1072 parser.xmltagstart = xmltagstart;
1073 parser.xmltagstartparsed = xmltagstartparsed;
1074
1075 /* NOTE: GETNEXT is defined in xml.h for inline optimization */
1076 xml_parse(&parser);
1077
1078 checkfileerror(stdin, "<stdin>", 'r');
1079 checkfileerror(stdout, "<stdout>", 'w');
1080
1081 return 0;
1082 }