codemadness.org

       sfeed.c - sfeed - RSS and Atom parser
 (HTM) git clone git://git.codemadness.org/sfeed
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       sfeed.c (30132B)
       ---
            1 #include <errno.h>
            2 #include <stdint.h>
            3 #include <stdio.h>
            4 #include <stdlib.h>
            5 #include <string.h>
            6 #include <strings.h>
            7 
            8 #include "util.h"
            9 #include "xml.h"
           10 
           11 #define ISINCONTENT(ctx)  ((ctx).iscontent && !((ctx).iscontenttag))
           12 #define ISCONTENTTAG(ctx) (!((ctx).iscontent) && (ctx).iscontenttag)
           13 
           14 /* these feed fields support multiple separated values */
           15 #define ISFEEDFIELDMULTI(t) ((t) == FeedFieldCategory)
           16 
           17 /* string and byte-length */
           18 #define STRP(s)           s,sizeof(s)-1
           19 
           20 enum FeedType {
           21         FeedTypeNone = 0,
           22         FeedTypeRSS  = 1,
           23         FeedTypeAtom = 2
           24 };
           25 
           26 enum ContentType {
           27         ContentTypeNone  = 0,
           28         ContentTypePlain = 1,
           29         ContentTypeHTML  = 2
           30 };
           31 static const char *contenttypes[] = { "", "plain", "html" };
           32 
           33 /* String data / memory pool */
           34 typedef struct string {
           35         char   *data;   /* data */
           36         size_t  len;    /* string length */
           37         size_t  bufsiz; /* allocated size */
           38 } String;
           39 
           40 /* NOTE: the order of these fields (content, date, author) indicate the
           41  *       priority to use them, from least important to high. */
           42 enum TagId {
           43         TagUnknown = 0,
           44         /* RSS */
           45         RSSTagDcdate, RSSTagPubdate, /* creation date has higher priority */
           46         RSSTagTitle,
           47         RSSTagMediaDescription, RSSTagDescription, RSSTagContentEncoded,
           48         RSSTagGuid,
           49         RSSTagGuidPermalinkFalse,
           50         RSSTagGuidPermalinkTrue,
           51         /* must be defined after GUID, because it can be a link (isPermaLink) */
           52         RSSTagLink,
           53         RSSTagEnclosure,
           54         RSSTagAuthor, RSSTagDccreator,
           55         RSSTagCategory,
           56         /* Atom */
           57         /* creation date has higher priority */
           58         AtomTagModified, AtomTagUpdated, AtomTagIssued, AtomTagPublished,
           59         AtomTagTitle,
           60         AtomTagMediaDescription, AtomTagSummary, AtomTagContent,
           61         AtomTagId,
           62         AtomTagLink,
           63         AtomTagLinkAlternate,
           64         AtomTagLinkEnclosure,
           65         AtomTagAuthor, AtomTagAuthorName,
           66         AtomTagCategory,
           67         TagLast
           68 };
           69 
           70 typedef struct feedtag {
           71         char       *name; /* name of tag to match */
           72         size_t      len;  /* len of `name` */
           73         enum TagId  id;   /* unique ID */
           74 } FeedTag;
           75 
           76 typedef struct field {
           77         String     str;
           78         enum TagId tagid; /* tagid set previously, used for tag priority */
           79 } FeedField;
           80 
           81 enum {
           82         FeedFieldTime = 0, FeedFieldTitle, FeedFieldLink, FeedFieldContent,
           83         FeedFieldId, FeedFieldAuthor, FeedFieldEnclosure, FeedFieldCategory,
           84         FeedFieldLast
           85 };
           86 
           87 typedef struct feedcontext {
           88         String          *field;        /* current FeedItem field String */
           89         FeedField        fields[FeedFieldLast]; /* data for current item */
           90         FeedTag          tag;          /* unique current parsed tag */
           91         int              iscontent;    /* in content data */
           92         int              iscontenttag; /* in content tag */
           93         enum ContentType contenttype;  /* content-type for item */
           94         enum FeedType    feedtype;
           95         int              attrcount;    /* count item HTML element attributes */
           96 } FeedContext;
           97 
           98 static long long datetounix(long long, int, int, int, int, int);
           99 static FeedTag * gettag(enum FeedType, const char *, size_t);
          100 static long gettzoffset(const char *);
          101 static int  isattr(const char *, size_t, const char *, size_t);
          102 static int  istag(const char *, size_t, const char *, size_t);
          103 static int  parsetime(const char *, long long *);
          104 static void printfields(void);
          105 static void string_append(String *, const char *, size_t);
          106 static void string_buffer_realloc(String *, size_t);
          107 static void string_clear(String *);
          108 static void string_print_encoded(String *);
          109 static void string_print_timestamp(String *);
          110 static void string_print_trimmed(String *);
          111 static void string_print_trimmed_multi(String *);
          112 static void string_print_uri(String *);
          113 static void xmlattr(XMLParser *, const char *, size_t, const char *, size_t,
          114                     const char *, size_t);
          115 static void xmlattrentity(XMLParser *, const char *, size_t, const char *,
          116                           size_t, const char *, size_t);
          117 static void xmlattrend(XMLParser *, const char *, size_t, const char *,
          118                        size_t);
          119 static void xmlattrstart(XMLParser *, const char *, size_t, const char *,
          120                          size_t);
          121 static void xmldata(XMLParser *, const char *, size_t);
          122 static void xmldataentity(XMLParser *, const char *, size_t);
          123 static void xmltagend(XMLParser *, const char *, size_t, int);
          124 static void xmltagstart(XMLParser *, const char *, size_t);
          125 static void xmltagstartparsed(XMLParser *, const char *, size_t, int);
          126 
          127 /* map tag name to TagId type */
          128 /* RSS, keep this in alphabetical order */
          129 static const FeedTag rsstags[] = {
          130         { STRP("author"),            RSSTagAuthor            },
          131         { STRP("category"),          RSSTagCategory          },
          132         { STRP("content:encoded"),   RSSTagContentEncoded    },
          133         { STRP("dc:creator"),        RSSTagDccreator         },
          134         { STRP("dc:date"),           RSSTagDcdate            },
          135         { STRP("description"),       RSSTagDescription       },
          136         /* RSS: <enclosure url="" />, Atom has <link rel="enclosure" /> */
          137         { STRP("enclosure"),         RSSTagEnclosure         },
          138         { STRP("guid"),              RSSTagGuid              },
          139         { STRP("link"),              RSSTagLink              },
          140         { STRP("media:description"), RSSTagMediaDescription  },
          141         { STRP("pubdate"),           RSSTagPubdate           },
          142         { STRP("title"),             RSSTagTitle             }
          143 };
          144 
          145 /* Atom, keep this in alphabetical order */
          146 static const FeedTag atomtags[] = {
          147         { STRP("author"),            AtomTagAuthor           },
          148         { STRP("category"),          AtomTagCategory         },
          149         { STRP("content"),           AtomTagContent          },
          150         { STRP("id"),                AtomTagId               },
          151         { STRP("issued"),            AtomTagIssued           }, /* Atom 0.3 */
          152         /* Atom: <link href="" />, RSS has <link></link> */
          153         { STRP("link"),              AtomTagLink             },
          154         { STRP("media:description"), AtomTagMediaDescription },
          155         { STRP("modified"),          AtomTagModified         }, /* Atom 0.3 */
          156         { STRP("published"),         AtomTagPublished        },
          157         { STRP("summary"),           AtomTagSummary          },
          158         { STRP("title"),             AtomTagTitle            },
          159         { STRP("updated"),           AtomTagUpdated          }
          160 };
          161 
          162 /* special case: nested <author><name> */
          163 static const FeedTag atomtagauthor = { STRP("author"), AtomTagAuthor };
          164 static const FeedTag atomtagauthorname = { STRP("name"), AtomTagAuthorName };
          165 
          166 /* reference to no / unknown tag */
          167 static const FeedTag notag = { STRP(""), TagUnknown };
          168 
          169 /* map TagId type to RSS/Atom field, all tags must be defined */
          170 static const int fieldmap[TagLast] = {
          171         [TagUnknown]               = -1,
          172         /* RSS */
          173         [RSSTagDcdate]             = FeedFieldTime,
          174         [RSSTagPubdate]            = FeedFieldTime,
          175         [RSSTagTitle]              = FeedFieldTitle,
          176         [RSSTagMediaDescription]   = FeedFieldContent,
          177         [RSSTagDescription]        = FeedFieldContent,
          178         [RSSTagContentEncoded]     = FeedFieldContent,
          179         [RSSTagGuid]               = -1,
          180         [RSSTagGuidPermalinkFalse] = FeedFieldId,
          181         [RSSTagGuidPermalinkTrue]  = FeedFieldId, /* special case: both a link and an id */
          182         [RSSTagLink]               = FeedFieldLink,
          183         [RSSTagEnclosure]          = FeedFieldEnclosure,
          184         [RSSTagAuthor]             = FeedFieldAuthor,
          185         [RSSTagDccreator]          = FeedFieldAuthor,
          186         [RSSTagCategory]           = FeedFieldCategory,
          187         /* Atom */
          188         [AtomTagModified]          = FeedFieldTime,
          189         [AtomTagUpdated]           = FeedFieldTime,
          190         [AtomTagIssued]            = FeedFieldTime,
          191         [AtomTagPublished]         = FeedFieldTime,
          192         [AtomTagTitle]             = FeedFieldTitle,
          193         [AtomTagMediaDescription]  = FeedFieldContent,
          194         [AtomTagSummary]           = FeedFieldContent,
          195         [AtomTagContent]           = FeedFieldContent,
          196         [AtomTagId]                = FeedFieldId,
          197         [AtomTagLink]              = -1,
          198         [AtomTagLinkAlternate]     = FeedFieldLink,
          199         [AtomTagLinkEnclosure]     = FeedFieldEnclosure,
          200         [AtomTagAuthor]            = -1,
          201         [AtomTagAuthorName]        = FeedFieldAuthor,
          202         [AtomTagCategory]          = FeedFieldCategory
          203 };
          204 
          205 static const int FieldSeparator = '\t';
          206 /* separator for multiple values in a field, separator should be 1 byte */
          207 static const char FieldMultiSeparator[] = "|";
          208 static struct uri baseuri;
          209 static const char *baseurl;
          210 
          211 static FeedContext ctx;
          212 static XMLParser parser; /* XML parser state */
          213 static String attrispermalink, attrrel, attrtype, tmpstr;
          214 
          215 /* Unique tag(id) for parsed tag name. */
          216 static FeedTag *
          217 gettag(enum FeedType feedtype, const char *name, size_t namelen)
          218 {
          219         FeedTag *r;
          220         size_t i;
          221 
          222         switch (feedtype) {
          223         case FeedTypeRSS:
          224                 for (i = 0; i < sizeof(rsstags) / sizeof(rsstags[0]); i++) {
          225                         r = (FeedTag *)&rsstags[i];
          226                         if (r->len == namelen && !strcasecmp(r->name, name))
          227                                 return r;
          228                 }
          229                 break;
          230         case FeedTypeAtom:
          231                 for (i = 0; i < sizeof(atomtags) / sizeof(atomtags[0]); i++) {
          232                         r = (FeedTag *)&atomtags[i];
          233                         if (r->len == namelen && !strcasecmp(r->name, name))
          234                                 return r;
          235                 }
          236                 break;
          237         default:
          238                 break;
          239         }
          240 
          241         return NULL;
          242 }
          243 
          244 static char *
          245 ltrim(const char *s)
          246 {
          247         for (; ISSPACE((unsigned char)*s); s++)
          248                 ;
          249         return (char *)s;
          250 }
          251 
          252 static char *
          253 rtrim(const char *s)
          254 {
          255         const char *e;
          256 
          257         for (e = s + strlen(s); e > s && ISSPACE((unsigned char)*(e - 1)); e--)
          258                 ;
          259         return (char *)e;
          260 }
          261 
          262 /* Clear string only; don't free, prevents unnecessary reallocation. */
          263 static void
          264 string_clear(String *s)
          265 {
          266         if (s->data)
          267                 s->data[0] = '\0';
          268         s->len = 0;
          269 }
          270 
          271 static void
          272 string_buffer_realloc(String *s, size_t newlen)
          273 {
          274         size_t alloclen;
          275 
          276         if (newlen > SIZE_MAX / 2) {
          277                 alloclen = SIZE_MAX;
          278         } else {
          279                 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
          280                         ;
          281         }
          282         if (!(s->data = realloc(s->data, alloclen)))
          283                 err(1, "realloc");
          284         s->bufsiz = alloclen;
          285 }
          286 
          287 /* Append data to String, s->data and data may not overlap. */
          288 static void
          289 string_append(String *s, const char *data, size_t len)
          290 {
          291         if (!len)
          292                 return;
          293 
          294         if (s->len >= SIZE_MAX - len) {
          295                 errno = ENOMEM;
          296                 err(1, "realloc");
          297         }
          298 
          299         /* check if allocation is necessary, never shrink the buffer. */
          300         if (s->len + len >= s->bufsiz)
          301                 string_buffer_realloc(s, s->len + len + 1);
          302         memcpy(s->data + s->len, data, len);
          303         s->len += len;
          304         s->data[s->len] = '\0';
          305 }
          306 
          307 /* Print text, encode TABs, newlines and '\', remove other whitespace.
          308  * Remove leading and trailing whitespace. */
          309 static void
          310 string_print_encoded(String *s)
          311 {
          312         const char *p, *e;
          313 
          314         if (!s->data || !s->len)
          315                 return;
          316 
          317         p = ltrim(s->data);
          318         e = rtrim(p);
          319 
          320         for (; *p && p != e; p++) {
          321                 switch (*p) {
          322                 case '\n': putchar('\\'); putchar('n'); break;
          323                 case '\\': putchar('\\'); putchar('\\'); break;
          324                 case '\t': putchar('\\'); putchar('t'); break;
          325                 default:
          326                         /* ignore control chars */
          327                         if (!ISCNTRL((unsigned char)*p))
          328                                 putchar(*p);
          329                         break;
          330                 }
          331         }
          332 }
          333 
          334 static void
          335 printtrimmed(const char *s)
          336 {
          337         char *p, *e;
          338 
          339         p = ltrim(s);
          340         e = rtrim(p);
          341         for (; *p && p != e; p++) {
          342                 if (ISSPACE((unsigned char)*p))
          343                         putchar(' '); /* any whitespace to space */
          344                 else if (!ISCNTRL((unsigned char)*p))
          345                         /* ignore other control chars */
          346                         putchar(*p);
          347         }
          348 }
          349 
          350 /* Print text, replace TABs, carriage return and other whitespace with ' '.
          351  * Other control chars are removed. Remove leading and trailing whitespace. */
          352 static void
          353 string_print_trimmed(String *s)
          354 {
          355         if (!s->data || !s->len)
          356                 return;
          357 
          358         printtrimmed(s->data);
          359 }
          360 
          361 /* Print each field with trimmed whitespace, separated by '|'. */
          362 static void
          363 string_print_trimmed_multi(String *s)
          364 {
          365         char *p, *e;
          366         int c;
          367 
          368         if (!s->data || !s->len)
          369                 return;
          370 
          371         for (p = s->data; ; p = e + 1) {
          372                 if ((e = strstr(p, FieldMultiSeparator))) {
          373                         c = *e;
          374                         *e = '\0';
          375                         printtrimmed(p);
          376                         *e = c; /* restore NUL byte to original character */
          377                         fputs(FieldMultiSeparator, stdout);
          378                 } else {
          379                         printtrimmed(p);
          380                         break;
          381                 }
          382         }
          383 }
          384 
          385 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
          386 static void
          387 printuri(char *s)
          388 {
          389         char link[4096], *p, *e;
          390         struct uri newuri, olduri;
          391         int c, r = -1;
          392 
          393         p = ltrim(s);
          394         e = rtrim(p);
          395         c = *e;
          396         *e = '\0';
          397 
          398         if (baseurl && !uri_hasscheme(p) &&
          399             uri_parse(p, &olduri) != -1 && !olduri.proto[0] &&
          400             uri_makeabs(&newuri, &olduri, &baseuri) != -1 && newuri.proto[0])
          401                 r = uri_format(link, sizeof(link), &newuri);
          402 
          403         if (r >= 0 && (size_t)r < sizeof(link))
          404                 printtrimmed(link);
          405         else
          406                 printtrimmed(p);
          407 
          408         *e = c; /* restore NUL byte to original character */
          409 }
          410 
          411 /* Print URL, if it is a relative URL then it uses the global `baseurl`. */
          412 static void
          413 string_print_uri(String *s)
          414 {
          415         if (!s->data || !s->len)
          416                 return;
          417 
          418         printuri(s->data);
          419 }
          420 
          421 /* Print as UNIX timestamp, print nothing if the time is empty or invalid. */
          422 static void
          423 string_print_timestamp(String *s)
          424 {
          425         long long t;
          426 
          427         if (!s->data || !s->len)
          428                 return;
          429 
          430         if (parsetime(s->data, &t) != -1)
          431                 printf("%lld", t);
          432 }
          433 
          434 /* Convert time fields. Returns a signed (at least) 64-bit UNIX timestamp.
          435  * Parameters should be passed as they are in a struct tm and in a valid range:
          436  * that is: year = year - 1900, month = month - 1. */
          437 static long long
          438 datetounix(long long year, int mon, int day, int hour, int min, int sec)
          439 {
          440         /* seconds in a month in a regular (non-leap) year */
          441         static const long secs_through_month[] = {
          442                 0, 31 * 86400, 59 * 86400, 90 * 86400,
          443                 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
          444                 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
          445         int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
          446         long long t;
          447 
          448         /* optimization: handle common range year 1902 up to and including 2038 */
          449         if (year - 2ULL <= 136) {
          450                 /* amount of leap days relative to 1970: every 4 years */
          451                 leaps = (year / 4) - 17; /* 17 leap years offset for 1902 - 1970 */
          452                 if (!(year & 3)) {
          453                         leaps--;
          454                         is_leap = 1;
          455                 } else {
          456                         is_leap = 0;
          457                 }
          458                 t = 31536000 * (year - 70) + (86400 * leaps); /* 365 * 86400 = 31536000 */
          459         } else {
          460                 /* general leap year calculation:
          461                  * leap years occur mostly every 4 years but every 100 years
          462                  * a leap year is skipped unless the year is divisible by 400 */
          463                 cycles = (year - 100) / 400;
          464                 rem = (year - 100) % 400;
          465                 if (rem < 0) {
          466                         cycles--;
          467                         rem += 400;
          468                 }
          469                 if (!rem) {
          470                         is_leap = 1;
          471                 } else {
          472                         if (rem >= 300) {
          473                                 centuries = 3;
          474                                 rem -= 300;
          475                         } else if (rem >= 200) {
          476                                 centuries = 2;
          477                                 rem -= 200;
          478                         } else if (rem >= 100) {
          479                                 centuries = 1;
          480                                 rem -= 100;
          481                         }
          482                         if (rem) {
          483                                 leaps = rem / 4U;
          484                                 rem %= 4U;
          485                                 is_leap = !rem;
          486                         }
          487                 }
          488                 leaps += (97 * cycles) + (24 * centuries) - is_leap;
          489 
          490                 /* adjust 8 leap days from 1970 up to and including 2000:
          491                  * ((30 * 365) + 8) * 86400 = 946771200 */
          492                 t = ((year - 100) * 31536000LL) + (leaps * 86400LL) + 946771200LL;
          493         }
          494         t += secs_through_month[mon];
          495         if (is_leap && mon >= 2)
          496                 t += 86400;
          497         t += 86400LL * (day - 1);
          498         t += 3600LL * hour;
          499         t += 60LL * min;
          500         t += sec;
          501 
          502         return t;
          503 }
          504 
          505 /* Get timezone from string, return time offset in seconds from UTC.
          506  * NOTE: only parses timezones in RFC 822, many other timezone names are
          507  * ambiguous anyway.
          508  * ANSI and military zones are defined wrong in RFC 822 and are unsupported,
          509  * see note on RFC 2822 4.3 page 32. */
          510 static long
          511 gettzoffset(const char *s)
          512 {
          513         static const struct {
          514                 char *name;
          515                 int offhour;
          516         } tzones[] = {
          517                 { "CDT", -5 * 3600 },
          518                 { "CST", -6 * 3600 },
          519                 { "EDT", -4 * 3600 },
          520                 { "EST", -5 * 3600 },
          521                 { "MDT", -6 * 3600 },
          522                 { "MST", -7 * 3600 },
          523                 { "PDT", -7 * 3600 },
          524                 { "PST", -8 * 3600 },
          525         };
          526         const char *p;
          527         long tzhour = 0, tzmin = 0;
          528         size_t i;
          529 
          530         for (; ISSPACE((unsigned char)*s); s++)
          531                 ;
          532         switch (*s) {
          533         case '-': /* offset */
          534         case '+':
          535                 for (i = 0, p = s + 1; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
          536                         tzhour = (tzhour * 10) + (*p - '0');
          537                 if (*p == ':')
          538                         p++;
          539                 for (i = 0; i < 2 && ISDIGIT((unsigned char)*p); i++, p++)
          540                         tzmin = (tzmin * 10) + (*p - '0');
          541                 return ((tzhour * 3600) + (tzmin * 60)) * (s[0] == '-' ? -1 : 1);
          542         default: /* timezone name */
          543                 for (i = 0; ISALPHA((unsigned char)s[i]); i++)
          544                         ;
          545                 if (i != 3)
          546                         return 0;
          547                 /* compare timezone and adjust offset relative to UTC */
          548                 for (i = 0; i < sizeof(tzones) / sizeof(*tzones); i++) {
          549                         if (!memcmp(s, tzones[i].name, 3))
          550                                 return tzones[i].offhour;
          551                 }
          552         }
          553         return 0;
          554 }
          555 
          556 /* Parse time string `s` into the UNIX timestamp `tp`.
          557  * Returns 0 on success or -1 on failure. */
          558 static int
          559 parsetime(const char *s, long long *tp)
          560 {
          561         static const struct {
          562                 char *name;
          563                 int len;
          564         } mons[] = {
          565                 { STRP("January"),   },
          566                 { STRP("February"),  },
          567                 { STRP("March"),     },
          568                 { STRP("April"),     },
          569                 { STRP("May"),       },
          570                 { STRP("June"),      },
          571                 { STRP("July"),      },
          572                 { STRP("August"),    },
          573                 { STRP("September"), },
          574                 { STRP("October"),   },
          575                 { STRP("November"),  },
          576                 { STRP("December"),  },
          577         };
          578         int va[6] = { 0 }, i, j, v, vi;
          579         size_t m;
          580 
          581         for (; ISSPACE((unsigned char)*s); s++)
          582                 ;
          583         if (!ISDIGIT((unsigned char)*s) && !ISALPHA((unsigned char)*s))
          584                 return -1;
          585 
          586         if (ISDIGIT((unsigned char)s[0]) &&
          587             ISDIGIT((unsigned char)s[1]) &&
          588             ISDIGIT((unsigned char)s[2]) &&
          589             ISDIGIT((unsigned char)s[3])) {
          590                 /* formats "%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" or "%Y%m%d%H%M%S" */
          591                 vi = 0;
          592         } else {
          593                 /* format: "[%a, ]%d %b %Y %H:%M:%S" */
          594                 /* parse "[%a, ]%d %b %Y " part, then use time parsing as above */
          595                 for (; ISALPHA((unsigned char)*s); s++)
          596                         ;
          597                 for (; ISSPACE((unsigned char)*s); s++)
          598                         ;
          599                 if (*s == ',')
          600                         s++;
          601                 for (; ISSPACE((unsigned char)*s); s++)
          602                         ;
          603                 for (v = 0, i = 0; i < 2 && ISDIGIT((unsigned char)*s); s++, i++)
          604                         v = (v * 10) + (*s - '0');
          605                 va[2] = v; /* day */
          606                 for (; ISSPACE((unsigned char)*s); s++)
          607                         ;
          608                 /* end of word month */
          609                 for (j = 0; ISALPHA((unsigned char)s[j]); j++)
          610                         ;
          611                 /* check month name */
          612                 if (j < 3 || j > 9)
          613                         return -1; /* month cannot match */
          614                 for (m = 0; m < sizeof(mons) / sizeof(*mons); m++) {
          615                         /* abbreviation (3 length) or long name */
          616                         if ((j == 3 || j == mons[m].len) &&
          617                             !strncasecmp(mons[m].name, s, j)) {
          618                                 va[1] = m + 1;
          619                                 s += j;
          620                                 break;
          621                         }
          622                 }
          623                 if (m >= 12)
          624                         return -1; /* no month found */
          625                 for (; ISSPACE((unsigned char)*s); s++)
          626                         ;
          627                 for (v = 0, i = 0; i < 4 && ISDIGIT((unsigned char)*s); s++, i++)
          628                         v = (v * 10) + (*s - '0');
          629                 /* obsolete short year: RFC 2822 4.3 */
          630                 if (i == 2 || i == 3)
          631                         v += (i == 2 && v >= 0 && v <= 49) ? 2000 : 1900;
          632                 va[0] = v; /* year */
          633                 for (; ISSPACE((unsigned char)*s); s++)
          634                         ;
          635                 /* parse only regular time part, see below */
          636                 vi = 3;
          637         }
          638 
          639         /* parse time parts (and possibly remaining date parts) */
          640         for (; *s && vi < 6; vi++) {
          641                 for (i = 0, v = 0; i < ((vi == 0) ? 4 : 2) &&
          642                                    ISDIGIT((unsigned char)*s); s++, i++) {
          643                         v = (v * 10) + (*s - '0');
          644                 }
          645                 va[vi] = v;
          646 
          647                 if ((vi < 2 && (*s == '-' || *s == '/')) ||
          648                     (vi == 2 && (*s == 'T' || *s == 't' || ISSPACE((unsigned char)*s))) ||
          649                     (vi > 2 && *s == ':'))
          650                         s++;
          651         }
          652 
          653         /* skip milliseconds in for example: "%Y-%m-%dT%H:%M:%S.000Z" */
          654         if (*s == '.' || *s == ',') {
          655                 for (s++; ISDIGIT((unsigned char)*s); s++)
          656                         ;
          657         }
          658 
          659         /* invalid range */
          660         if (va[0] < 0 || va[0] > 9999 ||
          661             va[1] < 1 || va[1] > 12 ||
          662             va[2] < 1 || va[2] > 31 ||
          663             va[3] < 0 || va[3] > 23 ||
          664             va[4] < 0 || va[4] > 59 ||
          665             va[5] < 0 || va[5] > 60) /* allow leap second */
          666                 return -1;
          667 
          668         *tp = datetounix(va[0] - 1900, va[1] - 1, va[2], va[3], va[4], va[5]) -
          669               gettzoffset(s);
          670 
          671         return 0;
          672 }
          673 
          674 static void
          675 printfields(void)
          676 {
          677         string_print_timestamp(&ctx.fields[FeedFieldTime].str);
          678         putchar(FieldSeparator);
          679         string_print_trimmed(&ctx.fields[FeedFieldTitle].str);
          680         putchar(FieldSeparator);
          681         string_print_uri(&ctx.fields[FeedFieldLink].str);
          682         putchar(FieldSeparator);
          683         string_print_encoded(&ctx.fields[FeedFieldContent].str);
          684         putchar(FieldSeparator);
          685         fputs(contenttypes[ctx.contenttype], stdout);
          686         putchar(FieldSeparator);
          687         string_print_trimmed(&ctx.fields[FeedFieldId].str);
          688         putchar(FieldSeparator);
          689         string_print_trimmed(&ctx.fields[FeedFieldAuthor].str);
          690         putchar(FieldSeparator);
          691         string_print_uri(&ctx.fields[FeedFieldEnclosure].str);
          692         putchar(FieldSeparator);
          693         string_print_trimmed_multi(&ctx.fields[FeedFieldCategory].str);
          694         putchar('\n');
          695 
          696         if (ferror(stdout)) /* check for errors but do not flush */
          697                 checkfileerror(stdout, "<stdout>", 'w');
          698 }
          699 
          700 static int
          701 istag(const char *name, size_t len, const char *name2, size_t len2)
          702 {
          703         return (len == len2 && !strcasecmp(name, name2));
          704 }
          705 
          706 static int
          707 isattr(const char *name, size_t len, const char *name2, size_t len2)
          708 {
          709         return (len == len2 && !strcasecmp(name, name2));
          710 }
          711 
          712 static void
          713 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
          714         const char *v, size_t vl)
          715 {
          716         /* handles transforming inline XML to data */
          717         if (ISINCONTENT(ctx)) {
          718                 if (ctx.contenttype == ContentTypeHTML)
          719                         xmldata(p, v, vl);
          720                 return;
          721         }
          722 
          723         if (!ctx.tag.id)
          724                 return;
          725 
          726         /* content-type may be for Atom: text, xhtml, html or a mime-type.
          727          * for MRSS (media:description): plain, html. */
          728         if (ISCONTENTTAG(ctx)) {
          729                 if (isattr(n, nl, STRP("type")))
          730                         string_append(&attrtype, v, vl);
          731                 return;
          732         }
          733 
          734         if (ctx.feedtype == FeedTypeRSS) {
          735                 if (ctx.tag.id == RSSTagEnclosure &&
          736                     isattr(n, nl, STRP("url"))) {
          737                         string_append(&tmpstr, v, vl);
          738                 } else if (ctx.tag.id == RSSTagGuid &&
          739                            isattr(n, nl, STRP("ispermalink"))) {
          740                         string_append(&attrispermalink, v, vl);
          741                 }
          742         } else if (ctx.feedtype == FeedTypeAtom) {
          743                 if (ctx.tag.id == AtomTagLink) {
          744                         if (isattr(n, nl, STRP("rel"))) {
          745                                 string_append(&attrrel, v, vl);
          746                         } else if (isattr(n, nl, STRP("href"))) {
          747                                 string_append(&tmpstr, v, vl);
          748                         }
          749                 } else if (ctx.tag.id == AtomTagCategory &&
          750                            isattr(n, nl, STRP("term"))) {
          751                         string_append(&tmpstr, v, vl);
          752                 }
          753         }
          754 }
          755 
          756 static void
          757 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl,
          758               const char *data, size_t datalen)
          759 {
          760         char buf[8];
          761         int len;
          762 
          763         /* handles transforming inline XML to data */
          764         if (ISINCONTENT(ctx)) {
          765                 if (ctx.contenttype == ContentTypeHTML)
          766                         xmldata(p, data, datalen);
          767                 return;
          768         }
          769 
          770         if (!ctx.tag.id)
          771                 return;
          772 
          773         /* try to translate entity, else just pass as data to
          774          * xmlattr handler. */
          775         if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
          776                 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
          777         else
          778                 xmlattr(p, t, tl, n, nl, data, datalen);
          779 }
          780 
          781 static void
          782 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
          783 {
          784         if (ISINCONTENT(ctx)) {
          785                 if (ctx.contenttype == ContentTypeHTML) {
          786                         /* handles transforming inline XML to data */
          787                         xmldata(p, "\"", 1);
          788                         ctx.attrcount = 0;
          789                 }
          790                 return;
          791         }
          792 }
          793 
          794 static void
          795 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl)
          796 {
          797         if (ISINCONTENT(ctx)) {
          798                 if (ctx.contenttype == ContentTypeHTML) {
          799                         /* handles transforming inline XML to data */
          800                         if (!ctx.attrcount)
          801                                 xmldata(p, " ", 1);
          802                         ctx.attrcount++;
          803                         xmldata(p, n, nl);
          804                         xmldata(p, "=\"", 2);
          805                 }
          806                 return;
          807         }
          808 
          809         if (attrispermalink.len && isattr(n, nl, STRP("ispermalink")))
          810                 string_clear(&attrispermalink);
          811         else if (attrrel.len && isattr(n, nl, STRP("rel")))
          812                 string_clear(&attrrel);
          813         else if (attrtype.len && isattr(n, nl, STRP("type")))
          814                 string_clear(&attrtype);
          815         else if (tmpstr.len &&
          816             (isattr(n, nl, STRP("href")) ||
          817              isattr(n, nl, STRP("term")) ||
          818              isattr(n, nl, STRP("url"))))
          819                 string_clear(&tmpstr); /* use the last value for multiple attribute values */
          820 }
          821 
          822 static void
          823 xmldata(XMLParser *p, const char *s, size_t len)
          824 {
          825         if (!ctx.field)
          826                 return;
          827 
          828         if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
          829                 string_append(&tmpstr, s, len);
          830         else
          831                 string_append(ctx.field, s, len);
          832 }
          833 
          834 static void
          835 xmldataentity(XMLParser *p, const char *data, size_t datalen)
          836 {
          837         char buf[8];
          838         int len;
          839 
          840         if (!ctx.field)
          841                 return;
          842 
          843         /* try to translate entity, else just pass as data to
          844          * xmldata handler. */
          845         if ((len = xml_entitytostr(data, buf, sizeof(buf))) > 0)
          846                 xmldata(p, buf, (size_t)len);
          847         else
          848                 xmldata(p, data, datalen);
          849 }
          850 
          851 static void
          852 xmltagstart(XMLParser *p, const char *t, size_t tl)
          853 {
          854         const FeedTag *f;
          855 
          856         if (ISINCONTENT(ctx)) {
          857                 if (ctx.contenttype == ContentTypeHTML) {
          858                         ctx.attrcount = 0;
          859                         xmldata(p, "<", 1);
          860                         xmldata(p, t, tl);
          861                 }
          862                 return;
          863         }
          864 
          865         /* start of RSS or Atom item / entry */
          866         if (ctx.feedtype == FeedTypeNone) {
          867                 if (istag(t, tl, STRP("entry")))
          868                         ctx.feedtype = FeedTypeAtom;
          869                 else if (istag(t, tl, STRP("item")))
          870                         ctx.feedtype = FeedTypeRSS;
          871                 return;
          872         }
          873 
          874         /* field tagid already set or nested tags. */
          875         if (ctx.tag.id) {
          876                 /* nested <author><name> for Atom */
          877                 if (ctx.tag.id == AtomTagAuthor &&
          878                     istag(t, tl, STRP("name"))) {
          879                         memcpy(&(ctx.tag), &atomtagauthorname, sizeof(ctx.tag));
          880                 } else {
          881                         return; /* other nested tags are not allowed: return */
          882                 }
          883         }
          884 
          885         /* in item */
          886         if (ctx.tag.id == TagUnknown) {
          887                 if (!(f = gettag(ctx.feedtype, t, tl)))
          888                         f = &notag;
          889                 memcpy(&(ctx.tag), f, sizeof(ctx.tag));
          890         }
          891 
          892         ctx.iscontenttag = (fieldmap[ctx.tag.id] == FeedFieldContent);
          893         string_clear(&attrispermalink);
          894         string_clear(&attrrel);
          895         string_clear(&attrtype);
          896 }
          897 
          898 static void
          899 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
          900 {
          901         enum TagId tagid;
          902 
          903         if (ISINCONTENT(ctx)) {
          904                 if (ctx.contenttype == ContentTypeHTML) {
          905                         if (isshort)
          906                                 xmldata(p, "/>", 2);
          907                         else
          908                                 xmldata(p, ">", 1);
          909                 }
          910                 return;
          911         }
          912 
          913         /* set tag type based on its attribute value */
          914         if (ctx.tag.id == RSSTagGuid) {
          915                 /* if empty the default is "true" */
          916                 if (!attrispermalink.len ||
          917                     isattr(attrispermalink.data, attrispermalink.len, STRP("true")))
          918                         ctx.tag.id = RSSTagGuidPermalinkTrue;
          919                 else
          920                         ctx.tag.id = RSSTagGuidPermalinkFalse;
          921         } else if (ctx.tag.id == AtomTagLink) {
          922                 /* empty or "alternate": other types could be
          923                  * "enclosure", "related", "self" or "via" */
          924                 if (!attrrel.len || isattr(attrrel.data, attrrel.len, STRP("alternate")))
          925                         ctx.tag.id = AtomTagLinkAlternate;
          926                 else if (isattr(attrrel.data, attrrel.len, STRP("enclosure")))
          927                         ctx.tag.id = AtomTagLinkEnclosure;
          928                 else
          929                         ctx.tag.id = AtomTagLink; /* unknown */
          930         }
          931 
          932         tagid = ctx.tag.id;
          933 
          934         /* map tag type to field: unknown or lesser priority is ignored,
          935          * when tags of the same type are repeated only the first is used. */
          936         if (fieldmap[tagid] == -1 ||
          937             (!ISFEEDFIELDMULTI(fieldmap[tagid]) &&
          938              tagid <= ctx.fields[fieldmap[tagid]].tagid)) {
          939                 return;
          940         }
          941 
          942         if (ctx.iscontenttag) {
          943                 ctx.iscontent = 1;
          944                 ctx.iscontenttag = 0;
          945 
          946                 /* detect content-type based on type attribute */
          947                 if (attrtype.len) {
          948                         if (isattr(attrtype.data, attrtype.len, STRP("html")) ||
          949                             isattr(attrtype.data, attrtype.len, STRP("xhtml")) ||
          950                             isattr(attrtype.data, attrtype.len, STRP("text/html")) ||
          951                             isattr(attrtype.data, attrtype.len, STRP("text/xhtml")) ||
          952                             isattr(attrtype.data, attrtype.len, STRP("application/xhtml+xml")))
          953                                 ctx.contenttype = ContentTypeHTML;
          954                         else /* unknown: handle as base64 text data */
          955                                 ctx.contenttype = ContentTypePlain;
          956                 } else {
          957                         /* default content-type */
          958                         if (tagid == RSSTagContentEncoded || tagid == RSSTagDescription)
          959                                 ctx.contenttype = ContentTypeHTML;
          960                         else
          961                                 ctx.contenttype = ContentTypePlain;
          962                 }
          963         }
          964 
          965         ctx.field = &(ctx.fields[fieldmap[tagid]].str);
          966         ctx.fields[fieldmap[tagid]].tagid = tagid;
          967 
          968         /* clear field if it is overwritten (with a priority order) for the new
          969          * value, if the field can have multiple values then do not clear it. */
          970         if (!ISFEEDFIELDMULTI(fieldmap[ctx.tag.id]))
          971                 string_clear(ctx.field);
          972 }
          973 
          974 static void
          975 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
          976 {
          977         size_t i;
          978 
          979         if (ctx.feedtype == FeedTypeNone)
          980                 return;
          981 
          982         if (ISINCONTENT(ctx)) {
          983                 /* not a closed content field */
          984                 if (!istag(ctx.tag.name, ctx.tag.len, t, tl)) {
          985                         if (!isshort && ctx.contenttype == ContentTypeHTML) {
          986                                 xmldata(p, "</", 2);
          987                                 xmldata(p, t, tl);
          988                                 xmldata(p, ">", 1);
          989                         }
          990                         return;
          991                 }
          992         } else if (ctx.tag.id && istag(ctx.tag.name, ctx.tag.len, t, tl)) {
          993                 /* matched tag end: close it.
          994                  * copy also to the link field if the attribute isPermaLink="true"
          995                  * and it is not set by a tag with higher priority. */
          996                 if (ctx.tag.id == RSSTagGuidPermalinkTrue && ctx.field &&
          997                     ctx.tag.id > ctx.fields[FeedFieldLink].tagid) {
          998                         string_clear(&ctx.fields[FeedFieldLink].str);
          999                         string_append(&ctx.fields[FeedFieldLink].str,
         1000                                       ctx.field->data, ctx.field->len);
         1001                         ctx.fields[FeedFieldLink].tagid = ctx.tag.id;
         1002                 }
         1003         } else if (!ctx.tag.id && ((ctx.feedtype == FeedTypeAtom &&
         1004            istag(t, tl, STRP("entry"))) || /* Atom */
         1005            (ctx.feedtype == FeedTypeRSS &&
         1006            istag(t, tl, STRP("item"))))) /* RSS */
         1007         {
         1008                 /* end of RSS or Atom entry / item */
         1009                 printfields();
         1010 
         1011                 /* clear strings */
         1012                 for (i = 0; i < FeedFieldLast; i++) {
         1013                         string_clear(&ctx.fields[i].str);
         1014                         ctx.fields[i].tagid = TagUnknown;
         1015                 }
         1016                 ctx.contenttype = ContentTypeNone;
         1017                 /* allow parsing of Atom and RSS concatenated in one XML stream. */
         1018                 ctx.feedtype = FeedTypeNone;
         1019         } else {
         1020                 return; /* not end of field */
         1021         }
         1022 
         1023         /* temporary string: for fields that cannot be processed
         1024          * directly and need more context, for example by its tag
         1025          * attributes, like the Atom link rel="alternate|enclosure". */
         1026         if (tmpstr.len && ctx.field) {
         1027                 if (ISFEEDFIELDMULTI(fieldmap[ctx.tag.id])) {
         1028                         if (ctx.field->len)
         1029                                 string_append(ctx.field, FieldMultiSeparator, 1);
         1030                         string_append(ctx.field, tmpstr.data, tmpstr.len);
         1031                 } else {
         1032                         string_clear(ctx.field);
         1033                         string_append(ctx.field, tmpstr.data, tmpstr.len);
         1034                 }
         1035         }
         1036 
         1037         /* close field */
         1038         string_clear(&tmpstr); /* reuse and clear temporary string */
         1039 
         1040         if (ctx.tag.id == AtomTagAuthorName)
         1041                 memcpy(&(ctx.tag), &atomtagauthor, sizeof(ctx.tag)); /* outer tag */
         1042         else
         1043                 memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
         1044 
         1045         ctx.iscontent = 0;
         1046         ctx.field = NULL;
         1047 }
         1048 
         1049 int
         1050 main(int argc, char *argv[])
         1051 {
         1052         if (pledge("stdio", NULL) == -1)
         1053                 err(1, "pledge");
         1054 
         1055         if (argc > 1) {
         1056                 if (uri_parse(argv[1], &baseuri) != -1 && baseuri.proto[0])
         1057                         baseurl = argv[1];
         1058                 else
         1059                         errx(1, "baseurl incorrect or too long");
         1060         }
         1061 
         1062         memcpy(&(ctx.tag), &notag, sizeof(ctx.tag));
         1063 
         1064         parser.xmlattr = xmlattr;
         1065         parser.xmlattrentity = xmlattrentity;
         1066         parser.xmlattrend = xmlattrend;
         1067         parser.xmlattrstart = xmlattrstart;
         1068         parser.xmlcdata = xmldata;
         1069         parser.xmldata = xmldata;
         1070         parser.xmldataentity = xmldataentity;
         1071         parser.xmltagend = xmltagend;
         1072         parser.xmltagstart = xmltagstart;
         1073         parser.xmltagstartparsed = xmltagstartparsed;
         1074 
         1075         /* NOTE: GETNEXT is defined in xml.h for inline optimization */
         1076         xml_parse(&parser);
         1077 
         1078         checkfileerror(stdin, "<stdin>", 'r');
         1079         checkfileerror(stdout, "<stdout>", 'w');
         1080 
         1081         return 0;
         1082 }