util.c - sfeed - RSS and Atom parser
 (HTM) git clone git://git.codemadness.org/sfeed
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       util.c (9946B)
       ---
            1 #include <errno.h>
            2 #include <stdarg.h>
            3 #include <stdio.h>
            4 #include <stdlib.h>
            5 #include <string.h>
            6 #include <wchar.h>
            7 
            8 #include "util.h"
            9 
           10 /* print to stderr, print error message of errno and exit().
           11  * Unlike BSD err() it does not prefix __progname */
           12 __dead void
           13 err(int exitstatus, const char *fmt, ...)
           14 {
           15         va_list ap;
           16         int saved_errno;
           17 
           18         saved_errno = errno;
           19 
           20         if (fmt) {
           21                 va_start(ap, fmt);
           22                 vfprintf(stderr, fmt, ap);
           23                 va_end(ap);
           24                 fputs(": ", stderr);
           25         }
           26         fprintf(stderr, "%s\n", strerror(saved_errno));
           27 
           28         exit(exitstatus);
           29 }
           30 
           31 /* print to stderr and exit().
           32  * Unlike BSD errx() it does not prefix __progname */
           33 __dead void
           34 errx(int exitstatus, const char *fmt, ...)
           35 {
           36         va_list ap;
           37 
           38         if (fmt) {
           39                 va_start(ap, fmt);
           40                 vfprintf(stderr, fmt, ap);
           41                 va_end(ap);
           42         }
           43         fputs("\n", stderr);
           44 
           45         exit(exitstatus);
           46 }
           47 
           48 /* Handle read or write errors for a FILE * stream */
           49 void
           50 checkfileerror(FILE *fp, const char *name, int mode)
           51 {
           52         if (mode == 'r' && ferror(fp))
           53                 errx(1, "read error: %s", name);
           54         else if (mode == 'w' && (fflush(fp) || ferror(fp)))
           55                 errx(1, "write error: %s", name);
           56 }
           57 
           58 /* strcasestr() included for portability */
           59 char *
           60 strcasestr(const char *h, const char *n)
           61 {
           62         size_t i;
           63 
           64         if (!n[0])
           65                 return (char *)h;
           66 
           67         for (; *h; ++h) {
           68                 for (i = 0; n[i] && TOLOWER((unsigned char)n[i]) ==
           69                             TOLOWER((unsigned char)h[i]); ++i)
           70                         ;
           71                 if (n[i] == '\0')
           72                         return (char *)h;
           73         }
           74 
           75         return NULL;
           76 }
           77 
           78 /* Check if string has a non-empty scheme / protocol part. */
           79 int
           80 uri_hasscheme(const char *s)
           81 {
           82         const char *p = s;
           83 
           84         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
           85                        *p == '+' || *p == '-' || *p == '.'; p++)
           86                 ;
           87         /* scheme, except if empty and starts with ":" then it is a path */
           88         return (*p == ':' && p != s);
           89 }
           90 
           91 /* Parse URI string `s` into an uri structure `u`.
           92  * Returns 0 on success or -1 on failure */
           93 int
           94 uri_parse(const char *s, struct uri *u)
           95 {
           96         const char *p = s;
           97         char *endptr;
           98         size_t i;
           99         long l;
          100 
          101         u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
          102         u->path[0] = u->query[0] = u->fragment[0] = '\0';
          103 
          104         /* protocol-relative */
          105         if (*p == '/' && *(p + 1) == '/') {
          106                 p += 2; /* skip "//" */
          107                 goto parseauth;
          108         }
          109 
          110         /* scheme / protocol part */
          111         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
          112                        *p == '+' || *p == '-' || *p == '.'; p++)
          113                 ;
          114         /* scheme, except if empty and starts with ":" then it is a path */
          115         if (*p == ':' && p != s) {
          116                 if (*(p + 1) == '/' && *(p + 2) == '/')
          117                         p += 3; /* skip "://" */
          118                 else
          119                         p++; /* skip ":" */
          120 
          121                 if ((size_t)(p - s) >= sizeof(u->proto))
          122                         return -1; /* protocol too long */
          123                 memcpy(u->proto, s, p - s);
          124                 u->proto[p - s] = '\0';
          125 
          126                 if (*(p - 1) != '/')
          127                         goto parsepath;
          128         } else {
          129                 p = s; /* no scheme format, reset to start */
          130                 goto parsepath;
          131         }
          132 
          133 parseauth:
          134         /* userinfo (username:password) */
          135         i = strcspn(p, "@/?#");
          136         if (p[i] == '@') {
          137                 if (i >= sizeof(u->userinfo))
          138                         return -1; /* userinfo too long */
          139                 memcpy(u->userinfo, p, i);
          140                 u->userinfo[i] = '\0';
          141                 p += i + 1;
          142         }
          143 
          144         /* IPv6 address */
          145         if (*p == '[') {
          146                 /* bracket not found, host too short or too long */
          147                 i = strcspn(p, "]");
          148                 if (p[i] != ']' || i < 3)
          149                         return -1;
          150                 i++; /* including "]" */
          151         } else {
          152                 /* domain / host part, skip until port, path or end. */
          153                 i = strcspn(p, ":/?#");
          154         }
          155         if (i >= sizeof(u->host))
          156                 return -1; /* host too long */
          157         memcpy(u->host, p, i);
          158         u->host[i] = '\0';
          159         p += i;
          160 
          161         /* port */
          162         if (*p == ':') {
          163                 p++;
          164                 if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
          165                         return -1; /* port too long */
          166                 memcpy(u->port, p, i);
          167                 u->port[i] = '\0';
          168                 /* check for valid port: range 1 - 65535, may be empty */
          169                 errno = 0;
          170                 l = strtol(u->port, &endptr, 10);
          171                 if (i && (errno || *endptr || l <= 0 || l > 65535))
          172                         return -1;
          173                 p += i;
          174         }
          175 
          176 parsepath:
          177         /* path */
          178         if ((i = strcspn(p, "?#")) >= sizeof(u->path))
          179                 return -1; /* path too long */
          180         memcpy(u->path, p, i);
          181         u->path[i] = '\0';
          182         p += i;
          183 
          184         /* query */
          185         if (*p == '?') {
          186                 p++;
          187                 if ((i = strcspn(p, "#")) >= sizeof(u->query))
          188                         return -1; /* query too long */
          189                 memcpy(u->query, p, i);
          190                 u->query[i] = '\0';
          191                 p += i;
          192         }
          193 
          194         /* fragment */
          195         if (*p == '#') {
          196                 p++;
          197                 if ((i = strlen(p)) >= sizeof(u->fragment))
          198                         return -1; /* fragment too long */
          199                 memcpy(u->fragment, p, i);
          200                 u->fragment[i] = '\0';
          201         }
          202 
          203         return 0;
          204 }
          205 
          206 /* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
          207  * Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
          208  * Returns 0 on success, -1 on error or truncation. */
          209 int
          210 uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
          211 {
          212         char *p;
          213         int c;
          214 
          215         strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
          216 
          217         if (u->proto[0] || u->host[0]) {
          218                 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
          219                 strlcpy(a->host, u->host, sizeof(a->host));
          220                 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
          221                 strlcpy(a->host, u->host, sizeof(a->host));
          222                 strlcpy(a->port, u->port, sizeof(a->port));
          223                 strlcpy(a->path, u->path, sizeof(a->path));
          224                 strlcpy(a->query, u->query, sizeof(a->query));
          225                 return 0;
          226         }
          227 
          228         strlcpy(a->proto, b->proto, sizeof(a->proto));
          229         strlcpy(a->host, b->host, sizeof(a->host));
          230         strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
          231         strlcpy(a->host, b->host, sizeof(a->host));
          232         strlcpy(a->port, b->port, sizeof(a->port));
          233 
          234         if (!u->path[0]) {
          235                 strlcpy(a->path, b->path, sizeof(a->path));
          236         } else if (u->path[0] == '/') {
          237                 strlcpy(a->path, u->path, sizeof(a->path));
          238         } else {
          239                 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0';
          240                 a->path[1] = '\0';
          241 
          242                 if ((p = strrchr(b->path, '/'))) {
          243                         c = *(++p);
          244                         *p = '\0'; /* temporary NUL-terminate */
          245                         if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
          246                                 return -1;
          247                         *p = c; /* restore */
          248                 }
          249                 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
          250                         return -1;
          251         }
          252 
          253         if (u->path[0] || u->query[0])
          254                 strlcpy(a->query, u->query, sizeof(a->query));
          255         else
          256                 strlcpy(a->query, b->query, sizeof(a->query));
          257 
          258         return 0;
          259 }
          260 
          261 int
          262 uri_format(char *buf, size_t bufsiz, struct uri *u)
          263 {
          264         return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
          265                 u->proto,
          266                 u->userinfo[0] ? u->userinfo : "",
          267                 u->userinfo[0] ? "@" : "",
          268                 u->host,
          269                 u->port[0] ? ":" : "",
          270                 u->port,
          271                 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
          272                 u->path,
          273                 u->query[0] ? "?" : "",
          274                 u->query,
          275                 u->fragment[0] ? "#" : "",
          276                 u->fragment);
          277 }
          278 
          279 /* Splits fields in the line buffer by replacing TAB separators with NUL ('\0')
          280  * terminators and assign these fields as pointers. If there are less fields
          281  * than expected then the field is an empty string constant. */
          282 void
          283 parseline(char *line, char *fields[FieldLast])
          284 {
          285         char *prev, *s;
          286         size_t i;
          287 
          288         for (prev = line, i = 0;
          289             (s = strchr(prev, '\t')) && i < FieldLast - 1;
          290             i++) {
          291                 *s = '\0';
          292                 fields[i] = prev;
          293                 prev = s + 1;
          294         }
          295         fields[i++] = prev;
          296         /* make non-parsed fields empty. */
          297         for (; i < FieldLast; i++)
          298                 fields[i] = "";
          299 }
          300 
          301 /* Parse time to time_t, assumes time_t is signed, ignores fractions. */
          302 int
          303 strtotime(const char *s, time_t *t)
          304 {
          305         long long l;
          306         char *e;
          307 
          308         errno = 0;
          309         l = strtoll(s, &e, 10);
          310         if (errno || *s == '\0' || *e)
          311                 return -1;
          312 
          313         /* NOTE: the type long long supports the 64-bit range. If time_t is
          314          * 64-bit it is "2038-ready", otherwise it is truncated/wrapped. */
          315         if (t)
          316                 *t = (time_t)l;
          317 
          318         return 0;
          319 }
          320 
          321 time_t
          322 getcomparetime(void)
          323 {
          324         time_t now, t;
          325         char *p;
          326 
          327         if ((now = time(NULL)) == (time_t)-1)
          328                 return (time_t)-1;
          329 
          330         if ((p = getenv("SFEED_NEW_AGE"))) {
          331                 if (strtotime(p, &t) == -1)
          332                         return (time_t)-1;
          333                 return now - t;
          334         }
          335 
          336         return now - 86400; /* 1 day is old news */
          337 }
          338 
          339 /* Escape characters below as HTML 2.0 / XML 1.0. */
          340 void
          341 xmlencode(const char *s, FILE *fp)
          342 {
          343         for (; *s; ++s) {
          344                 switch (*s) {
          345                 case '<':  fputs("&lt;",   fp); break;
          346                 case '>':  fputs("&gt;",   fp); break;
          347                 case '\'': fputs("&#39;",  fp); break;
          348                 case '&':  fputs("&amp;",  fp); break;
          349                 case '"':  fputs("&quot;", fp); break;
          350                 default:   putc(*s, fp);
          351                 }
          352         }
          353 }
          354 
          355 /* print `len` columns of characters. If string is shorter pad the rest with
          356  * characters `pad`. */
          357 void
          358 printutf8pad(FILE *fp, const char *s, size_t len, int pad)
          359 {
          360         wchar_t wc;
          361         size_t col = 0, i, slen;
          362         int inc, rl, w;
          363 
          364         if (!len)
          365                 return;
          366 
          367         slen = strlen(s);
          368         for (i = 0; i < slen; i += inc) {
          369                 inc = 1; /* next byte */
          370                 if ((unsigned char)s[i] < 32) {
          371                         continue; /* skip control characters */
          372                 } else if ((unsigned char)s[i] >= 127) {
          373                         rl = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4);
          374                         inc = rl;
          375                         if (rl < 0) {
          376                                 mbtowc(NULL, NULL, 0); /* reset state */
          377                                 inc = 1; /* invalid, seek next byte */
          378                                 w = 1; /* replacement char is one width */
          379                         } else if ((w = wcwidth(wc)) == -1) {
          380                                 continue;
          381                         }
          382 
          383                         if (col + w > len || (col + w == len && s[i + inc])) {
          384                                 fputs(PAD_TRUNCATE_SYMBOL, fp); /* ellipsis */
          385                                 col++;
          386                                 break;
          387                         } else if (rl < 0) {
          388                                 fputs(UTF_INVALID_SYMBOL, fp); /* replacement */
          389                                 col++;
          390                                 continue;
          391                         }
          392                         fwrite(&s[i], 1, rl, fp);
          393                         col += w;
          394                 } else {
          395                         /* optimization: simple ASCII character */
          396                         if (col + 1 > len || (col + 1 == len && s[i + 1])) {
          397                                 fputs(PAD_TRUNCATE_SYMBOL, fp); /* ellipsis */
          398                                 col++;
          399                                 break;
          400                         }
          401                         putc(s[i], fp);
          402                         col++;
          403                 }
          404 
          405         }
          406         for (; col < len; ++col)
          407                 putc(pad, fp);
          408 }
          409 
          410 /* Counts column width of a character string. */
          411 size_t
          412 colw(const char *s)
          413 {
          414         wchar_t wc;
          415         size_t col = 0, i, slen;
          416         int inc, rl, w;
          417 
          418         slen = strlen(s);
          419         for (i = 0; i < slen; i += inc) {
          420                 inc = 1; /* next byte */
          421                 if ((unsigned char)s[i] < 32) {
          422                         continue;
          423                 } else if ((unsigned char)s[i] >= 127) {
          424                         rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
          425                         inc = rl;
          426                         if (rl < 0) {
          427                                 mbtowc(NULL, NULL, 0); /* reset state */
          428                                 inc = 1; /* invalid, seek next byte */
          429                                 w = 1; /* replacement char is one width */
          430                         } else if ((w = wcwidth(wc)) == -1) {
          431                                 continue;
          432                         }
          433                         col += w;
          434                 } else {
          435                         col++;
          436                 }
          437         }
          438         return col;
          439 }