twebdump.c - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       twebdump.c (24576B)
       ---
            1 #include <ctype.h>
            2 #include <err.h>
            3 #include <errno.h>
            4 #include <stdio.h>
            5 #include <stdlib.h>
            6 #include <string.h>
            7 #include <strings.h>
            8 #include <unistd.h>
            9 
           10 #include "arg.h"
           11 char *argv0;
           12 
           13 #include "xml.h"
           14 
           15 static XMLParser parser;
           16 
           17 #ifndef __OpenBSD__
           18 #define pledge(p1,p2) 0
           19 #endif
           20 
           21 #undef strlcat
           22 size_t strlcat(char *, const char *, size_t);
           23 #undef strlcpy
           24 size_t strlcpy(char *, const char *, size_t);
           25 
           26 /* uri */
           27 struct uri {
           28         char proto[48];
           29         char host[256];
           30         char path[2048];
           31         char port[6];     /* numeric port */
           32 };
           33 
           34 /* options */
           35 static int allowansi    = 0;  /* allow ANSI escape codes */
           36 static int showlinkrefs = 0;  /* show link references at the bottom */
           37 static int softlinewrap = 0;  /* soft line-wrapping */
           38 static int termwidth    = 72; /* terminal width */
           39 
           40 /* linked-list of link references */
           41 struct linkref {
           42         char *type;
           43         char *url;
           44         struct linkref *next;
           45 };
           46 
           47 static struct linkref *links_head;
           48 static struct linkref *links_cur;
           49 static int linkcount;
           50 
           51 enum DisplayType {
           52         DisplayUnknown     = 0,
           53         DisplayInline      = 1 << 0,
           54         DisplayInlineBlock = 1 << 1,
           55         DisplayBlock       = 1 << 2,
           56         DisplayNone        = 1 << 3,
           57         DisplayPre         = 1 << 4,
           58         DisplayList        = 1 << 5,
           59         DisplayListOrdered = 1 << 6,
           60         DisplayListItem    = 1 << 7,
           61         DisplayTable       = 1 << 8,
           62         DisplayTableRow    = 1 << 9,
           63         DisplayTableCell   = 1 << 10,
           64         DisplayHeader      = 1 << 11,
           65         DisplayBold        = 1 << 12,
           66         DisplayItalic      = 1 << 13,
           67         DisplayUnderline   = 1 << 14,
           68         DisplayBlink       = 1 << 15, /* lol */
           69         DisplayReverse     = 1 << 16,
           70         DisplayStrike      = 1 << 17,
           71 };
           72 
           73 struct tag {
           74         const char *name;
           75         enum DisplayType displaytype;
           76         enum DisplayType parenttype; /* display type belonging to element */
           77         int isvoid; /* "void" element */
           78         int isoptional; /* optional to close tag */
           79 };
           80 
           81 struct node {
           82         char tagname[256];
           83         struct tag tag;
           84         size_t nchildren; /* Child nodes for it's type */
           85 };
           86 
           87 /* String data / memory pool */
           88 typedef struct string {
           89         char   *data;   /* data */
           90         size_t  len;    /* string length */
           91         size_t  bufsiz; /* allocated size */
           92 } String;
           93 
           94 int absuri(char *, size_t, const char *, const char *);
           95 int parseuri(const char *, struct uri *, int);
           96 
           97 static char *basehref = "";
           98 
           99 static char src[4096]; /* src or href attribute */
          100 
          101 static String htmldata;
          102 
          103 /* for white-space output handling:
          104    1 = whitespace emitted (suppress repeated), 2 = other characters on this line
          105    Behaviour:
          106    * White-space data before non-whitespace data in tags are ignored on a line.
          107    * Repeated white-space are ignored: a single space (' ') is emitted.
          108 */
          109 static int whitespace_mode = 0;
          110 static size_t ncharsline = 0;
          111 
          112 #define MAX_DEPTH 256
          113 static struct node nodes[MAX_DEPTH];
          114 static int curnode;
          115 
          116 #if 0
          117 /* TODO: optional tags */
          118 { "body",     0, 0, 0, 1 },
          119 { "colgroup", 0, 0, 0, 1 },
          120 { "dd",       0, 0, 0, 1 },
          121 { "dt",       0, 0, 0, 1 },
          122 { "head",     0, 0, 0, 1 },
          123 { "html",     0, 0, 0, 1 },
          124 { "li",       0, 0, 0, 1 },
          125 { "optgroup", 0, 0, 0, 1 },
          126 { "option",   0, 0, 0, 1 },
          127 { "option",   0, 0, 0, 1 },
          128 { "p",        0, 0, 0, 1 },
          129 { "rp",       0, 0, 0, 1 },
          130 { "rt",       0, 0, 0, 1 },
          131 { "tbody",    0, 0, 0, 1 },
          132 { "td",       0, 0, 0, 1 },
          133 { "tfoot",    0, 0, 0, 1 },
          134 { "th",       0, 0, 0, 1 },
          135 { "thead",    0, 0, 0, 1 },
          136 { "tr",       0, 0, 0, 1 },
          137 #endif
          138 
          139 /* tag          displaytype                       p                v  o */
          140 static struct tag tags[] = {
          141 { "a",          DisplayInline | DisplayUnderline, 0,               0, 0 },
          142 { "area",       DisplayInline,                    0,               1, 0 },
          143 { "article",    DisplayBlock,                     0,               0, 0 },
          144 { "audio",      DisplayInline | DisplayUnderline, 0,               0, 0 },
          145 { "b",          DisplayInline | DisplayBold,      0,               0, 0 },
          146 { "base",       DisplayInline,                    0,               1, 0 },
          147 { "blink",      DisplayInline | DisplayBlink,     0,               0, 0 },
          148 { "blockquote", DisplayBlock,                     0,               0, 0 },
          149 { "br",         0,                                0,               1, 0 },
          150 { "code",       DisplayPre,                       0,               0, 0 },
          151 { "col",        DisplayInline,                    0,               1, 0 },
          152 { "del",        DisplayInline | DisplayStrike,    0,               0, 0 },
          153 { "div",        DisplayBlock,                     0,               0, 0 },
          154 { "em",         DisplayInline | DisplayItalic,    0,               0, 0 },
          155 { "embed",      DisplayInline,                    0,               1, 0 },
          156 { "footer",     DisplayBlock,                     0,               0, 0 },
          157 { "h1",         DisplayHeader | DisplayBold,      0,               0, 0 },
          158 { "h2",         DisplayHeader | DisplayBold,      0,               0, 0 },
          159 { "h3",         DisplayHeader | DisplayBold,      0,               0, 0 },
          160 { "h4",         DisplayHeader | DisplayBold,      0,               0, 0 },
          161 { "h5",         DisplayHeader | DisplayBold,      0,               0, 0 },
          162 { "h6",         DisplayHeader | DisplayBold,      0,               0, 0 },
          163 { "header",     DisplayBlock,                     0,               0, 0 },
          164 { "hr",         DisplayBlock,                     0,               1, 0 },
          165 { "i",          DisplayInline | DisplayItalic,    0,               0, 0 },
          166 { "img",        DisplayInline | DisplayUnderline, 0,               1, 0 },
          167 { "input",      DisplayInline,                    0,               1, 0 },
          168 { "li",         DisplayListItem,                  DisplayList,     0, 1 },
          169 { "link",       DisplayInline,                    0,               1, 0 },
          170 { "main",       DisplayBlock,                     0,               0, 0 },
          171 { "meta",       DisplayInline,                    0,               1, 0 },
          172 { "nav",        DisplayBlock,                     0,               0, 0 },
          173 { "ol",         DisplayList | DisplayListOrdered, 0,               0, 0 },
          174 { "p",          DisplayBlock,                     0,               0, 1 },
          175 { "param",      DisplayInline,                    0,               1, 0 },
          176 { "pre",        DisplayPre,                       0,               0, 0 },
          177 { "s",          DisplayInline | DisplayStrike,    0,               0, 0 },
          178 { "script",     DisplayNone,                      0,               0, 0 },
          179 { "source",     DisplayInline,                    0,               1, 0 },
          180 { "strike",     DisplayInline | DisplayStrike,    0,               0, 0 },
          181 { "strong",     DisplayInline | DisplayBold,      0,               0, 0 },
          182 { "style",      DisplayNone,                      0,               0, 0 },
          183 { "table",      DisplayTable,                     0,               0, 0 },
          184 { "td",         DisplayTableCell,                 DisplayTableRow, 0, 0 },
          185 { "template",   DisplayNone,                      0,               0, 0 },
          186 { "th",         DisplayTableCell | DisplayBold,   DisplayTableRow, 0, 1 },
          187 { "title",      DisplayBlock,                     0,               0, 0 },
          188 { "tr",         DisplayTableRow,                  DisplayTable,    0, 1 },
          189 { "track",      DisplayInline,                    0,               1, 0 },
          190 { "u",          DisplayInline | DisplayUnderline, 0,               0, 0 },
          191 { "ul",         DisplayList,                      0,               0, 0 },
          192 { "video",      DisplayInline | DisplayUnderline, 0,               0, 0 },
          193 { "wbr",        DisplayInline,                    0,               1, 0 },
          194 };
          195 
          196 static const char *ignorestate, *endtag;
          197 static int (*getnext)(void);
          198 
          199 /* return a space for all data until some case-insensitive string occurs. This
          200    is used to parse incorrect HTML/XML that contains unescaped HTML in script
          201    or style tags. If you see some </script> tag in a CDATA or comment
          202    section then e-mail W3C and tell them the web is too complex. */
          203 static inline int
          204 getnext_ignore(void)
          205 {
          206         int c;
          207 
          208         if ((c = getnext()) == EOF)
          209                 return EOF;
          210 
          211         if (tolower(c) == tolower((unsigned char)*ignorestate)) {
          212                 ignorestate++;
          213                 if (*ignorestate == '\0') {
          214                         parser.getnext = getnext; /* restore */
          215                         return ' ';
          216                 }
          217         } else {
          218                 ignorestate = endtag;
          219         }
          220 
          221         return ' ';
          222 }
          223 
          224 /* Clear string only; don't free, prevents unnecessary reallocation. */
          225 static void
          226 string_clear(String *s)
          227 {
          228         if (s->data)
          229                 s->data[0] = '\0';
          230         s->len = 0;
          231 }
          232 
          233 static void
          234 string_buffer_realloc(String *s, size_t newlen)
          235 {
          236         size_t alloclen;
          237 
          238         for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
          239                 ;
          240         if (!(s->data = realloc(s->data, alloclen)))
          241                 err(1, "realloc");
          242         s->bufsiz = alloclen;
          243 }
          244 
          245 static void
          246 string_append(String *s, const char *data, size_t len)
          247 {
          248         if (!len)
          249                 return;
          250         /* check if allocation is necesary, don't shrink buffer,
          251          * should be more than bufsiz ofcourse. */
          252         if (s->len + len >= s->bufsiz)
          253                 string_buffer_realloc(s, s->len + len + 1);
          254         memcpy(s->data + s->len, data, len);
          255         s->len += len;
          256         s->data[s->len] = '\0';
          257 }
          258 
          259 char *
          260 estrdup(const char *s)
          261 {
          262         char *p;
          263 
          264         if (!(p = strdup(s)))
          265                 err(1, "strdup");
          266         return p;
          267 }
          268 
          269 void *
          270 ecalloc(size_t nmemb, size_t size)
          271 {
          272         void *p;
          273 
          274         if (!(p = calloc(nmemb, size)))
          275                 err(1, "calloc");
          276         return p;
          277 }
          278 
          279 static void
          280 newline(void)
          281 {
          282         putchar('\n');
          283         whitespace_mode &= ~2; /* no characters on this line yet */
          284         ncharsline = 0;
          285 }
          286 
          287 static void
          288 printansi(const char *s)
          289 {
          290         if (!allowansi)
          291                 return;
          292         fputs(s, stdout);
          293 }
          294 
          295 /* print one character safely: no control characters */
          296 static void
          297 printc(int c)
          298 {
          299         if (isspace(c)) {
          300                 whitespace_mode |= 1;
          301         } else {
          302                 if (whitespace_mode == 3) {
          303                         putchar(' ');
          304                         ncharsline++;
          305                 }
          306 
          307                 whitespace_mode = 2;
          308                 if (!iscntrl(c)) {
          309                         putchar(c);
          310                         ncharsline++;
          311                 }
          312         }
          313 
          314         if (softlinewrap) {
          315                 /* TODO: harder line-wrapping on "non-word" characters */
          316                 if (strchr(" \n\t", c) && ncharsline >= termwidth)
          317                         newline();
          318         }
          319 }
          320 
          321 static struct node *
          322 findparenttype(int cur, int findtype)
          323 {
          324         int i;
          325 
          326         for (i = cur; i; i--) {
          327                 if ((nodes[i].tag.displaytype & findtype))
          328                         return &nodes[i];
          329         }
          330         return NULL;
          331 }
          332 
          333 /* Find nearest parent node belonging to type. For example a listitem -> list */
          334 static struct node *
          335 findparentoftype(int cur)
          336 {
          337         if (!nodes[cur].tag.parenttype)
          338                 return NULL;
          339 
          340         return findparenttype(cur, nodes[cur].tag.parenttype);
          341 }
          342 
          343 static void
          344 printsafe(const char *s, size_t len)
          345 {
          346         size_t i;
          347 
          348         for (i = 0; *s && i < len; s++, i++) {
          349                 switch (*s) {
          350                 case '\t':
          351                 case '\n':
          352                         putchar(*s);
          353                         break;
          354                 default:
          355                         if (!iscntrl((unsigned char)*s))
          356                                 putchar(*s);
          357                 }
          358         }
          359 }
          360 
          361 int
          362 parseuri(const char *s, struct uri *u, int rel)
          363 {
          364         const char *p = s, *b;
          365         char *endptr = NULL;
          366         size_t i;
          367         unsigned long l;
          368 
          369         u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
          370         if (!*s)
          371                 return 0;
          372 
          373         /* prefix is "//", don't read protocol, skip to domain parsing */
          374         if (!strncmp(p, "//", 2)) {
          375                 p += 2; /* skip "//" */
          376         } else {
          377                 /* protocol part */
          378                 for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
          379                                *p == '+' || *p == '-' || *p == '.'); p++)
          380                         ;
          381                 if (!strncmp(p, "://", 3)) {
          382                         if ((size_t)(p - s) >= sizeof(u->proto))
          383                                 return -1; /* protocol too long */
          384                         memcpy(u->proto, s, p - s);
          385                         u->proto[p - s] = '\0';
          386                         p += 3; /* skip "://" */
          387                 } else {
          388                         p = s; /* no protocol format, set to start */
          389                         /* relative url: read rest as path, else as domain */
          390                         if (rel)
          391                                 goto readpath;
          392                 }
          393         }
          394         /* IPv6 address */
          395         if (*p == '[') {
          396                 /* bracket not found or host too long */
          397                 if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
          398                     (size_t)(b - p) >= sizeof(u->host))
          399                         return -1;
          400                 memcpy(u->host, p, b - p + 1);
          401                 u->host[b - p + 1] = '\0';
          402                 p = b + 1;
          403         } else {
          404                 /* domain / host part, skip until port, path or end. */
          405                 if ((i = strcspn(p, ":/")) >= sizeof(u->host))
          406                         return -1; /* host too long */
          407                 memcpy(u->host, p, i);
          408                 u->host[i] = '\0';
          409                 p = &p[i];
          410         }
          411         /* port */
          412         if (*p == ':') {
          413                 if ((i = strcspn(++p, "/")) >= sizeof(u->port))
          414                         return -1; /* port too long */
          415                 memcpy(u->port, p, i);
          416                 u->port[i] = '\0';
          417                 /* check for valid port: range 1 - 65535 */
          418                 errno = 0;
          419                 l = strtoul(u->port, &endptr, 10);
          420                 if (errno || u->port[0] == '\0' || *endptr ||
          421                     !l || l > 65535)
          422                         return -1;
          423                 p = &p[i];
          424         }
          425 readpath:
          426         if (u->host[0]) {
          427                 p = &p[strspn(p, "/")];
          428                 strlcpy(u->path, "/", sizeof(u->path));
          429         } else {
          430                 /* absolute uri must have a host specified */
          431                 if (!rel)
          432                         return -1;
          433         }
          434         /* treat truncation as an error */
          435         if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
          436                 return -1;
          437         return 0;
          438 }
          439 
          440 static int
          441 encodeuri(char *buf, size_t bufsiz, const char *s)
          442 {
          443         static const char *table = "0123456789ABCDEF";
          444         size_t i, b;
          445 
          446         for (i = 0, b = 0; s[i]; i++) {
          447                 if (s[i] == ' ' ||
          448                     (unsigned char)s[i] > 127 ||
          449                     iscntrl((unsigned char)s[i])) {
          450                         if (b + 3 >= bufsiz)
          451                                 return -1;
          452                         buf[b++] = '%';
          453                         buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
          454                         buf[b++] = table[(unsigned char)s[i] & 15];
          455                 } else if (b < bufsiz) {
          456                         buf[b++] = s[i];
          457                 } else {
          458                         return -1;
          459                 }
          460         }
          461         if (b >= bufsiz)
          462                 return -1;
          463         buf[b] = '\0';
          464 
          465         return 0;
          466 }
          467 
          468 /* Get absolute uri; if `link` is relative use `base` to make it absolute.
          469  * the returned string in `buf` is uri encoded, see: encodeuri(). */
          470 int
          471 absuri(char *buf, size_t bufsiz, const char *link, const char *base)
          472 {
          473         struct uri ulink, ubase;
          474         char tmp[4096], *host, *p, *port;
          475         int c, r;
          476         size_t i;
          477 
          478         buf[0] = '\0';
          479         if (parseuri(base, &ubase, 0) == -1 ||
          480             parseuri(link, &ulink, 1) == -1 ||
          481             (!ulink.host[0] && !ubase.host[0]))
          482                 return -1;
          483 
          484         if (!strncmp(link, "//", 2)) {
          485                 host = ulink.host;
          486                 port = ulink.port;
          487         } else {
          488                 host = ulink.host[0] ? ulink.host : ubase.host;
          489                 port = ulink.port[0] ? ulink.port : ubase.port;
          490         }
          491         r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
          492                 ulink.proto[0] ?
          493                         ulink.proto :
          494                         (ubase.proto[0] ? ubase.proto : "http"),
          495                 host,
          496                 port[0] ? ":" : "",
          497                 port);
          498         if (r < 0 || (size_t)r >= sizeof(tmp))
          499                 return -1; /* error or truncation */
          500 
          501         /* relative to root */
          502         if (!ulink.host[0] && ulink.path[0] != '/') {
          503                 /* relative to base url path */
          504                 if (ulink.path[0]) {
          505                         if ((p = strrchr(ubase.path, '/'))) {
          506                                 /* temporary null-terminate */
          507                                 c = *(++p);
          508                                 *p = '\0';
          509                                 i = strlcat(tmp, ubase.path, sizeof(tmp));
          510                                 *p = c; /* restore */
          511                                 if (i >= sizeof(tmp))
          512                                         return -1;
          513                         }
          514                 } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
          515                            sizeof(tmp)) {
          516                         return -1;
          517                 }
          518         }
          519         if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
          520                 return -1;
          521 
          522         return encodeuri(buf, bufsiz, tmp);
          523 }
          524 
          525 static void
          526 xmlcdata(XMLParser *p, const char *data, size_t datalen)
          527 {
          528         struct node *cur;
          529 
          530         cur = &nodes[curnode];
          531         if (cur->tag.displaytype & DisplayNone)
          532                 return;
          533 
          534         printsafe(data, datalen);
          535 }
          536 
          537 static void
          538 xmldataend(XMLParser *p)
          539 {
          540         struct node *cur;
          541         char *start, *s, *e;
          542 
          543         if (!htmldata.data || !htmldata.len)
          544                 return;
          545 
          546         cur = &nodes[curnode];
          547 
          548         if ((cur->tag.displaytype & DisplayNone)) {
          549                 /* nothing */
          550         } else if ((cur->tag.displaytype & DisplayPre) ||
          551                    findparenttype(curnode, DisplayPre)) {
          552                 /* if <pre> or inside it */
          553                 printsafe(htmldata.data, htmldata.len);
          554         } else {
          555                 start = htmldata.data;
          556                 e = htmldata.data + htmldata.len;
          557 
          558                 for (s = start; s < e; s++)
          559                         printc((unsigned char)*s);
          560         }
          561 
          562         string_clear(&htmldata);
          563 }
          564 
          565 static void
          566 xmldata(XMLParser *p, const char *data, size_t datalen)
          567 {
          568         struct node *cur;
          569 
          570         cur = &nodes[curnode];
          571         if (cur->tag.displaytype & DisplayNone)
          572                 return;
          573 
          574         string_append(&htmldata, data, datalen);
          575 }
          576 
          577 static void
          578 xmldataentity(XMLParser *p, const char *data, size_t datalen)
          579 {
          580         struct node *cur;
          581         char buf[16];
          582         int n;
          583 
          584         cur = &nodes[curnode];
          585         if (cur->tag.displaytype & DisplayNone)
          586                 return;
          587 
          588         n = xml_entitytostr(data, buf, sizeof(buf));
          589         if (n > 0)
          590                 xmldata(p, buf, (size_t)n);
          591         else
          592                 xmldata(p, data, datalen);
          593 }
          594 
          595 int
          596 tagcmp(const void *v1, const void *v2)
          597 {
          598         struct tag *t1 = (struct tag *)v1;
          599         struct tag *t2 = (struct tag *)v2;
          600 
          601         return strcasecmp(t1->name, t2->name);
          602 }
          603 
          604 struct tag *
          605 findtag(const char *t)
          606 {
          607         struct tag find;
          608 
          609         find.name = t;
          610         return bsearch(&find, tags, sizeof(tags) / sizeof(*tags),
          611                 sizeof(*tags), tagcmp);
          612 }
          613 
          614 static void
          615 tagend(struct node *cur)
          616 {
          617         const char *t;
          618         size_t i;
          619 
          620         t = cur->tag.name;
          621 
          622         if (cur->tag.displaytype & DisplayBold)
          623                 printansi("\033[22m"); /* reset bold or faint */
          624         if (cur->tag.displaytype & DisplayItalic)
          625                 printansi("\033[23m"); /* reset italic */
          626         if (cur->tag.displaytype & DisplayUnderline)
          627                 printansi("\033[24m"); /* reset underline */
          628         if (cur->tag.displaytype & DisplayBlink)
          629                 printansi("\033[25m"); /* reset blink */
          630         if (cur->tag.displaytype & DisplayReverse)
          631                 printansi("\033[27m"); /* reset reverse */
          632         if (cur->tag.displaytype & DisplayStrike)
          633                 printansi("\033[29m"); /* reset strike */
          634 
          635         if (cur->tag.displaytype & DisplayBlock) {
          636                 newline();
          637         } else if (cur->tag.displaytype & DisplayPre) {
          638                 newline();
          639         } else if (cur->tag.displaytype & DisplayTable) {
          640                 newline();
          641         } else if (cur->tag.displaytype & DisplayList) {
          642                 newline();
          643         } else if (cur->tag.displaytype & DisplayListItem) {
          644                 newline();
          645         } else if (cur->tag.displaytype & DisplayHeader) {
          646                 newline();
          647 #if 1
          648                 if (t[0] == 'h' && t[1] >= '1' && t[1] <= '6' && t[2] == '\0') {
          649                         if (t[1] >= '3')
          650                                 for (i = 0; i < termwidth; i++)
          651                                         putchar('-');
          652                         else if (t[1] >= '1')
          653                                 for (i = 0; i < termwidth; i++)
          654                                         putchar('=');
          655                         newline();
          656                 }
          657 #endif
          658         }
          659 
          660 }
          661 
          662 static void
          663 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
          664 {
          665         struct tag *found;
          666         int i;
          667 
          668         /* ignore closing of void elements, like </br>, which is not allowed */
          669         if ((found = findtag(t))) {
          670                 if (!isshort && found->isvoid)
          671                         return;
          672         }
          673 
          674         /* if the current closing tag matches the current open tag */
          675         if (nodes[curnode].tag.name &&
          676             !strcasecmp(nodes[curnode].tag.name, t)) {
          677                 tagend(&nodes[curnode]);
          678                 if (curnode)
          679                         curnode--;
          680         } else {
          681                 /* ... else lookup the first matching start tag. This is also
          682                    for handling optional closing tags */
          683                 for (i = curnode; i > 0; i--) {
          684                         if (nodes[curnode].tag.name &&
          685                             !strcasecmp(nodes[i].tag.name, t)) {
          686                                 tagend(&nodes[i]);
          687                                 curnode = i;
          688                                 break;
          689                         }
          690                 }
          691                 if (curnode)
          692                         curnode--;
          693         }
          694 }
          695 
          696 /* check if the specified tag is closed at some point in the current tree */
          697 static int
          698 istagclosed(int cur)
          699 {
          700         int i;
          701 
          702         if (!cur)
          703                 return 0;
          704         for (i = cur - 1; i > 0; i--) {
          705                 if (!strcasecmp(nodes[i].tag.name, nodes[cur].tag.name))
          706                         return 0;
          707         }
          708         return 1;
          709 }
          710 
          711 static void
          712 xmltagstart(XMLParser *p, const char *t, size_t tl)
          713 {
          714         struct tag *found;
          715         struct node *cur, *parent;
          716         char *s;
          717 
          718         if (curnode >= MAX_DEPTH - 2)
          719                 errx(1, "max tag depth reached: %d\n", curnode);
          720         parent = &nodes[curnode];
          721         curnode++;
          722 
          723         cur = &nodes[curnode];
          724         memset(cur, 0, sizeof(*cur));
          725         /* tag defaults */
          726         cur->tag.displaytype = DisplayInline;
          727         cur->tag.name = cur->tagname;
          728         strlcpy(cur->tagname, t, sizeof(cur->tagname));
          729         /* to lowercase */
          730         for (s = cur->tagname; *s; s++)
          731                 *s = tolower((unsigned char)*s);
          732 
          733         /* match tag */
          734         if ((found = findtag(t))) {
          735                 cur->nchildren = 0;
          736                 memcpy(&(cur->tag), found, sizeof(*found));
          737 
          738                 if (cur->tag.isoptional && curnode && !istagclosed(curnode)) {
          739                         /* if it's an unclosed tag and it has parent (like ol, ul)
          740                            then fake the end tag. */
          741                         tagend(&nodes[curnode]);
          742                 }
          743 
          744                 /* parent tag is hidden, so hide ourself too */
          745                 if (parent->tag.displaytype & DisplayNone)
          746                         cur->tag.displaytype |= DisplayNone;
          747                 return;
          748         }
          749 
          750         src[0] = '\0'; /* reset src, href */
          751 }
          752 
          753 static void
          754 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
          755 {
          756         struct node *cur, *parent = NULL;
          757         char absurl[1024];
          758         int i;
          759 
          760         /* temporary replace the callback except the reader and end of tag
          761            restore the context once we receive the same ignored tag in the
          762            end tag handler */
          763         if (!strcasecmp(t, "script")) {
          764                 ignorestate = endtag = "</script>";
          765                 getnext = p->getnext; /* for restore */
          766                 p->getnext = getnext_ignore;
          767                 return;
          768         } else if (!strcasecmp(t, "style")) {
          769                 ignorestate = endtag = "</style>";
          770                 getnext = p->getnext; /* for restore */
          771                 p->getnext = getnext_ignore;
          772                 return;
          773         }
          774 
          775         cur = &nodes[curnode];
          776         if (cur->tag.displaytype & DisplayNone)
          777                 return;
          778 
          779         /* show links as reference at the bottom */
          780         if (showlinkrefs && src[0]) {
          781                 absurl[0] = '\0';
          782                 if (!strcasecmp(t, "a")) {
          783                         if (!strncmp(src, "mailto:", sizeof("mailto:") - 1))
          784                                 strlcpy(absurl, src, sizeof(absurl));
          785                         else if (!strncmp(src, "tel:", sizeof("tel:") - 1))
          786                                 strlcpy(absurl, src, sizeof(absurl));
          787                 }
          788                 if (!absurl[0] && absuri(absurl, sizeof(absurl), src, basehref) == -1)
          789                         absurl[0] = '\0';
          790                 if (absurl[0]) {
          791                         if (!links_head)
          792                                 links_cur = links_head = ecalloc(1, sizeof(*links_head));
          793                         else
          794                                 links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
          795                         links_cur->url = estrdup(absurl);
          796 
          797                         printf(" [%d]", ++linkcount);
          798                         links_cur->type = estrdup(t);
          799                 }
          800                 src[0] = '\0';
          801         }
          802 
          803         /* find first parent node of type and increase child node count */
          804         if (cur->tag.parenttype && (parent = findparentoftype(curnode)))
          805                 parent->nchildren++;
          806 
          807         if (cur->tag.displaytype & DisplayBlock) {
          808                 newline();
          809         } else if (cur->tag.displaytype & DisplayHeader) {
          810                 newline();
          811         } else if (cur->tag.displaytype & DisplayTableRow) {
          812                 newline();
          813         } else if (cur->tag.displaytype & DisplayList) {
          814                 newline();
          815         } else if (cur->tag.displaytype & DisplayTableCell) {
          816                 if (parent && parent->nchildren > 1)
          817                         fputs("\t", stdout);
          818         } else if (cur->tag.displaytype & DisplayListItem) {
          819                 /* indent nested list items */
          820                 for (i = curnode; i; i--) {
          821                         if (nodes[i].tag.displaytype & DisplayListItem)
          822                                 continue;
          823                         if (nodes[i].tag.displaytype & DisplayList)
          824                                 fputs("    ", stdout);
          825                 }
          826                 /* find first parent node and ordered numbers or unordered */
          827                 if (parent) {
          828                         if (parent->tag.displaytype & DisplayListOrdered)
          829                                 printf("%zu. ", parent->nchildren);
          830                         else
          831                                 fputs("\xe2\x80\xa2 ", stdout);
          832                 }
          833         }
          834 
          835         if (cur->tag.displaytype & DisplayBold)
          836                 printansi("\033[1m");
          837         if (cur->tag.displaytype & DisplayItalic)
          838                 printansi("\033[3m");
          839         if (cur->tag.displaytype & DisplayUnderline)
          840                 printansi("\033[4m");
          841         if (cur->tag.displaytype & DisplayBlink)
          842                 printansi("\033[5m");
          843         if (cur->tag.displaytype & DisplayReverse)
          844                 printansi("\033[7m");
          845         if (cur->tag.displaytype & DisplayStrike)
          846                 printansi("\033[9m");
          847 
          848         if (!strcasecmp(t, "hr")) { /* ruler */
          849                 for (i = 0; i < termwidth; i++)
          850                         putchar('-');
          851         } else if (!strcasecmp(t, "br")) {
          852                 newline();
          853         }
          854 
          855         /* autoclose tags, such as <br>, pretend we are <br/> */
          856         if (!isshort && cur->tag.isvoid)
          857                 xmltagend(p, t, tl, 1);
          858 }
          859 
          860 static void
          861 xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
          862         size_t namelen, const char *value, size_t valuelen)
          863 {
          864         struct node *cur;
          865 
          866         cur = &nodes[curnode];
          867         if (cur->tag.displaytype & DisplayNone)
          868                 return;
          869 
          870         /* hide tags with attribute aria-hidden or hidden */
          871         if (!strcasecmp(name, "aria-hidden") || !strcasecmp(name, "hidden"))
          872                 cur->tag.displaytype |= DisplayNone;
          873 
          874         if (cur->tag.displaytype & DisplayNone)
          875                 return;
          876 
          877         if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
          878                 strlcpy(src, value, sizeof(src));
          879 
          880         if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
          881              !strcasecmp(tag, "audio")) &&
          882             !strcasecmp(name, "src") && valuelen)
          883                 strlcpy(src, value, sizeof(src));
          884 
          885         /* show img alt attribute as text. */
          886         if (!strcasecmp(tag, "img") && !strcasecmp(name, "alt"))
          887                 printsafe(value, strlen(value));
          888 }
          889 
          890 void
          891 printlinkrefs(void)
          892 {
          893         size_t i;
          894 
          895         if (!links_head)
          896                 return;
          897 
          898         printf("\n\nLink references:\n");
          899 
          900         /* TODO: add title attribute or some basic description? */
          901         for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
          902                 printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
          903 }
          904 
          905 void
          906 usage(void)
          907 {
          908         fprintf(stderr, "%s [-alr] [-b basehref] [-w termwidth]\n", argv0);
          909         exit(1);
          910 }
          911 
          912 int
          913 main(int argc, char **argv)
          914 {
          915         if (pledge("stdio", NULL) < 0)
          916                 err(1, "pledge");
          917 
          918         ARGBEGIN {
          919         case 'a':
          920                 allowansi = !allowansi;
          921                 break;
          922         case 'b':
          923                 basehref = EARGF(usage());
          924                 break;
          925         case 'l':
          926                 showlinkrefs = !showlinkrefs;
          927                 break;
          928         case 'r':
          929                 softlinewrap = !softlinewrap;
          930                 break;
          931         case 'w':
          932                 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
          933                         usage();
          934                 break;
          935         default:
          936                 usage();
          937         } ARGEND
          938 
          939         parser.xmlattr = xmlattr;
          940         parser.xmlcdata = xmlcdata;
          941         parser.xmldata = xmldata;
          942         parser.xmldataend = xmldataend;
          943         parser.xmldataentity = xmldataentity;
          944         parser.xmltagstart = xmltagstart;
          945         parser.xmltagend = xmltagend;
          946         parser.xmltagstartparsed = xmltagstartparsed;
          947 
          948         parser.getnext = getchar;
          949         xml_parse(&parser);
          950 
          951         if (showlinkrefs)
          952                 printlinkrefs();
          953 
          954         if (ncharsline)
          955                 putchar('\n');
          956 
          957         printansi("\033[0m"); /* reset all attributes */
          958 
          959         return 0;
          960 }