webdump.c - webdump - HTML to plain-text converter for webpages
 (HTM) git clone git://git.codemadness.org/webdump
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       webdump.c (66824B)
       ---
            1 #include <errno.h>
            2 #include <limits.h>
            3 #include <stdio.h>
            4 #include <stdarg.h>
            5 #include <stdlib.h>
            6 #include <string.h>
            7 #include <strings.h>
            8 #include <unistd.h>
            9 
           10 #include "arg.h"
           11 char *argv0;
           12 
           13 #include "tree.h"
           14 #include "xml.h"
           15 
           16 static XMLParser parser;
           17 
           18 #ifndef __OpenBSD__
           19 #define pledge(p1,p2) 0
           20 #endif
           21 
           22 #undef strlcat
           23 size_t strlcat(char *, const char *, size_t);
           24 #undef strlcpy
           25 size_t strlcpy(char *, const char *, size_t);
           26 
           27 /* ctype-like macros, but always compatible with ASCII / UTF-8 */
           28 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
           29 #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f)
           30 #define ISDIGIT(c) (((unsigned)c) - '0' < 10)
           31 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
           32 #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c))
           33 
           34 #define LEN(x) (sizeof(x) / sizeof(x[0]))
           35 
           36 /* URI */
           37 struct uri {
           38         char proto[48];     /* scheme including ":" or "://" */
           39         char userinfo[256]; /* username [:password] */
           40         char host[256];
           41         char port[6];       /* numeric port */
           42         char path[1024];
           43         char query[1024];
           44         char fragment[1024];
           45 };
           46 
           47 /* options */
           48 static int allowansi     = 0;  /* (-a) allow ANSI escape codes */
           49 static int uniqrefs      = 0;  /* (-d) number unique references */
           50 static int showrefinline = 0;  /* (-i) show link reference number inline */
           51 static int showurlinline = 0;  /* (-I) show full link reference inline */
           52 static int showrefbottom = 0;  /* (-l) show link references at the bottom */
           53 static int allowlinewrap = 0;  /* (-r) line-wrapping */
           54 static int termwidth     = 77; /* (-w) terminal width */
           55 static int resources     = 0;  /* (-x) write resources line-by-line to fd 3? */
           56 
           57 enum DisplayType {
           58         DisplayUnknown     = 0,
           59         DisplayInline      = 1 << 0,
           60         DisplayInlineBlock = 1 << 1, /* unused for now */
           61         DisplayBlock       = 1 << 2,
           62         DisplayNone        = 1 << 3,
           63         DisplayPre         = 1 << 4,
           64         DisplayList        = 1 << 5,
           65         DisplayListOrdered = 1 << 6,
           66         DisplayListItem    = 1 << 7,
           67         DisplayTable       = 1 << 8,
           68         DisplayTableRow    = 1 << 9,
           69         DisplayTableCell   = 1 << 10,
           70         DisplayHeader      = 1 << 11,
           71         DisplayDl          = 1 << 12,
           72         DisplayInput       = 1 << 13,
           73         DisplayButton      = 1 << 14,
           74         DisplaySelect      = 1 << 15,
           75         DisplaySelectMulti = 1 << 16,
           76         DisplayOption      = 1 << 17
           77 };
           78 
           79 /* ANSI markup */
           80 enum MarkupType {
           81         MarkupNone        = 0,
           82         MarkupBold        = 1 << 0,
           83         MarkupItalic      = 1 << 1,
           84         MarkupUnderline   = 1 << 2,
           85         MarkupBlink       = 1 << 3, /* lol */
           86         MarkupReverse     = 1 << 4,
           87         MarkupStrike      = 1 << 5
           88 };
           89 
           90 /* String data / memory pool */
           91 typedef struct string {
           92         char   *data;   /* data */
           93         size_t  len;    /* string length */
           94         size_t  bufsiz; /* allocated size */
           95 } String;
           96 
           97 enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
           98         TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton,
           99         TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails,
          100         TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset,
          101         TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2,
          102         TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI,
          103         TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
          104         TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl,
          105         TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
          106         TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
          107         TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate,
          108         TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack,
          109         TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp };
          110 
          111 struct tag {
          112         const char *name;
          113         enum TagId id;
          114         enum DisplayType displaytype;
          115         enum MarkupType markuptype; /* ANSI markup */
          116         enum DisplayType parenttype; /* display type belonging to element */
          117         int isvoid; /* "void" element */
          118         int isoptional; /* optional to close tag */
          119         int margintop; /* newlines when the tag starts */
          120         int marginbottom; /* newlines after the tag ends */
          121         int indent; /* indent in cells */
          122 };
          123 
          124 struct node {
          125         char tagname[256];
          126         struct tag tag;
          127         size_t nchildren; /* child node count */
          128         size_t visnchildren; /* child node count which are visible */
          129         /* attributes */
          130         char id[256];
          131         char classnames[1024];
          132         int indent; /* indent per node, for formatting */
          133         int hasdata; /* tag contains some data, for formatting */
          134 };
          135 
          136 struct selectornode {
          137         char tagname[256];
          138         long index; /* index of node to match on: -1 if not matching on index */
          139         /* attributes */
          140         char id[256];
          141         char classnames[1024];
          142 };
          143 
          144 struct selector {
          145         const char *text;
          146         struct selectornode nodes[32];
          147         int depth;
          148 };
          149 
          150 /* list of selectors */
          151 struct selectors {
          152         struct selector **selectors;
          153         size_t count;
          154 };
          155 
          156 /* RB tree of link references */
          157 struct linkref {
          158         char *type;
          159         enum TagId tagid;
          160         char *url;
          161         int ishidden;
          162         size_t linknr;
          163         RB_ENTRY(linkref) entry;
          164 };
          165 
          166 /* link references and hidden link references */
          167 static struct linkref **visrefs;
          168 static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
          169 static struct linkref **hiddenrefs;
          170 static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */
          171 
          172 /* compare link by URL for link references RB-tree */
          173 static int
          174 linkrefcmp(struct linkref *r1, struct linkref *r2)
          175 {
          176         return strcmp(r1->url, r2->url);
          177 }
          178 
          179 RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
          180 RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
          181 
          182 static const char *str_bullet_item = "* ";
          183 static const char *str_checkbox_checked = "x";
          184 static const char *str_ruler = "-";
          185 static const char *str_radio_checked = "*";
          186 
          187 /* base href, to make URLs absolute */
          188 static char basehrefdoc[4096]; /* buffer for base href in document, if any */
          189 static int basehrefset; /* base href set and can be used? */
          190 static struct uri base; /* parsed current base href */
          191 
          192 /* buffers for some attributes of the current tag */
          193 static String attr_alt; /* alt attribute */
          194 static String attr_checked; /* checked attribute */
          195 static String attr_class; /* class attribute */
          196 static int attr_class_set; /* class attribute is set already */
          197 static String attr_data; /* data attribute */
          198 static String attr_href; /* href attribute */
          199 static String attr_id; /* id attribute */
          200 static int attr_id_set; /* class attribute is set already */
          201 static String attr_src; /* src attribute */
          202 static String attr_type; /* type attribute */
          203 static String attr_value; /* value attribute */
          204 
          205 static String htmldata; /* buffered HTML data near the current tag */
          206 
          207 /* for white-space output handling:
          208    1 = whitespace emitted (suppress repeated), 2 = other characters on this line
          209    Behaviour:
          210    * White-space data before non-whitespace data in tags are ignored on a line.
          211    * Repeated white-space are ignored: a single space (' ') is emitted.
          212 */
          213 static int whitespace_mode;
          214 static int nbytesline; /* bytes on this line */
          215 static int ncells; /* current cell/column count */
          216 static int hadnewline; /* count for repeated newlines */
          217 /* flag for skipping initial white-space in tag: for HTML white-space handling */
          218 static int skipinitialws = 1;
          219 #define DEFAULT_INDENT 2
          220 static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */
          221 static int indent; /* indent for the current line, in columns */
          222 /* previous output sequential newlines, used for calculating margins between
          223    elements and reducing excessive newlines */
          224 static int currentnewlines;
          225 
          226 /* buffers for line-wrapping (buffer per word boundary) */
          227 static char rbuf[1024];
          228 static int rbuflen;
          229 static int rnbufcells; /* pending cell count to add */
          230 
          231 #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */
          232 static struct node *nodes; /* node tree (one per level is remembered) */
          233 static String *nodes_links; /* keep track of links per node */
          234 static size_t ncapnodes; /* current allocated node capacity */
          235 static int curnode; /* current node depth */
          236 
          237 /* reader / selector mode (-s) */
          238 static int reader_mode;
          239 /* flag if the tags and their children should be ignored in the current context */
          240 static int reader_ignore;
          241 
          242 static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */
          243 static int linewrap; /* allow linewrap in this context */
          244 
          245 /* selector to match (for -s and -u) */
          246 static struct selectors *sel_hide, *sel_show;
          247 
          248 /* tags table: needs to be sorted like tagcmp(), alphabetically */
          249 
          250 /* tag          id             displaytype                       markup           parent           v  o  b  a  i */
          251 static struct tag tags[] = {
          252 { "a",          TagA,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          253 { "address",    TagAddress,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          254 { "area",       TagArea,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          255 { "article",    TagArticle,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          256 { "aside",      TagAside,      DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          257 { "audio",      TagAudio,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          258 { "b",          TagB,          DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
          259 { "base",       TagBase,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          260 { "blink",      TagBlink,      DisplayInline,                    MarkupBlink,     0,               0, 0, 0, 0, 0 },
          261 { "blockquote", TagBlockquote, DisplayBlock,                     0,               0,               0, 0, 0, 0, 2 },
          262 { "body",       TagBody,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          263 { "br",         TagBr,         0,                                0,               0,               1, 0, 0, 0, 0 },
          264 { "button",     TagButton,     DisplayInline | DisplayButton,    0,               0,               0, 0, 0, 0, 0 },
          265 { "cite",       TagCite,       DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          266 { "col",        TagCol,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          267 { "colgroup",   TagColgroup,   DisplayInline,                    0,               0,               0, 1, 0, 0, 0 },
          268 { "datalist",   TagDatalist,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          269 { "dd",         TagDd,         DisplayBlock,                     0,               0,               0, 1, 0, 0, 4 },
          270 { "del",        TagDel,        DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          271 { "details",    TagDetails,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          272 { "dfn",        TagDfn,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          273 { "dir",        TagDir,        DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          274 { "div",        TagDiv,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          275 { "dl",         TagDl,         DisplayBlock | DisplayDl,         0,               0,               0, 0, 0, 0, 0 },
          276 { "dt",         TagDt,         DisplayBlock,                     MarkupBold,      0,               0, 1, 0, 0, 0 },
          277 { "em",         TagEm,         DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          278 { "embed",      TagEmbed,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          279 { "fieldset",   TagFieldset,   DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          280 { "figcaption", TagFigcaption, DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          281 { "figure",     TagFigure,     DisplayBlock,                     0,               0,               0, 0, 1, 1, 4 },
          282 { "footer",     TagFooter,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          283 { "form",       TagForm,       DisplayBlock,                     0,               0,               0, 0, 0, 1, 0 },
          284 { "frame",      TagFrame,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          285 { "h1",         TagH1,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          286 { "h2",         TagH2,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          287 { "h3",         TagH3,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          288 { "h4",         TagH4,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          289 { "h5",         TagH5,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          290 { "h6",         TagH6,         DisplayHeader,                    MarkupBold,      0,               0, 0, 1, 1, -DEFAULT_INDENT },
          291 { "head",       TagHead,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
          292 { "header",     TagHeader,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          293 { "hr",         TagHr,         DisplayBlock,                     0,               0,               1, 0, 0, 0, 0 },
          294 { "html",       TagHtml,       DisplayBlock,                     0,               0,               0, 1, 0, 0, 0 },
          295 { "i",          TagI,          DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          296 { "iframe",     TagIframe,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          297 { "img",        TagImg,        DisplayInline,                    MarkupUnderline, 0,               1, 0, 0, 0, 0 },
          298 { "input",      TagInput,      DisplayInput,                     0,               0,               1, 0, 0, 0, 0 },
          299 { "ins",        TagIns,        DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          300 { "label",      TagLabel,      DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          301 { "legend",     TagLegend,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          302 { "li",         TagLi,         DisplayListItem,                  0,               DisplayList,     0, 1, 0, 0, 0 },
          303 { "link",       TagLink,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          304 { "main",       TagMain,       DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          305 { "mark",       TagMark,       DisplayInline,                    MarkupReverse,   0,               0, 0, 0, 0, 0 },
          306 { "menu",       TagMenu,       DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          307 { "meta",       TagMeta,       DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          308 { "nav",        TagNav,        DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          309 { "object",     TagObject,     DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          310 { "ol",         TagOl,         DisplayList | DisplayListOrdered, 0,               0,               0, 0, 1, 1, 0 },
          311 { "option",     TagOption,     DisplayInline | DisplayOption,    0,               0,               0, 1, 0, 0, 0 },
          312 { "p",          TagP,          DisplayBlock,                     0,               0,               0, 1, 1, 1, 0 },
          313 { "param",      TagParam,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          314 { "pre",        TagPre,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 },
          315 { "s",          TagS,          DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          316 { "script",     TagScript,     DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          317 { "search",     TagSearch,     DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          318 { "section",    TagSection,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          319 { "select",     TagSelect,     DisplayInline | DisplaySelect,    0,               0,               0, 0, 0, 0, 0 },
          320 { "source",     TagSource,     DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          321 { "strike",     TagStrike,     DisplayInline,                    MarkupStrike,    0,               0, 0, 0, 0, 0 },
          322 { "strong",     TagStrong,     DisplayInline,                    MarkupBold,      0,               0, 0, 0, 0, 0 },
          323 { "style",      TagStyle,      DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          324 { "summary",    TagSummary,    DisplayBlock,                     0,               0,               0, 0, 0, 0, 0 },
          325 { "svg",        TagSvg,        DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          326 { "table",      TagTable,      DisplayTable,                     0,               0,               0, 0, 0, 0, 0 },
          327 { "tbody",      TagTbody,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          328 { "td",         TagTd,         DisplayTableCell,                 0,               DisplayTableRow, 0, 1, 0, 0, 0 },
          329 { "template",   TagTemplate,   DisplayNone,                      0,               0,               0, 0, 0, 0, 0 },
          330 { "textarea",   TagTextarea,   DisplayInline,                    0,               0,               0, 0, 0, 0, 0 },
          331 { "tfoot",      TagTfoot,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          332 { "th",         TagTh,         DisplayTableCell,                 MarkupBold,      DisplayTableRow, 0, 1, 0, 0, 0 },
          333 { "thead",      TagThead,      DisplayInline,                    0,               DisplayTable,    0, 1, 0, 0, 0 },
          334 { "title",      TagTitle,      DisplayBlock,                     0,               0,               0, 0, 0, 1, -DEFAULT_INDENT },
          335 { "tr",         TagTr,         DisplayTableRow,                  0,               DisplayTable,    0, 1, 0, 0, 0 },
          336 { "track",      TagTrack,      DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          337 { "u",          TagU,          DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          338 { "ul",         TagUl,         DisplayList,                      0,               0,               0, 0, 1, 1, 2 },
          339 { "var",        TagVar,        DisplayInline,                    MarkupItalic,    0,               0, 0, 0, 0, 0 },
          340 { "video",      TagVideo,      DisplayInline,                    MarkupUnderline, 0,               0, 0, 0, 0, 0 },
          341 { "wbr",        TagWbr,        DisplayInline,                    0,               0,               1, 0, 0, 0, 0 },
          342 { "xmp",        TagXmp,        DisplayPre,                       0,               0,               0, 0, 1, 1, 4 }
          343 };
          344 
          345 /* hint for compilers and static analyzers that a function exits */
          346 #ifndef __dead
          347 #define __dead
          348 #endif
          349 
          350 /* print to stderr, print error message of errno and exit(). */
          351 __dead static void
          352 err(int exitstatus, const char *fmt, ...)
          353 {
          354         va_list ap;
          355         int saved_errno;
          356 
          357         saved_errno = errno;
          358 
          359         fputs("webdump: ", stderr);
          360         if (fmt) {
          361                 va_start(ap, fmt);
          362                 vfprintf(stderr, fmt, ap);
          363                 va_end(ap);
          364                 fputs(": ", stderr);
          365         }
          366         fprintf(stderr, "%s\n", strerror(saved_errno));
          367 
          368         exit(exitstatus);
          369 }
          370 
          371 /* print to stderr and exit(). */
          372 __dead static void
          373 errx(int exitstatus, const char *fmt, ...)
          374 {
          375         va_list ap;
          376 
          377         fputs("webdump: ", stderr);
          378         if (fmt) {
          379                 va_start(ap, fmt);
          380                 vfprintf(stderr, fmt, ap);
          381                 va_end(ap);
          382         }
          383         fputs("\n", stderr);
          384 
          385         exit(exitstatus);
          386 }
          387 
          388 static const char *ignorestate, *endtag;
          389 static int (*getnext)(void);
          390 
          391 /* return a space for all data until some case-insensitive string occurs. This
          392    is used to parse incorrect HTML/XML that contains unescaped HTML in script
          393    or style tags. If you see some </script> tag in a CDATA or comment
          394    section then e-mail W3C and tell them the web is too complex. */
          395 static inline int
          396 getnext_ignore(void)
          397 {
          398         int c;
          399 
          400         if ((c = getnext()) == EOF)
          401                 return EOF;
          402 
          403         if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignorestate)) {
          404                 ignorestate++;
          405                 if (*ignorestate == '\0') {
          406                         parser.getnext = getnext; /* restore */
          407                         return ' ';
          408                 }
          409         } else {
          410                 ignorestate = endtag; /* no full match: reset to beginning */
          411         }
          412 
          413         return ' '; /* pretend there is just SPACEs */
          414 }
          415 
          416 /* Clear string only; don't free, prevents unnecessary reallocation. */
          417 static void
          418 string_clear(String *s)
          419 {
          420         if (s->data)
          421                 s->data[0] = '\0';
          422         s->len = 0;
          423 }
          424 
          425 static void
          426 string_buffer_realloc(String *s, size_t newlen)
          427 {
          428         size_t alloclen;
          429 
          430         for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
          431                 ;
          432         if (!(s->data = realloc(s->data, alloclen)))
          433                 err(1, "realloc");
          434         s->bufsiz = alloclen;
          435 }
          436 
          437 static void
          438 string_append(String *s, const char *data, size_t len)
          439 {
          440         if (!len)
          441                 return;
          442         /* check if allocation is necesary, don't shrink buffer,
          443          * should be more than bufsiz ofcourse. */
          444         if (s->len + len >= s->bufsiz)
          445                 string_buffer_realloc(s, s->len + len + 1);
          446         memcpy(s->data + s->len, data, len);
          447         s->len += len;
          448         s->data[s->len] = '\0';
          449 }
          450 
          451 static char *
          452 estrdup(const char *s)
          453 {
          454         char *p;
          455 
          456         if (!(p = strdup(s)))
          457                 err(1, "strdup");
          458         return p;
          459 }
          460 
          461 static char *
          462 estrndup(const char *s, size_t n)
          463 {
          464         char *p;
          465 
          466         if (!(p = strndup(s, n)))
          467                 err(1, "strndup");
          468         return p;
          469 }
          470 
          471 static void *
          472 erealloc(void *p, size_t siz)
          473 {
          474         if (!(p = realloc(p, siz)))
          475                 err(1, "realloc");
          476 
          477         return p;
          478 }
          479 
          480 static void *
          481 ecalloc(size_t nmemb, size_t size)
          482 {
          483         void *p;
          484 
          485         if (!(p = calloc(nmemb, size)))
          486                 err(1, "calloc");
          487         return p;
          488 }
          489 
          490 /* check if string has a non-empty scheme / protocol part */
          491 static int
          492 uri_hasscheme(const char *s)
          493 {
          494         const char *p = s;
          495 
          496         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
          497                        *p == '+' || *p == '-' || *p == '.'; p++)
          498                 ;
          499         /* scheme, except if empty and starts with ":" then it is a path */
          500         return (*p == ':' && p != s);
          501 }
          502 
          503 static int
          504 uri_parse(const char *s, struct uri *u)
          505 {
          506         const char *p = s;
          507         char *endptr;
          508         size_t i;
          509         long l;
          510 
          511         u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
          512         u->path[0] = u->query[0] = u->fragment[0] = '\0';
          513 
          514         /* protocol-relative */
          515         if (*p == '/' && *(p + 1) == '/') {
          516                 p += 2; /* skip "//" */
          517                 goto parseauth;
          518         }
          519 
          520         /* scheme / protocol part */
          521         for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) ||
          522                        *p == '+' || *p == '-' || *p == '.'; p++)
          523                 ;
          524         /* scheme, except if empty and starts with ":" then it is a path */
          525         if (*p == ':' && p != s) {
          526                 if (*(p + 1) == '/' && *(p + 2) == '/')
          527                         p += 3; /* skip "://" */
          528                 else
          529                         p++; /* skip ":" */
          530 
          531                 if ((size_t)(p - s) >= sizeof(u->proto))
          532                         return -1; /* protocol too long */
          533                 memcpy(u->proto, s, p - s);
          534                 u->proto[p - s] = '\0';
          535 
          536                 if (*(p - 1) != '/')
          537                         goto parsepath;
          538         } else {
          539                 p = s; /* no scheme format, reset to start */
          540                 goto parsepath;
          541         }
          542 
          543 parseauth:
          544         /* userinfo (username:password) */
          545         i = strcspn(p, "@/?#");
          546         if (p[i] == '@') {
          547                 if (i >= sizeof(u->userinfo))
          548                         return -1; /* userinfo too long */
          549                 memcpy(u->userinfo, p, i);
          550                 u->userinfo[i] = '\0';
          551                 p += i + 1;
          552         }
          553 
          554         /* IPv6 address */
          555         if (*p == '[') {
          556                 /* bracket not found, host too short or too long */
          557                 i = strcspn(p, "]");
          558                 if (p[i] != ']' || i < 3)
          559                         return -1;
          560                 i++; /* including "]" */
          561         } else {
          562                 /* domain / host part, skip until port, path or end. */
          563                 i = strcspn(p, ":/?#");
          564         }
          565         if (i >= sizeof(u->host))
          566                 return -1; /* host too long */
          567         memcpy(u->host, p, i);
          568         u->host[i] = '\0';
          569         p += i;
          570 
          571         /* port */
          572         if (*p == ':') {
          573                 p++;
          574                 if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
          575                         return -1; /* port too long */
          576                 memcpy(u->port, p, i);
          577                 u->port[i] = '\0';
          578                 /* check for valid port: range 1 - 65535, may be empty */
          579                 errno = 0;
          580                 l = strtol(u->port, &endptr, 10);
          581                 if (i && (errno || *endptr || l <= 0 || l > 65535))
          582                         return -1;
          583                 p += i;
          584         }
          585 
          586 parsepath:
          587         /* path */
          588         if ((i = strcspn(p, "?#")) >= sizeof(u->path))
          589                 return -1; /* path too long */
          590         memcpy(u->path, p, i);
          591         u->path[i] = '\0';
          592         p += i;
          593 
          594         /* query */
          595         if (*p == '?') {
          596                 p++;
          597                 if ((i = strcspn(p, "#")) >= sizeof(u->query))
          598                         return -1; /* query too long */
          599                 memcpy(u->query, p, i);
          600                 u->query[i] = '\0';
          601                 p += i;
          602         }
          603 
          604         /* fragment */
          605         if (*p == '#') {
          606                 p++;
          607                 if ((i = strlen(p)) >= sizeof(u->fragment))
          608                         return -1; /* fragment too long */
          609                 memcpy(u->fragment, p, i);
          610                 u->fragment[i] = '\0';
          611         }
          612 
          613         return 0;
          614 }
          615 
          616 /* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
          617    Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
          618    Returns 0 on success, -1 on error or truncation. */
          619 static int
          620 uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
          621 {
          622         char *p;
          623         int c;
          624 
          625         strlcpy(a->fragment, u->fragment, sizeof(a->fragment));
          626 
          627         if (u->proto[0] || u->host[0]) {
          628                 strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto));
          629                 strlcpy(a->host, u->host, sizeof(a->host));
          630                 strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo));
          631                 strlcpy(a->host, u->host, sizeof(a->host));
          632                 strlcpy(a->port, u->port, sizeof(a->port));
          633                 strlcpy(a->path, u->path, sizeof(a->path));
          634                 strlcpy(a->query, u->query, sizeof(a->query));
          635                 return 0;
          636         }
          637 
          638         strlcpy(a->proto, b->proto, sizeof(a->proto));
          639         strlcpy(a->host, b->host, sizeof(a->host));
          640         strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo));
          641         strlcpy(a->host, b->host, sizeof(a->host));
          642         strlcpy(a->port, b->port, sizeof(a->port));
          643 
          644         if (!u->path[0]) {
          645                 strlcpy(a->path, b->path, sizeof(a->path));
          646         } else if (u->path[0] == '/') {
          647                 strlcpy(a->path, u->path, sizeof(a->path));
          648         } else {
          649                 a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0';
          650                 a->path[1] = '\0';
          651 
          652                 if ((p = strrchr(b->path, '/'))) {
          653                         c = *(++p);
          654                         *p = '\0'; /* temporary NUL-terminate */
          655                         if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path))
          656                                 return -1;
          657                         *p = c; /* restore */
          658                 }
          659                 if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path))
          660                         return -1;
          661         }
          662 
          663         if (u->path[0] || u->query[0])
          664                 strlcpy(a->query, u->query, sizeof(a->query));
          665         else
          666                 strlcpy(a->query, b->query, sizeof(a->query));
          667 
          668         return 0;
          669 }
          670 
          671 static int
          672 uri_format(char *buf, size_t bufsiz, struct uri *u)
          673 {
          674         return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
          675                 u->proto,
          676                 u->userinfo[0] ? u->userinfo : "",
          677                 u->userinfo[0] ? "@" : "",
          678                 u->host,
          679                 u->port[0] ? ":" : "",
          680                 u->port,
          681                 u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "",
          682                 u->path,
          683                 u->query[0] ? "?" : "",
          684                 u->query,
          685                 u->fragment[0] ? "#" : "",
          686                 u->fragment);
          687 }
          688 
          689 /* compare tag name (case-insensitive) */
          690 static int
          691 tagcmp(const char *s1, const char *s2)
          692 {
          693         return strcasecmp(s1, s2);
          694 }
          695 
          696 /* compare attribute name (case-insensitive) */
          697 static int
          698 attrcmp(const char *s1, const char *s2)
          699 {
          700         return strcasecmp(s1, s2);
          701 }
          702 
          703 static void
          704 rindent(void)
          705 {
          706         int i, total;
          707 
          708         total = indent + defaultindent;
          709         if (total < 0)
          710                 total = 0;
          711         for (i = 0; i < total; i++)
          712                 putchar(' ');
          713 
          714         nbytesline += total;
          715         ncells += total;
          716 }
          717 
          718 static void
          719 emitmarkup(int markuptype)
          720 {
          721         if (!allowansi)
          722                 return;
          723 
          724         if (!markuptype)
          725                 fputs("\033[0m", stdout); /* reset all attributes */
          726 
          727         /* set */
          728         if (markuptype & MarkupBold)
          729                 fputs("\033[1m", stdout);
          730         if (markuptype & MarkupItalic)
          731                 fputs("\033[3m", stdout);
          732         if (markuptype & MarkupUnderline)
          733                 fputs("\033[4m", stdout);
          734         if (markuptype & MarkupBlink)
          735                 fputs("\033[5m", stdout);
          736         if (markuptype & MarkupReverse)
          737                 fputs("\033[7m", stdout);
          738         if (markuptype & MarkupStrike)
          739                 fputs("\033[9m", stdout);
          740 }
          741 
          742 /* flush remaining buffer (containing a word): used for word-wrap handling */
          743 static void
          744 hflush(void)
          745 {
          746         int i;
          747 
          748         if (!rbuflen)
          749                 return;
          750 
          751         if (!nbytesline) {
          752                 if (curmarkup)
          753                         emitmarkup(0);
          754                 rindent();
          755                 /* emit code again per line, needed for GNU/less -R */
          756                 if (curmarkup)
          757                         emitmarkup(curmarkup);
          758         }
          759 
          760         for (i = 0; i < rbuflen; i++)
          761                 putchar(rbuf[i]);
          762 
          763         nbytesline += rbuflen;
          764         ncells += rnbufcells;
          765         rbuflen = 0;
          766         rnbufcells = 0;
          767 }
          768 
          769 static void
          770 printansi(const char *s)
          771 {
          772         size_t len;
          773 
          774         if (!allowansi)
          775                 return;
          776 
          777         if (linewrap) {
          778                 len = strlen(s);
          779                 if (rbuflen + len + 1 >= sizeof(rbuf))
          780                         hflush();
          781                 if (rbuflen + len + 1 < sizeof(rbuf)) {
          782                         memcpy(rbuf + rbuflen, s, len);
          783                         rbuflen += len;
          784                         /* NOTE: nbytesline and ncells are not counted for markup */
          785                 }
          786         } else {
          787                 fputs(s, stdout);
          788         }
          789 }
          790 
          791 static void
          792 setmarkup(int markuptype)
          793 {
          794         if (!allowansi)
          795                 return;
          796 
          797         /* need change? */
          798         if (curmarkup == markuptype)
          799                 return;
          800 
          801         if (!markuptype) {
          802                 printansi("\033[0m"); /* reset all attributes */
          803                 curmarkup = markuptype;
          804                 return;
          805         }
          806 
          807         /* set */
          808         if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold))
          809                 printansi("\033[1m");
          810         if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic))
          811                 printansi("\033[3m");
          812         if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderline))
          813                 printansi("\033[4m");
          814         if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink))
          815                 printansi("\033[5m");
          816         if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse))
          817                 printansi("\033[7m");
          818         if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike))
          819                 printansi("\033[9m");
          820 
          821         /* unset */
          822         if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold))
          823                 printansi("\033[22m"); /* reset bold or faint */
          824         if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic))
          825                 printansi("\033[23m"); /* reset italic */
          826         if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderline))
          827                 printansi("\033[24m"); /* reset underline */
          828         if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink))
          829                 printansi("\033[25m"); /* reset blink */
          830         if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse))
          831                 printansi("\033[27m"); /* reset reverse */
          832         if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike))
          833                 printansi("\033[29m"); /* reset strike */
          834 
          835         curmarkup = markuptype;
          836 }
          837 
          838 static void
          839 startmarkup(int markuptype)
          840 {
          841         setmarkup(curmarkup | markuptype);
          842 }
          843 
          844 static void
          845 endmarkup(int markuptype)
          846 {
          847         setmarkup(curmarkup & ~markuptype);
          848 }
          849 
          850 /* rough cell width of a unicode codepoint by counting a unicode codepoint as 1
          851    cell in general.
          852    NOTE: this is of course incorrect since characters can be 2 width aswell,
          853    in the future maybe replace this with wcwidth() or similar */
          854 static int
          855 utfwidth(int c)
          856 {
          857         /* not the start of a codepoint */
          858         if ((c & 0xc0) == 0x80)
          859                 return 0;
          860         /* count TAB as 8 */
          861         if (c == '\t')
          862                 return 8;
          863         return 1;
          864 }
          865 
          866 /* write a character, handling state of repeated newlines, some HTML
          867    white-space rules, indentation and word-wrapping */
          868 static void
          869 hputchar(int c)
          870 {
          871         struct node *cur = &nodes[curnode];
          872         cur->hasdata = 1;
          873 
          874         if (c == '\n') {
          875                 /* previous line had characters, so not a repeated newline */
          876                 if (nbytesline > 0)
          877                         hadnewline = 0;
          878 
          879                 /* start a new line, no chars on this line yet */
          880                 whitespace_mode &= ~2; /* no chars on this line yet */
          881                 nbytesline = 0;
          882                 ncells = 0;
          883 
          884                 if (hadnewline)
          885                         currentnewlines++; /* repeating newlines */
          886                 hadnewline = 1;
          887         } else {
          888                 hadnewline = 0;
          889                 currentnewlines = 0;
          890         }
          891 
          892         /* skip initial/leading white-space */
          893         if (ISSPACE((unsigned char)c)) {
          894                 if (skipinitialws)
          895                         return;
          896         } else {
          897                 skipinitialws = 0;
          898         }
          899 
          900         if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c)))
          901                 return;
          902 
          903         if (!linewrap) {
          904                 if (c == '\n') {
          905                         putchar('\n');
          906                         nbytesline = 0;
          907                         ncells = 0;
          908                 } else {
          909                         if (!nbytesline) {
          910                                 if (curmarkup)
          911                                         emitmarkup(0);
          912                                 rindent();
          913                                 /* emit code again per line, needed for GNU/less -R */
          914                                 if (curmarkup)
          915                                         emitmarkup(curmarkup);
          916                         }
          917                         putchar(c);
          918                         nbytesline++;
          919                         ncells += utfwidth(c);
          920                 }
          921                 return;
          922         }
          923 
          924         /* really too long: the whole word doesn't even fit, flush it */
          925         if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) - 1) {
          926                 putchar('\n');
          927                 nbytesline = 0;
          928                 ncells = 0;
          929                 hflush();
          930         }
          931 
          932         if (c == '\n') {
          933                 putchar('\n');
          934                 hflush();
          935                 return;
          936         } else if (ISSPACE((unsigned char)c) || c == '-') {
          937                 if (ncells + rnbufcells >= termwidth) {
          938                         putchar('\n');
          939                         nbytesline = 0;
          940                         ncells = 0;
          941                 }
          942                 rbuf[rbuflen++] = c;
          943                 rnbufcells += utfwidth(c);
          944                 hflush();
          945                 return;
          946         }
          947 
          948         rbuf[rbuflen++] = c;
          949         rnbufcells += utfwidth(c);
          950 }
          951 
          952 /* calculate indentation of current node depth, using the sum of each
          953    indentation per node */
          954 static int
          955 calcindent(void)
          956 {
          957         int i, n = 0;
          958 
          959         for (i = curnode; i >= 0; i--)
          960                 n += nodes[i].indent;
          961 
          962         return n;
          963 }
          964 
          965 static void
          966 hprint(const char *s)
          967 {
          968         for (; *s; ++s)
          969                 hputchar(*s);
          970 }
          971 
          972 /* printf(), max 256 bytes for now */
          973 static void
          974 hprintf(const char *fmt, ...)
          975 {
          976         va_list ap;
          977         char buf[256];
          978 
          979         va_start(ap, fmt);
          980         vsnprintf(buf, sizeof(buf), fmt, ap);
          981         va_end(ap);
          982 
          983         /* use hprint() formatting logic. */
          984         hprint(buf);
          985 }
          986 
          987 static void
          988 newline(void)
          989 {
          990         if (skipinitialws)
          991                 return;
          992         hputchar('\n');
          993 }
          994 
          995 static int
          996 parentcontainerhasdata(int curtype, int n)
          997 {
          998         int i;
          999 
         1000         for (i = n; i >= 0; i--) {
         1001                 if (nodes[i].tag.displaytype & (DisplayList|DisplayTable))
         1002                         break;
         1003                 if (nodes[i].hasdata)
         1004                         return 1;
         1005         }
         1006 
         1007         return 0;
         1008 }
         1009 
         1010 /* start on a newline for the start of a block element or not */
         1011 static void
         1012 startblock(void)
         1013 {
         1014         hflush();
         1015         whitespace_mode &= ~2; /* no characters on this line yet */
         1016         if (nbytesline <= 0)
         1017                 return;
         1018         if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
         1019                 hputchar('\n');
         1020 }
         1021 
         1022 /* start on a newline for the end of a block element or not */
         1023 static void
         1024 endblock(void)
         1025 {
         1026         hflush();
         1027         whitespace_mode &= ~2; /* no characters on this line yet */
         1028         if (nbytesline <= 0)
         1029                 return;
         1030         if (!hadnewline)
         1031                 hputchar('\n');
         1032 }
         1033 
         1034 /* print one character safely: no control characters,
         1035    handle HTML white-space rules */
         1036 static void
         1037 printc(int c)
         1038 {
         1039         if (ISSPACE((unsigned char)c)) {
         1040                 if (whitespace_mode == 2)
         1041                         hputchar(' ');
         1042                 whitespace_mode |= 1;
         1043         } else {
         1044                 whitespace_mode = 2;
         1045                 if (!ISCNTRL((unsigned char)c))
         1046                         hputchar(c);
         1047         }
         1048 }
         1049 
         1050 static void
         1051 printpre(const char *s, size_t len)
         1052 {
         1053         struct node *cur;
         1054         size_t i;
         1055 
         1056         /* reset state of newlines because this data is printed literally */
         1057         hadnewline = 0;
         1058         currentnewlines = 0;
         1059 
         1060         /* skip leading newline */
         1061         i = 0;
         1062         if (skipinitialws) {
         1063                 if (*s == '\n' && i < len) {
         1064                         s++;
         1065                         i++;
         1066                 }
         1067         }
         1068 
         1069         hflush();
         1070 
         1071         skipinitialws = 0;
         1072 
         1073         if (*s) {
         1074                 cur = &nodes[curnode];
         1075                 cur->hasdata = 1;
         1076         }
         1077 
         1078         for (; *s && i < len; s++, i++) {
         1079                 switch (*s) {
         1080                 case '\n':
         1081                         putchar('\n');
         1082                         nbytesline = 0;
         1083                         ncells = 0;
         1084                         break;
         1085                 case '\t':
         1086                         hadnewline = 0;
         1087                         if (!nbytesline) {
         1088                                 if (curmarkup)
         1089                                         emitmarkup(0);
         1090                                 rindent();
         1091                                 /* emit code again per line, needed for GNU/less -R */
         1092                                 if (curmarkup)
         1093                                         emitmarkup(curmarkup);
         1094                         }
         1095 
         1096                         /* TAB to 8 spaces */
         1097                         fputs("        ", stdout);
         1098                         nbytesline += 8;
         1099                         ncells += 8;
         1100                         break;
         1101                 default:
         1102                         if (ISCNTRL((unsigned char)*s))
         1103                                 continue;
         1104 
         1105                         if (!nbytesline) {
         1106                                 if (curmarkup)
         1107                                         emitmarkup(0);
         1108                                 rindent();
         1109                                 /* emit code again per line, needed for GNU/less -R */
         1110                                 if (curmarkup)
         1111                                         emitmarkup(curmarkup);
         1112                         }
         1113 
         1114                         putchar(*s);
         1115                         nbytesline++;
         1116                         /* start of rune: incorrectly assume 1 rune is 1 cell for now */
         1117                         ncells += utfwidth((unsigned char)*s);
         1118                 }
         1119         }
         1120 }
         1121 
         1122 static struct node *
         1123 findparenttype(int cur, int findtype)
         1124 {
         1125         int i;
         1126 
         1127         for (i = cur; i >= 0; i--) {
         1128                 if ((nodes[i].tag.displaytype & findtype))
         1129                         return &nodes[i];
         1130         }
         1131         return NULL;
         1132 }
         1133 
         1134 static int
         1135 isclassmatch(const char *haystack, const char *needle)
         1136 {
         1137         const char *p;
         1138         size_t needlelen;
         1139         size_t matched = 0;
         1140 
         1141         needlelen = strlen(needle);
         1142         for (p = haystack; *p; p++) {
         1143                 if (ISSPACE((unsigned char)*p)) {
         1144                         matched = 0;
         1145                         continue;
         1146                 }
         1147                 if (needle[matched] == *p)
         1148                         matched++;
         1149                 else
         1150                         matched = 0;
         1151                 if (matched == needlelen) {
         1152                         if (*(p + 1) == '\0' || ISSPACE((unsigned char)*(p + 1)))
         1153                                 return 1;
         1154                 }
         1155         }
         1156 
         1157         return 0;
         1158 }
         1159 
         1160 /* very limited CSS-like selector, supports: main, main#id, main.class,
         1161    ".class", "#id", "ul li a" */
         1162 static int
         1163 compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
         1164 {
         1165         int depth = 0, len;
         1166         long l;
         1167         const char *s, *start;
         1168         char tmp[256];
         1169         int nameset = 0;
         1170 
         1171         memset(&nodes[0], 0, sizeof(nodes[0]));
         1172         nodes[0].index = -1;
         1173 
         1174         s = sel;
         1175         for (; *s && ISSPACE((unsigned char)*s); s++)
         1176                 ;
         1177 
         1178         start = s;
         1179         for (; ; s++) {
         1180                 /* end of tag */
         1181                 if (!nameset &&
         1182                     (*s == '#' || *s == '.' || *s == '@' ||
         1183                      *s == '\0' || ISSPACE((unsigned char)*s))) {
         1184                         nameset = 1;
         1185                         len = s - start; /* tag name */
         1186                         if (len >= sizeof(tmp))
         1187                                 return 0;
         1188                         if (len)
         1189                                 memcpy(tmp, start, len);
         1190                         tmp[len] = '\0';
         1191 
         1192                         memcpy(nodes[depth].tagname, tmp, len + 1);
         1193                 }
         1194 
         1195                 /* end */
         1196                 if (*s == '\0' || ISSPACE((unsigned char)*s)) {
         1197                         for (; ISSPACE((unsigned char)*s); s++)
         1198                                 ;
         1199                         start = s; /* start of a new tag */
         1200                         depth++;
         1201                         if (depth >= maxnodes)
         1202                                 return 0;
         1203 
         1204                         nameset = 0;
         1205                         memset(&nodes[depth], 0, sizeof(nodes[depth]));
         1206                         nodes[depth].index = -1;
         1207 
         1208                         /* end of selector */
         1209                         if (*s == '\0')
         1210                                 break;
         1211                 }
         1212 
         1213                 /* index */
         1214                 if (*s == '@') {
         1215                         len = strcspn(s + 1, ".#@ \t\n");
         1216                         if (len >= sizeof(tmp))
         1217                                 return 0;
         1218                         memcpy(tmp, s + 1, len);
         1219                         tmp[len] = '\0';
         1220 
         1221                         l = strtol(tmp, NULL, 10);
         1222                         if (l >= 0)
         1223                                 nodes[depth].index = l;
         1224                         s += len;
         1225                         start = s + 1;
         1226                         continue;
         1227                 }
         1228 
         1229                 /* id */
         1230                 if (*s == '#') {
         1231                         len = strcspn(s + 1, ".#@ \t\n");
         1232                         if (len >= sizeof(tmp))
         1233                                 return 0;
         1234                         memcpy(tmp, s + 1, len);
         1235                         tmp[len] = '\0';
         1236                         memcpy(nodes[depth].id, tmp, len + 1);
         1237                         s += len;
         1238                         start = s + 1;
         1239                         continue;
         1240                 }
         1241 
         1242                 /* class */
         1243                 if (*s == '.') {
         1244                         len = strcspn(s + 1, ".#@ \t\n");
         1245                         if (len >= sizeof(tmp))
         1246                                 return 0;
         1247                         memcpy(tmp, s + 1, len);
         1248                         tmp[len] = '\0';
         1249                         /* allow only one classname for now */
         1250                         memcpy(nodes[depth].classnames, tmp, len + 1);
         1251                         s += len;
         1252                         start = s + 1;
         1253                         continue;
         1254                 }
         1255         }
         1256 
         1257         return depth;
         1258 }
         1259 
         1260 static struct selector *
         1261 newselector(const char *q)
         1262 {
         1263         struct selector *sel;
         1264         int r;
         1265 
         1266         sel = ecalloc(1, sizeof(*sel));
         1267         sel->text = estrdup(q);
         1268 
         1269         r = compileselector(sel->text, sel->nodes, LEN(sel->nodes));
         1270         if (r <= 0) {
         1271                 free(sel);
         1272                 return NULL;
         1273         }
         1274         sel->depth = r;
         1275 
         1276         return sel;
         1277 }
         1278 
         1279 static struct selectors *
         1280 compileselectors(const char *q)
         1281 {
         1282         struct selectors *sels = NULL;
         1283         struct selector *sel;
         1284         const char *start;
         1285         char *qe;
         1286         int count = 0;
         1287         size_t siz;
         1288 
         1289         sels = ecalloc(1, sizeof(*sels));
         1290 
         1291         start = q;
         1292         for (; ; q++) {
         1293                 if (*q == ',' || *q == '\0') {
         1294                         qe = estrndup(start, q - start);
         1295                         sel = newselector(qe);
         1296                         free(qe);
         1297 
         1298                         /* add new selector */
         1299                         siz = (count + 1) * sizeof(struct selector *);
         1300                         sels->selectors = erealloc(sels->selectors, siz);
         1301                         sels->selectors[count] = sel;
         1302                         count++;
         1303 
         1304                         if (*q == '\0')
         1305                                 break;
         1306                         start = q + 1;
         1307                 }
         1308         }
         1309         sels->count = count;
         1310 
         1311         return sels;
         1312 }
         1313 
         1314 /* very limited CSS-like matcher, supports: main, main#id, main.class,
         1315    ".class", "#id", "ul li a" */
         1316 static int
         1317 iscssmatch(struct selector *sel, struct node *root, int maxdepth)
         1318 {
         1319         int d, md = 0;
         1320 
         1321         for (d = 0; d <= maxdepth; d++) {
         1322                 /* tag matched? */
         1323                 if (sel->nodes[md].tagname[0] &&
         1324                     strcasecmp(sel->nodes[md].tagname, root[d].tagname))
         1325                         continue; /* no */
         1326 
         1327                 /* id matched? */
         1328                 if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, root[d].id))
         1329                         continue; /* no */
         1330 
         1331                 /* class matched, for now allow only one classname in the selector,
         1332                    matching multiple classnames */
         1333                 if (sel->nodes[md].classnames[0] &&
         1334                     !isclassmatch(root[d].classnames, sel->nodes[md].classnames))
         1335                         continue; /* no */
         1336 
         1337                 /* index matched */
         1338                 if (sel->nodes[md].index != -1 &&
         1339                     (d == 0 ||
         1340                     root[d - 1].nchildren == 0 ||
         1341                     sel->nodes[md].index != root[d - 1].nchildren - 1))
         1342                         continue;
         1343 
         1344                 md++;
         1345                 /* all matched of one selector */
         1346                 if (md == sel->depth)
         1347                         return 1;
         1348         }
         1349 
         1350         return 0;
         1351 }
         1352 
         1353 static int
         1354 iscssmatchany(struct selectors *sels, struct node *root, int maxdepth)
         1355 {
         1356         struct selector *sel;
         1357         int i;
         1358 
         1359         for (i = 0; i < sels->count; i++) {
         1360                 sel = sels->selectors[i];
         1361                 if (iscssmatch(sel, root, maxdepth))
         1362                         return 1;
         1363         }
         1364         return 0;
         1365 }
         1366 
         1367 static void
         1368 handleinlinealt(void)
         1369 {
         1370         struct node *cur;
         1371         char *start, *s, *e;
         1372 
         1373         /* do not show the alt text if the element is hidden */
         1374         cur = &nodes[curnode];
         1375         if (cur->tag.displaytype & DisplayNone)
         1376                 return;
         1377 
         1378         /* show img alt attribute as text. */
         1379         if (attr_alt.len) {
         1380                 start = attr_alt.data;
         1381                 e = attr_alt.data + attr_alt.len;
         1382 
         1383                 for (s = start; s < e; s++)
         1384                         printc((unsigned char)*s);
         1385                 hflush();
         1386         } else if (cur->tag.id == TagImg && !showurlinline) {
         1387                 /* if there is no alt text and no URL is shown inline, then
         1388                    show "[IMG]" to indicate there was an image there */
         1389                 hprint("[IMG]");
         1390         }
         1391 }
         1392 
         1393 /* lookup a link reference by url in the red-black tree */
         1394 static struct linkref *
         1395 findlinkref(const char *url)
         1396 {
         1397         struct linkref find;
         1398 
         1399         find.url = (char *)url;
         1400 
         1401         return RB_FIND(linkreftree, &linkrefhead, &find);
         1402 }
         1403 
         1404 /* add a link reference. Returns the added link reference, or the existing link
         1405    reference if links are deduplicated */
         1406 static struct linkref *
         1407 addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden)
         1408 {
         1409         struct linkref *link;
         1410         size_t linknr;
         1411 
         1412         /* if links are deduplicates return the existing link */
         1413         if (uniqrefs && (link = findlinkref(url)))
         1414                 return link;
         1415 
         1416         if (tagid == TagA)
         1417                 _type = "link";
         1418 
         1419         link = ecalloc(1, sizeof(*link));
         1420 
         1421         if (!ishidden) {
         1422                 linknr = ++nvisrefs;
         1423                 if (nvisrefs >= ncapvisrefs) {
         1424                         ncapvisrefs += 256; /* greedy alloc */
         1425                         visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs);
         1426                 }
         1427                 visrefs[linknr - 1] = link; /* add pointer to list */
         1428         } else {
         1429                 linknr = ++nhiddenrefs;
         1430                 if (nhiddenrefs >= ncaphiddenrefs) {
         1431                         ncaphiddenrefs += 256; /* greedy alloc */
         1432                         hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs);
         1433                 }
         1434                 hiddenrefs[linknr - 1] = link; /* add pointer to list */
         1435         }
         1436 
         1437         link->url = estrdup(url);
         1438         link->type = estrdup(_type);
         1439         link->tagid = tagid;
         1440         link->ishidden = ishidden;
         1441         link->linknr = linknr;
         1442 
         1443         /* add to tree: the tree is only used for checking unique link references */
         1444         if (uniqrefs)
         1445                 RB_INSERT(linkreftree, &linkrefhead, link);
         1446 
         1447         return link;
         1448 }
         1449 
         1450 static void
         1451 handleinlinelink(void)
         1452 {
         1453         struct uri newuri, olduri;
         1454         struct node *cur;
         1455         char buf[4096], *url;
         1456         int r;
         1457 
         1458         if (!showrefbottom && !showrefinline && !showurlinline && !resources)
         1459                 return; /* there is no need to collect the reference */
         1460 
         1461         if (!attr_href.len && !attr_src.len && !attr_data.len)
         1462                 return; /* there is no reference */
         1463 
         1464         /* by default use the original URL */
         1465         if (attr_src.len)
         1466                 url = attr_src.data;
         1467         else if (attr_href.len)
         1468                 url = attr_href.data;
         1469         else
         1470                 url = attr_data.data;
         1471 
         1472         if (!url)
         1473                 return;
         1474 
         1475         /* Not an absolute URL yet: try to make it absolute.
         1476            If it is not possible use the relative URL */
         1477         if (!uri_hasscheme(url) && basehrefset &&
         1478             uri_parse(url, &olduri) != -1 &&
         1479             uri_makeabs(&newuri, &olduri, &base) != -1 &&
         1480             newuri.proto[0]) {
         1481                 r = uri_format(buf, sizeof(buf), &newuri);
         1482                 if (r >= 0 && (size_t)r < sizeof(buf))
         1483                         url = buf;
         1484         }
         1485 
         1486         if (!url[0])
         1487                 return;
         1488 
         1489         cur = &nodes[curnode];
         1490 
         1491         if (!(cur->tag.displaytype & DisplayNone)) {
         1492                 string_clear(&nodes_links[curnode]);
         1493                 string_append(&nodes_links[curnode], url, strlen(url));
         1494         }
         1495 
         1496         /* add hidden links directly to the reference,
         1497            the order doesn't matter */
         1498         if (cur->tag.displaytype & DisplayNone)
         1499                 addlinkref(url, cur->tag.name, cur->tag.id, 1);
         1500 }
         1501 
         1502 static void
         1503 printlinkrefs(void)
         1504 {
         1505         struct linkref *ref;
         1506         size_t i;
         1507 
         1508         if (!nvisrefs && !nhiddenrefs)
         1509                 return;
         1510 
         1511         if (resources) {
         1512                 for (i = 0; i < nvisrefs; i++) {
         1513                         ref = visrefs[i];
         1514                         dprintf(3, "%s\t%s\n", ref->type, ref->url);
         1515                 }
         1516                 for (i = 0; i < nhiddenrefs; i++) {
         1517                         ref = hiddenrefs[i];
         1518                         dprintf(3, "%s\t%s\n", ref->type, ref->url);
         1519                 }
         1520         }
         1521 
         1522         printf("\nReferences\n\n");
         1523 
         1524         for (i = 0; i < nvisrefs; i++) {
         1525                 ref = visrefs[i];
         1526                 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
         1527         }
         1528 
         1529         if (nhiddenrefs > 0)
         1530                 printf("\n\nHidden references\n\n");
         1531         /* hidden links don't have a link number, just count them */
         1532         for (i = 0; i < nhiddenrefs; i++) {
         1533                 ref = hiddenrefs[i];
         1534                 printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type);
         1535         }
         1536 }
         1537 
         1538 /* size to grow node capacity (greedy) */
         1539 #define NODE_CAP_INC 16
         1540 
         1541 /* increase node depth, allocate space for nodes if needed */
         1542 static void
         1543 incnode(void)
         1544 {
         1545         size_t i;
         1546 
         1547         curnode++;
         1548 
         1549         if (curnode >= MAX_NODE_DEPTH)
         1550                 errx(1, "max node depth reached: %d", curnode);
         1551 
         1552         if (curnode >= ncapnodes) {
         1553                 nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC));
         1554                 nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC));
         1555 
         1556                 /* clear new region */
         1557                 memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC);
         1558                 memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC);
         1559 
         1560                 for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) {
         1561                         nodes[i].tag.displaytype = DisplayInline;
         1562                         nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */
         1563                 }
         1564 
         1565                 ncapnodes += NODE_CAP_INC; /* greedy alloc */
         1566         }
         1567 }
         1568 
         1569 static void
         1570 xmldatastart(XMLParser *p)
         1571 {
         1572 }
         1573 
         1574 static void
         1575 xmldataend(XMLParser *p)
         1576 {
         1577         struct node *cur;
         1578         char *start, *s, *e;
         1579 
         1580         if (!htmldata.data || !htmldata.len)
         1581                 return;
         1582 
         1583         cur = &nodes[curnode];
         1584 
         1585         if (reader_ignore || (cur->tag.displaytype & DisplayNone)) {
         1586                 /* print nothing */
         1587         } else if ((cur->tag.displaytype & DisplayPre) ||
         1588                    findparenttype(curnode - 1, DisplayPre)) {
         1589                 printpre(htmldata.data, htmldata.len);
         1590         } else {
         1591                 start = htmldata.data;
         1592                 e = htmldata.data + htmldata.len;
         1593 
         1594                 for (s = start; s < e; s++)
         1595                         printc((unsigned char)*s);
         1596         }
         1597 
         1598         string_clear(&htmldata);
         1599 }
         1600 
         1601 static void
         1602 xmldata(XMLParser *p, const char *data, size_t datalen)
         1603 {
         1604         struct node *cur;
         1605 
         1606         if (reader_ignore)
         1607                 return;
         1608 
         1609         cur = &nodes[curnode];
         1610         if (cur->tag.displaytype & DisplayNone)
         1611                 return;
         1612 
         1613         string_append(&htmldata, data, datalen);
         1614 }
         1615 
         1616 static void
         1617 xmldataentity(XMLParser *p, const char *data, size_t datalen)
         1618 {
         1619         struct node *cur;
         1620         char buf[8];
         1621         int len;
         1622 
         1623         if (reader_ignore)
         1624                 return;
         1625 
         1626         cur = &nodes[curnode];
         1627         if (cur->tag.displaytype & DisplayNone)
         1628                 return;
         1629 
         1630         len = xml_entitytostr(data, buf, sizeof(buf));
         1631         if (len > 0)
         1632                 xmldata(p, buf, (size_t)len);
         1633         else
         1634                 xmldata(p, data, datalen);
         1635 }
         1636 
         1637 static void
         1638 xmlcdatastart(XMLParser *p)
         1639 {
         1640         xmldatastart(p);
         1641 }
         1642 
         1643 static void
         1644 xmlcdataend(XMLParser *p)
         1645 {
         1646         xmldataend(p); /* treat CDATA as data */
         1647 }
         1648 
         1649 static void
         1650 xmlcdata(XMLParser *p, const char *data, size_t datalen)
         1651 {
         1652         xmldata(p, data, datalen); /* treat CDATA as data */
         1653 }
         1654 
         1655 /* lookup function to compare tag name (case-insensitive) for sort functions */
         1656 static int
         1657 findtagcmp(const void *v1, const void *v2)
         1658 {
         1659         struct tag *t1 = (struct tag *)v1;
         1660         struct tag *t2 = (struct tag *)v2;
         1661 
         1662         return strcasecmp(t1->name, t2->name);
         1663 }
         1664 
         1665 /* binary search tag by tag name */
         1666 static struct tag *
         1667 findtag(const char *t)
         1668 {
         1669         struct tag find = { 0 };
         1670 
         1671         find.name = t;
         1672 
         1673         return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp);
         1674 }
         1675 
         1676 static void
         1677 handleendtag(struct tag *tag)
         1678 {
         1679         int i, marginbottom;
         1680 
         1681         if (tag->displaytype & DisplayNone)
         1682                 return;
         1683         if (reader_ignore)
         1684                 return;
         1685 
         1686         if (tag->displaytype & (DisplayButton | DisplayOption)) {
         1687                 hputchar(']');
         1688                 hflush();
         1689         }
         1690 
         1691         if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTable | DisplayTableRow |
         1692                 DisplayList | DisplayListItem | DisplayPre)) {
         1693                 endblock(); /* break line if needed */
         1694         }
         1695 
         1696         /* when a list ends and its not inside a list add an extra bottom margin */
         1697         marginbottom = tag->marginbottom;
         1698 
         1699         if (marginbottom > 0) {
         1700                 if (tag->displaytype & DisplayList) {
         1701                         if (findparenttype(curnode - 1, DisplayList))
         1702                                 marginbottom--;
         1703                 }
         1704         }
         1705 
         1706         if (marginbottom > 0) {
         1707                 hflush();
         1708                 for (i = currentnewlines; i < marginbottom; i++) {
         1709                         putchar('\n');
         1710                         nbytesline = 0;
         1711                         ncells = 0;
         1712                         currentnewlines++;
         1713                 }
         1714                 hadnewline = 1;
         1715         }
         1716 }
         1717 
         1718 static void
         1719 endnode(struct node *cur)
         1720 {
         1721         struct linkref *ref;
         1722         int i, ishidden;
         1723 
         1724         /* set a flag indicating the element and its parent containers have data.
         1725            This is used for some formatting */
         1726         if (cur->hasdata) {
         1727                 for (i = curnode; i >= 0; i--)
         1728                         nodes[i].hasdata = 1;
         1729         }
         1730 
         1731         endmarkup(cur->tag.markuptype);
         1732 
         1733         ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone);
         1734 
         1735         /* add link and show the link number in the visible order */
         1736         if (!ishidden && nodes_links[curnode].len > 0) {
         1737                 ref = addlinkref(nodes_links[curnode].data,
         1738                         cur->tag.name, cur->tag.id, ishidden);
         1739 
         1740                 if (showrefinline || showurlinline) {
         1741                         hflush();
         1742                         startmarkup(MarkupReverse);
         1743                 }
         1744 
         1745                 if (showrefinline)
         1746                         hprintf("[%zu]", ref->linknr);
         1747                 if (showurlinline) {
         1748                         if (ref->tagid == TagA)
         1749                                 hprintf("[%s]", ref->url);
         1750                         else
         1751                                 hprintf("[%s: %s]", ref->type, ref->url);
         1752                 }
         1753                 if (showrefinline || showurlinline) {
         1754                         endmarkup(MarkupReverse);
         1755                         hflush();
         1756                 }
         1757         }
         1758 
         1759         handleendtag(&(cur->tag));
         1760 }
         1761 
         1762 static void
         1763 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
         1764 {
         1765         struct tag *found, *tag;
         1766         enum TagId child, childs[16];
         1767         size_t nchilds;
         1768         int i, j, k, nchildfound, parenttype;
         1769 
         1770         /* match tag and lookup metadata */
         1771         /* ignore closing of void elements, like </br>, which is not allowed */
         1772         if ((found = findtag(t))) {
         1773                 if (!isshort && found->isvoid)
         1774                         return;
         1775         }
         1776 
         1777         /* TODO: implement more complete optional tag handling.
         1778            in reality the optional tag rules are more complex, see:
         1779            https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
         1780 
         1781         child = 0;
         1782         nchilds = 0;
         1783         nchildfound = 0;
         1784         parenttype = 0; /* by default, seek until the root */
         1785 
         1786         if (found && found->displaytype & DisplayPre) {
         1787                 skipinitialws = 0; /* do not skip white-space, for margins */
         1788         } else if (found && found->displaytype & DisplayList) {
         1789                 childs[0] = TagLi;
         1790                 nchilds = 1;
         1791                 parenttype = DisplayList;
         1792         } else if (found && found->displaytype & DisplayTableRow) {
         1793                 childs[0] = TagTd;
         1794                 nchilds = 1;
         1795                 parenttype = DisplayTableRow;
         1796         } else if (found && found->displaytype & DisplayTable) {
         1797                 childs[0] = TagTd;
         1798                 nchilds = 1;
         1799                 parenttype = DisplayTable;
         1800         } else if (found && found->displaytype & DisplaySelect) {
         1801                 childs[0] = TagOption;
         1802                 nchilds = 1;
         1803                 parenttype = DisplaySelect;
         1804         } else if (found && found->displaytype & DisplayDl) {
         1805                 childs[0] = TagP;
         1806                 childs[1] = TagDd;
         1807                 childs[2] = TagDt;
         1808                 nchilds = 3;
         1809                 parenttype = DisplayDl;
         1810         } else if (found && found->displaytype & DisplayBlock) {
         1811                 childs[0] = TagP;
         1812                 nchilds = 1;
         1813                 parenttype = 0; /* seek until the root */
         1814         }
         1815 
         1816         if (nchilds > 0) {
         1817                 for (i = curnode; i >= 0; i--) {
         1818                         if (nchildfound)
         1819                                 break;
         1820                         if ((nodes[i].tag.displaytype & parenttype))
         1821                                 break;
         1822                         for (j = 0; j < nchilds; j++) {
         1823                                 child = childs[j];
         1824                                 if (nodes[i].tag.id == child) {
         1825                                         /* fake closing the previous tags */
         1826                                         for (k = curnode; k >= i; k--)
         1827                                                 endnode(&nodes[k]);
         1828                                         curnode = k;
         1829                                         nchildfound = 1;
         1830                                         break;
         1831                                 }
         1832                         }
         1833                 }
         1834         }
         1835 
         1836         /* if the current closing tag matches the current open tag */
         1837         if (nodes[curnode].tag.name &&
         1838             !tagcmp(nodes[curnode].tag.name, t)) {
         1839                 endnode(&nodes[curnode]);
         1840                 if (curnode)
         1841                         curnode--;
         1842         } else {
         1843                 /* ... else lookup the first matching start tag. This is also
         1844                    for handling optional closing tags */
         1845                 tag = NULL;
         1846                 for (i = curnode; i >= 0; i--) {
         1847                         if (nodes[i].tag.name &&
         1848                             !tagcmp(nodes[i].tag.name, t)) {
         1849                                 endnode(&nodes[i]);
         1850                                 curnode = i > 0 ? i - 1 : 0;
         1851                                 tag = &nodes[i].tag;
         1852                                 break;
         1853                         }
         1854                 }
         1855                 /* unmatched closing tag found */
         1856                 if (!tag && found)
         1857                         handleendtag(found);
         1858         }
         1859         indent = calcindent();
         1860 
         1861 #if 0
         1862         /* check if linewrap is enabled, but currently is disabled and needs to
         1863            be restored */
         1864         if (allowlinewrap && !linewrap) {
         1865                 tag = NULL;
         1866                 for (i = curnode; i >= 0; i--) {
         1867                         if (nodes[i].tag.id == TagTable) {
         1868                                 tag = &nodes[i].tag;
         1869                                 break;
         1870                         }
         1871                 }
         1872                 if (!tag)
         1873                         linewrap = allowlinewrap;
         1874         }
         1875 #endif
         1876 
         1877         /* restore markup of the tag we are in now */
         1878         startmarkup(nodes[curnode].tag.markuptype);
         1879 
         1880         /* check if the current node still matches the visible selector */
         1881         if (reader_mode && sel_show && !reader_ignore) {
         1882                 if (!iscssmatchany(sel_show, nodes, curnode)) {
         1883                         reader_ignore = 1;
         1884                         newline();
         1885                 }
         1886         }
         1887 }
         1888 
         1889 static void
         1890 xmltagstart(XMLParser *p, const char *t, size_t tl)
         1891 {
         1892         struct tag *found;
         1893         struct node *cur;
         1894         enum TagId tagid;
         1895         enum TagId child, childs[16];
         1896         size_t nchilds;
         1897         char *s;
         1898         int i, j, k, nchildfound, parenttype;
         1899 
         1900         cur = &nodes[curnode];
         1901 
         1902         string_clear(&attr_alt);
         1903         string_clear(&attr_checked);
         1904         string_clear(&attr_class);
         1905         attr_class_set = 0;
         1906         string_clear(&attr_data);
         1907         string_clear(&attr_href);
         1908         string_clear(&attr_id);
         1909         attr_id_set = 0;
         1910         string_clear(&attr_src);
         1911         string_clear(&attr_type);
         1912         string_clear(&attr_value);
         1913 
         1914         /* match tag and lookup metadata */
         1915         found = findtag(t);
         1916 
         1917         /* TODO: implement more complete optional tag handling.
         1918            in reality the optional tag rules are more complex, see:
         1919            https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
         1920 
         1921         child = 0;
         1922         nchilds = 0;
         1923         nchildfound = 0;
         1924         parenttype = 0; /* by default, seek until the root */
         1925 
         1926         /* if optional tag <p> is open and a list element is found, close </p>. */
         1927         if (found && found->displaytype & DisplayList) {
         1928                 /* not inside a list */
         1929                 childs[0] = TagP;
         1930                 nchilds = 1;
         1931                 parenttype = DisplayList;
         1932         } else if (found && found->isoptional) {
         1933                 tagid = found->id;
         1934                 if (tagid == TagLi) {
         1935                         childs[0] = TagLi;
         1936                         nchilds = 1;
         1937                         parenttype = DisplayList;
         1938                 } else if (tagid == TagTd) {
         1939                         childs[0] = TagTd;
         1940                         nchilds = 1;
         1941                         parenttype = DisplayTableRow;
         1942                 } else if (tagid == TagTr) {
         1943                         childs[0] = TagTr;
         1944                         nchilds = 1;
         1945                         parenttype = DisplayTable;
         1946                 } else if (tagid == TagP) {
         1947                         childs[0] = TagP;
         1948                         nchilds = 1;
         1949                         parenttype = 0; /* seek until the root */
         1950                 } else if (tagid == TagOption) {
         1951                         childs[0] = TagOption;
         1952                         nchilds = 1;
         1953                         parenttype = DisplaySelect;
         1954                 } else if (tagid == TagDt) {
         1955                         childs[0] = TagDd;
         1956                         nchilds = 1;
         1957                         parenttype = DisplayDl;
         1958                 } else if (tagid == TagDd) {
         1959                         childs[0] = TagDd;
         1960                         childs[1] = TagDt;
         1961                         nchilds = 2;
         1962                         parenttype = DisplayDl;
         1963                 } else if (tagid == cur->tag.id) {
         1964                         /* fake closing the previous tag if it is the same and repeated */
         1965                         xmltagend(p, t, tl, 0);
         1966                 }
         1967         } else if (found && found->displaytype & DisplayBlock) {
         1968                 /* check if we have an open "<p>" tag */
         1969                 childs[0] = TagP;
         1970                 childs[1] = TagDl;
         1971                 nchilds = 2;
         1972                 parenttype = DisplayDl;
         1973         }
         1974 
         1975         if (nchilds > 0) {
         1976                 for (i = curnode; i >= 0; i--) {
         1977                         if (nchildfound)
         1978                                 break;
         1979                         if ((nodes[i].tag.displaytype & parenttype))
         1980                                 break;
         1981                         for (j = 0; j < nchilds; j++) {
         1982                                 child = childs[j];
         1983                                 if (nodes[i].tag.id == child) {
         1984                                         /* fake closing the previous tags */
         1985                                         for (k = curnode; k >= i; k--)
         1986                                                 xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0);
         1987                                         nchildfound = 1;
         1988                                         break;
         1989                                 }
         1990                         }
         1991                 }
         1992         }
         1993 
         1994         incnode();
         1995         string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */
         1996         cur = &nodes[curnode];
         1997         memset(cur, 0, sizeof(*cur)); /* clear / reset node */
         1998         /* tag defaults */
         1999         cur->tag.displaytype = DisplayInline;
         2000         cur->tag.name = cur->tagname; /* assign fixed-size buffer */
         2001         strlcpy(cur->tagname, t, sizeof(cur->tagname));
         2002 
         2003         /* force to lowercase */
         2004         for (s = cur->tagname; *s; s++)
         2005                 *s = TOLOWER((unsigned char)*s);
         2006 
         2007         /* matched tag: copy tag information to current node */
         2008         if (found)
         2009                 memcpy(&(cur->tag), found, sizeof(*found));
         2010 
         2011         /* if parent tag is hidden then hide itself too */
         2012         if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & DisplayNone))
         2013                 cur->tag.displaytype |= DisplayNone;
         2014 }
         2015 
         2016 static void
         2017 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
         2018 {
         2019         struct tag *found;
         2020         enum TagId tagid;
         2021         struct node *cur, *parent;
         2022         int i, margintop;
         2023 
         2024         /* match tag and lookup metadata */
         2025         tagid = 0;
         2026         if ((found = findtag(t)))
         2027                 tagid = found->id;
         2028 
         2029         /* temporary replace the callback except the reader and end of tag
         2030            restore the context once we receive the same ignored tag in the
         2031            end tag handler */
         2032         if (tagid == TagScript) {
         2033                 ignorestate = endtag = "</script>";
         2034                 getnext = p->getnext; /* for restore */
         2035                 p->getnext = getnext_ignore;
         2036                 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
         2037                 return;
         2038         } else if (tagid == TagStyle) {
         2039                 ignorestate = endtag = "</style>";
         2040                 getnext = p->getnext; /* for restore */
         2041                 p->getnext = getnext_ignore;
         2042                 xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
         2043                 return;
         2044         }
         2045 
         2046 #if 0
         2047         /* disable line-wrapping inside tables */
         2048         if (tagid == TagTable)
         2049                 linewrap = 0;
         2050 #endif
         2051 
         2052         cur = &nodes[curnode];
         2053 
         2054         /* copy attributes if set */
         2055         if (attr_id.len)
         2056                 strlcpy(cur->id, attr_id.data, sizeof(cur->id));
         2057         else
         2058                 cur->id[0] = '\0';
         2059         if (attr_class.len)
         2060                 strlcpy(cur->classnames, attr_class.data, sizeof(cur->classnames));
         2061         else
         2062                 cur->classnames[0] = '\0';
         2063 
         2064         /* parent node */
         2065         if (curnode > 0) {
         2066                 parent = &nodes[curnode - 1];
         2067                 parent->nchildren++; /* increase child node count */
         2068                 /* count visible childnodes */
         2069                 if (!(cur->tag.displaytype & DisplayNone))
         2070                         parent->visnchildren++;
         2071         } else {
         2072                 parent = NULL;
         2073         }
         2074 
         2075         if (reader_mode && sel_show && reader_ignore &&
         2076             iscssmatchany(sel_show, nodes, curnode))
         2077                 reader_ignore = 0;
         2078 
         2079         /* hide element */
         2080         if (reader_mode && sel_hide &&
         2081             iscssmatchany(sel_hide, nodes, curnode))
         2082                 cur->tag.displaytype |= DisplayNone;
         2083 
         2084         /* indent for this tag */
         2085         cur->indent = cur->tag.indent;
         2086 
         2087         if (!reader_ignore) {
         2088                 /* add link reference, print links and alt text */
         2089                 handleinlinelink();
         2090                 handleinlinealt();
         2091         }
         2092 
         2093         /* <select><option> */
         2094         if (cur->tag.displaytype & DisplayOption) {
         2095                 /* <select multiple>: show all options */
         2096                 if (parent->tag.displaytype & DisplaySelectMulti)
         2097                         cur->tag.displaytype |= DisplayBlock;
         2098                 else if (parent->nchildren > 1) /* show the first item as selected */
         2099                         cur->tag.displaytype |= DisplayNone; /* else hide */
         2100         }
         2101 
         2102         if (cur->tag.displaytype & DisplayNone)
         2103                 return;
         2104 
         2105         if (reader_ignore)
         2106                 return;
         2107 
         2108         indent = calcindent();
         2109 
         2110         if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | DisplayPre |
         2111                 DisplayTable | DisplayTableRow |
         2112                 DisplayList | DisplayListItem))) {
         2113                 startblock(); /* break line if needed */
         2114         }
         2115 
         2116         if (cur->tag.displaytype & (DisplayButton | DisplayOption)) {
         2117                 hflush();
         2118                 hputchar('[');
         2119         }
         2120 
         2121         margintop = cur->tag.margintop;
         2122         if (cur->tag.displaytype & (DisplayList)) {
         2123                 for (i = curnode - 1; i >= 0; i--) {
         2124                         if (nodes[i].tag.displaytype & DisplayList)
         2125                                 break;
         2126                         if (!(nodes[i].tag.displaytype & DisplayListItem))
         2127                                 continue;
         2128                         if (nodes[i].hasdata && margintop > 0) {
         2129                                 margintop--;
         2130                                 break;
         2131                         }
         2132                 }
         2133         } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) {
         2134                 if (!parentcontainerhasdata(cur->tag.displaytype, curnode - 1)) {
         2135                         if (margintop > 0)
         2136                                 margintop--;
         2137                 }
         2138         }
         2139 
         2140         if (margintop > 0) {
         2141                 hflush();
         2142                 for (i = currentnewlines; i < margintop; i++) {
         2143                         putchar('\n');
         2144                         nbytesline = 0;
         2145                         ncells = 0;
         2146                         currentnewlines++;
         2147                 }
         2148                 hadnewline = 1;
         2149         }
         2150 
         2151         if (cur->tag.displaytype & DisplayPre) {
         2152                 skipinitialws = 1;
         2153         } else if (cur->tag.displaytype & DisplayTableCell) {
         2154                 if (parent && parent->visnchildren > 1)
         2155                         hputchar('\t');
         2156         } else if (cur->tag.displaytype & DisplayListItem) {
         2157                 /* find first parent node and ordered numbers or unordered */
         2158                 if (parent) {
         2159                         skipinitialws = 0;
         2160 
         2161                         /* print bullet, add columns to indentation level */
         2162                         if (parent->tag.displaytype & DisplayListOrdered) {
         2163                                 hprintf("%4zu. ", parent->nchildren);
         2164                                 cur->indent = 6;
         2165                                 indent += cur->indent; /* align to number */
         2166                         } else if (parent->tag.displaytype & DisplayList) {
         2167                                 hprint(str_bullet_item);
         2168                                 cur->indent = 2;
         2169                                 indent += 2; /* align to bullet */
         2170                         }
         2171                 }
         2172                 skipinitialws = 0;
         2173         } else if (cur->tag.displaytype & DisplayInput) {
         2174                 if (!attr_type.len) {
         2175                         hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */
         2176                 } else if (!strcasecmp(attr_type.data, "button")) {
         2177                         hprintf("[%s]", attr_value.len ? attr_value.data : "");
         2178                 } else if (!strcasecmp(attr_type.data, "submit")) {
         2179                         hprintf("[%s]", attr_value.len ? attr_value.data : "Submit Query");
         2180                 } else if (!strcasecmp(attr_type.data, "reset")) {
         2181                         hprintf("[%s]", attr_value.len ? attr_value.data : "Reset");
         2182                 } else if (!strcasecmp(attr_type.data, "checkbox")) {
         2183                         hprintf("[%s]",
         2184                                 attr_checked.len &&
         2185                                 !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " ");
         2186                 } else if (!strcasecmp(attr_type.data, "radio")) {
         2187                         hprintf("[%s]",
         2188                                 attr_checked.len &&
         2189                                 !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " ");
         2190                 } else if (!strcasecmp(attr_type.data, "hidden")) {
         2191                         cur->tag.displaytype |= DisplayNone;
         2192                 } else {
         2193                         /* unrecognized / default case is text */
         2194                         hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
         2195                 }
         2196         }
         2197 
         2198         startmarkup(cur->tag.markuptype);
         2199 
         2200         /* do not count data such as an item bullet as part of the data for
         2201            the node */
         2202         cur->hasdata = 0;
         2203 
         2204         if (tagid == TagHr) { /* ruler */
         2205                 i = termwidth - indent - defaultindent;
         2206                 for (; i > 0; i--)
         2207                         hprint(str_ruler);
         2208                 cur->hasdata = 1; /* treat <hr/> as data */
         2209         } else if (tagid == TagBr) {
         2210                 hflush();
         2211                 hadnewline = 0; /* forced newline */
         2212                 hputchar('\n');
         2213                 cur->hasdata = 1; /* treat <br/> as data */
         2214         }
         2215 
         2216         /* autoclose tags, such as <br>, pretend we are <br/> */
         2217         if (!isshort && cur->tag.isvoid)
         2218                 xmltagend(p, t, tl, 1); /* pretend close of short tag */
         2219 }
         2220 
         2221 static void
         2222 xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
         2223         size_t nl, const char *v, size_t vl)
         2224 {
         2225         struct node *cur;
         2226         enum TagId tagid;
         2227 
         2228         cur = &nodes[curnode];
         2229         tagid = cur->tag.id;
         2230 
         2231         /* hide tags with attribute aria-hidden or hidden */
         2232         if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
         2233                 cur->tag.displaytype |= DisplayNone;
         2234 
         2235         if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */
         2236                 string_append(&attr_class, v, vl);
         2237         else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */
         2238                 string_append(&attr_id, v, vl);
         2239         else if (!attrcmp(n, "type"))
         2240                 string_append(&attr_type, v, vl);
         2241         else if (!attrcmp(n, "value"))
         2242                 string_append(&attr_value, v, vl);
         2243 
         2244         /* <base href="..." /> */
         2245         if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
         2246                 strlcat(basehrefdoc, v, sizeof(basehrefdoc));
         2247 
         2248         if (tagid == TagA && !attrcmp(n, "href"))
         2249                 string_append(&attr_href, v, vl);
         2250 
         2251         if (tagid == TagSelect && !attrcmp(n, "multiple"))
         2252                 cur->tag.displaytype |= DisplaySelectMulti;
         2253 
         2254         if (tagid == TagObject && !attrcmp(n, "data"))
         2255                 string_append(&attr_data, v, vl);
         2256 
         2257         /* show img alt attribute as text. */
         2258         if (tagid == TagImg && !attrcmp(n, "alt"))
         2259                 string_append(&attr_alt, v, vl);
         2260 
         2261         if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"))
         2262                 string_append(&attr_checked, v, vl);
         2263 
         2264         /* src attribute */
         2265         switch (tagid) {
         2266         case TagAudio:
         2267         case TagEmbed:
         2268         case TagFrame:
         2269         case TagIframe:
         2270         case TagImg:
         2271         case TagSource:
         2272         case TagTrack:
         2273         case TagVideo:
         2274                 if (!attrcmp(n, "src"))
         2275                         string_append(&attr_src, v, vl);
         2276                 break;
         2277         default:
         2278                 break;
         2279         }
         2280 }
         2281 
         2282 static void
         2283 xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
         2284         size_t nl, const char *v, size_t vl)
         2285 {
         2286         char buf[8];
         2287         int len;
         2288 
         2289         len = xml_entitytostr(v, buf, sizeof(buf));
         2290         if (len > 0)
         2291                 xmlattr(p, t, tl, n, nl, buf, (size_t)len);
         2292         else
         2293                 xmlattr(p, t, tl, n, nl, v, vl);
         2294 }
         2295 
         2296 static void
         2297 xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
         2298         size_t nl)
         2299 {
         2300         struct node *cur;
         2301         enum TagId tagid;
         2302 
         2303         cur = &nodes[curnode];
         2304         tagid = cur->tag.id;
         2305 
         2306         if (!attr_class_set && !attrcmp(n, "class"))
         2307                 attr_class_set = 1;
         2308         else if (!attr_id_set && !attrcmp(n, "id"))
         2309                 attr_id_set = 1;
         2310 
         2311         /* set base URL, if it is set it cannot be overwritten again */
         2312         if (!basehrefset && basehrefdoc[0] &&
         2313             tagid == TagBase && !attrcmp(n, "href"))
         2314                 basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
         2315 
         2316         /* if attribute checked is set but it has no value then set it to "checked" */
         2317         if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len)
         2318                 string_append(&attr_checked, "checked", sizeof("checked") - 1);
         2319 }
         2320 
         2321 static void
         2322 xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
         2323         size_t nl)
         2324 {
         2325         struct node *cur;
         2326         enum TagId tagid;
         2327 
         2328         cur = &nodes[curnode];
         2329         tagid = cur->tag.id;
         2330 
         2331         if (!attrcmp(n, "alt"))
         2332                 string_clear(&attr_alt);
         2333         else if (!attrcmp(n, "checked"))
         2334                 string_clear(&attr_checked);
         2335         else if (!attr_class_set && !attrcmp(n, "class"))
         2336                 string_clear(&attr_class);
         2337         else if (!attrcmp(n, "data"))
         2338                 string_clear(&attr_data);
         2339         else if (!attrcmp(n, "href"))
         2340                 string_clear(&attr_href);
         2341         else if (!attr_id_set && !attrcmp(n, "id"))
         2342                 string_clear(&attr_id);
         2343         else if (!attrcmp(n, "src"))
         2344                 string_clear(&attr_src);
         2345         else if (!attrcmp(n, "type"))
         2346                 string_clear(&attr_type);
         2347         else if (!attrcmp(n, "value"))
         2348                 string_clear(&attr_value);
         2349 
         2350         if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
         2351                 basehrefdoc[0] = '\0';
         2352 }
         2353 
         2354 static void
         2355 usage(void)
         2356 {
         2357         fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0);
         2358         exit(1);
         2359 }
         2360 
         2361 int
         2362 main(int argc, char **argv)
         2363 {
         2364         char *basehref;
         2365 
         2366         if (pledge("stdio", NULL) < 0)
         2367                 err(1, "pledge");
         2368 
         2369         ARGBEGIN {
         2370         case '8':
         2371                 str_bullet_item = "\xe2\x80\xa2 ";
         2372                 str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal" */
         2373                 break;
         2374         case 'a':
         2375                 allowansi = !allowansi;
         2376                 break;
         2377         case 'b':
         2378                 basehref = EARGF(usage());
         2379                 if (uri_parse(basehref, &base) == -1 ||
         2380                     !base.proto[0])
         2381                         usage();
         2382                 basehrefset = 1;
         2383                 break;
         2384         case 'd':
         2385                 uniqrefs = !uniqrefs;
         2386                 break;
         2387         case 'i':
         2388                 showrefinline = !showrefinline;
         2389                 break;
         2390         case 'I':
         2391                 showurlinline = !showurlinline;
         2392                 break;
         2393         case 'l':
         2394                 showrefbottom = !showrefbottom;
         2395                 break;
         2396         case 'r':
         2397                 allowlinewrap = !allowlinewrap;
         2398                 break;
         2399         case 's':
         2400                 sel_show = compileselectors(EARGF(usage()));
         2401                 /* switch to reader/selector mode, ignore all data except when matched */
         2402                 reader_mode = 1;
         2403                 reader_ignore = 1;
         2404                 break;
         2405         case 'u':
         2406                 sel_hide = compileselectors(EARGF(usage()));
         2407                 /* switch to reader/selector mode */
         2408                 reader_mode = 1;
         2409                 break;
         2410         case 'w':
         2411                 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
         2412                         usage();
         2413                 break;
         2414         case 'x':
         2415                 resources = !resources;
         2416                 break;
         2417         default:
         2418                 usage();
         2419         } ARGEND
         2420 
         2421         linewrap = allowlinewrap;
         2422 
         2423         /* initial nodes */
         2424         ncapnodes = NODE_CAP_INC;
         2425         nodes = ecalloc(ncapnodes, sizeof(*nodes));
         2426         nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links));
         2427 
         2428         parser.xmlattrstart = xmlattrstart;
         2429         parser.xmlattr = xmlattr;
         2430         parser.xmlattrentity = xmlattrentity;
         2431         parser.xmlattrend = xmlattrend;
         2432         parser.xmlcdatastart = xmlcdatastart;
         2433         parser.xmlcdata = xmlcdata;
         2434         parser.xmlcdataend = xmlcdataend;
         2435         parser.xmldatastart = xmldatastart;
         2436         parser.xmldata = xmldata;
         2437         parser.xmldataentity = xmldataentity;
         2438         parser.xmldataend = xmldataend;
         2439         parser.xmltagstart = xmltagstart;
         2440         parser.xmltagstartparsed = xmltagstartparsed;
         2441         parser.xmltagend = xmltagend;
         2442 
         2443         parser.getnext = getchar;
         2444         xml_parse(&parser);
         2445 
         2446         hflush();
         2447         if (ncells > 0)
         2448                 newline();
         2449 
         2450         if (showrefbottom || resources)
         2451                 printlinkrefs();
         2452 
         2453         hflush();
         2454         setmarkup(0);
         2455 
         2456         return 0;
         2457 }