#include #include #include #include #include #include #include #include #include "arg.h" char *argv0; #include "tree.h" #include "xml.h" static XMLParser parser; #ifndef __OpenBSD__ #define pledge(p1,p2) 0 #endif #undef strlcat size_t strlcat(char *, const char *, size_t); #undef strlcpy size_t strlcpy(char *, const char *, size_t); /* ctype-like macros, but always compatible with ASCII / UTF-8 */ #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) #define ISCNTRL(c) ((c) < ' ' || (c) == 0x7f) #define ISDIGIT(c) (((unsigned)c) - '0' < 10) #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) #define TOLOWER(c) ((((unsigned)c) - 'A' < 26) ? ((c) | 32) : (c)) #define LEN(x) (sizeof(x) / sizeof(x[0])) /* URI */ struct uri { char proto[48]; /* scheme including ":" or "://" */ char userinfo[256]; /* username [:password] */ char host[256]; char port[6]; /* numeric port */ char path[1024]; char query[1024]; char fragment[1024]; }; /* options */ static int allowansi = 0; /* (-a) allow ANSI escape codes */ static int uniqrefs = 0; /* (-d) number unique references */ static int showrefinline = 0; /* (-i) show link reference number inline */ static int showurlinline = 0; /* (-I) show full link reference inline */ static int showrefbottom = 0; /* (-l) show link references at the bottom */ static int allowlinewrap = 0; /* (-r) line-wrapping */ static int termwidth = 77; /* (-w) terminal width */ static int resources = 0; /* (-x) write resources line-by-line to fd 3? */ enum DisplayType { DisplayUnknown = 0, DisplayInline = 1 << 0, DisplayInlineBlock = 1 << 1, /* unused for now */ DisplayBlock = 1 << 2, DisplayNone = 1 << 3, DisplayPre = 1 << 4, DisplayList = 1 << 5, DisplayListOrdered = 1 << 6, DisplayListItem = 1 << 7, DisplayTable = 1 << 8, DisplayTableRow = 1 << 9, DisplayTableCell = 1 << 10, DisplayHeader = 1 << 11, DisplayDl = 1 << 12, DisplayInput = 1 << 13, DisplayButton = 1 << 14, DisplaySelect = 1 << 15, DisplaySelectMulti = 1 << 16, DisplayOption = 1 << 17 }; /* ANSI markup */ enum MarkupType { MarkupNone = 0, MarkupBold = 1 << 0, MarkupItalic = 1 << 1, MarkupUnderline = 1 << 2, MarkupBlink = 1 << 3, /* lol */ MarkupReverse = 1 << 4, MarkupStrike = 1 << 5 }; /* String data / memory pool */ typedef struct string { char *data; /* data */ size_t len; /* string length */ size_t bufsiz; /* allocated size */ } String; enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio, TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle, TagSummary, TagSvg, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo, TagWbr, TagXmp }; struct tag { const char *name; enum TagId id; enum DisplayType displaytype; enum MarkupType markuptype; /* ANSI markup */ enum DisplayType parenttype; /* display type belonging to element */ int isvoid; /* "void" element */ int isoptional; /* optional to close tag */ int margintop; /* newlines when the tag starts */ int marginbottom; /* newlines after the tag ends */ int indent; /* indent in cells */ }; struct node { char tagname[256]; struct tag tag; size_t nchildren; /* child node count */ size_t visnchildren; /* child node count which are visible */ /* attributes */ char id[256]; char classnames[1024]; int indent; /* indent per node, for formatting */ int hasdata; /* tag contains some data, for formatting */ }; struct selectornode { char tagname[256]; long index; /* index of node to match on: -1 if not matching on index */ /* attributes */ char id[256]; char classnames[1024]; }; struct selector { const char *text; struct selectornode nodes[32]; int depth; }; /* list of selectors */ struct selectors { struct selector **selectors; size_t count; }; /* RB tree of link references */ struct linkref { char *type; enum TagId tagid; char *url; int ishidden; size_t linknr; RB_ENTRY(linkref) entry; }; /* link references and hidden link references */ static struct linkref **visrefs; static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */ static struct linkref **hiddenrefs; static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */ /* compare link by URL for link references RB-tree */ static int linkrefcmp(struct linkref *r1, struct linkref *r2) { return strcmp(r1->url, r2->url); } RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead); RB_GENERATE(linkreftree, linkref, entry, linkrefcmp) static const char *str_bullet_item = "* "; static const char *str_checkbox_checked = "x"; static const char *str_ruler = "-"; static const char *str_radio_checked = "*"; /* base href, to make URLs absolute */ static char basehrefdoc[4096]; /* buffer for base href in document, if any */ static int basehrefset; /* base href set and can be used? */ static struct uri base; /* parsed current base href */ /* buffers for some attributes of the current tag */ static String attr_alt; /* alt attribute */ static String attr_checked; /* checked attribute */ static String attr_class; /* class attribute */ static int attr_class_set; /* class attribute is set already */ static String attr_data; /* data attribute */ static String attr_href; /* href attribute */ static String attr_id; /* id attribute */ static int attr_id_set; /* class attribute is set already */ static String attr_src; /* src attribute */ static String attr_type; /* type attribute */ static String attr_value; /* value attribute */ static String htmldata; /* buffered HTML data near the current tag */ /* for white-space output handling: 1 = whitespace emitted (suppress repeated), 2 = other characters on this line Behaviour: * White-space data before non-whitespace data in tags are ignored on a line. * Repeated white-space are ignored: a single space (' ') is emitted. */ static int whitespace_mode; static int nbytesline; /* bytes on this line */ static int ncells; /* current cell/column count */ static int hadnewline; /* count for repeated newlines */ /* flag for skipping initial white-space in tag: for HTML white-space handling */ static int skipinitialws = 1; #define DEFAULT_INDENT 2 static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */ static int indent; /* indent for the current line, in columns */ /* previous output sequential newlines, used for calculating margins between elements and reducing excessive newlines */ static int currentnewlines; /* buffers for line-wrapping (buffer per word boundary) */ static char rbuf[1024]; static int rbuflen; static int rnbufcells; /* pending cell count to add */ #define MAX_NODE_DEPTH 4096 /* absolute maximum node depth */ static struct node *nodes; /* node tree (one per level is remembered) */ static String *nodes_links; /* keep track of links per node */ static size_t ncapnodes; /* current allocated node capacity */ static int curnode; /* current node depth */ /* reader / selector mode (-s) */ static int reader_mode; /* flag if the tags and their children should be ignored in the current context */ static int reader_ignore; static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */ static int linewrap; /* allow linewrap in this context */ /* selector to match (for -s and -u) */ static struct selectors *sel_hide, *sel_show; /* tags table: needs to be sorted like tagcmp(), alphabetically */ /* tag id displaytype markup parent v o b a i */ static struct tag tags[] = { { "a", TagA, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, { "address", TagAddress, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "area", TagArea, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "article", TagArticle, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "aside", TagAside, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "audio", TagAudio, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, { "b", TagB, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 }, { "base", TagBase, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "blink", TagBlink, DisplayInline, MarkupBlink, 0, 0, 0, 0, 0, 0 }, { "blockquote", TagBlockquote, DisplayBlock, 0, 0, 0, 0, 0, 0, 2 }, { "body", TagBody, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "br", TagBr, 0, 0, 0, 1, 0, 0, 0, 0 }, { "button", TagButton, DisplayInline | DisplayButton, 0, 0, 0, 0, 0, 0, 0 }, { "cite", TagCite, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, { "col", TagCol, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "colgroup", TagColgroup, DisplayInline, 0, 0, 0, 1, 0, 0, 0 }, { "datalist", TagDatalist, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, { "dd", TagDd, DisplayBlock, 0, 0, 0, 1, 0, 0, 4 }, { "del", TagDel, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 }, { "details", TagDetails, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "dfn", TagDfn, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, { "dir", TagDir, DisplayList, 0, 0, 0, 0, 1, 1, 2 }, { "div", TagDiv, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "dl", TagDl, DisplayBlock | DisplayDl, 0, 0, 0, 0, 0, 0, 0 }, { "dt", TagDt, DisplayBlock, MarkupBold, 0, 0, 1, 0, 0, 0 }, { "em", TagEm, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, { "embed", TagEmbed, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "fieldset", TagFieldset, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "figcaption", TagFigcaption, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "figure", TagFigure, DisplayBlock, 0, 0, 0, 0, 1, 1, 4 }, { "footer", TagFooter, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "form", TagForm, DisplayBlock, 0, 0, 0, 0, 0, 1, 0 }, { "frame", TagFrame, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "h1", TagH1, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, { "h2", TagH2, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, { "h3", TagH3, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, { "h4", TagH4, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, { "h5", TagH5, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, { "h6", TagH6, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT }, { "head", TagHead, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 }, { "header", TagHeader, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "hr", TagHr, DisplayBlock, 0, 0, 1, 0, 0, 0, 0 }, { "html", TagHtml, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 }, { "i", TagI, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, { "iframe", TagIframe, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, { "img", TagImg, DisplayInline, MarkupUnderline, 0, 1, 0, 0, 0, 0 }, { "input", TagInput, DisplayInput, 0, 0, 1, 0, 0, 0, 0 }, { "ins", TagIns, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, { "label", TagLabel, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, { "legend", TagLegend, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "li", TagLi, DisplayListItem, 0, DisplayList, 0, 1, 0, 0, 0 }, { "link", TagLink, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "main", TagMain, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "mark", TagMark, DisplayInline, MarkupReverse, 0, 0, 0, 0, 0, 0 }, { "menu", TagMenu, DisplayList, 0, 0, 0, 0, 1, 1, 2 }, { "meta", TagMeta, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "nav", TagNav, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "object", TagObject, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, { "ol", TagOl, DisplayList | DisplayListOrdered, 0, 0, 0, 0, 1, 1, 0 }, { "option", TagOption, DisplayInline | DisplayOption, 0, 0, 0, 1, 0, 0, 0 }, { "p", TagP, DisplayBlock, 0, 0, 0, 1, 1, 1, 0 }, { "param", TagParam, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "pre", TagPre, DisplayPre, 0, 0, 0, 0, 1, 1, 4 }, { "s", TagS, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 }, { "script", TagScript, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, { "search", TagSearch, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "section", TagSection, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "select", TagSelect, DisplayInline | DisplaySelect, 0, 0, 0, 0, 0, 0, 0 }, { "source", TagSource, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "strike", TagStrike, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 }, { "strong", TagStrong, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 }, { "style", TagStyle, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, { "summary", TagSummary, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 }, { "svg", TagSvg, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, { "table", TagTable, DisplayTable, 0, 0, 0, 0, 0, 0, 0 }, { "tbody", TagTbody, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 }, { "td", TagTd, DisplayTableCell, 0, DisplayTableRow, 0, 1, 0, 0, 0 }, { "template", TagTemplate, DisplayNone, 0, 0, 0, 0, 0, 0, 0 }, { "textarea", TagTextarea, DisplayInline, 0, 0, 0, 0, 0, 0, 0 }, { "tfoot", TagTfoot, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 }, { "th", TagTh, DisplayTableCell, MarkupBold, DisplayTableRow, 0, 1, 0, 0, 0 }, { "thead", TagThead, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 }, { "title", TagTitle, DisplayBlock, 0, 0, 0, 0, 0, 1, -DEFAULT_INDENT }, { "tr", TagTr, DisplayTableRow, 0, DisplayTable, 0, 1, 0, 0, 0 }, { "track", TagTrack, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "u", TagU, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, { "ul", TagUl, DisplayList, 0, 0, 0, 0, 1, 1, 2 }, { "var", TagVar, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 }, { "video", TagVideo, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 }, { "wbr", TagWbr, DisplayInline, 0, 0, 1, 0, 0, 0, 0 }, { "xmp", TagXmp, DisplayPre, 0, 0, 0, 0, 1, 1, 4 } }; /* hint for compilers and static analyzers that a function exits */ #ifndef __dead #define __dead #endif /* print to stderr, print error message of errno and exit(). */ __dead static void err(int exitstatus, const char *fmt, ...) { va_list ap; int saved_errno; saved_errno = errno; fputs("webdump: ", stderr); if (fmt) { va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fputs(": ", stderr); } fprintf(stderr, "%s\n", strerror(saved_errno)); exit(exitstatus); } /* print to stderr and exit(). */ __dead static void errx(int exitstatus, const char *fmt, ...) { va_list ap; fputs("webdump: ", stderr); if (fmt) { va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); } fputs("\n", stderr); exit(exitstatus); } static const char *ignorestate, *endtag; static int (*getnext)(void); /* return a space for all data until some case-insensitive string occurs. This is used to parse incorrect HTML/XML that contains unescaped HTML in script or style tags. If you see some tag in a CDATA or comment section then e-mail W3C and tell them the web is too complex. */ static inline int getnext_ignore(void) { int c; if ((c = getnext()) == EOF) return EOF; if (TOLOWER((unsigned char)c) == TOLOWER((unsigned char)*ignorestate)) { ignorestate++; if (*ignorestate == '\0') { parser.getnext = getnext; /* restore */ return ' '; } } else { ignorestate = endtag; /* no full match: reset to beginning */ } return ' '; /* pretend there is just SPACEs */ } /* Clear string only; don't free, prevents unnecessary reallocation. */ static void string_clear(String *s) { if (s->data) s->data[0] = '\0'; s->len = 0; } static void string_buffer_realloc(String *s, size_t newlen) { size_t alloclen; for (alloclen = 64; alloclen <= newlen; alloclen *= 2) ; if (!(s->data = realloc(s->data, alloclen))) err(1, "realloc"); s->bufsiz = alloclen; } static void string_append(String *s, const char *data, size_t len) { if (!len) return; /* check if allocation is necesary, don't shrink buffer, * should be more than bufsiz ofcourse. */ if (s->len + len >= s->bufsiz) string_buffer_realloc(s, s->len + len + 1); memcpy(s->data + s->len, data, len); s->len += len; s->data[s->len] = '\0'; } static char * estrdup(const char *s) { char *p; if (!(p = strdup(s))) err(1, "strdup"); return p; } static char * estrndup(const char *s, size_t n) { char *p; if (!(p = strndup(s, n))) err(1, "strndup"); return p; } static void * erealloc(void *p, size_t siz) { if (!(p = realloc(p, siz))) err(1, "realloc"); return p; } static void * ecalloc(size_t nmemb, size_t size) { void *p; if (!(p = calloc(nmemb, size))) err(1, "calloc"); return p; } /* check if string has a non-empty scheme / protocol part */ static int uri_hasscheme(const char *s) { const char *p = s; for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) || *p == '+' || *p == '-' || *p == '.'; p++) ; /* scheme, except if empty and starts with ":" then it is a path */ return (*p == ':' && p != s); } static int uri_parse(const char *s, struct uri *u) { const char *p = s; char *endptr; size_t i; long l; u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0'; u->path[0] = u->query[0] = u->fragment[0] = '\0'; /* protocol-relative */ if (*p == '/' && *(p + 1) == '/') { p += 2; /* skip "//" */ goto parseauth; } /* scheme / protocol part */ for (; ISALPHA((unsigned char)*p) || ISDIGIT((unsigned char)*p) || *p == '+' || *p == '-' || *p == '.'; p++) ; /* scheme, except if empty and starts with ":" then it is a path */ if (*p == ':' && p != s) { if (*(p + 1) == '/' && *(p + 2) == '/') p += 3; /* skip "://" */ else p++; /* skip ":" */ if ((size_t)(p - s) >= sizeof(u->proto)) return -1; /* protocol too long */ memcpy(u->proto, s, p - s); u->proto[p - s] = '\0'; if (*(p - 1) != '/') goto parsepath; } else { p = s; /* no scheme format, reset to start */ goto parsepath; } parseauth: /* userinfo (username:password) */ i = strcspn(p, "@/?#"); if (p[i] == '@') { if (i >= sizeof(u->userinfo)) return -1; /* userinfo too long */ memcpy(u->userinfo, p, i); u->userinfo[i] = '\0'; p += i + 1; } /* IPv6 address */ if (*p == '[') { /* bracket not found, host too short or too long */ i = strcspn(p, "]"); if (p[i] != ']' || i < 3) return -1; i++; /* including "]" */ } else { /* domain / host part, skip until port, path or end. */ i = strcspn(p, ":/?#"); } if (i >= sizeof(u->host)) return -1; /* host too long */ memcpy(u->host, p, i); u->host[i] = '\0'; p += i; /* port */ if (*p == ':') { p++; if ((i = strcspn(p, "/?#")) >= sizeof(u->port)) return -1; /* port too long */ memcpy(u->port, p, i); u->port[i] = '\0'; /* check for valid port: range 1 - 65535, may be empty */ errno = 0; l = strtol(u->port, &endptr, 10); if (i && (errno || *endptr || l <= 0 || l > 65535)) return -1; p += i; } parsepath: /* path */ if ((i = strcspn(p, "?#")) >= sizeof(u->path)) return -1; /* path too long */ memcpy(u->path, p, i); u->path[i] = '\0'; p += i; /* query */ if (*p == '?') { p++; if ((i = strcspn(p, "#")) >= sizeof(u->query)) return -1; /* query too long */ memcpy(u->query, p, i); u->query[i] = '\0'; p += i; } /* fragment */ if (*p == '#') { p++; if ((i = strlen(p)) >= sizeof(u->fragment)) return -1; /* fragment too long */ memcpy(u->fragment, p, i); u->fragment[i] = '\0'; } return 0; } /* Transform and try to make the URI `u` absolute using base URI `b` into `a`. Follows some of the logic from "RFC 3986 - 5.2.2. Transform References". Returns 0 on success, -1 on error or truncation. */ static int uri_makeabs(struct uri *a, struct uri *u, struct uri *b) { char *p; int c; strlcpy(a->fragment, u->fragment, sizeof(a->fragment)); if (u->proto[0] || u->host[0]) { strlcpy(a->proto, u->proto[0] ? u->proto : b->proto, sizeof(a->proto)); strlcpy(a->host, u->host, sizeof(a->host)); strlcpy(a->userinfo, u->userinfo, sizeof(a->userinfo)); strlcpy(a->host, u->host, sizeof(a->host)); strlcpy(a->port, u->port, sizeof(a->port)); strlcpy(a->path, u->path, sizeof(a->path)); strlcpy(a->query, u->query, sizeof(a->query)); return 0; } strlcpy(a->proto, b->proto, sizeof(a->proto)); strlcpy(a->host, b->host, sizeof(a->host)); strlcpy(a->userinfo, b->userinfo, sizeof(a->userinfo)); strlcpy(a->host, b->host, sizeof(a->host)); strlcpy(a->port, b->port, sizeof(a->port)); if (!u->path[0]) { strlcpy(a->path, b->path, sizeof(a->path)); } else if (u->path[0] == '/') { strlcpy(a->path, u->path, sizeof(a->path)); } else { a->path[0] = (b->host[0] && b->path[0] != '/') ? '/' : '\0'; a->path[1] = '\0'; if ((p = strrchr(b->path, '/'))) { c = *(++p); *p = '\0'; /* temporary NUL-terminate */ if (strlcat(a->path, b->path, sizeof(a->path)) >= sizeof(a->path)) return -1; *p = c; /* restore */ } if (strlcat(a->path, u->path, sizeof(a->path)) >= sizeof(a->path)) return -1; } if (u->path[0] || u->query[0]) strlcpy(a->query, u->query, sizeof(a->query)); else strlcpy(a->query, b->query, sizeof(a->query)); return 0; } static int uri_format(char *buf, size_t bufsiz, struct uri *u) { return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s", u->proto, u->userinfo[0] ? u->userinfo : "", u->userinfo[0] ? "@" : "", u->host, u->port[0] ? ":" : "", u->port, u->host[0] && u->path[0] && u->path[0] != '/' ? "/" : "", u->path, u->query[0] ? "?" : "", u->query, u->fragment[0] ? "#" : "", u->fragment); } /* compare tag name (case-insensitive) */ static int tagcmp(const char *s1, const char *s2) { return strcasecmp(s1, s2); } /* compare attribute name (case-insensitive) */ static int attrcmp(const char *s1, const char *s2) { return strcasecmp(s1, s2); } static void rindent(void) { int i, total; total = indent + defaultindent; if (total < 0) total = 0; for (i = 0; i < total; i++) putchar(' '); nbytesline += total; ncells += total; } static void emitmarkup(int markuptype) { if (!allowansi) return; if (!markuptype) fputs("\033[0m", stdout); /* reset all attributes */ /* set */ if (markuptype & MarkupBold) fputs("\033[1m", stdout); if (markuptype & MarkupItalic) fputs("\033[3m", stdout); if (markuptype & MarkupUnderline) fputs("\033[4m", stdout); if (markuptype & MarkupBlink) fputs("\033[5m", stdout); if (markuptype & MarkupReverse) fputs("\033[7m", stdout); if (markuptype & MarkupStrike) fputs("\033[9m", stdout); } /* flush remaining buffer (containing a word): used for word-wrap handling */ static void hflush(void) { int i; if (!rbuflen) return; if (!nbytesline) { if (curmarkup) emitmarkup(0); rindent(); /* emit code again per line, needed for GNU/less -R */ if (curmarkup) emitmarkup(curmarkup); } for (i = 0; i < rbuflen; i++) putchar(rbuf[i]); nbytesline += rbuflen; ncells += rnbufcells; rbuflen = 0; rnbufcells = 0; } static void printansi(const char *s) { size_t len; if (!allowansi) return; if (linewrap) { len = strlen(s); if (rbuflen + len + 1 >= sizeof(rbuf)) hflush(); if (rbuflen + len + 1 < sizeof(rbuf)) { memcpy(rbuf + rbuflen, s, len); rbuflen += len; /* NOTE: nbytesline and ncells are not counted for markup */ } } else { fputs(s, stdout); } } static void setmarkup(int markuptype) { if (!allowansi) return; /* need change? */ if (curmarkup == markuptype) return; if (!markuptype) { printansi("\033[0m"); /* reset all attributes */ curmarkup = markuptype; return; } /* set */ if (!(curmarkup & MarkupBold) && (markuptype & MarkupBold)) printansi("\033[1m"); if (!(curmarkup & MarkupItalic) && (markuptype & MarkupItalic)) printansi("\033[3m"); if (!(curmarkup & MarkupUnderline) && (markuptype & MarkupUnderline)) printansi("\033[4m"); if (!(curmarkup & MarkupBlink) && (markuptype & MarkupBlink)) printansi("\033[5m"); if (!(curmarkup & MarkupReverse) && (markuptype & MarkupReverse)) printansi("\033[7m"); if (!(curmarkup & MarkupStrike) && (markuptype & MarkupStrike)) printansi("\033[9m"); /* unset */ if ((curmarkup & MarkupBold) && !(markuptype & MarkupBold)) printansi("\033[22m"); /* reset bold or faint */ if ((curmarkup & MarkupItalic) && !(markuptype & MarkupItalic)) printansi("\033[23m"); /* reset italic */ if ((curmarkup & MarkupUnderline) && !(markuptype & MarkupUnderline)) printansi("\033[24m"); /* reset underline */ if ((curmarkup & MarkupBlink) && !(markuptype & MarkupBlink)) printansi("\033[25m"); /* reset blink */ if ((curmarkup & MarkupReverse) && !(markuptype & MarkupReverse)) printansi("\033[27m"); /* reset reverse */ if ((curmarkup & MarkupStrike) && !(markuptype & MarkupStrike)) printansi("\033[29m"); /* reset strike */ curmarkup = markuptype; } static void startmarkup(int markuptype) { setmarkup(curmarkup | markuptype); } static void endmarkup(int markuptype) { setmarkup(curmarkup & ~markuptype); } /* rough cell width of a unicode codepoint by counting a unicode codepoint as 1 cell in general. NOTE: this is of course incorrect since characters can be 2 width aswell, in the future maybe replace this with wcwidth() or similar */ static int utfwidth(int c) { /* not the start of a codepoint */ if ((c & 0xc0) == 0x80) return 0; /* count TAB as 8 */ if (c == '\t') return 8; return 1; } /* write a character, handling state of repeated newlines, some HTML white-space rules, indentation and word-wrapping */ static void hputchar(int c) { struct node *cur = &nodes[curnode]; cur->hasdata = 1; if (c == '\n') { /* previous line had characters, so not a repeated newline */ if (nbytesline > 0) hadnewline = 0; /* start a new line, no chars on this line yet */ whitespace_mode &= ~2; /* no chars on this line yet */ nbytesline = 0; ncells = 0; if (hadnewline) currentnewlines++; /* repeating newlines */ hadnewline = 1; } else { hadnewline = 0; currentnewlines = 0; } /* skip initial/leading white-space */ if (ISSPACE((unsigned char)c)) { if (skipinitialws) return; } else { skipinitialws = 0; } if (!(c == '\n' || c == '\t' || !ISCNTRL((unsigned char)c))) return; if (!linewrap) { if (c == '\n') { putchar('\n'); nbytesline = 0; ncells = 0; } else { if (!nbytesline) { if (curmarkup) emitmarkup(0); rindent(); /* emit code again per line, needed for GNU/less -R */ if (curmarkup) emitmarkup(curmarkup); } putchar(c); nbytesline++; ncells += utfwidth(c); } return; } /* really too long: the whole word doesn't even fit, flush it */ if (ncells + rnbufcells >= termwidth || rbuflen >= sizeof(rbuf) - 1) { putchar('\n'); nbytesline = 0; ncells = 0; hflush(); } if (c == '\n') { putchar('\n'); hflush(); return; } else if (ISSPACE((unsigned char)c) || c == '-') { if (ncells + rnbufcells >= termwidth) { putchar('\n'); nbytesline = 0; ncells = 0; } rbuf[rbuflen++] = c; rnbufcells += utfwidth(c); hflush(); return; } rbuf[rbuflen++] = c; rnbufcells += utfwidth(c); } /* calculate indentation of current node depth, using the sum of each indentation per node */ static int calcindent(void) { int i, n = 0; for (i = curnode; i >= 0; i--) n += nodes[i].indent; return n; } static void hprint(const char *s) { for (; *s; ++s) hputchar(*s); } /* printf(), max 256 bytes for now */ static void hprintf(const char *fmt, ...) { va_list ap; char buf[256]; va_start(ap, fmt); vsnprintf(buf, sizeof(buf), fmt, ap); va_end(ap); /* use hprint() formatting logic. */ hprint(buf); } static void newline(void) { if (skipinitialws) return; hputchar('\n'); } static int parentcontainerhasdata(int curtype, int n) { int i; for (i = n; i >= 0; i--) { if (nodes[i].tag.displaytype & (DisplayList|DisplayTable)) break; if (nodes[i].hasdata) return 1; } return 0; } /* start on a newline for the start of a block element or not */ static void startblock(void) { hflush(); whitespace_mode &= ~2; /* no characters on this line yet */ if (nbytesline <= 0) return; if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata) hputchar('\n'); } /* start on a newline for the end of a block element or not */ static void endblock(void) { hflush(); whitespace_mode &= ~2; /* no characters on this line yet */ if (nbytesline <= 0) return; if (!hadnewline) hputchar('\n'); } /* print one character safely: no control characters, handle HTML white-space rules */ static void printc(int c) { if (ISSPACE((unsigned char)c)) { if (whitespace_mode == 2) hputchar(' '); whitespace_mode |= 1; } else { whitespace_mode = 2; if (!ISCNTRL((unsigned char)c)) hputchar(c); } } static void printpre(const char *s, size_t len) { struct node *cur; size_t i; /* reset state of newlines because this data is printed literally */ hadnewline = 0; currentnewlines = 0; /* skip leading newline */ i = 0; if (skipinitialws) { if (*s == '\n' && i < len) { s++; i++; } } hflush(); skipinitialws = 0; if (*s) { cur = &nodes[curnode]; cur->hasdata = 1; } for (; *s && i < len; s++, i++) { switch (*s) { case '\n': putchar('\n'); nbytesline = 0; ncells = 0; break; case '\t': hadnewline = 0; if (!nbytesline) { if (curmarkup) emitmarkup(0); rindent(); /* emit code again per line, needed for GNU/less -R */ if (curmarkup) emitmarkup(curmarkup); } /* TAB to 8 spaces */ fputs(" ", stdout); nbytesline += 8; ncells += 8; break; default: if (ISCNTRL((unsigned char)*s)) continue; if (!nbytesline) { if (curmarkup) emitmarkup(0); rindent(); /* emit code again per line, needed for GNU/less -R */ if (curmarkup) emitmarkup(curmarkup); } putchar(*s); nbytesline++; /* start of rune: incorrectly assume 1 rune is 1 cell for now */ ncells += utfwidth((unsigned char)*s); } } } static struct node * findparenttype(int cur, int findtype) { int i; for (i = cur; i >= 0; i--) { if ((nodes[i].tag.displaytype & findtype)) return &nodes[i]; } return NULL; } static int isclassmatch(const char *haystack, const char *needle) { const char *p; size_t needlelen; size_t matched = 0; needlelen = strlen(needle); for (p = haystack; *p; p++) { if (ISSPACE((unsigned char)*p)) { matched = 0; continue; } if (needle[matched] == *p) matched++; else matched = 0; if (matched == needlelen) { if (*(p + 1) == '\0' || ISSPACE((unsigned char)*(p + 1))) return 1; } } return 0; } /* very limited CSS-like selector, supports: main, main#id, main.class, ".class", "#id", "ul li a" */ static int compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes) { int depth = 0, len; long l; const char *s, *start; char tmp[256]; int nameset = 0; memset(&nodes[0], 0, sizeof(nodes[0])); nodes[0].index = -1; s = sel; for (; *s && ISSPACE((unsigned char)*s); s++) ; start = s; for (; ; s++) { /* end of tag */ if (!nameset && (*s == '#' || *s == '.' || *s == '@' || *s == '\0' || ISSPACE((unsigned char)*s))) { nameset = 1; len = s - start; /* tag name */ if (len >= sizeof(tmp)) return 0; if (len) memcpy(tmp, start, len); tmp[len] = '\0'; memcpy(nodes[depth].tagname, tmp, len + 1); } /* end */ if (*s == '\0' || ISSPACE((unsigned char)*s)) { for (; ISSPACE((unsigned char)*s); s++) ; start = s; /* start of a new tag */ depth++; if (depth >= maxnodes) return 0; nameset = 0; memset(&nodes[depth], 0, sizeof(nodes[depth])); nodes[depth].index = -1; /* end of selector */ if (*s == '\0') break; } /* index */ if (*s == '@') { len = strcspn(s + 1, ".#@ \t\n"); if (len >= sizeof(tmp)) return 0; memcpy(tmp, s + 1, len); tmp[len] = '\0'; l = strtol(tmp, NULL, 10); if (l >= 0) nodes[depth].index = l; s += len; start = s + 1; continue; } /* id */ if (*s == '#') { len = strcspn(s + 1, ".#@ \t\n"); if (len >= sizeof(tmp)) return 0; memcpy(tmp, s + 1, len); tmp[len] = '\0'; memcpy(nodes[depth].id, tmp, len + 1); s += len; start = s + 1; continue; } /* class */ if (*s == '.') { len = strcspn(s + 1, ".#@ \t\n"); if (len >= sizeof(tmp)) return 0; memcpy(tmp, s + 1, len); tmp[len] = '\0'; /* allow only one classname for now */ memcpy(nodes[depth].classnames, tmp, len + 1); s += len; start = s + 1; continue; } } return depth; } static struct selector * newselector(const char *q) { struct selector *sel; int r; sel = ecalloc(1, sizeof(*sel)); sel->text = estrdup(q); r = compileselector(sel->text, sel->nodes, LEN(sel->nodes)); if (r <= 0) { free(sel); return NULL; } sel->depth = r; return sel; } static struct selectors * compileselectors(const char *q) { struct selectors *sels = NULL; struct selector *sel; const char *start; char *qe; int count = 0; size_t siz; sels = ecalloc(1, sizeof(*sels)); start = q; for (; ; q++) { if (*q == ',' || *q == '\0') { qe = estrndup(start, q - start); sel = newselector(qe); free(qe); /* add new selector */ siz = (count + 1) * sizeof(struct selector *); sels->selectors = erealloc(sels->selectors, siz); sels->selectors[count] = sel; count++; if (*q == '\0') break; start = q + 1; } } sels->count = count; return sels; } /* very limited CSS-like matcher, supports: main, main#id, main.class, ".class", "#id", "ul li a" */ static int iscssmatch(struct selector *sel, struct node *root, int maxdepth) { int d, md = 0; for (d = 0; d <= maxdepth; d++) { /* tag matched? */ if (sel->nodes[md].tagname[0] && strcasecmp(sel->nodes[md].tagname, root[d].tagname)) continue; /* no */ /* id matched? */ if (sel->nodes[md].id[0] && strcmp(sel->nodes[md].id, root[d].id)) continue; /* no */ /* class matched, for now allow only one classname in the selector, matching multiple classnames */ if (sel->nodes[md].classnames[0] && !isclassmatch(root[d].classnames, sel->nodes[md].classnames)) continue; /* no */ /* index matched */ if (sel->nodes[md].index != -1 && (d == 0 || root[d - 1].nchildren == 0 || sel->nodes[md].index != root[d - 1].nchildren - 1)) continue; md++; /* all matched of one selector */ if (md == sel->depth) return 1; } return 0; } static int iscssmatchany(struct selectors *sels, struct node *root, int maxdepth) { struct selector *sel; int i; for (i = 0; i < sels->count; i++) { sel = sels->selectors[i]; if (iscssmatch(sel, root, maxdepth)) return 1; } return 0; } static void handleinlinealt(void) { struct node *cur; char *start, *s, *e; /* do not show the alt text if the element is hidden */ cur = &nodes[curnode]; if (cur->tag.displaytype & DisplayNone) return; /* show img alt attribute as text. */ if (attr_alt.len) { start = attr_alt.data; e = attr_alt.data + attr_alt.len; for (s = start; s < e; s++) printc((unsigned char)*s); hflush(); } else if (cur->tag.id == TagImg && !showurlinline) { /* if there is no alt text and no URL is shown inline, then show "[IMG]" to indicate there was an image there */ hprint("[IMG]"); } } /* lookup a link reference by url in the red-black tree */ static struct linkref * findlinkref(const char *url) { struct linkref find; find.url = (char *)url; return RB_FIND(linkreftree, &linkrefhead, &find); } /* add a link reference. Returns the added link reference, or the existing link reference if links are deduplicated */ static struct linkref * addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden) { struct linkref *link; size_t linknr; /* if links are deduplicates return the existing link */ if (uniqrefs && (link = findlinkref(url))) return link; if (tagid == TagA) _type = "link"; link = ecalloc(1, sizeof(*link)); if (!ishidden) { linknr = ++nvisrefs; if (nvisrefs >= ncapvisrefs) { ncapvisrefs += 256; /* greedy alloc */ visrefs = erealloc(visrefs, sizeof(*visrefs) * ncapvisrefs); } visrefs[linknr - 1] = link; /* add pointer to list */ } else { linknr = ++nhiddenrefs; if (nhiddenrefs >= ncaphiddenrefs) { ncaphiddenrefs += 256; /* greedy alloc */ hiddenrefs = erealloc(hiddenrefs, sizeof(*hiddenrefs) * ncaphiddenrefs); } hiddenrefs[linknr - 1] = link; /* add pointer to list */ } link->url = estrdup(url); link->type = estrdup(_type); link->tagid = tagid; link->ishidden = ishidden; link->linknr = linknr; /* add to tree: the tree is only used for checking unique link references */ if (uniqrefs) RB_INSERT(linkreftree, &linkrefhead, link); return link; } static void handleinlinelink(void) { struct uri newuri, olduri; struct node *cur; char buf[4096], *url; int r; if (!showrefbottom && !showrefinline && !showurlinline && !resources) return; /* there is no need to collect the reference */ if (!attr_href.len && !attr_src.len && !attr_data.len) return; /* there is no reference */ /* by default use the original URL */ if (attr_src.len) url = attr_src.data; else if (attr_href.len) url = attr_href.data; else url = attr_data.data; if (!url) return; /* Not an absolute URL yet: try to make it absolute. If it is not possible use the relative URL */ if (!uri_hasscheme(url) && basehrefset && uri_parse(url, &olduri) != -1 && uri_makeabs(&newuri, &olduri, &base) != -1 && newuri.proto[0]) { r = uri_format(buf, sizeof(buf), &newuri); if (r >= 0 && (size_t)r < sizeof(buf)) url = buf; } if (!url[0]) return; cur = &nodes[curnode]; if (!(cur->tag.displaytype & DisplayNone)) { string_clear(&nodes_links[curnode]); string_append(&nodes_links[curnode], url, strlen(url)); } /* add hidden links directly to the reference, the order doesn't matter */ if (cur->tag.displaytype & DisplayNone) addlinkref(url, cur->tag.name, cur->tag.id, 1); } static void printlinkrefs(void) { struct linkref *ref; size_t i; if (!nvisrefs && !nhiddenrefs) return; if (resources) { for (i = 0; i < nvisrefs; i++) { ref = visrefs[i]; dprintf(3, "%s\t%s\n", ref->type, ref->url); } for (i = 0; i < nhiddenrefs; i++) { ref = hiddenrefs[i]; dprintf(3, "%s\t%s\n", ref->type, ref->url); } } printf("\nReferences\n\n"); for (i = 0; i < nvisrefs; i++) { ref = visrefs[i]; printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); } if (nhiddenrefs > 0) printf("\n\nHidden references\n\n"); /* hidden links don't have a link number, just count them */ for (i = 0; i < nhiddenrefs; i++) { ref = hiddenrefs[i]; printf(" %zu. %s (%s)\n", ref->linknr, ref->url, ref->type); } } /* size to grow node capacity (greedy) */ #define NODE_CAP_INC 16 /* increase node depth, allocate space for nodes if needed */ static void incnode(void) { size_t i; curnode++; if (curnode >= MAX_NODE_DEPTH) errx(1, "max node depth reached: %d", curnode); if (curnode >= ncapnodes) { nodes = erealloc(nodes, sizeof(*nodes) * (ncapnodes + NODE_CAP_INC)); nodes_links = erealloc(nodes_links, sizeof(*nodes_links) * (ncapnodes + NODE_CAP_INC)); /* clear new region */ memset(&nodes[ncapnodes], 0, sizeof(*nodes) * NODE_CAP_INC); memset(&nodes_links[ncapnodes], 0, sizeof(*nodes_links) * NODE_CAP_INC); for (i = 0; i < ncapnodes; i++) nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */ for (i = ncapnodes; i < ncapnodes + NODE_CAP_INC; i++) { nodes[i].tag.displaytype = DisplayInline; nodes[i].tag.name = nodes[i].tagname; /* assign to use fixed-size buffer */ } ncapnodes += NODE_CAP_INC; /* greedy alloc */ } } static void xmldatastart(XMLParser *p) { } static void xmldataend(XMLParser *p) { struct node *cur; char *start, *s, *e; if (!htmldata.data || !htmldata.len) return; cur = &nodes[curnode]; if (reader_ignore || (cur->tag.displaytype & DisplayNone)) { /* print nothing */ } else if ((cur->tag.displaytype & DisplayPre) || findparenttype(curnode - 1, DisplayPre)) { printpre(htmldata.data, htmldata.len); } else { start = htmldata.data; e = htmldata.data + htmldata.len; for (s = start; s < e; s++) printc((unsigned char)*s); } string_clear(&htmldata); } static void xmldata(XMLParser *p, const char *data, size_t datalen) { struct node *cur; if (reader_ignore) return; cur = &nodes[curnode]; if (cur->tag.displaytype & DisplayNone) return; string_append(&htmldata, data, datalen); } static void xmldataentity(XMLParser *p, const char *data, size_t datalen) { struct node *cur; char buf[8]; int len; if (reader_ignore) return; cur = &nodes[curnode]; if (cur->tag.displaytype & DisplayNone) return; len = xml_entitytostr(data, buf, sizeof(buf)); if (len > 0) xmldata(p, buf, (size_t)len); else xmldata(p, data, datalen); } static void xmlcdatastart(XMLParser *p) { xmldatastart(p); } static void xmlcdataend(XMLParser *p) { xmldataend(p); /* treat CDATA as data */ } static void xmlcdata(XMLParser *p, const char *data, size_t datalen) { xmldata(p, data, datalen); /* treat CDATA as data */ } /* lookup function to compare tag name (case-insensitive) for sort functions */ static int findtagcmp(const void *v1, const void *v2) { struct tag *t1 = (struct tag *)v1; struct tag *t2 = (struct tag *)v2; return strcasecmp(t1->name, t2->name); } /* binary search tag by tag name */ static struct tag * findtag(const char *t) { struct tag find = { 0 }; find.name = t; return bsearch(&find, tags, LEN(tags), sizeof(*tags), findtagcmp); } static void handleendtag(struct tag *tag) { int i, marginbottom; if (tag->displaytype & DisplayNone) return; if (reader_ignore) return; if (tag->displaytype & (DisplayButton | DisplayOption)) { hputchar(']'); hflush(); } if (tag->displaytype & (DisplayBlock | DisplayHeader | DisplayTable | DisplayTableRow | DisplayList | DisplayListItem | DisplayPre)) { endblock(); /* break line if needed */ } /* when a list ends and its not inside a list add an extra bottom margin */ marginbottom = tag->marginbottom; if (marginbottom > 0) { if (tag->displaytype & DisplayList) { if (findparenttype(curnode - 1, DisplayList)) marginbottom--; } } if (marginbottom > 0) { hflush(); for (i = currentnewlines; i < marginbottom; i++) { putchar('\n'); nbytesline = 0; ncells = 0; currentnewlines++; } hadnewline = 1; } } static void endnode(struct node *cur) { struct linkref *ref; int i, ishidden; /* set a flag indicating the element and its parent containers have data. This is used for some formatting */ if (cur->hasdata) { for (i = curnode; i >= 0; i--) nodes[i].hasdata = 1; } endmarkup(cur->tag.markuptype); ishidden = reader_ignore || (cur->tag.displaytype & DisplayNone); /* add link and show the link number in the visible order */ if (!ishidden && nodes_links[curnode].len > 0) { ref = addlinkref(nodes_links[curnode].data, cur->tag.name, cur->tag.id, ishidden); if (showrefinline || showurlinline) { hflush(); startmarkup(MarkupReverse); } if (showrefinline) hprintf("[%zu]", ref->linknr); if (showurlinline) { if (ref->tagid == TagA) hprintf("[%s]", ref->url); else hprintf("[%s: %s]", ref->type, ref->url); } if (showrefinline || showurlinline) { endmarkup(MarkupReverse); hflush(); } } handleendtag(&(cur->tag)); } static void xmltagend(XMLParser *p, const char *t, size_t tl, int isshort) { struct tag *found, *tag; enum TagId childs[16]; size_t nchilds; int i, j, k, nchildfound, parenttype; /* match tag and lookup metadata */ /* ignore closing of void elements, like
, which is not allowed */ if ((found = findtag(t))) { if (!isshort && found->isvoid) return; } /* TODO: implement more complete optional tag handling. in reality the optional tag rules are more complex, see: https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ nchilds = 0; nchildfound = 0; parenttype = 0; /* by default, seek until the root */ if (found && found->displaytype & DisplayPre) { skipinitialws = 0; /* do not skip white-space, for margins */ } else if (found && found->displaytype & DisplayList) { childs[0] = TagLi; nchilds = 1; parenttype = DisplayList; } else if (found && found->displaytype & DisplayTableRow) { childs[0] = TagTd; nchilds = 1; parenttype = DisplayTableRow; } else if (found && found->displaytype & DisplayTable) { childs[0] = TagTd; nchilds = 1; parenttype = DisplayTable; } else if (found && found->displaytype & DisplaySelect) { childs[0] = TagOption; nchilds = 1; parenttype = DisplaySelect; } else if (found && found->displaytype & DisplayDl) { childs[0] = TagP; childs[1] = TagDd; childs[2] = TagDt; nchilds = 3; parenttype = DisplayDl; } else if (found && found->displaytype & DisplayBlock) { childs[0] = TagP; nchilds = 1; parenttype = 0; /* seek until the root */ } if (nchilds > 0) { for (i = curnode; i >= 0; i--) { if (nchildfound) break; if ((nodes[i].tag.displaytype & parenttype)) break; for (j = 0; j < nchilds; j++) { if (nodes[i].tag.id == childs[j]) { /* fake closing the previous tags */ for (k = curnode; k >= i; k--) endnode(&nodes[k]); curnode = k; nchildfound = 1; break; } } } } /* if the current closing tag matches the current open tag */ if (nodes[curnode].tag.name && !tagcmp(nodes[curnode].tag.name, t)) { endnode(&nodes[curnode]); if (curnode) curnode--; } else { /* ... else lookup the first matching start tag. This is also for handling optional closing tags */ tag = NULL; for (i = curnode; i >= 0; i--) { if (nodes[i].tag.name && !tagcmp(nodes[i].tag.name, t)) { endnode(&nodes[i]); curnode = i > 0 ? i - 1 : 0; tag = &nodes[i].tag; break; } } /* unmatched closing tag found */ if (!tag && found) handleendtag(found); } indent = calcindent(); #if 0 /* check if linewrap is enabled, but currently is disabled and needs to be restored */ if (allowlinewrap && !linewrap) { tag = NULL; for (i = curnode; i >= 0; i--) { if (nodes[i].tag.id == TagTable) { tag = &nodes[i].tag; break; } } if (!tag) linewrap = allowlinewrap; } #endif /* restore markup of the tag we are in now */ startmarkup(nodes[curnode].tag.markuptype); /* check if the current node still matches the visible selector */ if (reader_mode && sel_show && !reader_ignore) { if (!iscssmatchany(sel_show, nodes, curnode)) { reader_ignore = 1; newline(); } } } static void xmltagstart(XMLParser *p, const char *t, size_t tl) { struct tag *found; struct node *cur; enum TagId tagid; enum TagId childs[16]; size_t nchilds; char *s; int i, j, k, nchildfound, parenttype; cur = &nodes[curnode]; string_clear(&attr_alt); string_clear(&attr_checked); string_clear(&attr_class); attr_class_set = 0; string_clear(&attr_data); string_clear(&attr_href); string_clear(&attr_id); attr_id_set = 0; string_clear(&attr_src); string_clear(&attr_type); string_clear(&attr_value); /* match tag and lookup metadata */ found = findtag(t); /* TODO: implement more complete optional tag handling. in reality the optional tag rules are more complex, see: https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */ nchilds = 0; nchildfound = 0; parenttype = 0; /* by default, seek until the root */ /* if optional tag

is open and a list element is found, close

. */ if (found && found->displaytype & DisplayList) { /* not inside a list */ childs[0] = TagP; nchilds = 1; parenttype = DisplayList; } else if (found && found->isoptional) { tagid = found->id; if (tagid == TagLi) { childs[0] = TagLi; nchilds = 1; parenttype = DisplayList; } else if (tagid == TagTd) { childs[0] = TagTd; nchilds = 1; parenttype = DisplayTableRow; } else if (tagid == TagTr) { childs[0] = TagTr; nchilds = 1; parenttype = DisplayTable; } else if (tagid == TagP) { childs[0] = TagP; nchilds = 1; parenttype = 0; /* seek until the root */ } else if (tagid == TagOption) { childs[0] = TagOption; nchilds = 1; parenttype = DisplaySelect; } else if (tagid == TagDt) { childs[0] = TagDd; nchilds = 1; parenttype = DisplayDl; } else if (tagid == TagDd) { childs[0] = TagDd; childs[1] = TagDt; nchilds = 2; parenttype = DisplayDl; } else if (tagid == cur->tag.id) { /* fake closing the previous tag if it is the same and repeated */ xmltagend(p, t, tl, 0); } } else if (found && found->displaytype & DisplayBlock) { /* check if we have an open "

" tag */ childs[0] = TagP; childs[1] = TagDl; nchilds = 2; parenttype = DisplayDl; } if (nchilds > 0) { for (i = curnode; i >= 0; i--) { if (nchildfound) break; if ((nodes[i].tag.displaytype & parenttype)) break; for (j = 0; j < nchilds; j++) { if (nodes[i].tag.id == childs[j]) { /* fake closing the previous tags */ for (k = curnode; k >= i; k--) xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0); nchildfound = 1; break; } } } } incnode(); string_clear(&nodes_links[curnode]); /* clear possible link reference for this node */ cur = &nodes[curnode]; memset(cur, 0, sizeof(*cur)); /* clear / reset node */ /* tag defaults */ cur->tag.displaytype = DisplayInline; cur->tag.name = cur->tagname; /* assign fixed-size buffer */ strlcpy(cur->tagname, t, sizeof(cur->tagname)); /* force to lowercase */ for (s = cur->tagname; *s; s++) *s = TOLOWER((unsigned char)*s); /* matched tag: copy tag information to current node */ if (found) memcpy(&(cur->tag), found, sizeof(*found)); /* if parent tag is hidden then hide itself too */ if (curnode > 0 && (nodes[curnode - 1].tag.displaytype & DisplayNone)) cur->tag.displaytype |= DisplayNone; } static void xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort) { struct tag *found; enum TagId tagid; struct node *cur, *parent; int i, margintop; /* match tag and lookup metadata */ tagid = 0; if ((found = findtag(t))) tagid = found->id; /* temporary replace the callback except the reader and end of tag restore the context once we receive the same ignored tag in the end tag handler */ if (tagid == TagScript) { ignorestate = endtag = ""; getnext = p->getnext; /* for restore */ p->getnext = getnext_ignore; xmltagend(p, t, tl, 0); /* fake the call the tag was ended */ return; } else if (tagid == TagStyle) { ignorestate = endtag = ""; getnext = p->getnext; /* for restore */ p->getnext = getnext_ignore; xmltagend(p, t, tl, 0); /* fake the call the tag was ended */ return; } #if 0 /* disable line-wrapping inside tables */ if (tagid == TagTable) linewrap = 0; #endif cur = &nodes[curnode]; /* copy attributes if set */ if (attr_id.len) strlcpy(cur->id, attr_id.data, sizeof(cur->id)); else cur->id[0] = '\0'; if (attr_class.len) strlcpy(cur->classnames, attr_class.data, sizeof(cur->classnames)); else cur->classnames[0] = '\0'; /* parent node */ if (curnode > 0) { parent = &nodes[curnode - 1]; parent->nchildren++; /* increase child node count */ /* count visible childnodes */ if (!(cur->tag.displaytype & DisplayNone)) parent->visnchildren++; } else { parent = NULL; } if (reader_mode && sel_show && reader_ignore && iscssmatchany(sel_show, nodes, curnode)) reader_ignore = 0; /* hide element */ if (reader_mode && sel_hide && iscssmatchany(sel_hide, nodes, curnode)) cur->tag.displaytype |= DisplayNone; /* indent for this tag */ cur->indent = cur->tag.indent; if (!reader_ignore) { /* add link reference, print links and alt text */ handleinlinelink(); handleinlinealt(); } /* : show all options */ if (parent->tag.displaytype & DisplaySelectMulti) cur->tag.displaytype |= DisplayBlock; else if (parent->nchildren > 1) /* show the first item as selected */ cur->tag.displaytype |= DisplayNone; /* else hide */ } if (cur->tag.displaytype & DisplayNone) return; if (reader_ignore) return; indent = calcindent(); if ((cur->tag.displaytype & (DisplayBlock | DisplayHeader | DisplayPre | DisplayTable | DisplayTableRow | DisplayList | DisplayListItem))) { startblock(); /* break line if needed */ } if (cur->tag.displaytype & (DisplayButton | DisplayOption)) { hflush(); hputchar('['); } margintop = cur->tag.margintop; if (cur->tag.displaytype & (DisplayList)) { for (i = curnode - 1; i >= 0; i--) { if (nodes[i].tag.displaytype & DisplayList) break; if (!(nodes[i].tag.displaytype & DisplayListItem)) continue; if (nodes[i].hasdata && margintop > 0) { margintop--; break; } } } else if (cur->tag.displaytype & (DisplayBlock|DisplayTable)) { if (!parentcontainerhasdata(cur->tag.displaytype, curnode - 1)) { if (margintop > 0) margintop--; } } if (margintop > 0) { hflush(); for (i = currentnewlines; i < margintop; i++) { putchar('\n'); nbytesline = 0; ncells = 0; currentnewlines++; } hadnewline = 1; } if (cur->tag.displaytype & DisplayPre) { skipinitialws = 1; } else if (cur->tag.displaytype & DisplayTableCell) { if (parent && parent->visnchildren > 1) hputchar('\t'); } else if (cur->tag.displaytype & DisplayListItem) { /* find first parent node and ordered numbers or unordered */ if (parent) { skipinitialws = 0; /* print bullet, add columns to indentation level */ if (parent->tag.displaytype & DisplayListOrdered) { hprintf("%4zu. ", parent->nchildren); cur->indent = 6; indent += cur->indent; /* align to number */ } else if (parent->tag.displaytype & DisplayList) { hprint(str_bullet_item); cur->indent = 2; indent += 2; /* align to bullet */ } } skipinitialws = 0; } else if (cur->tag.displaytype & DisplayInput) { if (!attr_type.len) { hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */ } else if (!strcasecmp(attr_type.data, "button")) { hprintf("[%s]", attr_value.len ? attr_value.data : ""); } else if (!strcasecmp(attr_type.data, "submit")) { hprintf("[%s]", attr_value.len ? attr_value.data : "Submit Query"); } else if (!strcasecmp(attr_type.data, "reset")) { hprintf("[%s]", attr_value.len ? attr_value.data : "Reset"); } else if (!strcasecmp(attr_type.data, "checkbox")) { hprintf("[%s]", attr_checked.len && !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " "); } else if (!strcasecmp(attr_type.data, "radio")) { hprintf("[%s]", attr_checked.len && !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " "); } else if (!strcasecmp(attr_type.data, "hidden")) { cur->tag.displaytype |= DisplayNone; } else { /* unrecognized / default case is text */ hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); } } startmarkup(cur->tag.markuptype); /* do not count data such as an item bullet as part of the data for the node */ cur->hasdata = 0; if (tagid == TagHr) { /* ruler */ i = termwidth - indent - defaultindent; for (; i > 0; i--) hprint(str_ruler); cur->hasdata = 1; /* treat


as data */ } else if (tagid == TagBr) { hflush(); hadnewline = 0; /* forced newline */ hputchar('\n'); cur->hasdata = 1; /* treat
as data */ } /* autoclose tags, such as
, pretend we are
*/ if (!isshort && cur->tag.isvoid) xmltagend(p, t, tl, 1); /* pretend close of short tag */ } static void xmlattr(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, const char *v, size_t vl) { struct node *cur; enum TagId tagid; cur = &nodes[curnode]; tagid = cur->tag.id; /* hide tags with attribute aria-hidden or hidden */ if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden")) cur->tag.displaytype |= DisplayNone; if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */ string_append(&attr_class, v, vl); else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */ string_append(&attr_id, v, vl); else if (!attrcmp(n, "type")) string_append(&attr_type, v, vl); else if (!attrcmp(n, "value")) string_append(&attr_value, v, vl); /* */ if (!basehrefset && tagid == TagBase && !attrcmp(n, "href")) strlcat(basehrefdoc, v, sizeof(basehrefdoc)); if (tagid == TagA && !attrcmp(n, "href")) string_append(&attr_href, v, vl); if (tagid == TagSelect && !attrcmp(n, "multiple")) cur->tag.displaytype |= DisplaySelectMulti; if (tagid == TagObject && !attrcmp(n, "data")) string_append(&attr_data, v, vl); /* show img alt attribute as text. */ if (tagid == TagImg && !attrcmp(n, "alt")) string_append(&attr_alt, v, vl); if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked")) string_append(&attr_checked, v, vl); /* src attribute */ switch (tagid) { case TagAudio: case TagEmbed: case TagFrame: case TagIframe: case TagImg: case TagSource: case TagTrack: case TagVideo: if (!attrcmp(n, "src")) string_append(&attr_src, v, vl); break; default: break; } } static void xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl, const char *v, size_t vl) { char buf[8]; int len; len = xml_entitytostr(v, buf, sizeof(buf)); if (len > 0) xmlattr(p, t, tl, n, nl, buf, (size_t)len); else xmlattr(p, t, tl, n, nl, v, vl); } static void xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) { struct node *cur; enum TagId tagid; cur = &nodes[curnode]; tagid = cur->tag.id; if (!attr_class_set && !attrcmp(n, "class")) attr_class_set = 1; else if (!attr_id_set && !attrcmp(n, "id")) attr_id_set = 1; /* set base URL, if it is set it cannot be overwritten again */ if (!basehrefset && basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href")) basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0; /* if attribute checked is set but it has no value then set it to "checked" */ if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len) string_append(&attr_checked, "checked", sizeof("checked") - 1); } static void xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n, size_t nl) { struct node *cur; enum TagId tagid; cur = &nodes[curnode]; tagid = cur->tag.id; if (!attrcmp(n, "alt")) string_clear(&attr_alt); else if (!attrcmp(n, "checked")) string_clear(&attr_checked); else if (!attr_class_set && !attrcmp(n, "class")) string_clear(&attr_class); else if (!attrcmp(n, "data")) string_clear(&attr_data); else if (!attrcmp(n, "href")) string_clear(&attr_href); else if (!attr_id_set && !attrcmp(n, "id")) string_clear(&attr_id); else if (!attrcmp(n, "src")) string_clear(&attr_src); else if (!attrcmp(n, "type")) string_clear(&attr_type); else if (!attrcmp(n, "value")) string_clear(&attr_value); if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href")) basehrefdoc[0] = '\0'; } static void usage(void) { fprintf(stderr, "%s [-8adiIlrx] [-b basehref] [-s selector] [-u selector] [-w termwidth]\n", argv0); exit(1); } int main(int argc, char **argv) { char *basehref; if (pledge("stdio", NULL) < 0) err(1, "pledge"); ARGBEGIN { case '8': str_bullet_item = "\xe2\x80\xa2 "; str_ruler = "\xe2\x94\x80"; /* symbol: "light horizontal" */ break; case 'a': allowansi = !allowansi; break; case 'b': basehref = EARGF(usage()); if (uri_parse(basehref, &base) == -1 || !base.proto[0]) usage(); basehrefset = 1; break; case 'd': uniqrefs = !uniqrefs; break; case 'i': showrefinline = !showrefinline; break; case 'I': showurlinline = !showurlinline; break; case 'l': showrefbottom = !showrefbottom; break; case 'r': allowlinewrap = !allowlinewrap; break; case 's': sel_show = compileselectors(EARGF(usage())); /* switch to reader/selector mode, ignore all data except when matched */ reader_mode = 1; reader_ignore = 1; break; case 'u': sel_hide = compileselectors(EARGF(usage())); /* switch to reader/selector mode */ reader_mode = 1; break; case 'w': if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1) usage(); break; case 'x': resources = !resources; break; default: usage(); } ARGEND linewrap = allowlinewrap; /* initial nodes */ ncapnodes = NODE_CAP_INC; nodes = ecalloc(ncapnodes, sizeof(*nodes)); nodes_links = ecalloc(ncapnodes, sizeof(*nodes_links)); parser.xmlattrstart = xmlattrstart; parser.xmlattr = xmlattr; parser.xmlattrentity = xmlattrentity; parser.xmlattrend = xmlattrend; parser.xmlcdatastart = xmlcdatastart; parser.xmlcdata = xmlcdata; parser.xmlcdataend = xmlcdataend; parser.xmldatastart = xmldatastart; parser.xmldata = xmldata; parser.xmldataentity = xmldataentity; parser.xmldataend = xmldataend; parser.xmltagstart = xmltagstart; parser.xmltagstartparsed = xmltagstartparsed; parser.xmltagend = xmltagend; parser.getnext = getchar; xml_parse(&parser); hflush(); if (ncells > 0) newline(); if (showrefbottom || resources) printlinkrefs(); hflush(); setmarkup(0); return 0; }