twebdump.c - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
twebdump.c (24576B)
---
1 #include <ctype.h>
2 #include <err.h>
3 #include <errno.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7 #include <strings.h>
8 #include <unistd.h>
9
10 #include "arg.h"
11 char *argv0;
12
13 #include "xml.h"
14
15 static XMLParser parser;
16
17 #ifndef __OpenBSD__
18 #define pledge(p1,p2) 0
19 #endif
20
21 #undef strlcat
22 size_t strlcat(char *, const char *, size_t);
23 #undef strlcpy
24 size_t strlcpy(char *, const char *, size_t);
25
26 /* uri */
27 struct uri {
28 char proto[48];
29 char host[256];
30 char path[2048];
31 char port[6]; /* numeric port */
32 };
33
34 /* options */
35 static int allowansi = 0; /* allow ANSI escape codes */
36 static int showlinkrefs = 0; /* show link references at the bottom */
37 static int softlinewrap = 0; /* soft line-wrapping */
38 static int termwidth = 72; /* terminal width */
39
40 /* linked-list of link references */
41 struct linkref {
42 char *type;
43 char *url;
44 struct linkref *next;
45 };
46
47 static struct linkref *links_head;
48 static struct linkref *links_cur;
49 static int linkcount;
50
51 enum DisplayType {
52 DisplayUnknown = 0,
53 DisplayInline = 1 << 0,
54 DisplayInlineBlock = 1 << 1,
55 DisplayBlock = 1 << 2,
56 DisplayNone = 1 << 3,
57 DisplayPre = 1 << 4,
58 DisplayList = 1 << 5,
59 DisplayListOrdered = 1 << 6,
60 DisplayListItem = 1 << 7,
61 DisplayTable = 1 << 8,
62 DisplayTableRow = 1 << 9,
63 DisplayTableCell = 1 << 10,
64 DisplayHeader = 1 << 11,
65 DisplayBold = 1 << 12,
66 DisplayItalic = 1 << 13,
67 DisplayUnderline = 1 << 14,
68 DisplayBlink = 1 << 15, /* lol */
69 DisplayReverse = 1 << 16,
70 DisplayStrike = 1 << 17,
71 };
72
73 struct tag {
74 const char *name;
75 enum DisplayType displaytype;
76 enum DisplayType parenttype; /* display type belonging to element */
77 int isvoid; /* "void" element */
78 int isoptional; /* optional to close tag */
79 };
80
81 struct node {
82 char tagname[256];
83 struct tag tag;
84 size_t nchildren; /* Child nodes for it's type */
85 };
86
87 /* String data / memory pool */
88 typedef struct string {
89 char *data; /* data */
90 size_t len; /* string length */
91 size_t bufsiz; /* allocated size */
92 } String;
93
94 int absuri(char *, size_t, const char *, const char *);
95 int parseuri(const char *, struct uri *, int);
96
97 static char *basehref = "";
98
99 static char src[4096]; /* src or href attribute */
100
101 static String htmldata;
102
103 /* for white-space output handling:
104 1 = whitespace emitted (suppress repeated), 2 = other characters on this line
105 Behaviour:
106 * White-space data before non-whitespace data in tags are ignored on a line.
107 * Repeated white-space are ignored: a single space (' ') is emitted.
108 */
109 static int whitespace_mode = 0;
110 static size_t ncharsline = 0;
111
112 #define MAX_DEPTH 256
113 static struct node nodes[MAX_DEPTH];
114 static int curnode;
115
116 #if 0
117 /* TODO: optional tags */
118 { "body", 0, 0, 0, 1 },
119 { "colgroup", 0, 0, 0, 1 },
120 { "dd", 0, 0, 0, 1 },
121 { "dt", 0, 0, 0, 1 },
122 { "head", 0, 0, 0, 1 },
123 { "html", 0, 0, 0, 1 },
124 { "li", 0, 0, 0, 1 },
125 { "optgroup", 0, 0, 0, 1 },
126 { "option", 0, 0, 0, 1 },
127 { "option", 0, 0, 0, 1 },
128 { "p", 0, 0, 0, 1 },
129 { "rp", 0, 0, 0, 1 },
130 { "rt", 0, 0, 0, 1 },
131 { "tbody", 0, 0, 0, 1 },
132 { "td", 0, 0, 0, 1 },
133 { "tfoot", 0, 0, 0, 1 },
134 { "th", 0, 0, 0, 1 },
135 { "thead", 0, 0, 0, 1 },
136 { "tr", 0, 0, 0, 1 },
137 #endif
138
139 /* tag displaytype p v o */
140 static struct tag tags[] = {
141 { "a", DisplayInline | DisplayUnderline, 0, 0, 0 },
142 { "area", DisplayInline, 0, 1, 0 },
143 { "article", DisplayBlock, 0, 0, 0 },
144 { "audio", DisplayInline | DisplayUnderline, 0, 0, 0 },
145 { "b", DisplayInline | DisplayBold, 0, 0, 0 },
146 { "base", DisplayInline, 0, 1, 0 },
147 { "blink", DisplayInline | DisplayBlink, 0, 0, 0 },
148 { "blockquote", DisplayBlock, 0, 0, 0 },
149 { "br", 0, 0, 1, 0 },
150 { "code", DisplayPre, 0, 0, 0 },
151 { "col", DisplayInline, 0, 1, 0 },
152 { "del", DisplayInline | DisplayStrike, 0, 0, 0 },
153 { "div", DisplayBlock, 0, 0, 0 },
154 { "em", DisplayInline | DisplayItalic, 0, 0, 0 },
155 { "embed", DisplayInline, 0, 1, 0 },
156 { "footer", DisplayBlock, 0, 0, 0 },
157 { "h1", DisplayHeader | DisplayBold, 0, 0, 0 },
158 { "h2", DisplayHeader | DisplayBold, 0, 0, 0 },
159 { "h3", DisplayHeader | DisplayBold, 0, 0, 0 },
160 { "h4", DisplayHeader | DisplayBold, 0, 0, 0 },
161 { "h5", DisplayHeader | DisplayBold, 0, 0, 0 },
162 { "h6", DisplayHeader | DisplayBold, 0, 0, 0 },
163 { "header", DisplayBlock, 0, 0, 0 },
164 { "hr", DisplayBlock, 0, 1, 0 },
165 { "i", DisplayInline | DisplayItalic, 0, 0, 0 },
166 { "img", DisplayInline | DisplayUnderline, 0, 1, 0 },
167 { "input", DisplayInline, 0, 1, 0 },
168 { "li", DisplayListItem, DisplayList, 0, 1 },
169 { "link", DisplayInline, 0, 1, 0 },
170 { "main", DisplayBlock, 0, 0, 0 },
171 { "meta", DisplayInline, 0, 1, 0 },
172 { "nav", DisplayBlock, 0, 0, 0 },
173 { "ol", DisplayList | DisplayListOrdered, 0, 0, 0 },
174 { "p", DisplayBlock, 0, 0, 1 },
175 { "param", DisplayInline, 0, 1, 0 },
176 { "pre", DisplayPre, 0, 0, 0 },
177 { "s", DisplayInline | DisplayStrike, 0, 0, 0 },
178 { "script", DisplayNone, 0, 0, 0 },
179 { "source", DisplayInline, 0, 1, 0 },
180 { "strike", DisplayInline | DisplayStrike, 0, 0, 0 },
181 { "strong", DisplayInline | DisplayBold, 0, 0, 0 },
182 { "style", DisplayNone, 0, 0, 0 },
183 { "table", DisplayTable, 0, 0, 0 },
184 { "td", DisplayTableCell, DisplayTableRow, 0, 0 },
185 { "template", DisplayNone, 0, 0, 0 },
186 { "th", DisplayTableCell | DisplayBold, DisplayTableRow, 0, 1 },
187 { "title", DisplayBlock, 0, 0, 0 },
188 { "tr", DisplayTableRow, DisplayTable, 0, 1 },
189 { "track", DisplayInline, 0, 1, 0 },
190 { "u", DisplayInline | DisplayUnderline, 0, 0, 0 },
191 { "ul", DisplayList, 0, 0, 0 },
192 { "video", DisplayInline | DisplayUnderline, 0, 0, 0 },
193 { "wbr", DisplayInline, 0, 1, 0 },
194 };
195
196 static const char *ignorestate, *endtag;
197 static int (*getnext)(void);
198
199 /* return a space for all data until some case-insensitive string occurs. This
200 is used to parse incorrect HTML/XML that contains unescaped HTML in script
201 or style tags. If you see some </script> tag in a CDATA or comment
202 section then e-mail W3C and tell them the web is too complex. */
203 static inline int
204 getnext_ignore(void)
205 {
206 int c;
207
208 if ((c = getnext()) == EOF)
209 return EOF;
210
211 if (tolower(c) == tolower((unsigned char)*ignorestate)) {
212 ignorestate++;
213 if (*ignorestate == '\0') {
214 parser.getnext = getnext; /* restore */
215 return ' ';
216 }
217 } else {
218 ignorestate = endtag;
219 }
220
221 return ' ';
222 }
223
224 /* Clear string only; don't free, prevents unnecessary reallocation. */
225 static void
226 string_clear(String *s)
227 {
228 if (s->data)
229 s->data[0] = '\0';
230 s->len = 0;
231 }
232
233 static void
234 string_buffer_realloc(String *s, size_t newlen)
235 {
236 size_t alloclen;
237
238 for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
239 ;
240 if (!(s->data = realloc(s->data, alloclen)))
241 err(1, "realloc");
242 s->bufsiz = alloclen;
243 }
244
245 static void
246 string_append(String *s, const char *data, size_t len)
247 {
248 if (!len)
249 return;
250 /* check if allocation is necesary, don't shrink buffer,
251 * should be more than bufsiz ofcourse. */
252 if (s->len + len >= s->bufsiz)
253 string_buffer_realloc(s, s->len + len + 1);
254 memcpy(s->data + s->len, data, len);
255 s->len += len;
256 s->data[s->len] = '\0';
257 }
258
259 char *
260 estrdup(const char *s)
261 {
262 char *p;
263
264 if (!(p = strdup(s)))
265 err(1, "strdup");
266 return p;
267 }
268
269 void *
270 ecalloc(size_t nmemb, size_t size)
271 {
272 void *p;
273
274 if (!(p = calloc(nmemb, size)))
275 err(1, "calloc");
276 return p;
277 }
278
279 static void
280 newline(void)
281 {
282 putchar('\n');
283 whitespace_mode &= ~2; /* no characters on this line yet */
284 ncharsline = 0;
285 }
286
287 static void
288 printansi(const char *s)
289 {
290 if (!allowansi)
291 return;
292 fputs(s, stdout);
293 }
294
295 /* print one character safely: no control characters */
296 static void
297 printc(int c)
298 {
299 if (isspace(c)) {
300 whitespace_mode |= 1;
301 } else {
302 if (whitespace_mode == 3) {
303 putchar(' ');
304 ncharsline++;
305 }
306
307 whitespace_mode = 2;
308 if (!iscntrl(c)) {
309 putchar(c);
310 ncharsline++;
311 }
312 }
313
314 if (softlinewrap) {
315 /* TODO: harder line-wrapping on "non-word" characters */
316 if (strchr(" \n\t", c) && ncharsline >= termwidth)
317 newline();
318 }
319 }
320
321 static struct node *
322 findparenttype(int cur, int findtype)
323 {
324 int i;
325
326 for (i = cur; i; i--) {
327 if ((nodes[i].tag.displaytype & findtype))
328 return &nodes[i];
329 }
330 return NULL;
331 }
332
333 /* Find nearest parent node belonging to type. For example a listitem -> list */
334 static struct node *
335 findparentoftype(int cur)
336 {
337 if (!nodes[cur].tag.parenttype)
338 return NULL;
339
340 return findparenttype(cur, nodes[cur].tag.parenttype);
341 }
342
343 static void
344 printsafe(const char *s, size_t len)
345 {
346 size_t i;
347
348 for (i = 0; *s && i < len; s++, i++) {
349 switch (*s) {
350 case '\t':
351 case '\n':
352 putchar(*s);
353 break;
354 default:
355 if (!iscntrl((unsigned char)*s))
356 putchar(*s);
357 }
358 }
359 }
360
361 int
362 parseuri(const char *s, struct uri *u, int rel)
363 {
364 const char *p = s, *b;
365 char *endptr = NULL;
366 size_t i;
367 unsigned long l;
368
369 u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
370 if (!*s)
371 return 0;
372
373 /* prefix is "//", don't read protocol, skip to domain parsing */
374 if (!strncmp(p, "//", 2)) {
375 p += 2; /* skip "//" */
376 } else {
377 /* protocol part */
378 for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
379 *p == '+' || *p == '-' || *p == '.'); p++)
380 ;
381 if (!strncmp(p, "://", 3)) {
382 if ((size_t)(p - s) >= sizeof(u->proto))
383 return -1; /* protocol too long */
384 memcpy(u->proto, s, p - s);
385 u->proto[p - s] = '\0';
386 p += 3; /* skip "://" */
387 } else {
388 p = s; /* no protocol format, set to start */
389 /* relative url: read rest as path, else as domain */
390 if (rel)
391 goto readpath;
392 }
393 }
394 /* IPv6 address */
395 if (*p == '[') {
396 /* bracket not found or host too long */
397 if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
398 (size_t)(b - p) >= sizeof(u->host))
399 return -1;
400 memcpy(u->host, p, b - p + 1);
401 u->host[b - p + 1] = '\0';
402 p = b + 1;
403 } else {
404 /* domain / host part, skip until port, path or end. */
405 if ((i = strcspn(p, ":/")) >= sizeof(u->host))
406 return -1; /* host too long */
407 memcpy(u->host, p, i);
408 u->host[i] = '\0';
409 p = &p[i];
410 }
411 /* port */
412 if (*p == ':') {
413 if ((i = strcspn(++p, "/")) >= sizeof(u->port))
414 return -1; /* port too long */
415 memcpy(u->port, p, i);
416 u->port[i] = '\0';
417 /* check for valid port: range 1 - 65535 */
418 errno = 0;
419 l = strtoul(u->port, &endptr, 10);
420 if (errno || u->port[0] == '\0' || *endptr ||
421 !l || l > 65535)
422 return -1;
423 p = &p[i];
424 }
425 readpath:
426 if (u->host[0]) {
427 p = &p[strspn(p, "/")];
428 strlcpy(u->path, "/", sizeof(u->path));
429 } else {
430 /* absolute uri must have a host specified */
431 if (!rel)
432 return -1;
433 }
434 /* treat truncation as an error */
435 if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
436 return -1;
437 return 0;
438 }
439
440 static int
441 encodeuri(char *buf, size_t bufsiz, const char *s)
442 {
443 static const char *table = "0123456789ABCDEF";
444 size_t i, b;
445
446 for (i = 0, b = 0; s[i]; i++) {
447 if (s[i] == ' ' ||
448 (unsigned char)s[i] > 127 ||
449 iscntrl((unsigned char)s[i])) {
450 if (b + 3 >= bufsiz)
451 return -1;
452 buf[b++] = '%';
453 buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
454 buf[b++] = table[(unsigned char)s[i] & 15];
455 } else if (b < bufsiz) {
456 buf[b++] = s[i];
457 } else {
458 return -1;
459 }
460 }
461 if (b >= bufsiz)
462 return -1;
463 buf[b] = '\0';
464
465 return 0;
466 }
467
468 /* Get absolute uri; if `link` is relative use `base` to make it absolute.
469 * the returned string in `buf` is uri encoded, see: encodeuri(). */
470 int
471 absuri(char *buf, size_t bufsiz, const char *link, const char *base)
472 {
473 struct uri ulink, ubase;
474 char tmp[4096], *host, *p, *port;
475 int c, r;
476 size_t i;
477
478 buf[0] = '\0';
479 if (parseuri(base, &ubase, 0) == -1 ||
480 parseuri(link, &ulink, 1) == -1 ||
481 (!ulink.host[0] && !ubase.host[0]))
482 return -1;
483
484 if (!strncmp(link, "//", 2)) {
485 host = ulink.host;
486 port = ulink.port;
487 } else {
488 host = ulink.host[0] ? ulink.host : ubase.host;
489 port = ulink.port[0] ? ulink.port : ubase.port;
490 }
491 r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
492 ulink.proto[0] ?
493 ulink.proto :
494 (ubase.proto[0] ? ubase.proto : "http"),
495 host,
496 port[0] ? ":" : "",
497 port);
498 if (r < 0 || (size_t)r >= sizeof(tmp))
499 return -1; /* error or truncation */
500
501 /* relative to root */
502 if (!ulink.host[0] && ulink.path[0] != '/') {
503 /* relative to base url path */
504 if (ulink.path[0]) {
505 if ((p = strrchr(ubase.path, '/'))) {
506 /* temporary null-terminate */
507 c = *(++p);
508 *p = '\0';
509 i = strlcat(tmp, ubase.path, sizeof(tmp));
510 *p = c; /* restore */
511 if (i >= sizeof(tmp))
512 return -1;
513 }
514 } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
515 sizeof(tmp)) {
516 return -1;
517 }
518 }
519 if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
520 return -1;
521
522 return encodeuri(buf, bufsiz, tmp);
523 }
524
525 static void
526 xmlcdata(XMLParser *p, const char *data, size_t datalen)
527 {
528 struct node *cur;
529
530 cur = &nodes[curnode];
531 if (cur->tag.displaytype & DisplayNone)
532 return;
533
534 printsafe(data, datalen);
535 }
536
537 static void
538 xmldataend(XMLParser *p)
539 {
540 struct node *cur;
541 char *start, *s, *e;
542
543 if (!htmldata.data || !htmldata.len)
544 return;
545
546 cur = &nodes[curnode];
547
548 if ((cur->tag.displaytype & DisplayNone)) {
549 /* nothing */
550 } else if ((cur->tag.displaytype & DisplayPre) ||
551 findparenttype(curnode, DisplayPre)) {
552 /* if <pre> or inside it */
553 printsafe(htmldata.data, htmldata.len);
554 } else {
555 start = htmldata.data;
556 e = htmldata.data + htmldata.len;
557
558 for (s = start; s < e; s++)
559 printc((unsigned char)*s);
560 }
561
562 string_clear(&htmldata);
563 }
564
565 static void
566 xmldata(XMLParser *p, const char *data, size_t datalen)
567 {
568 struct node *cur;
569
570 cur = &nodes[curnode];
571 if (cur->tag.displaytype & DisplayNone)
572 return;
573
574 string_append(&htmldata, data, datalen);
575 }
576
577 static void
578 xmldataentity(XMLParser *p, const char *data, size_t datalen)
579 {
580 struct node *cur;
581 char buf[16];
582 int n;
583
584 cur = &nodes[curnode];
585 if (cur->tag.displaytype & DisplayNone)
586 return;
587
588 n = xml_entitytostr(data, buf, sizeof(buf));
589 if (n > 0)
590 xmldata(p, buf, (size_t)n);
591 else
592 xmldata(p, data, datalen);
593 }
594
595 int
596 tagcmp(const void *v1, const void *v2)
597 {
598 struct tag *t1 = (struct tag *)v1;
599 struct tag *t2 = (struct tag *)v2;
600
601 return strcasecmp(t1->name, t2->name);
602 }
603
604 struct tag *
605 findtag(const char *t)
606 {
607 struct tag find;
608
609 find.name = t;
610 return bsearch(&find, tags, sizeof(tags) / sizeof(*tags),
611 sizeof(*tags), tagcmp);
612 }
613
614 static void
615 tagend(struct node *cur)
616 {
617 const char *t;
618 size_t i;
619
620 t = cur->tag.name;
621
622 if (cur->tag.displaytype & DisplayBold)
623 printansi("\033[22m"); /* reset bold or faint */
624 if (cur->tag.displaytype & DisplayItalic)
625 printansi("\033[23m"); /* reset italic */
626 if (cur->tag.displaytype & DisplayUnderline)
627 printansi("\033[24m"); /* reset underline */
628 if (cur->tag.displaytype & DisplayBlink)
629 printansi("\033[25m"); /* reset blink */
630 if (cur->tag.displaytype & DisplayReverse)
631 printansi("\033[27m"); /* reset reverse */
632 if (cur->tag.displaytype & DisplayStrike)
633 printansi("\033[29m"); /* reset strike */
634
635 if (cur->tag.displaytype & DisplayBlock) {
636 newline();
637 } else if (cur->tag.displaytype & DisplayPre) {
638 newline();
639 } else if (cur->tag.displaytype & DisplayTable) {
640 newline();
641 } else if (cur->tag.displaytype & DisplayList) {
642 newline();
643 } else if (cur->tag.displaytype & DisplayListItem) {
644 newline();
645 } else if (cur->tag.displaytype & DisplayHeader) {
646 newline();
647 #if 1
648 if (t[0] == 'h' && t[1] >= '1' && t[1] <= '6' && t[2] == '\0') {
649 if (t[1] >= '3')
650 for (i = 0; i < termwidth; i++)
651 putchar('-');
652 else if (t[1] >= '1')
653 for (i = 0; i < termwidth; i++)
654 putchar('=');
655 newline();
656 }
657 #endif
658 }
659
660 }
661
662 static void
663 xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
664 {
665 struct tag *found;
666 int i;
667
668 /* ignore closing of void elements, like </br>, which is not allowed */
669 if ((found = findtag(t))) {
670 if (!isshort && found->isvoid)
671 return;
672 }
673
674 /* if the current closing tag matches the current open tag */
675 if (nodes[curnode].tag.name &&
676 !strcasecmp(nodes[curnode].tag.name, t)) {
677 tagend(&nodes[curnode]);
678 if (curnode)
679 curnode--;
680 } else {
681 /* ... else lookup the first matching start tag. This is also
682 for handling optional closing tags */
683 for (i = curnode; i > 0; i--) {
684 if (nodes[curnode].tag.name &&
685 !strcasecmp(nodes[i].tag.name, t)) {
686 tagend(&nodes[i]);
687 curnode = i;
688 break;
689 }
690 }
691 if (curnode)
692 curnode--;
693 }
694 }
695
696 /* check if the specified tag is closed at some point in the current tree */
697 static int
698 istagclosed(int cur)
699 {
700 int i;
701
702 if (!cur)
703 return 0;
704 for (i = cur - 1; i > 0; i--) {
705 if (!strcasecmp(nodes[i].tag.name, nodes[cur].tag.name))
706 return 0;
707 }
708 return 1;
709 }
710
711 static void
712 xmltagstart(XMLParser *p, const char *t, size_t tl)
713 {
714 struct tag *found;
715 struct node *cur, *parent;
716 char *s;
717
718 if (curnode >= MAX_DEPTH - 2)
719 errx(1, "max tag depth reached: %d\n", curnode);
720 parent = &nodes[curnode];
721 curnode++;
722
723 cur = &nodes[curnode];
724 memset(cur, 0, sizeof(*cur));
725 /* tag defaults */
726 cur->tag.displaytype = DisplayInline;
727 cur->tag.name = cur->tagname;
728 strlcpy(cur->tagname, t, sizeof(cur->tagname));
729 /* to lowercase */
730 for (s = cur->tagname; *s; s++)
731 *s = tolower((unsigned char)*s);
732
733 /* match tag */
734 if ((found = findtag(t))) {
735 cur->nchildren = 0;
736 memcpy(&(cur->tag), found, sizeof(*found));
737
738 if (cur->tag.isoptional && curnode && !istagclosed(curnode)) {
739 /* if it's an unclosed tag and it has parent (like ol, ul)
740 then fake the end tag. */
741 tagend(&nodes[curnode]);
742 }
743
744 /* parent tag is hidden, so hide ourself too */
745 if (parent->tag.displaytype & DisplayNone)
746 cur->tag.displaytype |= DisplayNone;
747 return;
748 }
749
750 src[0] = '\0'; /* reset src, href */
751 }
752
753 static void
754 xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
755 {
756 struct node *cur, *parent = NULL;
757 char absurl[1024];
758 int i;
759
760 /* temporary replace the callback except the reader and end of tag
761 restore the context once we receive the same ignored tag in the
762 end tag handler */
763 if (!strcasecmp(t, "script")) {
764 ignorestate = endtag = "</script>";
765 getnext = p->getnext; /* for restore */
766 p->getnext = getnext_ignore;
767 return;
768 } else if (!strcasecmp(t, "style")) {
769 ignorestate = endtag = "</style>";
770 getnext = p->getnext; /* for restore */
771 p->getnext = getnext_ignore;
772 return;
773 }
774
775 cur = &nodes[curnode];
776 if (cur->tag.displaytype & DisplayNone)
777 return;
778
779 /* show links as reference at the bottom */
780 if (showlinkrefs && src[0]) {
781 absurl[0] = '\0';
782 if (!strcasecmp(t, "a")) {
783 if (!strncmp(src, "mailto:", sizeof("mailto:") - 1))
784 strlcpy(absurl, src, sizeof(absurl));
785 else if (!strncmp(src, "tel:", sizeof("tel:") - 1))
786 strlcpy(absurl, src, sizeof(absurl));
787 }
788 if (!absurl[0] && absuri(absurl, sizeof(absurl), src, basehref) == -1)
789 absurl[0] = '\0';
790 if (absurl[0]) {
791 if (!links_head)
792 links_cur = links_head = ecalloc(1, sizeof(*links_head));
793 else
794 links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
795 links_cur->url = estrdup(absurl);
796
797 printf(" [%d]", ++linkcount);
798 links_cur->type = estrdup(t);
799 }
800 src[0] = '\0';
801 }
802
803 /* find first parent node of type and increase child node count */
804 if (cur->tag.parenttype && (parent = findparentoftype(curnode)))
805 parent->nchildren++;
806
807 if (cur->tag.displaytype & DisplayBlock) {
808 newline();
809 } else if (cur->tag.displaytype & DisplayHeader) {
810 newline();
811 } else if (cur->tag.displaytype & DisplayTableRow) {
812 newline();
813 } else if (cur->tag.displaytype & DisplayList) {
814 newline();
815 } else if (cur->tag.displaytype & DisplayTableCell) {
816 if (parent && parent->nchildren > 1)
817 fputs("\t", stdout);
818 } else if (cur->tag.displaytype & DisplayListItem) {
819 /* indent nested list items */
820 for (i = curnode; i; i--) {
821 if (nodes[i].tag.displaytype & DisplayListItem)
822 continue;
823 if (nodes[i].tag.displaytype & DisplayList)
824 fputs(" ", stdout);
825 }
826 /* find first parent node and ordered numbers or unordered */
827 if (parent) {
828 if (parent->tag.displaytype & DisplayListOrdered)
829 printf("%zu. ", parent->nchildren);
830 else
831 fputs("\xe2\x80\xa2 ", stdout);
832 }
833 }
834
835 if (cur->tag.displaytype & DisplayBold)
836 printansi("\033[1m");
837 if (cur->tag.displaytype & DisplayItalic)
838 printansi("\033[3m");
839 if (cur->tag.displaytype & DisplayUnderline)
840 printansi("\033[4m");
841 if (cur->tag.displaytype & DisplayBlink)
842 printansi("\033[5m");
843 if (cur->tag.displaytype & DisplayReverse)
844 printansi("\033[7m");
845 if (cur->tag.displaytype & DisplayStrike)
846 printansi("\033[9m");
847
848 if (!strcasecmp(t, "hr")) { /* ruler */
849 for (i = 0; i < termwidth; i++)
850 putchar('-');
851 } else if (!strcasecmp(t, "br")) {
852 newline();
853 }
854
855 /* autoclose tags, such as <br>, pretend we are <br/> */
856 if (!isshort && cur->tag.isvoid)
857 xmltagend(p, t, tl, 1);
858 }
859
860 static void
861 xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
862 size_t namelen, const char *value, size_t valuelen)
863 {
864 struct node *cur;
865
866 cur = &nodes[curnode];
867 if (cur->tag.displaytype & DisplayNone)
868 return;
869
870 /* hide tags with attribute aria-hidden or hidden */
871 if (!strcasecmp(name, "aria-hidden") || !strcasecmp(name, "hidden"))
872 cur->tag.displaytype |= DisplayNone;
873
874 if (cur->tag.displaytype & DisplayNone)
875 return;
876
877 if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
878 strlcpy(src, value, sizeof(src));
879
880 if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
881 !strcasecmp(tag, "audio")) &&
882 !strcasecmp(name, "src") && valuelen)
883 strlcpy(src, value, sizeof(src));
884
885 /* show img alt attribute as text. */
886 if (!strcasecmp(tag, "img") && !strcasecmp(name, "alt"))
887 printsafe(value, strlen(value));
888 }
889
890 void
891 printlinkrefs(void)
892 {
893 size_t i;
894
895 if (!links_head)
896 return;
897
898 printf("\n\nLink references:\n");
899
900 /* TODO: add title attribute or some basic description? */
901 for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
902 printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
903 }
904
905 void
906 usage(void)
907 {
908 fprintf(stderr, "%s [-alr] [-b basehref] [-w termwidth]\n", argv0);
909 exit(1);
910 }
911
912 int
913 main(int argc, char **argv)
914 {
915 if (pledge("stdio", NULL) < 0)
916 err(1, "pledge");
917
918 ARGBEGIN {
919 case 'a':
920 allowansi = !allowansi;
921 break;
922 case 'b':
923 basehref = EARGF(usage());
924 break;
925 case 'l':
926 showlinkrefs = !showlinkrefs;
927 break;
928 case 'r':
929 softlinewrap = !softlinewrap;
930 break;
931 case 'w':
932 if ((termwidth = strtol(EARGF(usage()), NULL, 10)) < 1)
933 usage();
934 break;
935 default:
936 usage();
937 } ARGEND
938
939 parser.xmlattr = xmlattr;
940 parser.xmlcdata = xmlcdata;
941 parser.xmldata = xmldata;
942 parser.xmldataend = xmldataend;
943 parser.xmldataentity = xmldataentity;
944 parser.xmltagstart = xmltagstart;
945 parser.xmltagend = xmltagend;
946 parser.xmltagstartparsed = xmltagstartparsed;
947
948 parser.getnext = getchar;
949 xml_parse(&parser);
950
951 if (showlinkrefs)
952 printlinkrefs();
953
954 if (ncharsline)
955 putchar('\n');
956
957 printansi("\033[0m"); /* reset all attributes */
958
959 return 0;
960 }