various improvements - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 89c9108dc27fe27e0f028f67508a1156ed242d2a
(DIR) parent 62884d7b5684e791bb0cd6466f74367d6d71618d
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Thu, 14 Sep 2023 22:31:03 +0200
various improvements
- add an unique tagid number per tag. This allows checking by tag number.
- add support for the link reference <frame>, <iframe>, <embed src>.
- improve checking for open optional <p> tags when a block element (such as
<section> is open).
- check if the base URI using the -b option is absolute.
Diffstat:
M webdump.1 | 3 ++-
M webdump.c | 430 +++++++++++++++++--------------
2 files changed, 245 insertions(+), 188 deletions(-)
---
(DIR) diff --git a/webdump.1 b/webdump.1
@@ -1,4 +1,4 @@
-.Dd September 12, 2023
+.Dd September 14, 2023
.Dt WEBDUMP 1
.Os
.Sh NAME
@@ -18,6 +18,7 @@ It converts and writes the output as plain-text to stdout.
A
.Ar baseurl
can be specified if the links in the feed are relative URLs.
+This must be an absolute URI.
.Bl -tag -width Ds
.It Fl 8
Use UTF-8 symbols for certain items like bullet items and rulers to make the
(DIR) diff --git a/webdump.c b/webdump.c
@@ -53,19 +53,6 @@ static int termwidth = 77; /* terminal width */
static int resources = 0; /* write resources line-by-line to fd 3? */
static int uniqrefs = 0; /* number unique references */
-/* linked-list of link references */
-struct linkref {
- char *type;
- char *url;
- int ishidden;
- size_t linknr;
- struct linkref *next;
-};
-
-static struct linkref *links_head;
-static struct linkref *links_cur;
-static int linkcount; /* visible link count */
-
enum DisplayType {
DisplayUnknown = 0,
DisplayInline = 1 << 0,
@@ -106,8 +93,22 @@ typedef struct string {
size_t bufsiz; /* allocated size */
} String;
+enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
+TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite,
+TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir,
+TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure,
+TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6,
+TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns,
+TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta,
+TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript,
+TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
+TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot,
+TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo,
+TagWbr, TagXmp };
+
struct tag {
const char *name;
+ enum TagId id;
enum DisplayType displaytype;
enum MarkupType markuptype; /* ANSI markup */
enum DisplayType parenttype; /* display type belonging to element */
@@ -150,6 +151,20 @@ struct selectors {
size_t count;
};
+/* linked-list of link references */
+struct linkref {
+ char *type;
+ enum TagId tagid;
+ char *url;
+ int ishidden;
+ size_t linknr;
+ struct linkref *next;
+};
+
+static struct linkref *links_head;
+static struct linkref *links_cur;
+static int linkcount; /* visible link count */
+
static const char *str_bullet_item = "* ";
static const char *str_checkbox_checked = "x";
static const char *str_ruler = "-";
@@ -212,96 +227,100 @@ static enum MarkupType curmarkup;
/* selector to match */
static struct selectors *sel_hide, *sel_show;
-/* tag displaytype markup parent v o b a i */
+/* tags table: needs to be sorted like tagcmp(), alphabetically */
+
+/* tag id displaytype markup parent v o b a i */
static struct tag tags[] = {
-{ "a", DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
-{ "address", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "area", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "article", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "aside", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "audio", DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
-{ "b", DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 },
-{ "base", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "blink", DisplayInline, MarkupBlink, 0, 0, 0, 0, 0, 0 },
-{ "blockquote", DisplayBlock, 0, 0, 0, 0, 0, 0, 2 },
-{ "body", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "br", 0, 0, 0, 1, 0, 0, 0, 0 },
-{ "button", DisplayInline | DisplayButton, 0, 0, 0, 0, 0, 0, 0 },
-{ "cite", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
-{ "col", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "colgroup", DisplayInline, 0, 0, 0, 1, 0, 0, 0 },
-{ "datalist", DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
-{ "dd", DisplayBlock, 0, 0, 0, 1, 0, 0, 4 },
-{ "del", DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
-{ "details", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "dfn", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
-{ "dir", DisplayList, 0, 0, 0, 0, 1, 1, 2 },
-{ "div", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "dl", DisplayBlock | DisplayDl, 0, 0, 0, 0, 0, 0, 0 },
-{ "dt", DisplayBlock, MarkupBold, 0, 0, 1, 0, 0, 0 },
-{ "em", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
-{ "embed", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "fieldset", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "figcaption", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "figure", DisplayBlock, 0, 0, 0, 0, 1, 1, 4 },
-{ "footer", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "form", DisplayBlock, 0, 0, 0, 0, 0, 1, 0 },
-{ "h1", DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
-{ "h2", DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
-{ "h3", DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
-{ "h4", DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
-{ "h5", DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
-{ "h6", DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
-{ "head", DisplayBlock, 0, 0, 0, 1, 0, 0, 0 },
-{ "header", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "hr", DisplayBlock, 0, 0, 1, 0, 0, 0, 0 },
-{ "html", DisplayBlock, 0, 0, 0, 1, 0, 0, 0 },
-{ "i", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
-{ "img", DisplayInline, MarkupUnderline, 0, 1, 0, 0, 0, 0 },
-{ "input", DisplayInput, 0, 0, 1, 0, 0, 0, 0 },
-{ "ins", DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
-{ "label", DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
-{ "legend", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "li", DisplayListItem, 0, DisplayList, 0, 1, 0, 0, 0 },
-{ "link", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "main", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "mark", DisplayInline, MarkupReverse, 0, 0, 0, 0, 0, 0 },
-{ "menu", DisplayList, 0, 0, 0, 0, 1, 1, 2 },
-{ "meta", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "nav", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "object", DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
-{ "ol", DisplayList | DisplayListOrdered, 0, 0, 0, 0, 1, 1, 0 },
-{ "option", DisplayInline | DisplayOption, 0, 0, 0, 1, 0, 0, 0 },
-{ "p", DisplayBlock, 0, 0, 0, 1, 1, 1, 0 },
-{ "param", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "pre", DisplayPre, 0, 0, 0, 0, 1, 1, 4 },
-{ "s", DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
-{ "search", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "script", DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
-{ "section", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "select", DisplayInline | DisplaySelect, 0, 0, 0, 0, 0, 0, 0 },
-{ "source", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "strike", DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
-{ "strong", DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 },
-{ "style", DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
-{ "summary", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
-{ "table", DisplayTable, 0, 0, 0, 0, 0, 0, 0 },
-{ "tbody", DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
-{ "td", DisplayTableCell, 0, DisplayTableRow, 0, 1, 0, 0, 0 },
-{ "template", DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
-{ "textarea", DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
-{ "tfoot", DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
-{ "th", DisplayTableCell, MarkupBold, DisplayTableRow, 0, 1, 0, 0, 0 },
-{ "thead", DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
-{ "title", DisplayBlock, 0, 0, 0, 0, 0, 1, -DEFAULT_INDENT },
-{ "tr", DisplayTableRow, 0, DisplayTable, 0, 1, 0, 0, 0 },
-{ "track", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "u", DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
-{ "ul", DisplayList, 0, 0, 0, 0, 1, 1, 2 },
-{ "var", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
-{ "video", DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
-{ "wbr", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
-{ "xmp", DisplayPre, 0, 0, 0, 0, 1, 1, 4 }
+{ "a", TagA, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
+{ "address", TagAddress, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "area", TagArea, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "article", TagArticle, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "aside", TagAside, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "audio", TagAudio, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
+{ "b", TagB, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 },
+{ "base", TagBase, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "blink", TagBlink, DisplayInline, MarkupBlink, 0, 0, 0, 0, 0, 0 },
+{ "blockquote", TagBlockquote, DisplayBlock, 0, 0, 0, 0, 0, 0, 2 },
+{ "body", TagBody, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "br", TagBr, 0, 0, 0, 1, 0, 0, 0, 0 },
+{ "button", TagButton, DisplayInline | DisplayButton, 0, 0, 0, 0, 0, 0, 0 },
+{ "cite", TagCite, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
+{ "col", TagCol, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "colgroup", TagColgroup, DisplayInline, 0, 0, 0, 1, 0, 0, 0 },
+{ "datalist", TagDatalist, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
+{ "dd", TagDd, DisplayBlock, 0, 0, 0, 1, 0, 0, 4 },
+{ "del", TagDel, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
+{ "details", TagDetails, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "dfn", TagDfn, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
+{ "dir", TagDir, DisplayList, 0, 0, 0, 0, 1, 1, 2 },
+{ "div", TagDiv, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "dl", TagDl, DisplayBlock | DisplayDl, 0, 0, 0, 0, 0, 0, 0 },
+{ "dt", TagDt, DisplayBlock, MarkupBold, 0, 0, 1, 0, 0, 0 },
+{ "em", TagEm, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
+{ "embed", TagEmbed, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "fieldset", TagFieldset, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "figcaption", TagFigcaption, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "figure", TagFigure, DisplayBlock, 0, 0, 0, 0, 1, 1, 4 },
+{ "footer", TagFooter, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "form", TagForm, DisplayBlock, 0, 0, 0, 0, 0, 1, 0 },
+{ "frame", TagFrame, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "h1", TagH1, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
+{ "h2", TagH2, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
+{ "h3", TagH3, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
+{ "h4", TagH4, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
+{ "h5", TagH5, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
+{ "h6", TagH6, DisplayHeader, MarkupBold, 0, 0, 0, 1, 1, -DEFAULT_INDENT },
+{ "head", TagHead, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 },
+{ "header", TagHeader, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "hr", TagHr, DisplayBlock, 0, 0, 1, 0, 0, 0, 0 },
+{ "html", TagHtml, DisplayBlock, 0, 0, 0, 1, 0, 0, 0 },
+{ "i", TagI, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
+{ "iframe", TagIframe, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
+{ "img", TagImg, DisplayInline, MarkupUnderline, 0, 1, 0, 0, 0, 0 },
+{ "input", TagInput, DisplayInput, 0, 0, 1, 0, 0, 0, 0 },
+{ "ins", TagIns, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
+{ "label", TagLabel, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
+{ "legend", TagLegend, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "li", TagLi, DisplayListItem, 0, DisplayList, 0, 1, 0, 0, 0 },
+{ "link", TagLink, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "main", TagMain, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "mark", TagMark, DisplayInline, MarkupReverse, 0, 0, 0, 0, 0, 0 },
+{ "menu", TagMenu, DisplayList, 0, 0, 0, 0, 1, 1, 2 },
+{ "meta", TagMeta, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "nav", TagNav, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "object", TagObject, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
+{ "ol", TagOl, DisplayList | DisplayListOrdered, 0, 0, 0, 0, 1, 1, 0 },
+{ "option", TagOption, DisplayInline | DisplayOption, 0, 0, 0, 1, 0, 0, 0 },
+{ "p", TagP, DisplayBlock, 0, 0, 0, 1, 1, 1, 0 },
+{ "param", TagParam, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "pre", TagPre, DisplayPre, 0, 0, 0, 0, 1, 1, 4 },
+{ "s", TagS, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
+{ "script", TagScript, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
+{ "search", TagSearch, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "section", TagSection, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "select", TagSelect, DisplayInline | DisplaySelect, 0, 0, 0, 0, 0, 0, 0 },
+{ "source", TagSource, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "strike", TagStrike, DisplayInline, MarkupStrike, 0, 0, 0, 0, 0, 0 },
+{ "strong", TagStrong, DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 },
+{ "style", TagStyle, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
+{ "summary", TagSummary, DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
+{ "table", TagTable, DisplayTable, 0, 0, 0, 0, 0, 0, 0 },
+{ "tbody", TagTbody, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
+{ "td", TagTd, DisplayTableCell, 0, DisplayTableRow, 0, 1, 0, 0, 0 },
+{ "template", TagTemplate, DisplayNone, 0, 0, 0, 0, 0, 0, 0 },
+{ "textarea", TagTextarea, DisplayInline, 0, 0, 0, 0, 0, 0, 0 },
+{ "tfoot", TagTfoot, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
+{ "th", TagTh, DisplayTableCell, MarkupBold, DisplayTableRow, 0, 1, 0, 0, 0 },
+{ "thead", TagThead, DisplayInline, 0, DisplayTable, 0, 1, 0, 0, 0 },
+{ "title", TagTitle, DisplayBlock, 0, 0, 0, 0, 0, 1, -DEFAULT_INDENT },
+{ "tr", TagTr, DisplayTableRow, 0, DisplayTable, 0, 1, 0, 0, 0 },
+{ "track", TagTrack, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "u", TagU, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
+{ "ul", TagUl, DisplayList, 0, 0, 0, 0, 1, 1, 2 },
+{ "var", TagVar, DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
+{ "video", TagVideo, DisplayInline, MarkupUnderline, 0, 0, 0, 0, 0, 0 },
+{ "wbr", TagWbr, DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "xmp", TagXmp, DisplayPre, 0, 0, 0, 0, 1, 1, 4 }
};
/* hint for compilers and static analyzers that a function exits */
@@ -1374,9 +1393,10 @@ findlinkref(const char *url)
}
static struct linkref *
-addlinkref(const char *url, const char *_type, int ishidden, int linknr)
+addlinkref(const char *url, const char *_type, enum TagId tagid, int ishidden,
+ int linknr)
{
- if (!tagcmp(_type, "a"))
+ if (tagid == TagA)
_type = "link";
/* add to linked list */
@@ -1386,6 +1406,7 @@ addlinkref(const char *url, const char *_type, int ishidden, int linknr)
links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
links_cur->url = estrdup(url);
links_cur->type = estrdup(_type);
+ links_cur->tagid = tagid;
links_cur->ishidden = ishidden;
links_cur->linknr = linknr;
@@ -1441,7 +1462,7 @@ handleinlinelink(void)
/* add hidden links directly to the reference,
the order doesn't matter */
if (cur->tag.displaytype & DisplayNone)
- addlinkref(url, cur->tag.name, 1, 0);
+ addlinkref(url, cur->tag.name, cur->tag.id, 1, 0);
}
void
@@ -1658,7 +1679,7 @@ endnode(struct node *cur)
if (!ref) {
linkcount++;
ref = addlinkref(nodes_links[curnode].data,
- cur->tag.name, ishidden, linkcount);
+ cur->tag.name, cur->tag.id, ishidden, linkcount);
}
if (showrefinline || showurlinline) {
@@ -1669,7 +1690,7 @@ endnode(struct node *cur)
if (showrefinline)
hprintf("[%zu]", ref->linknr);
if (showurlinline) {
- if (!tagcmp("link", ref->type))
+ if (ref->tagid == TagA)
hprintf("[%s]", ref->url);
else
hprintf("[%s: %s]", ref->type, ref->url);
@@ -1687,7 +1708,7 @@ static void
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
{
struct tag *found, *tag;
- char *child, *childs[16];
+ enum TagId child, childs[16];
size_t nchilds;
int i, j, k, nchildfound, parenttype;
@@ -1701,35 +1722,39 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
in reality the optional tag rules are more complex, see:
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
- child = NULL;
+ child = 0;
nchilds = 0;
nchildfound = 0;
- parenttype = 0;
+ parenttype = 0; /* by default, seek until the root */
if (found && found->displaytype & DisplayPre) {
skipinitialws = 0; /* do not skip white-space, for margins */
} else if (found && found->displaytype & DisplayList) {
- childs[0] = "li";
+ childs[0] = TagLi;
nchilds = 1;
parenttype = DisplayList;
} else if (found && found->displaytype & DisplayTableRow) {
- childs[0] = "td";
+ childs[0] = TagTd;
nchilds = 1;
parenttype = DisplayTableRow;
} else if (found && found->displaytype & DisplayTable) {
- childs[0] = "td";
+ childs[0] = TagTd;
nchilds = 1;
parenttype = DisplayTable;
} else if (found && found->displaytype & DisplaySelect) {
- childs[0] = "option";
+ childs[0] = TagOption;
nchilds = 1;
parenttype = DisplaySelect;
} else if (found && found->displaytype & DisplayDl) {
- childs[0] = "p";
- childs[1] = "dd";
- childs[2] = "dt";
+ childs[0] = TagP;
+ childs[1] = TagDd;
+ childs[2] = TagDt;
nchilds = 3;
parenttype = DisplayDl;
+ } else if (found && found->displaytype & DisplayBlock) {
+ childs[0] = TagP;
+ nchilds = 1;
+ parenttype = 0; /* seek until the root */
}
if (nchilds > 0) {
@@ -1740,7 +1765,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
break;
for (j = 0; j < nchilds; j++) {
child = childs[j];
- if (!tagcmp(nodes[i].tag.name, child)) {
+ if (nodes[i].tag.id == child) {
/* fake closing the previous tags */
for (k = curnode; k >= i; k--)
endnode(&nodes[k]);
@@ -1794,7 +1819,8 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
{
struct tag *found;
struct node *cur;
- char *child, *childs[16];
+ enum TagId tagid;
+ enum TagId child, childs[16];
size_t nchilds;
char *s;
int i, j, k, nchildfound, parenttype;
@@ -1821,55 +1847,56 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
in reality the optional tag rules are more complex, see:
https://html.spec.whatwg.org/multipage/syntax.html#optional-tags */
- child = NULL;
+ child = 0;
nchilds = 0;
nchildfound = 0;
- parenttype = 0;
+ parenttype = 0; /* by default, seek until the root */
/* if optional tag <p> is open and a list element is found, close </p>. */
if (found && found->displaytype & DisplayList) {
/* not inside a list */
- childs[0] = "p";
+ childs[0] = TagP;
nchilds = 1;
parenttype = DisplayList;
} else if (found && found->isoptional) {
- if (!tagcmp(t, "li")) {
- childs[0] = "li";
+ tagid = found->id;
+ if (tagid == TagLi) {
+ childs[0] = TagLi;
nchilds = 1;
parenttype = DisplayList;
- } else if (!tagcmp(t, "td")) {
- childs[0] = "td";
+ } else if (tagid == TagTd) {
+ childs[0] = TagTd;
nchilds = 1;
parenttype = DisplayTableRow;
- } else if (!tagcmp(t, "tr")) {
- childs[0] = "tr";
+ } else if (tagid == TagTr) {
+ childs[0] = TagTr;
nchilds = 1;
parenttype = DisplayTable;
- } else if (!tagcmp(t, "p")) {
- childs[0] = "p";
+ } else if (tagid == TagP) {
+ childs[0] = TagP;
nchilds = 1;
parenttype = 0; /* seek until the root */
- } else if (!tagcmp(t, "option")) {
- childs[0] = "option";
+ } else if (tagid == TagOption) {
+ childs[0] = TagOption;
nchilds = 1;
parenttype = DisplaySelect;
- } else if (!tagcmp(t, "dt")) {
- childs[0] = "dd";
+ } else if (tagid == TagDt) {
+ childs[0] = TagDd;
nchilds = 1;
parenttype = DisplayDl;
- } else if (!tagcmp(t, "dd")) {
- childs[0] = "dd";
- childs[1] = "dt";
+ } else if (tagid == TagDd) {
+ childs[0] = TagDd;
+ childs[1] = TagDt;
nchilds = 2;
parenttype = DisplayDl;
- } else if (!tagcmp(t, cur->tag.name)) {
+ } else if (tagid == cur->tag.id) {
/* fake closing the previous tag if it is the same and repeated */
xmltagend(p, t, tl, 0);
}
} else if (found && found->displaytype & DisplayBlock) {
/* check if we have an open "<p>" tag */
- childs[0] = "p";
- childs[1] = "dl";
+ childs[0] = TagP;
+ childs[1] = TagDl;
nchilds = 2;
parenttype = DisplayDl;
}
@@ -1882,7 +1909,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
break;
for (j = 0; j < nchilds; j++) {
child = childs[j];
- if (!tagcmp(nodes[i].tag.name, child)) {
+ if (nodes[i].tag.id == child) {
/* fake closing the previous tags */
for (k = curnode; k >= i; k--)
xmltagend(p, nodes[k].tag.name, strlen(nodes[k].tag.name), 0);
@@ -1917,19 +1944,26 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
static void
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
{
+ struct tag *found;
+ enum TagId tagid;
struct node *cur, *parent;
int i, margintop;
+ /* match tag */
+ tagid = 0;
+ if ((found = findtag(t)))
+ tagid = found->id;
+
/* temporary replace the callback except the reader and end of tag
restore the context once we receive the same ignored tag in the
end tag handler */
- if (!tagcmp(t, "script")) {
+ if (tagid == TagScript) {
ignorestate = endtag = "</script>";
getnext = p->getnext; /* for restore */
p->getnext = getnext_ignore;
xmltagend(p, t, tl, 0); /* fake the call the tag was ended */
return;
- } else if (!tagcmp(t, "style")) {
+ } else if (tagid == TagStyle) {
ignorestate = endtag = "</style>";
getnext = p->getnext; /* for restore */
p->getnext = getnext_ignore;
@@ -2089,12 +2123,12 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
the node */
cur->hasdata = 0;
- if (!tagcmp(t, "hr")) { /* ruler */
+ if (tagid == TagHr) { /* ruler */
i = termwidth - indent - defaultindent;
for (; i > 0; i--)
hprint(str_ruler);
cur->hasdata = 1; /* treat <hr/> as data */
- } else if (!tagcmp(t, "br")) {
+ } else if (tagid == TagBr) {
hflush();
hadnewline = 0; /* forced newline */
hputchar('\n');
@@ -2107,65 +2141,78 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
}
static void
-xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
- size_t namelen, const char *value, size_t valuelen)
+xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
+ size_t nl, const char *v, size_t vl)
{
struct node *cur;
+ enum TagId tagid;
cur = &nodes[curnode];
-
- if (!attrcmp(name, "class"))
- string_append(&attr_class, value, valuelen);
- else if (!attrcmp(name, "id"))
- string_append(&attr_id, value, valuelen);
-
- /* <base href="..." /> */
- if (!basehrefset && !attrcmp(name, "href") && !tagcmp(tag, "base"))
- strlcat(basehrefdoc, value, sizeof(basehrefdoc));
+ tagid = cur->tag.id;
/* hide tags with attribute aria-hidden or hidden */
- if (!attrcmp(name, "aria-hidden") || !attrcmp(name, "hidden"))
+ if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
cur->tag.displaytype |= DisplayNone;
- if (!tagcmp(tag, "select") && !attrcmp(name, "multiple"))
- cur->tag.displaytype |= DisplaySelectMulti;
+ if (!attrcmp(n, "class"))
+ string_append(&attr_class, v, vl);
+ else if (!attrcmp(n, "id"))
+ string_append(&attr_id, v, vl);
+ else if (!attrcmp(n, "type"))
+ string_append(&attr_type, v, vl);
+ else if (!attrcmp(n, "value"))
+ string_append(&attr_value, v, vl);
- if (!tagcmp(tag, "a") && !attrcmp(name, "href"))
- string_append(&attr_href, value, valuelen);
+ /* <base href="..." /> */
+ if (!basehrefset && tagid == TagBase && !attrcmp(n, "href"))
+ strlcat(basehrefdoc, v, sizeof(basehrefdoc));
- if (!tagcmp(tag, "object") && !attrcmp(name, "data"))
- string_append(&attr_data, value, valuelen);
+ if (tagid == TagA && !attrcmp(n, "href"))
+ string_append(&attr_href, v, vl);
- if ((!tagcmp(tag, "img") || !tagcmp(tag, "video") ||
- !tagcmp(tag, "source") || !tagcmp(tag, "track") ||
- !tagcmp(tag, "audio")) &&
- !attrcmp(name, "src") && valuelen)
- string_append(&attr_src, value, valuelen);
+ if (tagid == TagSelect && !attrcmp(n, "multiple"))
+ cur->tag.displaytype |= DisplaySelectMulti;
- /* show img alt attribute as text. */
- if (!tagcmp(tag, "img") && !attrcmp(name, "alt"))
- string_append(&attr_alt, value, valuelen);
+ if (tagid == TagObject && !attrcmp(n, "data"))
+ string_append(&attr_data, v, vl);
- if (!attrcmp(name, "checked"))
- string_append(&attr_checked, value, valuelen);
- else if (!attrcmp(name, "type"))
- string_append(&attr_type, value, valuelen);
- else if (!attrcmp(name, "value"))
- string_append(&attr_value, value, valuelen);
+ /* show img alt attribute as text. */
+ if (tagid == TagImg && !attrcmp(n, "alt"))
+ string_append(&attr_alt, v, vl);
+
+ if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked"))
+ string_append(&attr_checked, v, vl);
+
+ /* src attribute */
+ switch (tagid) {
+ case TagAudio:
+ case TagEmbed:
+ case TagFrame:
+ case TagIframe:
+ case TagImg:
+ case TagSource:
+ case TagTrack:
+ case TagVideo:
+ if (!attrcmp(n, "src"))
+ string_append(&attr_src, v, vl);
+ break;
+ default:
+ break;
+ }
}
static void
-xmlattrentity(XMLParser *p, const char *tag, size_t taglen, const char *name,
- size_t namelen, const char *value, size_t valuelen)
+xmlattrentity(XMLParser *p, const char *t, size_t tl, const char *n,
+ size_t nl, const char *v, size_t vl)
{
char buf[16];
- int n;
+ int len;
- n = xml_entitytostr(value, buf, sizeof(buf));
- if (n > 0)
- xmlattr(p, tag, taglen, name, namelen, buf, (size_t)n);
+ len = xml_entitytostr(v, buf, sizeof(buf));
+ if (len > 0)
+ xmlattr(p, t, tl, n, nl, buf, (size_t)len);
else
- xmlattr(p, tag, taglen, name, namelen, value, valuelen);
+ xmlattr(p, t, tl, n, nl, v, vl);
}
static void
@@ -2173,12 +2220,14 @@ xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
size_t nl)
{
struct node *cur;
+ enum TagId tagid;
cur = &nodes[curnode];
+ tagid = cur->tag.id;
/* set base URL, if it is set it cannot be overwritten again */
if (!basehrefset && basehrefdoc[0] &&
- !attrcmp(n, "href") && !tagcmp(t, "base"))
+ tagid == TagBase && !attrcmp(n, "href"))
basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
/* if attribute checked is set but it has no value then set it to "checked" */
@@ -2190,6 +2239,12 @@ static void
xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
size_t nl)
{
+ struct node *cur;
+ enum TagId tagid;
+
+ cur = &nodes[curnode];
+ tagid = cur->tag.id;
+
if (!attrcmp(n, "alt"))
string_clear(&attr_alt);
else if (!attrcmp(n, "checked"))
@@ -2209,7 +2264,7 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
else if (!attrcmp(n, "value"))
string_clear(&attr_value);
- if (basehrefdoc[0] && !attrcmp(n, "href") && !tagcmp(t, "base"))
+ if (basehrefdoc[0] && tagid == TagBase && !attrcmp(n, "href"))
basehrefdoc[0] = '\0';
}
@@ -2236,7 +2291,8 @@ main(int argc, char **argv)
break;
case 'b':
basehref = EARGF(usage());
- if (uri_parse(basehref, &base) == -1)
+ if (uri_parse(basehref, &base) == -1 ||
+ !base.proto[0])
usage();
basehrefset = 1;
break;