cleanup code a bit and add some comments - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 4793272ce07153284318336426796cb7e3c93af4
(DIR) parent 589d7d1ed851b5226a4782de8c9f00001f25c599
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Tue, 19 Sep 2023 20:05:02 +0200
cleanup code a bit and add some comments
Diffstat:
M webdump.c | 129 +++++++++++++++----------------
1 file changed, 62 insertions(+), 67 deletions(-)
---
(DIR) diff --git a/webdump.c b/webdump.c
@@ -45,14 +45,14 @@ struct uri {
};
/* options */
-static int allowansi = 0; /* allow ANSI escape codes */
-static int showrefbottom = 0; /* show link references at the bottom */
-static int showrefinline = 0; /* show link reference number inline */
-static int showurlinline = 0; /* show full link reference inline */
-static int linewrap = 0; /* line-wrapping */
-static int termwidth = 77; /* terminal width */
-static int resources = 0; /* write resources line-by-line to fd 3? */
-static int uniqrefs = 0; /* number unique references */
+static int allowansi = 0; /* (-a) allow ANSI escape codes */
+static int uniqrefs = 0; /* (-d) number unique references */
+static int showrefinline = 0; /* (-i) show link reference number inline */
+static int showurlinline = 0; /* (-I) show full link reference inline */
+static int showrefbottom = 0; /* (-l) show link references at the bottom */
+static int linewrap = 0; /* (-r) line-wrapping */
+static int termwidth = 77; /* (-w) terminal width */
+static int resources = 0; /* (-x) write resources line-by-line to fd 3? */
enum DisplayType {
DisplayUnknown = 0,
@@ -95,17 +95,19 @@ typedef struct string {
} String;
enum TagId { TagA = 1, TagAddress, TagArea, TagArticle, TagAside, TagAudio,
-TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton, TagCite,
-TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails, TagDfn, TagDir,
-TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset, TagFigcaption, TagFigure,
-TagFooter, TagForm, TagFrame, TagH1, TagH2, TagH3, TagH4, TagH5, TagH6,
-TagHead, TagHeader, TagHr, TagHtml, TagI, TagIframe, TagImg, TagInput, TagIns,
-TagLabel, TagLegend, TagLi, TagLink, TagMain, TagMark, TagMenu, TagMeta,
-TagNav, TagObject, TagOl, TagOption, TagP, TagParam, TagPre, TagS, TagScript,
-TagSearch, TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
-TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea, TagTfoot,
-TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl, TagVar, TagVideo,
-TagWbr, TagXmp };
+
+ TagB, TagBase, TagBlink, TagBlockquote, TagBody, TagBr, TagButton,
+ TagCite, TagCol, TagColgroup, TagDatalist, TagDd, TagDel, TagDetails,
+ TagDfn, TagDir, TagDiv, TagDl, TagDt, TagEm, TagEmbed, TagFieldset,
+ TagFigcaption, TagFigure, TagFooter, TagForm, TagFrame, TagH1, TagH2,
+ TagH3, TagH4, TagH5, TagH6, TagHead, TagHeader, TagHr, TagHtml, TagI,
+ TagIframe, TagImg, TagInput, TagIns, TagLabel, TagLegend, TagLi,
+ TagLink, TagMain, TagMark, TagMenu, TagMeta, TagNav, TagObject, TagOl,
+ TagOption, TagP, TagParam, TagPre, TagS, TagScript, TagSearch,
+ TagSection, TagSelect, TagSource, TagStrike, TagStrong, TagStyle,
+ TagSummary, TagTable, TagTbody, TagTd, TagTemplate, TagTextarea,
+ TagTfoot, TagTh, TagThead, TagTitle, TagTr, TagTrack, TagU, TagUl,
+ TagVar, TagVideo, TagWbr, TagXmp };
struct tag {
const char *name;
@@ -168,6 +170,7 @@ static size_t nvisrefs, ncapvisrefs; /* visible link count / capacity */
struct linkref **hiddenrefs;
static size_t nhiddenrefs, ncaphiddenrefs; /* hidden link count / capacity */
+/* compare link by URL for link references RB-tree */
int
linkrefcmp(struct linkref *r1, struct linkref *r2)
{
@@ -175,7 +178,6 @@ linkrefcmp(struct linkref *r1, struct linkref *r2)
}
RB_HEAD(linkreftree, linkref) linkrefhead = RB_INITIALIZER(&linkrefhead);
-RB_PROTOTYPE(linkreftree, linkref, entry, linkrefcmp)
RB_GENERATE(linkreftree, linkref, entry, linkrefcmp)
static const char *str_bullet_item = "* ";
@@ -184,10 +186,9 @@ static const char *str_ruler = "-";
static const char *str_radio_checked = "*";
/* base href, to make URLs absolute */
-static char *basehref = "";
-static char basehrefdoc[4096]; /* base href in document, if any */
-static int basehrefset = 0; /* base href set and can be used? */
-static struct uri base;
+static char basehrefdoc[4096]; /* buffer for base href in document, if any */
+static int basehrefset; /* base href set and can be used? */
+static struct uri base; /* parsed current base href */
/* buffers for some attributes of the current tag */
String attr_alt; /* alt attribute */
@@ -200,7 +201,7 @@ String attr_src; /* src attribute */
String attr_type; /* type attribute */
String attr_value; /* value attribute */
-static String htmldata;
+static String htmldata; /* buffered HTML data near the current tag */
/* for white-space output handling:
1 = whitespace emitted (suppress repeated), 2 = other characters on this line
@@ -208,15 +209,15 @@ static String htmldata;
* White-space data before non-whitespace data in tags are ignored on a line.
* Repeated white-space are ignored: a single space (' ') is emitted.
*/
-static int whitespace_mode = 0;
-static int nbytesline = 0;
-static int ncells = 0; /* current cell count */
-static int hadnewline = 0; /* count for repeated newlines */
+static int whitespace_mode;
+static int nbytesline; /* bytes on this line */
+static int ncells; /* current cell/column count */
+static int hadnewline; /* count for repeated newlines */
/* flag for skipping initial white-space in tag: for HTML white-space handling */
static int skipinitialws = 1;
#define DEFAULT_INDENT 2
-static const int defaultindent = DEFAULT_INDENT;
-static int indent;
+static const int defaultindent = DEFAULT_INDENT; /* default indent / margin */
+static int indent; /* indent for the current line, in columns */
/* previous output sequential newlines, used for calculating margins between
elements and reducing excessive newlines */
static int currentnewlines;
@@ -224,21 +225,22 @@ static int currentnewlines;
/* buffers for line-wrapping (buffer per word boundary) */
static char rbuf[1024];
static int rbuflen;
-static int rnbufcells = 0; /* pending cell count to add */
+static int rnbufcells; /* pending cell count to add */
#define MAX_NODE_DEPTH 65535 /* absolute maximum node depth */
-static struct node *nodes;
+static struct node *nodes; /* node tree (one per level is remembered) */
static String *nodes_links; /* keep track of links per node */
-static size_t ncapnodes;
+static size_t ncapnodes; /* current allocated node capacity */
static int curnode; /* current node depth */
-/* reader / selector mode */
-static int reader_mode = 0;
-static int reader_ignore = 0;
+/* reader / selector mode (-s) */
+static int reader_mode;
+/* flag if the tags and their children should be ignored in the current context */
+static int reader_ignore;
-static enum MarkupType curmarkup;
+static enum MarkupType curmarkup; /* current markup state (bold, underline, etc) */
-/* selector to match */
+/* selector to match (for -s and -u) */
static struct selectors *sel_hide, *sel_show;
/* tags table: needs to be sorted like tagcmp(), alphabetically */
@@ -483,7 +485,7 @@ ecalloc(size_t nmemb, size_t size)
}
/* check if string has a non-empty scheme / protocol part */
-int
+static int
uri_hasscheme(const char *s)
{
const char *p = s;
@@ -495,7 +497,7 @@ uri_hasscheme(const char *s)
return (*p == ':' && p != s);
}
-int
+static int
uri_parse(const char *s, struct uri *u)
{
const char *p = s;
@@ -611,7 +613,7 @@ parsepath:
/* Transform and try to make the URI `u` absolute using base URI `b` into `a`.
Follows some of the logic from "RFC 3986 - 5.2.2. Transform References".
Returns 0 on success, -1 on error or truncation. */
-int
+static int
uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
{
char *p;
@@ -663,7 +665,7 @@ uri_makeabs(struct uri *a, struct uri *u, struct uri *b)
return 0;
}
-int
+static int
uri_format(char *buf, size_t bufsiz, struct uri *u)
{
return snprintf(buf, bufsiz, "%s%s%s%s%s%s%s%s%s%s%s%s",
@@ -682,14 +684,14 @@ uri_format(char *buf, size_t bufsiz, struct uri *u)
}
/* compare tag name (case-insensitive) */
-int
+static int
tagcmp(const char *s1, const char *s2)
{
return strcasecmp(s1, s2);
}
/* compare attribute name (case-insensitive) */
-int
+static int
attrcmp(const char *s1, const char *s2)
{
return strcasecmp(s1, s2);
@@ -846,7 +848,7 @@ endmarkup(int markuptype)
cell in general.
NOTE: this is of course incorrect since characters can be 2 width aswell,
in the future maybe replace this with wcwidth() or similar */
-int
+static int
utfwidth(int c)
{
/* not the start of a codepoint */
@@ -1002,17 +1004,6 @@ parentcontainerhasdata(int curtype, int n)
return 0;
}
-static int
-parenthasdata(int n)
-{
- int i;
-
- for (i = n; i >= 0; i--)
- return nodes[i].hasdata;
-
- return 0;
-}
-
/* start on a newline for the start of a block element or not */
static void
startblock(void)
@@ -1021,7 +1012,7 @@ startblock(void)
whitespace_mode &= ~2; /* no characters on this line yet */
if (nbytesline <= 0)
return;
- if (!hadnewline && parenthasdata(curnode - 1))
+ if (!hadnewline && curnode >= 0 && nodes[curnode - 1].hasdata)
hputchar('\n');
}
@@ -1137,7 +1128,7 @@ findparenttype(int cur, int findtype)
return NULL;
}
-int
+static int
isclassmatch(const char *haystack, const char *needle)
{
const char *p;
@@ -1165,7 +1156,7 @@ isclassmatch(const char *haystack, const char *needle)
/* very limited CSS-like selector, supports: main, main#id, main.class,
".class", "#id", "ul li a" */
-int
+static int
compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
{
int depth = 0, len;
@@ -1263,7 +1254,7 @@ compileselector(const char *sel, struct selectornode *nodes, size_t maxnodes)
return depth;
}
-struct selector *
+static struct selector *
newselector(const char *q)
{
struct selector *sel;
@@ -1282,7 +1273,7 @@ newselector(const char *q)
return sel;
}
-struct selectors *
+static struct selectors *
compileselectors(const char *q)
{
struct selectors *sels = NULL;
@@ -1319,7 +1310,7 @@ compileselectors(const char *q)
/* very limited CSS-like matcher, supports: main, main#id, main.class,
".class", "#id", "ul li a" */
-int
+static int
iscssmatch(struct selector *sel, struct node *root, int maxdepth)
{
int d, md = 0;
@@ -1356,7 +1347,7 @@ iscssmatch(struct selector *sel, struct node *root, int maxdepth)
return 0;
}
-int
+static int
iscssmatchany(struct selectors *sels, struct node *root, int maxdepth)
{
struct selector *sel;
@@ -1499,7 +1490,7 @@ handleinlinelink(void)
addlinkref(url, cur->tag.name, cur->tag.id, 1);
}
-void
+static void
printlinkrefs(void)
{
struct linkref *ref;
@@ -1535,6 +1526,7 @@ printlinkrefs(void)
}
}
+/* size to grow node capacity (greedy) */
#define NODE_CAP_INC 256
/* increase node depth, allocate space for nodes if needed */
@@ -1759,6 +1751,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
size_t nchilds;
int i, j, k, nchildfound, parenttype;
+ /* match tag and lookup metadata */
/* ignore closing of void elements, like </br>, which is not allowed */
if ((found = findtag(t))) {
if (!isshort && found->isvoid)
@@ -1884,7 +1877,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
string_clear(&attr_type);
string_clear(&attr_value);
- /* match tag */
+ /* match tag and lookup metadata */
found = findtag(t);
/* TODO: implement more complete optional tag handling.
@@ -1993,7 +1986,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
struct node *cur, *parent;
int i, margintop;
- /* match tag */
+ /* match tag and lookup metadata */
tagid = 0;
if ((found = findtag(t)))
tagid = found->id;
@@ -2322,6 +2315,8 @@ usage(void)
int
main(int argc, char **argv)
{
+ char *basehref;
+
if (pledge("stdio", NULL) < 0)
err(1, "pledge");