improve forms a bit - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 630f76162a192327a3eecd4fc0adcb9b31cd4504
(DIR) parent 0705fb754f00c7866b2cc8cee0739a88a584a2e1
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Fri, 8 Sep 2023 15:05:38 +0200
improve forms a bit
- Treat fieldset and legend as block elements.
- Support more types, default or unsupported is "text".
- Show the default selected value for radio and checkboxes.
- Don't show hidden input types.
- Add a DisplayType DisplayInput to check the tag faster.
Diffstat:
M webdump.c | 64 +++++++++++++++++++++----------
1 file changed, 44 insertions(+), 20 deletions(-)
---
(DIR) diff --git a/webdump.c b/webdump.c
@@ -68,16 +68,17 @@ enum DisplayType {
DisplayUnknown = 0,
DisplayInline = 1 << 0,
DisplayInlineBlock = 1 << 1, /* unused for now */
- DisplayBlock = 1 << 2,
- DisplayNone = 1 << 3,
- DisplayPre = 1 << 4,
- DisplayList = 1 << 5,
- DisplayListOrdered = 1 << 6,
- DisplayListItem = 1 << 7,
- DisplayTable = 1 << 8,
- DisplayTableRow = 1 << 9,
- DisplayTableCell = 1 << 10,
- DisplayHeader = 1 << 11
+ DisplayInput = 1 << 2,
+ DisplayBlock = 1 << 3,
+ DisplayNone = 1 << 4,
+ DisplayPre = 1 << 5,
+ DisplayList = 1 << 6,
+ DisplayListOrdered = 1 << 7,
+ DisplayListItem = 1 << 8,
+ DisplayTable = 1 << 9,
+ DisplayTableRow = 1 << 10,
+ DisplayTableCell = 1 << 11,
+ DisplayHeader = 1 << 12
};
/* ANSI markup */
@@ -143,7 +144,9 @@ struct selectors {
};
static const char *str_bullet_item = "* ";
+static const char *str_checkbox_checked = "x";
static const char *str_ruler = "-";
+static const char *str_radio_checked = "*";
/* base href, to make URLs absolute */
static char *basehref = "";
@@ -153,6 +156,7 @@ static struct uri base;
/* buffers for some attributes of the current tag */
String attr_alt; /* alt attribute */
+String attr_checked; /* checked attribute */
String attr_class; /* class attribute */
String attr_href; /* href attribute */
String attr_id; /* id attribute */
@@ -221,6 +225,7 @@ static struct tag tags[] = {
{ "dt", DisplayBlock, MarkupBold, 0, 0, 1, 0, 0, 0 },
{ "em", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
{ "embed", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "fieldset", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
{ "figcaption", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
{ "figure", DisplayBlock, 0, 0, 0, 0, 1, 1, 4 },
{ "footer", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
@@ -236,8 +241,9 @@ static struct tag tags[] = {
{ "html", DisplayBlock, 0, 0, 0, 1, 0, 0, 0 },
{ "i", DisplayInline, MarkupItalic, 0, 0, 0, 0, 0, 0 },
{ "img", DisplayInline, MarkupUnderline, 0, 1, 0, 0, 0, 0 },
-{ "input", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
+{ "input", DisplayInput, 0, 0, 1, 0, 0, 0, 0 },
{ "label", DisplayInline, MarkupBold, 0, 0, 0, 0, 0, 0 },
+{ "legend", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
{ "li", DisplayListItem, 0, DisplayList, 0, 1, 0, 0, 0 },
{ "link", DisplayInline, 0, 0, 1, 0, 0, 0, 0 },
{ "main", DisplayBlock, 0, 0, 0, 0, 0, 0, 0 },
@@ -1684,6 +1690,7 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
cur = &nodes[curnode];
string_clear(&attr_alt);
+ string_clear(&attr_checked);
string_clear(&attr_class);
string_clear(&attr_href);
string_clear(&attr_id);
@@ -1891,18 +1898,23 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
if (!tagcmp(cur->tag.name, "input")) {
if (!attr_type.len) {
hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* default: text */
- } else if (!strcasecmp(attr_type.data, "text")) {
- hprintf("[%-15s]", attr_value.len ? attr_value.data : ""); /* text */
- } else if (!strcasecmp(attr_type.data, "search")) {
- hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
- } else if (!strcasecmp(attr_type.data, "button")) {
- hprintf("[%s]", attr_value.len ? attr_value.data : "");
- } else if (!strcasecmp(attr_type.data, "submit")) {
+ } else if (!strcasecmp(attr_type.data, "button") ||
+ !strcasecmp(attr_type.data, "submit") ||
+ !strcasecmp(attr_type.data, "reset")) {
hprintf("[%s]", attr_value.len ? attr_value.data : "");
} else if (!strcasecmp(attr_type.data, "checkbox")) {
- hprint("[ ]"); /* TODO: show x or unicode checkmark when selected? */
+ hprintf("[%s]",
+ attr_checked.len &&
+ !strcasecmp(attr_checked.data, "checked") ? str_checkbox_checked : " ");
} else if (!strcasecmp(attr_type.data, "radio")) {
- hprint("( )"); /* TODO: show x or unicode checkmark when selected? */
+ hprintf("[%s]",
+ attr_checked.len &&
+ !strcasecmp(attr_checked.data, "checked") ? str_radio_checked : " ");
+ } else if (!strcasecmp(attr_type.data, "hidden")) {
+ cur->tag.displaytype |= DisplayNone;
+ } else {
+ /* unrecognized / default case is text */
+ hprintf("[%-15s]", attr_value.len ? attr_value.data : "");
}
}
@@ -1963,6 +1975,8 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
if (!tagcmp(tag, "img") && !attrcmp(name, "alt"))
string_append(&attr_alt, value, valuelen);
+ if (!attrcmp(name, "checked"))
+ string_append(&attr_checked, value, valuelen);
if (!attrcmp(name, "type"))
string_append(&attr_type, value, valuelen);
if (!attrcmp(name, "value"))
@@ -1987,10 +2001,18 @@ static void
xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
size_t nl)
{
+ struct node *cur;
+
+ cur = &nodes[curnode];
+
/* set base URL, if it is set it cannot be overwritten again */
if (!basehrefset && basehrefdoc[0] &&
!attrcmp(n, "href") && !tagcmp(t, "base"))
basehrefset = uri_parse(basehrefdoc, &base) != -1 ? 1 : 0;
+
+ /* if attribute checked is set but it has no value then set it to "checked" */
+ if (cur->tag.displaytype & DisplayInput && !attrcmp(n, "checked") && !attr_checked.len)
+ string_append(&attr_checked, "checked", sizeof("checked") - 1);
}
static void
@@ -1999,6 +2021,8 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
{
if (!attrcmp(n, "alt"))
string_clear(&attr_alt);
+ else if (!attrcmp(n, "checked"))
+ string_clear(&attr_checked);
else if (!attrcmp(n, "class"))
string_clear(&attr_class);
else if (!attrcmp(n, "href"))