for the class and id attribute use the first value set - webdump - HTML to plain-text converter for webpages
(HTM) git clone git://git.codemadness.org/webdump
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit ae36c548e48ddea692a87557938441bb7cd54994
(DIR) parent 4793272ce07153284318336426796cb7e3c93af4
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Wed, 20 Sep 2023 18:51:10 +0200
for the class and id attribute use the first value set
+ small code-style tweaks.
Diffstat:
M webdump.c | 35 +++++++++++++++++++------------
1 file changed, 22 insertions(+), 13 deletions(-)
---
(DIR) diff --git a/webdump.c b/webdump.c
@@ -191,15 +191,17 @@ static int basehrefset; /* base href set and can be used? */
static struct uri base; /* parsed current base href */
/* buffers for some attributes of the current tag */
-String attr_alt; /* alt attribute */
-String attr_checked; /* checked attribute */
-String attr_class; /* class attribute */
-String attr_data; /* data attribute */
-String attr_href; /* href attribute */
-String attr_id; /* id attribute */
-String attr_src; /* src attribute */
-String attr_type; /* type attribute */
-String attr_value; /* value attribute */
+static String attr_alt; /* alt attribute */
+static String attr_checked; /* checked attribute */
+static String attr_class; /* class attribute */
+static int attr_class_set; /* class attribute is set already */
+static String attr_data; /* data attribute */
+static String attr_href; /* href attribute */
+static String attr_id; /* id attribute */
+static int attr_id_set; /* class attribute is set already */
+static String attr_src; /* src attribute */
+static String attr_type; /* type attribute */
+static String attr_value; /* value attribute */
static String htmldata; /* buffered HTML data near the current tag */
@@ -1870,9 +1872,11 @@ xmltagstart(XMLParser *p, const char *t, size_t tl)
string_clear(&attr_alt);
string_clear(&attr_checked);
string_clear(&attr_class);
+ attr_class_set = 0;
string_clear(&attr_data);
string_clear(&attr_href);
string_clear(&attr_id);
+ attr_id_set = 0;
string_clear(&attr_src);
string_clear(&attr_type);
string_clear(&attr_value);
@@ -2191,9 +2195,9 @@ xmlattr(XMLParser *p, const char *t, size_t tl, const char *n,
if (!attrcmp(n, "aria-hidden") || !attrcmp(n, "hidden"))
cur->tag.displaytype |= DisplayNone;
- if (!attrcmp(n, "class"))
+ if (!attr_class_set && !attrcmp(n, "class")) /* use the first set attribute */
string_append(&attr_class, v, vl);
- else if (!attrcmp(n, "id"))
+ else if (!attr_id_set && !attrcmp(n, "id")) /* use the first set attribute */
string_append(&attr_id, v, vl);
else if (!attrcmp(n, "type"))
string_append(&attr_type, v, vl);
@@ -2262,6 +2266,11 @@ xmlattrend(XMLParser *p, const char *t, size_t tl, const char *n,
cur = &nodes[curnode];
tagid = cur->tag.id;
+ if (!attr_class_set && !attrcmp(n, "class"))
+ attr_class_set = 1;
+ else if (!attr_id_set && !attrcmp(n, "id"))
+ attr_id_set = 1;
+
/* set base URL, if it is set it cannot be overwritten again */
if (!basehrefset && basehrefdoc[0] &&
tagid == TagBase && !attrcmp(n, "href"))
@@ -2286,13 +2295,13 @@ xmlattrstart(XMLParser *p, const char *t, size_t tl, const char *n,
string_clear(&attr_alt);
else if (!attrcmp(n, "checked"))
string_clear(&attr_checked);
- else if (!attrcmp(n, "class"))
+ else if (!attr_class_set && !attrcmp(n, "class"))
string_clear(&attr_class);
else if (!attrcmp(n, "data"))
string_clear(&attr_data);
else if (!attrcmp(n, "href"))
string_clear(&attr_href);
- else if (!attrcmp(n, "id"))
+ else if (!attr_id_set && !attrcmp(n, "id"))
string_clear(&attr_id);
else if (!attrcmp(n, "src"))
string_clear(&attr_src);