tignore all within <script> or <style> (WIP) - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit ec68d5635764887d323bc7e3e09c01fda411e865
(DIR) parent 26361ccd0ab0f19276d7727b8f589b1109cfbfd1
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Thu, 27 Jun 2019 19:37:07 +0200
ignore all within <script> or <style> (WIP)
Diffstat:
M main.c | 59 +++++++++++++++++++++----------
1 file changed, 40 insertions(+), 19 deletions(-)
---
(DIR) diff --git a/main.c b/main.c
t@@ -89,6 +89,34 @@ static char *blocktags[] = {
static String htmldata;
+static const char *ignorestate, *endtag;
+static int (*getnext)(void);
+
+/* return a space for all data until some case-insensitive string occurs. This
+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
+ or style tags. If you see some </script> tag in a CDATA or comment
+ section then e-mail W3C and tell them the web is too complex. */
+static inline int
+getnext_ignore(void)
+{
+ int c;
+
+ if ((c = getnext()) == EOF)
+ return EOF;
+
+ if (tolower(c) == tolower((unsigned char)*ignorestate)) {
+ ignorestate++;
+ if (*ignorestate == '\0') {
+ parser.getnext = getnext; /* restore */
+ return c;
+ }
+ } else {
+ ignorestate = endtag;
+ }
+
+ return ' ';
+}
+
/* Clear string only; don't free, prevents unnecessary reallocation. */
static void
string_clear(String *s)
t@@ -218,10 +246,10 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
/* TODO: add to tscrape too */
/* TODO: support some more HTML entities */
n = xml_entitytostr(data, buf, sizeof(buf));
- if (n <= 0)
- xmldata(p, data, datalen);
+ if (n > 0)
+ xmldata(p, buf, (size_t)n);
else
- string_append(&htmldata, buf, n);
+ xmldata(p, data, datalen);
}
static void
t@@ -282,16 +310,6 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
}
}
-static char ignoretag[8];
-static XMLParser xo; /* old context */
-
-static void
-xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort)
-{
- if (!strcasecmp(t, ignoretag))
- memcpy(p, &xo, sizeof(*p)); /* restore context */
-}
-
static void
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
{
t@@ -301,12 +319,15 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
/* temporary replace the callback except the reader and end of tag
restore the context once we receive the same ignored tag in the
end tag handler */
- if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) {
- strlcpy(ignoretag, t, sizeof(ignoretag));
- memcpy(&xo, p, sizeof(xo)); /* store old context */
- memset(p, 0, sizeof(*p));
- p->xmltagend = xmlignoretagend;
- p->getnext = xo.getnext;
+ if (!strcasecmp(t, "script")) {
+ ignorestate = endtag = "</script>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getnext_ignore;
+ return;
+ } else if (!strcasecmp(t, "style")) {
+ ignorestate = endtag = "</style>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getnext_ignore;
return;
}