ignore incorrect unescaped HTML in <style> or <script> in a better way - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit ed3a979265abe557e783ea22c6a09fb96241ff95
(DIR) parent 0fac9621c44b76c38d911438b1966d665e3b8134
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Mon, 17 Dec 2018 18:32:50 +0100
ignore incorrect unescaped HTML in <style> or <script> in a better way
Diffstat:
M tscrape.c | 53 +++++++++++++++++++++----------
1 file changed, 37 insertions(+), 16 deletions(-)
---
(DIR) diff --git a/tscrape.c b/tscrape.c
@@ -40,6 +40,34 @@ static char retweetid[64];
static int state;
static XMLParser p;
+static const char *ignorestate, *endtag;
+static int (*getnext)(void);
+
+/* return a space for all data until some case-insensitive string occurs. This
+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
+ or style tags. If you see some </script> tag in a CDATA or comment
+ section then e-mail W3C and tell them the web is too complex. */
+static inline int
+getchar_ignore(void)
+{
+ int c;
+
+ if ((c = getnext()) == EOF)
+ return EOF;
+
+ if (tolower(c) == tolower((unsigned char)*ignorestate)) {
+ ignorestate++;
+ if (*ignorestate == '\0') {
+ p.getnext = getnext; /* restore */
+ return c;
+ }
+ } else {
+ ignorestate = endtag;
+ }
+
+ return ' ';
+}
+
static void
printtweet(void)
{
@@ -100,16 +128,6 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
state &= ~(Timestamp);
}
-static char ignoretag[8];
-static XMLParser xo; /* old context */
-
-static void
-xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort)
-{
- if (!strcasecmp(t, ignoretag))
- memcpy(p, &xo, sizeof(*p)); /* restore context */
-}
-
static void
xmltagstart(XMLParser *x, const char *t, size_t tl)
{
@@ -122,12 +140,15 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
/* temporary replace the callback except the reader and end of tag
restore the context once we receive the same ignored tag in the
end tag handler */
- if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) {
- strlcpy(ignoretag, t, sizeof(ignoretag));
- memcpy(&xo, x, sizeof(xo)); /* store old context */
- memset(x, 0, sizeof(*x));
- x->xmltagend = xmlignoretagend;
- x->getnext = xo.getnext;
+ if (!strcasecmp(t, "script")) {
+ ignorestate = endtag = "</script>";
+ getnext = x->getnext; /* for restore */
+ x->getnext = getchar_ignore;
+ return;
+ } else if (!strcasecmp(t, "style")) {
+ ignorestate = endtag = "</style>";
+ getnext = x->getnext; /* for restore */
+ x->getnext = getchar_ignore;
return;
}