ignore incorrect unescaped HTML in <style> or <script> in a better way - grabtitle - stupid HTML title grabber
(HTM) git clone git://git.codemadness.org/grabtitle
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit d908478d0f84bc275428fd71e934c993bb29211c
(DIR) parent 0cca681092b680c5b80da62771d47fa383be6cd1
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Mon, 10 Dec 2018 19:01:58 +0100
ignore incorrect unescaped HTML in <style> or <script> in a better way
this way we can still use a (mostly) XML parser for HTML data.
Diffstat:
M grabtitle.c | 71 +++++++++++++++++++------------
1 file changed, 44 insertions(+), 27 deletions(-)
---
(DIR) diff --git a/grabtitle.c b/grabtitle.c
@@ -16,28 +16,38 @@
#endif
static XMLParser parser;
-static int istitle, ignore;
-
-static void
-xmltagstart(XMLParser *p, const char *t, size_t tl)
+static const char *state, *endtag;
+static int (*getnext)(void);
+
+/* return a space for all data until some case-insensitive string occurs. This
+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
+ or style tags. */
+static inline int
+getchar_ignore(void)
{
- if ((tl == 6 && !strcasecmp(t, "script")) ||
- (tl == 5 && !strcasecmp(t, "style")))
- ignore = 1;
- if (!ignore && tl == 5 && !strcasecmp(t, "title"))
- istitle = 1;
+ int c;
+
+ if ((c = getnext()) == EOF)
+ return EOF;
+
+ if (tolower(c) == tolower((unsigned char)*state)) {
+ state++;
+ if (*state == '\0') {
+ parser.getnext = getnext; /* restore */
+ return c;
+ }
+ } else {
+ state = endtag;
+ }
+
+ return ' ';
}
static void
xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
{
- if (ignore && ((tl == 6 && !strcasecmp(t, "script")) ||
- (tl == 5 && !strcasecmp(t, "style"))))
- ignore = 0;
- if (istitle && tl == 5 && !strcasecmp(t, "title")) {
- putchar('\n');
- exit(0);
- }
+ putchar('\n');
+ exit(0);
}
/* data and CDATA */
@@ -46,9 +56,6 @@ xmldata(XMLParser *p, const char *d, size_t dl)
{
size_t i;
- if (!istitle)
- return;
-
for (i = 0; *d && i < dl; i++, d++) {
if (iscntrl((unsigned char)*d))
putchar(' ');
@@ -63,15 +70,30 @@ xmldataentity(XMLParser *p, const char *d, size_t dl)
char buf[16];
ssize_t len;
- if (!istitle)
- return;
-
if ((len = xml_entitytostr(d, buf, sizeof(buf))))
xmldata(p, buf, (size_t)len);
else
xmldata(p, d, dl);
}
+static void
+xmltagstart(XMLParser *p, const char *t, size_t tl)
+{
+ if (tl == 6 && !strcasecmp(t, "script")) {
+ state = endtag = "</script>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getchar_ignore;
+ } else if (tl == 5 && !strcasecmp(t, "style")) {
+ state = endtag = "</style>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getchar_ignore;
+ } else if (tl == 5 && !strcasecmp(t, "title")) {
+ p->xmltagend = xmltagend;
+ p->xmlcdata = p->xmldata = xmldata;
+ p->xmldataentity = xmldataentity;
+ }
+}
+
int
main(int argc, char *argv[])
{
@@ -81,11 +103,6 @@ main(int argc, char *argv[])
}
parser.xmltagstart = xmltagstart;
- parser.xmltagend = xmltagend;
- parser.xmldata = xmldata;
- parser.xmlcdata = xmldata;
- parser.xmldataentity = xmldataentity;
-
parser.getnext = getchar;
xml_parse(&parser);