tremove preprocess code, compare tags and attribute case-insensitive - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 54f38abd3722c07e900820343e7c5288c6b0fdce
(DIR) parent dacc8c21011cdd6f6c9dc4ebd177478b2151a2c1
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sun, 20 Aug 2017 20:56:39 +0200
remove preprocess code, compare tags and attribute case-insensitive
idea to ignore literal tags (HTML).
Diffstat:
M main.c | 103 ++++++++++++-------------------
1 file changed, 40 insertions(+), 63 deletions(-)
---
(DIR) diff --git a/main.c b/main.c
t@@ -19,7 +19,7 @@ static XMLParser parser;
struct node {
char tag[256];
-/* int isignore;*/
+ int isignore;
int ispre;
int isinline;
int isblock;
t@@ -85,7 +85,6 @@ static char *blocktags[] = {
};
static String htmldata;
-static String preprocess;
/* Clear string only; don't free, prevents unnecessary reallocation. */
static void
t@@ -204,8 +203,8 @@ xmldata(XMLParser *p, const char *data, size_t datalen)
cur = &nodes[curnode];
string_append(&htmldata, data, datalen);
-/* if (cur->isignore)
- return;*/
+ if (cur->isignore)
+ return;
}
static void
t@@ -233,28 +232,28 @@ xmltagstart(XMLParser *p, const char *tag, size_t taglen)
src[0] = '\0'; /* src, href */
strlcpy(cur->tag, tag, sizeof(cur->tag));
-#if 0
+#if 1
for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) {
- if (!strcmp(ignoretags[i], tag)) {
+ if (!strcasecmp(ignoretags[i], tag)) {
cur->isignore = 1;
break;
}
}
#endif
for (i = 0; i < sizeof(pretags) / sizeof(*pretags); i++) {
- if (!strcmp(pretags[i], tag)) {
+ if (!strcasecmp(pretags[i], tag)) {
cur->ispre = 1;
break;
}
}
for (i = 0; i < sizeof(blocktags) / sizeof(*blocktags); i++) {
- if (!strcmp(blocktags[i], tag)) {
+ if (!strcasecmp(blocktags[i], tag)) {
cur->isblock = 1;
break;
}
}
for (i = 0; i < sizeof(inlinetags) / sizeof(*inlinetags); i++) {
- if (!strcmp(inlinetags[i], tag)) {
+ if (!strcasecmp(inlinetags[i], tag)) {
cur->isinline = 1;
break;
}
t@@ -270,8 +269,8 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort)
if (curnode)
curnode--;
cur = &nodes[curnode];
-/* if (cur->isignore)
- return;*/
+ if (cur->isignore)
+ return;
#if 0
if (src[0])
t@@ -279,7 +278,7 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort)
src[0] = '\0';
#endif
- if (!strcmp(tag, "tr"))
+ if (!strcasecmp(tag, "tr"))
fputs(" | ", stdout); /* HACK */
if (cur->isblock)
t@@ -300,31 +299,47 @@ static void
xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort)
{
struct node *cur;
- int i;
+ int c, i;
cur = &nodes[curnode];
-/* if (cur->isignore)
- return;*/
+ if (cur->isignore) {
+#if 0
+ /* HACK: ignored tag is parsed, hook into reader and read raw data
+ until literal end tag (without using the normal parser). */
+
+ /* TODO: process (buffered) as xml[c]data (no entity) */
+ while ((c = getchar()) != EOF) {
+ if (c == '<') {
+ /* TODO: check /endtag */
+ break;
+ }
+ }
+ if (c == EOF) {
+ }
+
+#endif
+ return;
+ }
if (cur->isblock)
fputs("\n", stdout);
- if (!strcmp(tag, "td") || !strcmp(tag, "th"))
+ if (!strcasecmp(tag, "td") || !strcasecmp(tag, "th"))
fputs(" | ", stdout); /* HACK */
- if (!strcmp(cur->tag, "li")) {
+ if (!strcasecmp(cur->tag, "li")) {
/* indent nested list items */
for (i = curnode; i; i--) {
- if (!strcmp(nodes[i].tag, "li"))
+ if (!strcasecmp(nodes[i].tag, "li"))
continue;
- if (!strcmp(nodes[i].tag, "ul") ||
- !strcmp(nodes[i].tag, "ol"))
+ if (!strcasecmp(nodes[i].tag, "ul") ||
+ !strcasecmp(nodes[i].tag, "ol"))
fputs(" ", stdout);
}
/* TODO: for <ol>, keep list counter on ol element (parent),
support ordered number type only */
fputs("* ", stdout);
- } else if (!strcmp(cur->tag, "hr")) {
+ } else if (!strcasecmp(cur->tag, "hr")) {
for (i = 0; i < 36; i++)
putchar('-');
}
t@@ -338,65 +353,27 @@ static void
xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
size_t namelen, const char *value, size_t valuelen)
{
- if (!strcmp(tag, "a") && !strcmp(name, "href") && valuelen)
+ if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
strlcpy(src, value, sizeof(src));
- if ((!strcmp(tag, "img") || !strcmp(tag, "video") || !strcmp(tag, "audio")) &&
- !strcmp(name, "src") && valuelen)
+ if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
+ !strcasecmp(tag, "audio")) &&
+ !strcasecmp(name, "src") && valuelen)
strlcpy(src, value, sizeof(src));
}
-/*static size_t readoffset;*/
-
int
readchar(void)
{
return getchar();
-#if 0
- size_t i, j;
- int c;
-
- for (; readoffset < preprocess.len; ) {
- if (preprocess.data[read_offset] != '<')
- return preprocess.data[read_offset++];
-
- for (j = 0; j < sizeof(ignoretags) / sizeof(*ignoretags); j++) {
- if (!strncmp(&preprocess.data[i + 1], ignoretags[i], sizeof(ignoretags[i]) - 1)) {
- if (strchr(" \t>", preprocess.data[i + 1 + sizeof(ignoretags[i]) - 1])) {
- /* TODO: search until end of this tag */
- }
- }
- }
- /* TODO: if no match just return char */
- return preprocess.data[read_offset++];
- }
- return EOF;
-#endif
}
-/* TODO: preprocess data, strip <script>, <style> etc */
int
main(void)
{
-
- char buf[BUFSIZ];
- int n;
-
if (pledge("stdio", NULL) < 0)
err(1, "pledge");
-#if 0
- /* TODO: optimize later */
- while (1) {
- /* TODO: check read error */
- n = read(0, buf, sizeof(buf) - 1);
- if (n <= 0)
- break;
- buf[n] = '\0';
- string_append(&preprocess, buf, n);
- }
-#endif
-
parser.xmlattr = xmlattr;
parser.xmlcdata = xmlcdata;
parser.xmldata = xmldata;