timprove whitespace handling (needs more work) - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit ea14e82082be78917aaa6e380879c0e230330b47
(DIR) parent 421341e1a2b737cb269a144a1634511705161651
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 22 Jul 2017 23:49:01 +0200
improve whitespace handling (needs more work)
Diffstat:
M Makefile | 2 +-
M main.c | 108 ++++++++++++++++++++++---------
2 files changed, 78 insertions(+), 32 deletions(-)
---
(DIR) diff --git a/Makefile b/Makefile
t@@ -1,5 +1,5 @@
build: clean
- cc xml.c main.c -o main
+ cc -ggdb -O0 -Wall xml.c main.c -o main
clean:
rm -f main *.o
(DIR) diff --git a/main.c b/main.c
t@@ -10,10 +10,9 @@
#include "xml.h"
/* string and size */
-#define STRP(s) s,sizeof(s)-1
+/*#define STRP(s) s,sizeof(s)-1*/
static XMLParser parser;
-static int isdatastart;
struct node {
char tag[256];
t@@ -23,6 +22,15 @@ struct node {
int isblock;
};
+typedef struct node Node;
+
+/* String data / memory pool */
+typedef struct string {
+ char *data; /* data */
+ size_t len; /* string length */
+ size_t bufsiz; /* allocated size */
+} String;
+
static char src[4096]; /* src or href attribute */
#define MAX_DEPTH 256
t@@ -70,58 +78,96 @@ static char *blocktags[] = {
"table",
};
+static String htmldata;
+
+/* Clear string only; don't free, prevents unnecessary reallocation. */
static void
-xmlcdata(XMLParser *p, const char *data, size_t datalen)
+string_clear(String *s)
{
- fputs(data, stdout);
+ if (s->data)
+ s->data[0] = '\0';
+ s->len = 0;
}
static void
-xmldatastart(XMLParser *p)
+string_buffer_realloc(String *s, size_t newlen)
{
- isdatastart = 1;
+ size_t alloclen;
+
+ for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
+ ;
+ if (!(s->data = realloc(s->data, alloclen)))
+ err(1, "realloc");
+ s->bufsiz = alloclen;
}
static void
-xmldataend(XMLParser *p)
+string_append(String *s, const char *data, size_t len)
{
- isdatastart = 0;
+ if (!len)
+ return;
+ /* check if allocation is necesary, don't shrink buffer,
+ * should be more than bufsiz ofcourse. */
+ if (s->len + len >= s->bufsiz)
+ string_buffer_realloc(s, s->len + len + 1);
+ memcpy(s->data + s->len, data, len);
+ s->len += len;
+ s->data[s->len] = '\0';
}
static void
-xmldata(XMLParser *p, const char *data, size_t datalen)
+xmlcdata(XMLParser *p, const char *data, size_t datalen)
+{
+ fputs(data, stdout);
+}
+
+static void
+xmldataend(XMLParser *p)
{
struct node *cur;
- const char *s = data;
+ char *start, *s, *e;
cur = &nodes[curnode];
- if (cur->isignore)
- goto end;
- /* TODO: if not <pre> or w/e, skip? */
- if (isdatastart && isspace(*s)) {
- for (s++; *s; s++) {
- if (!isspace(*s))
- break;
- }
- putchar(' ');
- }
+ start = htmldata.data;
+ for (s = start; *s; s++)
+ if (*s != '\r' && *s != '\n')
+ break;
+
+ e = s + strlen(s);
+ for (; e > s; e--)
+ if (*e != '\r' && *e != '\n')
+ break;
if (cur->ispre) {
- for (; *s; s++)
- putchar(*s);
+ fwrite(s, 1, e - s, stdout);
} else {
- for (; *s; s++) {
- if (isspace(*s))
- putchar(' ');
- else
+ for (; s < e; s++) {
+ if (!isspace(*s))
+ break;
+ }
+ for (; s < e; s++) {
+ if (!isspace(*s)) {
+ if (s != start && isspace(s[-1]))
+ putchar(' ');
putchar(*s);
+ }
}
}
-end:
- /* TODO: remove trailing space also ? */
- isdatastart = 0;
+ string_clear(&htmldata);
+}
+
+static void
+xmldata(XMLParser *p, const char *data, size_t datalen)
+{
+ struct node *cur;
+
+ cur = &nodes[curnode];
+ if (cur->isignore)
+ return;
+
+ string_append(&htmldata, data, datalen);
}
static void
t@@ -136,7 +182,7 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
if (n <= 0)
xmldata(p, data, datalen);
else
- fputs(buf, stdout);
+ string_append(&htmldata, buf, n);
}
static void
t@@ -254,6 +300,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
strlcpy(src, value, sizeof(src));
}
+/* TODO: preprocess data, strip <script>, <style> etc */
int
main(void)
{
t@@ -262,7 +309,6 @@ main(void)
parser.xmlattr = xmlattr;
parser.xmlcdata = xmlcdata;
- parser.xmldatastart = xmldatastart;
parser.xmldata = xmldata;
parser.xmldataend = xmldataend;
parser.xmldataentity = xmldataentity;