ttesting improve white-space handling - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 2a56590cbe1c1739171a28d4c30b5b318cb0b364
(DIR) parent e4a9e2404be2db1687430631e912f1809992a23b
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 21 Sep 2019 20:02:18 +0200
ttesting improve white-space handling
Diffstat:
M README | 6 +++---
M TODO | 7 +++++--
M main.c | 53 +++++++++++++++++++++----------
3 files changed, 45 insertions(+), 21 deletions(-)
---
(DIR) diff --git a/README b/README
t@@ -1,15 +1,15 @@
NOTE! work-in-progress (very slowly).
-Text-based webpage viewer
+Text-based HTML dump
Goals / scope:
-The tool will render a webpage only to stdout, similarly like links -dump or
+The tool will only render HTML to stdout, similarly to links -dump or
lynx -dump but simpler and more secure.
- It will be usable and secure for rendering HTML mails.
- No remote resources will be downloaded.
- Data will be written to stdout only.
-- No support for Javascript, CSS support, frames or forms.
+- No support for Javascript, CSS support, frames or form input.
(DIR) diff --git a/TODO b/TODO
t@@ -1,13 +1,16 @@
+- improve/remove duplicate white-space/newlines?
+- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre.
- base href.
specify and parse relative url, allow to specify base and also parse <base href="">
-- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre.
- detect <link /> to RSS/Atom feed, show as link.
example: <link rel="alternate" href="atom.xml" type="application/atom+xml" title="Codemadness Atom Feed" />
or
<link rel="alternate" title="Tweakers Mixed RSS feed" type="application/rss+xml" href="https://tweakers.net/feeds/mixed.xml">
- print safe (not certain control chars, except newline, TAB etc).
-- improve/remove duplicate white-space/newlines?
+- rework parsing of <script> and <style> with unescaped characters like < and >.
- <code> should not be treated as a block (<pre> does?)
+- make the code easy to embed/restructure to make a HTML-to-plain-text converter
+ for HTML in RSS/Atom feeds.
- add links as reference, for example on page: http://absmagazin.de/2018 the MP3 urls.
- add COMPATOBJ for strlcpy and strlcat.
- write a proper Makefile.
(DIR) diff --git a/main.c b/main.c
t@@ -402,37 +402,49 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen)
printsafe(data);
}
+#if 0
+static void
+xmldatastart(XMLParser *p)
+{
+// printf("DEBUG: %s\n", __func__);
+}
+#endif
+
static void
xmldataend(XMLParser *p)
{
struct node *cur;
char *start, *s, *e;
+// printf("DEBUG: %s\n", __func__);
+
if (!htmldata.data || !htmldata.len)
return;
cur = &nodes[curnode];
- if (cur->displaytype & DisplayNone) {
+
+// printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
+
+ if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
/* nothing */
} else if (cur->displaytype & DisplayPre) {
fwrite(htmldata.data, 1, htmldata.len, stdout);
} else {
start = htmldata.data;
- s = start;
- e = s + htmldata.len;
- /* TODO: better white-space handling */
- for (; s < e; s++) {
- if (isspace((unsigned char)*s)) {
- if (s != start && !isspace((unsigned char)s[-1]))
+ e = htmldata.data + htmldata.len;
+
+ /* TODO: better white-space handling, for example if there is only
+ white-space between 2 block elements then it can be ignored. */
+ for (s = start; s < e; s++) {
+ if (*s == '\r') {
+ continue;
+ } else if (isspace((unsigned char)*s)) {
+ if (s == start || !isspace((unsigned char)s[-1]))
putchar(' ');
- } else {
- if (!iscntrl((unsigned char)*s))
- putchar(*s);
+ } else if (!iscntrl((unsigned char)*s)) {
+ putchar(*s);
}
}
- if (s != start && e != start && !isspace((unsigned char)s[-1]) &&
- isspace((unsigned char)e[-1]))
- putchar(' ');
}
string_clear(&htmldata);
t@@ -479,19 +491,25 @@ xmltagstart(XMLParser *x, const char *t, size_t tl)
struct node *cur;
int i;
+// printf("start of tag: %s\n", t);
+
if (curnode >= MAX_DEPTH - 2)
errx(1, "max tag depth reached: %d\n", curnode);
curnode++;
cur = &nodes[curnode];
memset(cur, 0, sizeof(*cur));
- src[0] = '\0'; /* src, href */
+ cur->displaytype = DisplayInline;
strlcpy(cur->tag, t, sizeof(cur->tag));
+ src[0] = '\0'; /* src, href */
+
/* set display type */
for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
if (!strcasecmp(tags[i].tag, t)) {
- cur->displaytype |= tags[i].displaytype;
+ cur->displaytype = tags[i].displaytype;
+// printf("match on tag: %s == %s, displaytype: %d\n",
+// tags[i].tag, t, cur->displaytype);
break;
}
}
t@@ -505,6 +523,8 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
cur = &nodes[curnode];
+// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
+
if (cur->displaytype & DisplayBlock) {
fputs("\n", stdout);
} else if (cur->displaytype & DisplayPre) {
t@@ -609,7 +629,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
if (nodes[i].displaytype & DisplayListItem)
continue;
if (nodes[i].displaytype & DisplayList)
- fputs(" ", stdout);
+ fputs(" ", stdout);
}
/* TODO: for <ol>, keep list counter on ol element (parent),
support ordered number type only */
t@@ -656,6 +676,7 @@ main(void)
parser.xmlattr = xmlattr;
parser.xmlcdata = xmlcdata;
parser.xmldata = xmldata;
+// parser.xmldatastart = xmldatastart;
parser.xmldataend = xmldataend;
parser.xmldataentity = xmldataentity;
parser.xmltagstart = xmltagstart;