trefactor nbsp handling and named entities to codepoints - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 8ef0a95fe47d42ce8627c8aa7232a9eb1d4a172e
(DIR) parent fc07cb6c73b027722f67b491899d9fc7f5fff505
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Wed, 20 Nov 2019 23:29:30 +0100
refactor nbsp handling and named entities to codepoints
Diffstat:
M webdump.c | 9 +--------
M xml.c | 38 +++++++++++++++++++------------
2 files changed, 24 insertions(+), 23 deletions(-)
---
(DIR) diff --git a/webdump.c b/webdump.c
t@@ -533,15 +533,8 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
/* rsquo, hellip, ndash, lsquo */
/* TODO: support some more HTML entities */
- n = 0;
- if (!strcmp(data, " ") || !strcmp(data, "&NBSP;")) {
- memcpy(buf, "\xc2\xa0", 3); /* UTF-8: nbsp */
- n = 2;
- }
-
/* convert basic XML entities */
- if (n <= 0)
- n = xml_entitytostr(data, buf, sizeof(buf));
+ n = xml_entitytostr(data, buf, sizeof(buf));
if (n > 0)
xmldata(p, buf, (size_t)n);
else
(DIR) diff --git a/xml.c b/xml.c
t@@ -248,30 +248,38 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
{
static const struct {
const char *entity;
- int c;
+ long cp;
} entities[] = {
- { "amp;", '&' },
- { "lt;", '<' },
- { "gt;", '>' },
- { "apos;", '\'' },
- { "quot;", '"' },
- { "AMP;", '&' },
- { "LT;", '<' },
- { "GT;", '>' },
- { "APOS;", '\'' },
- { "QUOT;", '"' },
+ { "amp;", '&' },
+ { "lt;", '<' },
+ { "gt;", '>' },
+ { "apos;", '\'' },
+ { "quot;", '\"' },
+ { "ndash;", 0x2013 },
+ { "mdash;", 0x2014 },
+ { "nbsp;", 0x00a0 },
+ { "copy;", 0x00a9 },
+ { "AMP;", '&' },
+ { "LT;", '<' },
+ { "GT;", '>' },
+ { "APOS;", '\'' },
+ { "QUOT;", '\"' },
+ { "ndash;", 0x2013 },
+ { "MDASH;", 0x2014 },
+ { "NBSP;", 0x00a0 },
+ { "COPY;", 0x00a9 },
};
size_t i;
/* buffer is too small */
- if (bufsiz < 2)
+ if (bufsiz < 5)
return -1;
for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
if (!strcmp(e, entities[i].entity)) {
- buf[0] = entities[i].c;
- buf[1] = '\0';
- return 1;
+ i = codepointtoutf8(entities[i].cp, buf);
+ buf[i] = '\0';
+ return i;
}
}
return 0;