tsync XML improvements from sfeed - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit cd440128ab251321e18dc25802936a30cf25a5e9
(DIR) parent 0ac210d169689c8e8a33351adf6a2d06b9f7322d
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Thu, 27 Jun 2019 19:41:13 +0200
sync XML improvements from sfeed
Diffstat:
M xml.c | 125 ++++++++++++++-----------------
M xml.h | 5 +++++
2 files changed, 63 insertions(+), 67 deletions(-)
---
(DIR) diff --git a/xml.c b/xml.c
t@@ -15,7 +15,7 @@ xml_parseattrs(XMLParser *x)
size_t namelen = 0, valuelen;
int c, endsep, endname = 0, valuestart = 0;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (isspace(c)) {
if (namelen)
endname = 1;
t@@ -51,7 +51,7 @@ xml_parseattrs(XMLParser *x)
goto startvalue;
}
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
startvalue:
if (c == '&') { /* entities */
x->data[valuelen] = '\0';
t@@ -60,7 +60,7 @@ startvalue:
x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
x->data[0] = c;
valuelen = 1;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c))))
break;
if (valuelen < sizeof(x->data) - 1)
t@@ -124,9 +124,9 @@ xml_parsecomment(XMLParser *x)
if (x->xmlcommentstart)
x->xmlcommentstart(x);
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '-' || c == '>') {
- if (x->xmlcomment) {
+ if (x->xmlcomment && datalen) {
x->data[datalen] = '\0';
x->xmlcomment(x, x->data, datalen);
datalen = 0;
t@@ -173,9 +173,9 @@ xml_parsecdata(XMLParser *x)
if (x->xmlcdatastart)
x->xmlcdatastart(x);
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == ']' || c == '>') {
- if (x->xmlcdata) {
+ if (x->xmlcdata && datalen) {
x->data[datalen] = '\0';
x->xmlcdata(x, x->data, datalen);
datalen = 0;
t@@ -247,19 +247,19 @@ static int
namedentitytostr(const char *e, char *buf, size_t bufsiz)
{
static const struct {
- char *entity;
+ const char *entity;
int c;
} entities[] = {
- { "&", '&' },
- { "<", '<' },
- { ">", '>' },
- { "'", '\'' },
- { """, '"' },
- { "&", '&' },
- { "<", '<' },
- { ">", '>' },
- { "&APOS;", '\'' },
- { """, '"' }
+ { "amp;", '&' },
+ { "lt;", '<' },
+ { "gt;", '>' },
+ { "apos;", '\'' },
+ { "quot;", '"' },
+ { "AMP;", '&' },
+ { "LT;", '<' },
+ { "GT;", '>' },
+ { "APOS;", '\'' },
+ { "QUOT;", '"' }
};
size_t i;
t@@ -267,10 +267,6 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
if (bufsiz < 2)
return -1;
- /* doesn't start with &: can't match */
- if (*e != '&')
- return 0;
-
for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
if (!strcmp(e, entities[i].entity)) {
buf[0] = entities[i].c;
t@@ -292,12 +288,6 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
if (bufsiz < 5)
return -1;
- /* not a numeric entity */
- if (e[0] != '&' || e[1] != '#')
- return 0;
-
- /* e[1] == '#', numeric / hexadecimal entity */
- e += 2; /* skip "&#" */
errno = 0;
/* hex (16) or decimal (10) */
if (*e == 'x')
t@@ -318,37 +308,32 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
int
xml_entitytostr(const char *e, char *buf, size_t bufsiz)
{
- /* buffer is too small */
- if (bufsiz < 5)
- return -1;
/* doesn't start with & */
if (e[0] != '&')
return 0;
- /* named entity */
- if (e[1] != '#')
- return namedentitytostr(e, buf, bufsiz);
- else /* numeric entity */
- return numericentitytostr(e, buf, bufsiz);
+ /* numeric entity */
+ if (e[1] == '#')
+ return numericentitytostr(e + 2, buf, bufsiz);
+ else /* named entity */
+ return namedentitytostr(e + 1, buf, bufsiz);
}
void
xml_parse(XMLParser *x)
{
- int c, ispi;
- size_t datalen, tagdatalen, taglen;
+ size_t datalen, tagdatalen;
+ int c, isend;
- if (!x->getnext)
- return;
- while ((c = x->getnext()) != EOF && c != '<')
+ while ((c = GETNEXT()) != EOF && c != '<')
; /* skip until < */
while (c != EOF) {
if (c == '<') { /* parse tag */
- if ((c = x->getnext()) == EOF)
+ if ((c = GETNEXT()) == EOF)
return;
if (c == '!') { /* cdata and comments */
- for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
/* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
if (tagdatalen <= sizeof("[CDATA[") - 1)
x->data[tagdatalen++] = c;
t@@ -367,30 +352,32 @@ xml_parse(XMLParser *x)
}
}
} else {
- x->tag[0] = '\0';
- x->taglen = 0;
-
/* normal tag (open, short open, close), processing instruction. */
- if (isspace(c))
- while ((c = x->getnext()) != EOF && isspace(c))
- ;
- if (c == EOF)
- return;
x->tag[0] = c;
- ispi = (c == '?') ? 1 : 0;
- x->isshorttag = ispi;
- taglen = 1;
- while ((c = x->getnext()) != EOF) {
+ x->taglen = 1;
+ x->isshorttag = isend = 0;
+
+ /* treat processing instruction as shorttag, don't strip "?" prefix. */
+ if (c == '?') {
+ x->isshorttag = 1;
+ } else if (c == '/') {
+ if ((c = GETNEXT()) == EOF)
+ return;
+ x->tag[0] = c;
+ isend = 1;
+ }
+
+ while ((c = GETNEXT()) != EOF) {
if (c == '/')
x->isshorttag = 1; /* short tag */
else if (c == '>' || isspace(c)) {
- x->tag[taglen] = '\0';
- if (x->tag[0] == '/') { /* end tag, starts with </ */
- x->taglen = --taglen; /* len -1 because of / */
- if (taglen && x->xmltagend)
- x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
+ x->tag[x->taglen] = '\0';
+ if (isend) { /* end tag, starts with </ */
+ if (x->xmltagend)
+ x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
} else {
- x->taglen = taglen;
/* start tag */
if (x->xmltagstart)
x->xmltagstart(x, x->tag, x->taglen);
t@@ -400,11 +387,15 @@ xml_parse(XMLParser *x)
x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
}
/* call tagend for shortform or processing instruction */
- if ((x->isshorttag || ispi) && x->xmltagend)
- x->xmltagend(x, x->tag, x->taglen, 1);
+ if (x->isshorttag) {
+ if (x->xmltagend)
+ x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
+ x->tag[0] = '\0';
+ x->taglen = 0;
+ }
break;
- } else if (taglen < sizeof(x->tag) - 1)
- x->tag[taglen++] = c; /* NOTE: tag name truncation */
+ } else if (x->taglen < sizeof(x->tag) - 1)
+ x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
}
}
} else {
t@@ -412,7 +403,7 @@ xml_parse(XMLParser *x)
datalen = 0;
if (x->xmldatastart)
x->xmldatastart(x);
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '&') {
if (datalen) {
x->data[datalen] = '\0';
t@@ -421,7 +412,7 @@ xml_parse(XMLParser *x)
}
x->data[0] = c;
datalen = 1;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '<')
break;
if (datalen < sizeof(x->data) - 1)
(DIR) diff --git a/xml.h b/xml.h
t@@ -1,3 +1,6 @@
+#ifndef _XML_H
+#define _XML_H
+
typedef struct xmlparser {
/* handlers */
void (*xmlattr)(struct xmlparser *, const char *, size_t,
t@@ -23,6 +26,7 @@ typedef struct xmlparser {
void (*xmltagstartparsed)(struct xmlparser *, const char *,
size_t, int);
+ #define GETNEXT (x)->getnext
int (*getnext)(void);
/* current tag */
t@@ -38,3 +42,4 @@ typedef struct xmlparser {
int xml_entitytostr(const char *, char *, size_t);
void xml_parse(XMLParser *);
+#endif