sync xml.{c,h} - grabtitle - stupid HTML title grabber
(HTM) git clone git://git.codemadness.org/grabtitle
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 8e2bee7e85c6a6fbdb2b9ef84c69f8f74ab5b77c
(DIR) parent 0ffe161701f6f9ecde66204f5784e6709d647a1e
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 30 May 2020 13:36:43 +0200
sync xml.{c,h}
Diffstat:
M xml.c | 113 ++++++++++++++-----------------
M xml.h | 5 +++++
2 files changed, 55 insertions(+), 63 deletions(-)
---
(DIR) diff --git a/xml.c b/xml.c
@@ -15,7 +15,7 @@ xml_parseattrs(XMLParser *x)
size_t namelen = 0;
int c, endsep, endname = 0, valuestart = 0;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (isspace(c)) {
if (namelen)
endname = 1;
@@ -32,12 +32,12 @@ xml_parseattrs(XMLParser *x)
/* attribute with value */
if (c == '\'' || c == '"') {
endsep = c;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == endsep)
break;
}
} else {
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '>' || isspace(c))
break;
}
@@ -61,7 +61,7 @@ xml_parsecomment(XMLParser *x)
size_t i = 0;
int c;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '-') {
if (i < 2)
i++;
@@ -79,7 +79,7 @@ xml_parsecdata(XMLParser *x)
size_t datalen = 0, i = 0;
int c;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == ']' || c == '>') {
if (x->xmlcdata) {
x->data[datalen] = '\0';
@@ -147,44 +147,42 @@ codepointtoutf8(long r, char *s)
}
}
+struct namedentity {
+ const char *entity;
+ long cp;
+};
+
+int
+namedentitycmp(const void *v1, const void *v2)
+{
+ struct namedentity *n1 = (struct namedentity *)v1;
+ struct namedentity *n2 = (struct namedentity *)v2;
+
+ return strcmp(n1->entity, n2->entity);
+}
+
static int
namedentitytostr(const char *e, char *buf, size_t bufsiz)
{
- static const struct {
- char *entity;
- int c;
- } entities[] = {
- { "&", '&' },
- { "<", '<' },
- { ">", '>' },
- { "'", '\'' },
- { """, '"' },
- { " ", ' ' },
- { "&", '&' },
- { "<", '<' },
- { ">", '>' },
- { "&APOS;", '\'' },
- { """, '"' },
- { "&NBSP;", ' ' },
+ static const struct namedentity entities[] = {
+#include "namedentities.h"
};
+ struct namedentity find, *found;
size_t i;
/* buffer is too small */
- if (bufsiz < 2)
+ if (bufsiz < 5)
return -1;
- /* doesn't start with &: can't match */
- if (*e != '&')
- return 0;
-
- for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
- if (!strcmp(e, entities[i].entity)) {
- buf[0] = entities[i].c;
- buf[1] = '\0';
- return 1;
- }
+ find.entity = e;
+ found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
+ sizeof(*entities), namedentitycmp);
+ if (found) {
+ i = codepointtoutf8(found->cp, buf);
+ buf[i] = '\0';
+ return i;
}
- return 0;
+ return -1;
}
static int
@@ -198,21 +196,15 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
if (bufsiz < 5)
return -1;
- /* not a numeric entity */
- if (e[0] != '&' || e[1] != '#')
- return 0;
-
- /* e[1] == '#', numeric / hexadecimal entity */
- e += 2; /* skip "&#" */
errno = 0;
/* hex (16) or decimal (10) */
if (*e == 'x')
- l = strtoul(e + 1, &end, 16);
+ l = strtol(++e, &end, 16);
else
- l = strtoul(e, &end, 10);
- /* invalid value or not a well-formed entity or too high codepoint */
- if (errno || *end != ';' || l > 0x10FFFF)
- return 0;
+ l = strtol(e, &end, 10);
+ /* invalid value or not a well-formed entity or invalid codepoint */
+ if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
+ return -1;
len = codepointtoutf8(l, buf);
buf[len] = '\0';
@@ -220,21 +212,18 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
}
/* convert named- or numeric entity string to buffer string
- * returns byte-length of string. */
+ * returns byte-length of string or -1 on failure. */
int
xml_entitytostr(const char *e, char *buf, size_t bufsiz)
{
- /* buffer is too small */
- if (bufsiz < 5)
- return -1;
/* doesn't start with & */
if (e[0] != '&')
- return 0;
- /* named entity */
- if (e[1] != '#')
- return namedentitytostr(e, buf, bufsiz);
- else /* numeric entity */
- return numericentitytostr(e, buf, bufsiz);
+ return -1;
+ /* numeric entity */
+ if (e[1] == '#')
+ return numericentitytostr(e + 2, buf, bufsiz);
+ else /* named entity */
+ return namedentitytostr(e + 1, buf, bufsiz);
}
void
@@ -243,18 +232,16 @@ xml_parse(XMLParser *x)
size_t datalen, tagdatalen;
int c, isend;
- if (!x->getnext)
- return;
- while ((c = x->getnext()) != EOF && c != '<')
+ while ((c = GETNEXT()) != EOF && c != '<')
; /* skip until < */
while (c != EOF) {
if (c == '<') { /* parse tag */
- if ((c = x->getnext()) == EOF)
+ if ((c = GETNEXT()) == EOF)
return;
if (c == '!') { /* cdata and comments */
- for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
+ for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
/* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
if (tagdatalen <= sizeof("[CDATA[") - 1)
x->data[tagdatalen++] = c;
@@ -282,13 +269,13 @@ xml_parse(XMLParser *x)
if (c == '?') {
x->isshorttag = 1;
} else if (c == '/') {
- if ((c = x->getnext()) == EOF)
+ if ((c = GETNEXT()) == EOF)
return;
x->tag[0] = c;
isend = 1;
}
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '/')
x->isshorttag = 1; /* short tag */
else if (c == '>' || isspace(c)) {
@@ -320,7 +307,7 @@ xml_parse(XMLParser *x)
} else {
/* parse tag data */
datalen = 0;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '&') {
if (datalen) {
x->data[datalen] = '\0';
@@ -329,7 +316,7 @@ xml_parse(XMLParser *x)
}
x->data[0] = c;
datalen = 1;
- while ((c = x->getnext()) != EOF) {
+ while ((c = GETNEXT()) != EOF) {
if (c == '<')
break;
if (datalen < sizeof(x->data) - 1)
(DIR) diff --git a/xml.h b/xml.h
@@ -1,3 +1,6 @@
+#ifndef _XML_H
+#define _XML_H
+
typedef struct xmlparser {
/* handlers */
void (*xmlcdata)(struct xmlparser *, const char *, size_t);
@@ -6,6 +9,7 @@ typedef struct xmlparser {
void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
void (*xmltagstart)(struct xmlparser *, const char *, size_t);
+#define GETNEXT (x)->getnext
int (*getnext)(void);
/* current tag */
@@ -19,3 +23,4 @@ typedef struct xmlparser {
int xml_entitytostr(const char *, char *, size_t);
void xml_parse(XMLParser *);
+#endif