sync XML improvements - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 5df58d27f557292778cdc5dee306f18db8c980f7
(DIR) parent f8629e681a16fc3af086355a44c942df57291b4b
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 1 Feb 2020 15:02:27 +0100
sync XML improvements
Diffstat:
M tscrape.c | 8 ++++----
M xml.c | 24 ++++++++----------------
M xml.h | 2 ++
3 files changed, 14 insertions(+), 20 deletions(-)
---
(DIR) diff --git a/tscrape.c b/tscrape.c
@@ -107,10 +107,10 @@ isclassmatch(const char *classes, const char *clss, size_t len)
}
/* convert XML and some HTML entities */
-static ssize_t
+static int
html_entitytostr(const char *s, char *buf, size_t bufsiz)
{
- ssize_t len;
+ int len;
if ((len = xml_entitytostr(s, buf, bufsiz)) > 0)
return len;
@@ -244,7 +244,7 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
const char *v, size_t vl)
{
char buf[16];
- ssize_t len;
+ int len;
if (!state)
return;
@@ -267,7 +267,7 @@ static void
xmldataentity(XMLParser *x, const char *d, size_t dl)
{
char buf[16];
- ssize_t len;
+ int len;
if (!(state & Text))
return;
(DIR) diff --git a/xml.c b/xml.c
@@ -1,8 +1,5 @@
-#include <sys/types.h>
-
#include <ctype.h>
#include <errno.h>
-#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -255,11 +252,6 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
{ "gt;", '>' },
{ "apos;", '\'' },
{ "quot;", '"' },
- { "AMP;", '&' },
- { "LT;", '<' },
- { "GT;", '>' },
- { "APOS;", '\'' },
- { "QUOT;", '"' }
};
size_t i;
@@ -274,7 +266,7 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
return 1;
}
}
- return 0;
+ return -1;
}
static int
@@ -291,12 +283,12 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
errno = 0;
/* hex (16) or decimal (10) */
if (*e == 'x')
- l = strtoul(e + 1, &end, 16);
+ l = strtol(++e, &end, 16);
else
- l = strtoul(e, &end, 10);
- /* invalid value or not a well-formed entity or too high codepoint */
- if (errno || *end != ';' || l > 0x10FFFF)
- return 0;
+ l = strtol(e, &end, 10);
+ /* invalid value or not a well-formed entity or invalid codepoint */
+ if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
+ return -1;
len = codepointtoutf8(l, buf);
buf[len] = '\0';
@@ -304,13 +296,13 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
}
/* convert named- or numeric entity string to buffer string
- * returns byte-length of string. */
+ * returns byte-length of string or -1 on failure. */
int
xml_entitytostr(const char *e, char *buf, size_t bufsiz)
{
/* doesn't start with & */
if (e[0] != '&')
- return 0;
+ return -1;
/* numeric entity */
if (e[1] == '#')
return numericentitytostr(e + 2, buf, bufsiz);
(DIR) diff --git a/xml.h b/xml.h
@@ -1,6 +1,8 @@
#ifndef _XML_H
#define _XML_H
+#include <stdio.h>
+
typedef struct xmlparser {
/* handlers */
void (*xmlattr)(struct xmlparser *, const char *, size_t,