sync xml.{c,h} - grabtitle - stupid HTML title grabber
 (HTM) git clone git://git.codemadness.org/grabtitle
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 8e2bee7e85c6a6fbdb2b9ef84c69f8f74ab5b77c
 (DIR) parent 0ffe161701f6f9ecde66204f5784e6709d647a1e
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 30 May 2020 13:36:43 +0200
       
       sync xml.{c,h}
       
       Diffstat:
         M xml.c                               |     113 ++++++++++++++-----------------
         M xml.h                               |       5 +++++
       
       2 files changed, 55 insertions(+), 63 deletions(-)
       ---
 (DIR) diff --git a/xml.c b/xml.c
       @@ -15,7 +15,7 @@ xml_parseattrs(XMLParser *x)
                size_t namelen = 0;
                int c, endsep, endname = 0, valuestart = 0;
        
       -        while ((c = x->getnext()) != EOF) {
       +        while ((c = GETNEXT()) != EOF) {
                        if (isspace(c)) {
                                if (namelen)
                                        endname = 1;
       @@ -32,12 +32,12 @@ xml_parseattrs(XMLParser *x)
                                /* attribute with value */
                                if (c == '\'' || c == '"') {
                                        endsep = c;
       -                                while ((c = x->getnext()) != EOF) {
       +                                while ((c = GETNEXT()) != EOF) {
                                                if (c == endsep)
                                                        break;
                                        }
                                } else {
       -                                while ((c = x->getnext()) != EOF) {
       +                                while ((c = GETNEXT()) != EOF) {
                                                if (c == '>' || isspace(c))
                                                        break;
                                        }
       @@ -61,7 +61,7 @@ xml_parsecomment(XMLParser *x)
                size_t i = 0;
                int c;
        
       -        while ((c = x->getnext()) != EOF) {
       +        while ((c = GETNEXT()) != EOF) {
                        if (c == '-') {
                                if (i < 2)
                                        i++;
       @@ -79,7 +79,7 @@ xml_parsecdata(XMLParser *x)
                size_t datalen = 0, i = 0;
                int c;
        
       -        while ((c = x->getnext()) != EOF) {
       +        while ((c = GETNEXT()) != EOF) {
                        if (c == ']' || c == '>') {
                                if (x->xmlcdata) {
                                        x->data[datalen] = '\0';
       @@ -147,44 +147,42 @@ codepointtoutf8(long r, char *s)
                }
        }
        
       +struct namedentity {
       +        const char *entity;
       +        long cp;
       +};
       +
       +int
       +namedentitycmp(const void *v1, const void *v2)
       +{
       +        struct namedentity *n1 = (struct namedentity *)v1;
       +        struct namedentity *n2 = (struct namedentity *)v2;
       +
       +        return strcmp(n1->entity, n2->entity);
       +}
       +
        static int
        namedentitytostr(const char *e, char *buf, size_t bufsiz)
        {
       -        static const struct {
       -                char *entity;
       -                int c;
       -        } entities[] = {
       -                { "&amp;",  '&'  },
       -                { "&lt;",   '<'  },
       -                { "&gt;",   '>'  },
       -                { "&apos;", '\'' },
       -                { "&quot;", '"'  },
       -                { "&nbsp;", ' '  },
       -                { "&AMP;",  '&'  },
       -                { "&LT;",   '<'  },
       -                { "&GT;",   '>'  },
       -                { "&APOS;", '\'' },
       -                { "&QUOT;", '"'  },
       -                { "&NBSP;", ' '  },
       +        static const struct namedentity entities[] = {
       +#include "namedentities.h"
                };
       +        struct namedentity find, *found;
                size_t i;
        
                /* buffer is too small */
       -        if (bufsiz < 2)
       +        if (bufsiz < 5)
                        return -1;
        
       -        /* doesn't start with &: can't match */
       -        if (*e != '&')
       -                return 0;
       -
       -        for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
       -                if (!strcmp(e, entities[i].entity)) {
       -                        buf[0] = entities[i].c;
       -                        buf[1] = '\0';
       -                        return 1;
       -                }
       +        find.entity = e;
       +        found = bsearch(&find, entities, sizeof(entities) / sizeof(*entities),
       +                sizeof(*entities), namedentitycmp);
       +        if (found) {
       +                i = codepointtoutf8(found->cp, buf);
       +                buf[i] = '\0';
       +                return i;
                }
       -        return 0;
       +        return -1;
        }
        
        static int
       @@ -198,21 +196,15 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
                if (bufsiz < 5)
                        return -1;
        
       -        /* not a numeric entity */
       -        if (e[0] != '&' || e[1] != '#')
       -                return 0;
       -
       -        /* e[1] == '#', numeric / hexadecimal entity */
       -        e += 2; /* skip "&#" */
                errno = 0;
                /* hex (16) or decimal (10) */
                if (*e == 'x')
       -                l = strtoul(e + 1, &end, 16);
       +                l = strtol(++e, &end, 16);
                else
       -                l = strtoul(e, &end, 10);
       -        /* invalid value or not a well-formed entity or too high codepoint */
       -        if (errno || *end != ';' || l > 0x10FFFF)
       -                return 0;
       +                l = strtol(e, &end, 10);
       +        /* invalid value or not a well-formed entity or invalid codepoint */
       +        if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff)
       +                return -1;
                len = codepointtoutf8(l, buf);
                buf[len] = '\0';
        
       @@ -220,21 +212,18 @@ numericentitytostr(const char *e, char *buf, size_t bufsiz)
        }
        
        /* convert named- or numeric entity string to buffer string
       - * returns byte-length of string. */
       + * returns byte-length of string or -1 on failure. */
        int
        xml_entitytostr(const char *e, char *buf, size_t bufsiz)
        {
       -        /* buffer is too small */
       -        if (bufsiz < 5)
       -                return -1;
                /* doesn't start with & */
                if (e[0] != '&')
       -                return 0;
       -        /* named entity */
       -        if (e[1] != '#')
       -                return namedentitytostr(e, buf, bufsiz);
       -        else /* numeric entity */
       -                return numericentitytostr(e, buf, bufsiz);
       +                return -1;
       +        /* numeric entity */
       +        if (e[1] == '#')
       +                return numericentitytostr(e + 2, buf, bufsiz);
       +        else /* named entity */
       +                return namedentitytostr(e + 1, buf, bufsiz);
        }
        
        void
       @@ -243,18 +232,16 @@ xml_parse(XMLParser *x)
                size_t datalen, tagdatalen;
                int c, isend;
        
       -        if (!x->getnext)
       -                return;
       -        while ((c = x->getnext()) != EOF && c != '<')
       +        while ((c = GETNEXT()) != EOF && c != '<')
                        ; /* skip until < */
        
                while (c != EOF) {
                        if (c == '<') { /* parse tag */
       -                        if ((c = x->getnext()) == EOF)
       +                        if ((c = GETNEXT()) == EOF)
                                        return;
        
                                if (c == '!') { /* cdata and comments */
       -                                for (tagdatalen = 0; (c = x->getnext()) != EOF;) {
       +                                for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
                                                /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */
                                                if (tagdatalen <= sizeof("[CDATA[") - 1)
                                                        x->data[tagdatalen++] = c;
       @@ -282,13 +269,13 @@ xml_parse(XMLParser *x)
                                        if (c == '?') {
                                                x->isshorttag = 1;
                                        } else if (c == '/') {
       -                                        if ((c = x->getnext()) == EOF)
       +                                        if ((c = GETNEXT()) == EOF)
                                                        return;
                                                x->tag[0] = c;
                                                isend = 1;
                                        }
        
       -                                while ((c = x->getnext()) != EOF) {
       +                                while ((c = GETNEXT()) != EOF) {
                                                if (c == '/')
                                                        x->isshorttag = 1; /* short tag */
                                                else if (c == '>' || isspace(c)) {
       @@ -320,7 +307,7 @@ xml_parse(XMLParser *x)
                        } else {
                                /* parse tag data */
                                datalen = 0;
       -                        while ((c = x->getnext()) != EOF) {
       +                        while ((c = GETNEXT()) != EOF) {
                                        if (c == '&') {
                                                if (datalen) {
                                                        x->data[datalen] = '\0';
       @@ -329,7 +316,7 @@ xml_parse(XMLParser *x)
                                                }
                                                x->data[0] = c;
                                                datalen = 1;
       -                                        while ((c = x->getnext()) != EOF) {
       +                                        while ((c = GETNEXT()) != EOF) {
                                                        if (c == '<')
                                                                break;
                                                        if (datalen < sizeof(x->data) - 1)
 (DIR) diff --git a/xml.h b/xml.h
       @@ -1,3 +1,6 @@
       +#ifndef _XML_H
       +#define _XML_H
       +
        typedef struct xmlparser {
                /* handlers */
                void (*xmlcdata)(struct xmlparser *, const char *, size_t);
       @@ -6,6 +9,7 @@ typedef struct xmlparser {
                void (*xmltagend)(struct xmlparser *, const char *, size_t, int);
                void (*xmltagstart)(struct xmlparser *, const char *, size_t);
        
       +#define GETNEXT (x)->getnext
                int (*getnext)(void);
        
                /* current tag */
       @@ -19,3 +23,4 @@ typedef struct xmlparser {
        
        int xml_entitytostr(const char *, char *, size_t);
        void xml_parse(XMLParser *);
       +#endif