z3bra.org

       trefactor nbsp handling and named entities to codepoints - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 8ef0a95fe47d42ce8627c8aa7232a9eb1d4a172e
 (DIR) parent fc07cb6c73b027722f67b491899d9fc7f5fff505
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Wed, 20 Nov 2019 23:29:30 +0100
       
       refactor nbsp handling and named entities to codepoints
       
       Diffstat:
         M webdump.c                           |       9 +--------
         M xml.c                               |      38 +++++++++++++++++++------------
       
       2 files changed, 24 insertions(+), 23 deletions(-)
       ---
 (DIR) diff --git a/webdump.c b/webdump.c
       t@@ -533,15 +533,8 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
                /* rsquo, hellip, ndash, lsquo */
                /* TODO: support some more HTML entities */
        
       -        n = 0;
       -        if (!strcmp(data, "&nbsp;") || !strcmp(data, "&NBSP;")) {
       -                memcpy(buf, "\xc2\xa0", 3); /* UTF-8: nbsp */
       -                n = 2;
       -        }
       -
                /* convert basic XML entities */
       -        if (n <= 0)
       -                n = xml_entitytostr(data, buf, sizeof(buf));
       +        n = xml_entitytostr(data, buf, sizeof(buf));
                if (n > 0)
                        xmldata(p, buf, (size_t)n);
                else
 (DIR) diff --git a/xml.c b/xml.c
       t@@ -248,30 +248,38 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
        {
                static const struct {
                        const char *entity;
       -                int c;
       +                long cp;
                } entities[] = {
       -                { "amp;",  '&'  },
       -                { "lt;",   '<'  },
       -                { "gt;",   '>'  },
       -                { "apos;", '\'' },
       -                { "quot;", '"'  },
       -                { "AMP;",  '&'  },
       -                { "LT;",   '<'  },
       -                { "GT;",   '>'  },
       -                { "APOS;", '\'' },
       -                { "QUOT;", '"'  },
       +                { "amp;",   '&'  },
       +                { "lt;",    '<'  },
       +                { "gt;",    '>'  },
       +                { "apos;",  '\'' },
       +                { "quot;",  '\"'  },
       +                { "ndash;", 0x2013 },
       +                { "mdash;", 0x2014 },
       +                { "nbsp;",  0x00a0 },
       +                { "copy;",  0x00a9 },
       +                { "AMP;",   '&'  },
       +                { "LT;",    '<'  },
       +                { "GT;",    '>'  },
       +                { "APOS;",  '\'' },
       +                { "QUOT;",  '\"'  },
       +                { "ndash;", 0x2013 },
       +                { "MDASH;", 0x2014 },
       +                { "NBSP;",  0x00a0 },
       +                { "COPY;",  0x00a9 },
                };
                size_t i;
        
                /* buffer is too small */
       -        if (bufsiz < 2)
       +        if (bufsiz < 5)
                        return -1;
        
                for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) {
                        if (!strcmp(e, entities[i].entity)) {
       -                        buf[0] = entities[i].c;
       -                        buf[1] = '\0';
       -                        return 1;
       +                        i = codepointtoutf8(entities[i].cp, buf);
       +                        buf[i] = '\0';
       +                        return i;
                        }
                }
                return 0;