XML tag parse improvements for PI and end tags - grabtitle - stupid HTML title grabber
 (HTM) git clone git://git.codemadness.org/grabtitle
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 3054084945aae1ceb22b87c9132dea71cf9e5108
 (DIR) parent d908478d0f84bc275428fd71e934c993bb29211c
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Mon, 10 Dec 2018 19:03:02 +0100
       
       XML tag parse improvements for PI and end tags
       
       - Stricter parsing of tags, no whitespace stripping after <.
       - For end tags the "internal" context x->tag would be "/sometag". Make sure
         this matches exactly with the parameter tag.
       - Reset tagname after parsing an end tag.
       - Make end tag handling more consistent.
       - Remove temporary variable taglen.
       
       Diffstat:
         M xml.c                               |      52 +++++++++++++++++--------------
       
       1 file changed, 29 insertions(+), 23 deletions(-)
       ---
 (DIR) diff --git a/xml.c b/xml.c
       @@ -334,8 +334,8 @@ xml_entitytostr(const char *e, char *buf, size_t bufsiz)
        void
        xml_parse(XMLParser *x)
        {
       -        int c, ispi;
       -        size_t datalen, tagdatalen, taglen;
       +        size_t datalen, tagdatalen;
       +        int c, isend;
        
                if (!x->getnext)
                        return;
       @@ -367,30 +367,32 @@ xml_parse(XMLParser *x)
                                                }
                                        }
                                } else {
       -                                x->tag[0] = '\0';
       -                                x->taglen = 0;
       -
                                        /* normal tag (open, short open, close), processing instruction. */
       -                                if (isspace(c))
       -                                        while ((c = x->getnext()) != EOF && isspace(c))
       -                                                ;
       -                                if (c == EOF)
       -                                        return;
                                        x->tag[0] = c;
       -                                ispi = (c == '?') ? 1 : 0;
       -                                x->isshorttag = ispi;
       -                                taglen = 1;
       +                                x->taglen = 1;
       +                                x->isshorttag = isend = 0;
       +
       +                                /* treat processing instruction as shorttag, don't strip "?" prefix. */
       +                                if (c == '?') {
       +                                        x->isshorttag = 1;
       +                                } else if (c == '/') {
       +                                        if ((c = x->getnext()) == EOF)
       +                                                return;
       +                                        x->tag[0] = c;
       +                                        isend = 1;
       +                                }
       +
                                        while ((c = x->getnext()) != EOF) {
                                                if (c == '/')
                                                        x->isshorttag = 1; /* short tag */
                                                else if (c == '>' || isspace(c)) {
       -                                                x->tag[taglen] = '\0';
       -                                                if (x->tag[0] == '/') { /* end tag, starts with </ */
       -                                                        x->taglen = --taglen; /* len -1 because of / */
       -                                                        if (taglen && x->xmltagend)
       -                                                                x->xmltagend(x, &(x->tag)[1], x->taglen, 0);
       +                                                x->tag[x->taglen] = '\0';
       +                                                if (isend) { /* end tag, starts with </ */
       +                                                        if (x->xmltagend)
       +                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
                                                        } else {
       -                                                        x->taglen = taglen;
                                                                /* start tag */
                                                                if (x->xmltagstart)
                                                                        x->xmltagstart(x, x->tag, x->taglen);
       @@ -400,11 +402,15 @@ xml_parse(XMLParser *x)
                                                                        x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
                                                        }
                                                        /* call tagend for shortform or processing instruction */
       -                                                if ((x->isshorttag || ispi) && x->xmltagend)
       -                                                        x->xmltagend(x, x->tag, x->taglen, 1);
       +                                                if (x->isshorttag) {
       +                                                        if (x->xmltagend)
       +                                                                x->xmltagend(x, x->tag, x->taglen, x->isshorttag);
       +                                                        x->tag[0] = '\0';
       +                                                        x->taglen = 0;
       +                                                }
                                                        break;
       -                                        } else if (taglen < sizeof(x->tag) - 1)
       -                                                x->tag[taglen++] = c; /* NOTE: tag name truncation */
       +                                        } else if (x->taglen < sizeof(x->tag) - 1)
       +                                                x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
                                        }
                                }
                        } else {