tMore & names and numbers. - plan9port - [fork] Plan 9 from user space
 (HTM) git clone git://src.adamsgaard.dk/plan9port
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 431e32de9b41c230b0791fb9f2f293859d189e59
 (DIR) parent 01a1c31a7d99fd24ba134ddc33fb3df95a668f3a
 (HTM) Author: rsc <devnull@localhost>
       Date:   Fri, 30 Sep 2005 17:45:40 +0000
       
       More & names and numbers.
       
       Diffstat:
         M src/libhtml/lex.c                   |     214 +++++++++++++++++--------------
       
       1 file changed, 118 insertions(+), 96 deletions(-)
       ---
 (DIR) diff --git a/src/libhtml/lex.c b/src/libhtml/lex.c
       t@@ -333,7 +333,9 @@ AsciiInt _chartab[] = {
                {"kappa", 954},
                {"lambda", 955},
                {"laquo", 171},
       +        {"ldquo", 8220},
                {"ldots", 8230},
       +        {"lsquo", 8216},
                {"lt", 60},
                {"macr", 175},
                {"mdash", 8212},
       t@@ -364,8 +366,10 @@ AsciiInt _chartab[] = {
                {"quad", 8193},
                {"quot", 34},
                {"raquo", 187},
       +        {"rdquo", 8221},
                {"reg", 174},
                {"rho", 961},
       +        {"rsquo", 8217},
                {"sect", 167},
                {"shy", 173},
                {"sigma", 963},
       t@@ -492,9 +496,9 @@ _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
                ai = 0;
                if(dbglex)
                        fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
       -        if(ts->mtype == TextHtml) {
       -                for(;;) {
       -                        if(ai == alen) {
       +        if(ts->mtype == TextHtml){
       +                for(;;){
       +                        if(ai == alen){
                                        a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
                                        alen += ToksChunk;
                                }
       t@@ -502,9 +506,9 @@ _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
                                c = getchar(ts);
                                if(c < 0)
                                        break;
       -                        if(c == '<') {
       +                        if(c == '<'){
                                        tag = gettag(ts, starti, a, &ai);
       -                                if(tag == Tscript) {
       +                                if(tag == Tscript){
                                                // special rules for getting Data after....
                                                starti = ts->i;
                                                c = getchar(ts);
       t@@ -521,8 +525,8 @@ _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
                }
                else {
                        // plain text (non-html) tokens
       -                for(;;) {
       -                        if(ai == alen) {
       +                for(;;){
       +                        if(ai == alen){
                                        a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
                                        alen += ToksChunk;
                                }
       t@@ -560,14 +564,14 @@ getplaindata(TokenSource* ts, Token* a, int* pai)
                s = nil;
                j = 0;
                starti = ts->i;
       -        for(c = getchar(ts); c >= 0; c = getchar(ts)) {
       -                if(c < ' ') {
       -                        if(isspace(c)) {
       -                                if(c == '\r') {
       +        for(c = getchar(ts); c >= 0; c = getchar(ts)){
       +                if(c < ' '){
       +                        if(isspace(c)){
       +                                if(c == '\r'){
                                                // ignore it unless no following '\n',
                                                // in which case treat it like '\n'
                                                c = getchar(ts);
       -                                        if(c != '\n') {
       +                                        if(c != '\n'){
                                                        if(c >= 0)
                                                                ungetchar(ts, c);
                                                        c = '\n';
       t@@ -577,9 +581,9 @@ getplaindata(TokenSource* ts, Token* a, int* pai)
                                else
                                        c = 0;
                        }
       -                if(c != 0) {
       +                if(c != 0){
                                buf[j++] = c;
       -                        if(j == sizeof(buf)-1) {
       +                        if(j == sizeof(buf)-1){
                                        s = buftostr(s, buf, j);
                                        j = 0;
                                }
       t@@ -627,19 +631,19 @@ getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
                s = nil;
                j = 0;
                c = firstc;
       -        while(c >= 0) {
       -                if(c == '&') {
       +        while(c >= 0){
       +                if(c == '&'){
                                c = ampersand(ts);
                                if(c < 0)
                                        break;
                        }
       -                else if(c < ' ') {
       -                        if(isspace(c)) {
       -                                if(c == '\r') {
       +                else if(c < ' '){
       +                        if(isspace(c)){
       +                                if(c == '\r'){
                                                // ignore it unless no following '\n',
                                                // in which case treat it like '\n'
                                                c = getchar(ts);
       -                                        if(c != '\n') {
       +                                        if(c != '\n'){
                                                        if(c >= 0)
                                                                ungetchar(ts, c);
                                                        c = '\n';
       t@@ -652,13 +656,13 @@ getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
                                        c = 0;
                                }
                        }
       -                else if(c == '<') {
       +                else if(c == '<'){
                                ungetchar(ts, c);
                                break;
                        }
       -                if(c != 0) {
       +                if(c != 0){
                                buf[j++] = c;
       -                        if(j == BIGBUFSIZE-1) {
       +                        if(j == BIGBUFSIZE-1){
                                        s = buftostr(s, buf, j);
                                        j = 0;
                                }
       t@@ -696,12 +700,12 @@ getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
                tstarti = starti;
                c = firstc;
                done = 0;
       -        while(c >= 0) {
       -                if(c == '<') {
       +        while(c >= 0){
       +                if(c == '<'){
                                // other browsers ignore stuff to end of line after <!
                                savei = ts->i;
                                c = getchar(ts);
       -                        if(c == '!') {
       +                        if(c == '!'){
                                        while(c >= 0 && c != '\n' && c != '\r')
                                                c = getchar(ts);
                                        if(c == '\r')
       t@@ -709,7 +713,7 @@ getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
                                        if(c == '\n')
                                                c = getchar(ts);
                                }
       -                        else if(c >= 0) {
       +                        else if(c >= 0){
                                        backup(ts, savei);
                                        tag = gettag(ts, tstarti, a, pai);
                                        if(tag == -1)
       t@@ -717,7 +721,7 @@ getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
                                        if(tag != Comment)
                                                (*pai)--;
                                        backup(ts, tstarti);
       -                                if(tag == Tscript + RBRA) {
       +                                if(tag == Tscript + RBRA){
                                                done = 1;
                                                break;
                                        }
       t@@ -727,9 +731,9 @@ getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
                        }
                        if(c < 0)
                                break;
       -                if(c != 0) {
       +                if(c != 0){
                                buf[j++] = c;
       -                        if(j == BIGBUFSIZE-1) {
       +                        if(j == BIGBUFSIZE-1){
                                        s = buftostr(s, buf, j);
                                        j = 0;
                                }
       t@@ -737,7 +741,7 @@ getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
                        tstarti = ts->i;
                        c = getchar(ts);
                }
       -        if(done || ts->i == ts->edata) {
       +        if(done || ts->i == ts->edata){
                        s = buftostr(s, buf, j);
                        tok = &a[(*pai)++];
                        tok->tag = Data;
       t@@ -784,15 +788,15 @@ gettag(TokenSource* ts, int starti, Token* a, int* pai)
                tok->attr = nil;
                tok->starti = starti;
                c = getchar(ts);
       -        if(c == '/') {
       +        if(c == '/'){
                        rbra = RBRA;
                        c = getchar(ts);
                }
                if(c < 0)
                        goto eob_done;
       -        if(c >= 256 || !isalpha(c)) {
       +        if(c >= 256 || !isalpha(c)){
                        // not a tag
       -                if(c == '!') {
       +                if(c == '!'){
                                ans = comment(ts);
                                if(ans != -1)
                                        return ans;
       t@@ -809,7 +813,7 @@ gettag(TokenSource* ts, int starti, Token* a, int* pai)
                // c starts a tagname
                buf[0] = c;
                i = 1;
       -        while(1) {
       +        for(;;){
                        c = getchar(ts);
                        if(c < 0)
                                goto eob_done;
       t@@ -826,34 +830,34 @@ gettag(TokenSource* ts, int starti, Token* a, int* pai)
        
                // attribute gathering loop
                al = nil;
       -        while(1) {
       +        for(;;){
                        // look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
                        // skip whitespace
        attrloop_continue:
       -                while(c < 256 && isspace(c)) {
       +                while(c < 256 && isspace(c)){
                                c = getchar(ts);
                                if(c < 0)
                                        goto eob_done;
                        }
                        if(c == '>')
                                goto attrloop_done;
       -                if(c == '<') {
       +                if(c == '<'){
                                if(warn)
                                        fprint(2, "warning: unclosed tag\n");
                                ungetchar(ts, c);
                                goto attrloop_done;
                        }
       -                if(c >= 256 || !isalpha(c)) {
       +                if(c >= 256 || !isalpha(c)){
                                if(warn)
                                        fprint(2, "warning: expected attribute name\n");
                                // skipt to next attribute name
       -                        while(1) {
       +                        for(;;){
                                        c = getchar(ts);
                                        if(c < 0)
                                                goto eob_done;
                                        if(c < 256 && isalpha(c))
                                                goto attrloop_continue;
       -                                if(c == '<') {
       +                                if(c == '<'){
                                                if(warn)
                                                        fprint(2, "warning: unclosed tag\n");
                                                ungetchar(ts, 60);
       t@@ -866,7 +870,7 @@ attrloop_continue:
                        // gather attribute name
                        buf[0] = c;
                        i = 1;
       -                while(1) {
       +                for(;;){
                                c = getchar(ts);
                                if(c < 0)
                                        goto eob_done;
       t@@ -876,23 +880,23 @@ attrloop_continue:
                                        buf[i++] = c;
                        }
                        afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
       -                if(warn && !afnd) {
       +                if(warn && !afnd){
                                buf[i] = 0;
                                fprint(2, "warning: unknown attribute name %S\n", buf);
                        }
                        // skip whitespace
       -                while(c < 256 && isspace(c)) {
       +                while(c < 256 && isspace(c)){
                                c = getchar(ts);
                                if(c < 0)
                                        goto eob_done;
                        }
       -                if(c != '=') {
       +                if(c != '='){
                                if(afnd)
                                        al = newattr(attid, nil, al);
                                goto attrloop_continue;
                        }
                        //# c is '=' here;  skip whitespace
       -                while(1) {
       +                for(;;){
                                c = getchar(ts);
                                if(c < 0)
                                        goto eob_done;
       t@@ -900,7 +904,7 @@ attrloop_continue:
                                        break;
                        }
                        quote = 0;
       -                if(c == '\'' || c == '"') {
       +                if(c == '\'' || c == '"'){
                                quote = c;
                                c = getchar(ts);
                                if(c < 0)
       t@@ -908,31 +912,31 @@ attrloop_continue:
                        }
                        val = nil;
                        nv = 0;
       -                while(1) {
       +                for(;;){
        valloop_continue:
                                if(c < 0)
                                        goto eob_done;
       -                        if(c == '>') {
       -                                if(quote) {
       +                        if(c == '>'){
       +                                if(quote){
                                                // c might be part of string (though not good style)
                                                // but if line ends before close quote, assume
                                                // there was an unmatched quote
                                                ti = ts->i;
       -                                        while(1) {
       +                                        for(;;){
                                                        c = getchar(ts);
                                                        if(c < 0)
                                                                goto eob_done;
       -                                                if(c == quote) {
       +                                                if(c == quote){
                                                                backup(ts, ti);
                                                                buf[nv++] = '>';
       -                                                        if(nv == BIGBUFSIZE-1) {
       +                                                        if(nv == BIGBUFSIZE-1){
                                                                        val = buftostr(val, buf, nv);
                                                                        nv = 0;
                                                                }
                                                                c = getchar(ts);
                                                                goto valloop_continue;
                                                        }
       -                                                if(c == '\n') {
       +                                                if(c == '\n'){
                                                                if(warn)
                                                                        fprint(2, "warning: apparent unmatched quote\n");
                                                                backup(ts, ti);
       t@@ -944,14 +948,14 @@ valloop_continue:
                                        else
                                                goto valloop_done;
                                }
       -                        if(quote) {
       -                                if(c == quote) {
       +                        if(quote){
       +                                if(c == quote){
                                                c = getchar(ts);
                                                if(c < 0)
                                                        goto eob_done;
                                                goto valloop_done;
                                        }
       -                                if(c == '\r') {
       +                                if(c == '\r'){
                                                c = getchar(ts);
                                                goto valloop_continue;
                                        }
       t@@ -962,20 +966,20 @@ valloop_continue:
                                        if(c < 256 && isspace(c))
                                                goto valloop_done;
                                }
       -                        if(c == '&') {
       +                        if(c == '&'){
                                        c = ampersand(ts);
                                        if(c == -1)
                                                goto eob_done;
                                }
                                buf[nv++] = c;
       -                        if(nv == BIGBUFSIZE-1) {
       +                        if(nv == BIGBUFSIZE-1){
                                        val = buftostr(val, buf, nv);
                                        nv = 0;
                                }
                                c = getchar(ts);
                        }
        valloop_done:
       -                if(afnd) {
       +                if(afnd){
                                val = buftostr(val, buf, nv);
                                al = newattr(attid, val, al);
                        }
       t@@ -1017,19 +1021,19 @@ comment(TokenSource* ts)
                nexti = ts->i;
                havecomment = 0;
                c = getchar(ts);
       -        if(c == '-') {
       +        if(c == '-'){
                        c = getchar(ts);
       -                if(c == '-') {
       +                if(c == '-'){
                                if(findstr(ts, L(Larrow)))
                                        havecomment = 1;
                                else
                                        backup(ts, nexti);
                        }
                }
       -        if(!havecomment) {
       +        if(!havecomment){
                        if(c == '>')
                                havecomment = 1;
       -                else if(c >= 0) {
       +                else if(c >= 0){
                                if(findstr(ts, L(Lgt)))
                                        havecomment = 1;
                        }
       t@@ -1053,15 +1057,15 @@ findstr(TokenSource* ts, Rune* s)
        
                c0 = s[0];
                n = runestrlen(s);
       -        while(1) {
       +        for(;;){
                        c = getchar(ts);
                        if(c < 0)
                                break;
       -                if(c == c0) {
       +                if(c == c0){
                                if(n == 1)
                                        return 1;
                                nexti = ts->i;
       -                        for(i = 1; i < n; i++) {
       +                        for(i = 1; i < n; i++){
                                        c = getchar(ts);
                                        if(c < 0)
                                                goto mainloop_done;
       t@@ -1077,6 +1081,18 @@ mainloop_done:
                return 0;
        }
        
       +static int
       +xdigit(int c)
       +{
       +        if('0' <= c && c <= '9')
       +                return c-'0';
       +        if('a' <= c && c <= 'f')
       +                return c-'a'+10;
       +        if('A' <= c && c <= 'F')
       +                return c-'A'+10;
       +        return -1;
       +}
       +
        // We've just read an '&'; look for an entity reference
        // name, and if found, return translated char.
        // if there is a complete entity name but it isn't known,
       t@@ -1100,36 +1116,42 @@ ampersand(TokenSource* ts)
                c = getchar(ts);
                fnd = 0;
                ans = -1;
       -        if(c == '#') {
       +        if(c == '#'){
                        c = getchar(ts);
                        v = 0;
       -                while(c >= 0) {
       -                        if(!(c < 256 && isdigit(c)))
       -                                break;
       -                        v = v*10 + c - 48;
       +                if(c == 'x'){
                                c = getchar(ts);
       +                        while((i=xdigit(c)) != -1){
       +                                v = v*16 + i;
       +                                c = getchar(ts);
       +                        }
       +                }else{
       +                        while('0' <= c && c <= '9'){
       +                                v = v*10 + c - '0';
       +                                c = getchar(ts);
       +                        }
                        }
       -                if(c >= 0) {
       +                if(c >= 0){
                                if(!(c == ';' || c == '\n' || c == '\r'))
                                        ungetchar(ts, c);
                                c = v;
                                if(c == 160)
                                        c = 160;
       -                        if(c >= Winstart && c <= Winend) {
       +                        if(c >= Winstart && c <= Winend){
                                        c = winchars[c - Winstart];
                                }
                                ans = c;
                                fnd = 1;
                        }
                }
       -        else if(c < 256 && isalpha(c)) {
       +        else if(c < 256 && isalpha(c)){
                        buf[0] = c;
                        k = 1;
       -                while(1) {
       +                for(;;){
                                c = getchar(ts);
                                if(c < 0)
                                        break;
       -                        if(ISNAMCHAR(c)) {
       +                        if(ISNAMCHAR(c)){
                                        if(k < SMALLBUFSIZE-1)
                                                buf[k++] = c;
                                }
       t@@ -1139,17 +1161,17 @@ ampersand(TokenSource* ts)
                                        break;
                                }
                        }
       -                if(c >= 0) {
       +                if(c >= 0){
                                fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
       -                        if(!fnd) {
       +                        if(!fnd){
                                        // Try prefixes of s
                                        if(c == ';' || c == '\n' || c == '\r')
                                                ungetchar(ts, c);
                                        i = k;
       -                                while(--k > 0) {
       +                                while(--k > 0){
                                                fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
       -                                        if(fnd) {
       -                                                while(i > k) {
       +                                        if(fnd){
       +                                                while(i > k){
                                                                i--;
                                                                ungetchar(ts, buf[i]);
                                                        }
       t@@ -1159,7 +1181,7 @@ ampersand(TokenSource* ts)
                                }
                        }
                }
       -        if(!fnd) {
       +        if(!fnd){
                        backup(ts, savei);
                        ans = '&';
                }
       t@@ -1181,14 +1203,14 @@ getchar(TokenSource* ts)
                        return -1;
                buf = ts->data;
                c = buf[ts->i];
       -        switch(ts->chset) {
       +        switch(ts->chset){
                case ISO_8859_1:
                        if(c >= Winstart && c <= Winend)
                                c = winchars[c - Winstart];
                        ts->i++;
                        break;
                case US_Ascii:
       -                if(c > 127) {
       +                if(c > 127){
                                if(warn)
                                        fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
                        }
       t@@ -1197,7 +1219,7 @@ getchar(TokenSource* ts)
                case UTF_8:
                        ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
                        n = chartorune(&r, (char*)(buf+ts->i));
       -                if(ok) {
       +                if(ok){
                                if(warn && c == 0x80)
                                        fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
                                ts->i += n;
       t@@ -1210,7 +1232,7 @@ getchar(TokenSource* ts)
                        }
                        break;
                case Unicode:
       -                if(ts->i < ts->edata - 1) {
       +                if(ts->i < ts->edata - 1){
                                //standards say most-significant byte first
                                c = (c << 8)|(buf[ts->i + 1]);
                                ts->i += 2;
       t@@ -1235,9 +1257,9 @@ ungetchar(TokenSource* ts, int c)
                char        a[UTFmax];
        
                n = 1;
       -        switch(ts->chset) {
       +        switch(ts->chset){
                case UTF_8:
       -                if(c >= 128) {
       +                if(c >= 128){
                                r = c;
                                n = runetochar(a, &r);
                        }
       t@@ -1273,8 +1295,8 @@ _tokaval(Token* t, int attid, Rune** pans, int xfer)
                Attr*        attr;
        
                attr = t->attr;
       -        while(attr != nil) {
       -                if(attr->attid == attid) {
       +        while(attr != nil){
       +                if(attr->attid == attid){
                                if(pans != nil)
                                        *pans = attr->value;
                                if(xfer)
       t@@ -1308,12 +1330,12 @@ Tconv(Fmt *f)
                        if(dbglex > 1)
                                i = snprint(buf, sizeof(buf), "[%d]", t->starti);
                        tag = t->tag;
       -                if(tag == Data) {
       +                if(tag == Data){
                                i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
                        }
                        else {
                                srbra = "";
       -                        if(tag >= RBRA) {
       +                        if(tag >= RBRA){
                                        tag -= RBRA;
                                        srbra = "/";
                                }
       t@@ -1321,7 +1343,7 @@ Tconv(Fmt *f)
                                if(tag == Notfound)
                                        tname = L(Lquestion);
                                i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
       -                        for(a = t->attr; a != nil; a = a->next) {
       +                        for(a = t->attr; a != nil; a = a->next){
                                        aname = attrnames[a->attid];
                                        i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
                                        if(a->value != nil)
       t@@ -1356,7 +1378,7 @@ freeattrs(Attr* ahead)
                Attr* nexta;
        
                a = ahead;
       -        while(a != nil) {
       +        while(a != nil){
                        nexta = a->next;
                        free(a->value);
                        free(a);
       t@@ -1377,7 +1399,7 @@ _freetokens(Token* tarray, int n)
        
                if(tarray == nil)
                        return;
       -        for(i = 0; i < n; i++) {
       +        for(i = 0; i < n; i++){
                        t = &tarray[i];
                        free(t->text);
                        freeattrs(t->attr);