Don't punt on encoding errors. - sam - An updated version of the sam text editor.
 (HTM) git clone git://vernunftzentrum.de/sam.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit ce79fc47ddd96ac43d9ab40078a80fe28a93ec12
 (DIR) parent c6c0085b825c6060c598acfb6e2831a246e4fd31
 (HTM) Author: Rob King <jking@deadpixi.com>
       Date:   Fri, 27 Jan 2017 10:08:55 -0600
       
       Don't punt on encoding errors.
       
       Originally, if files contained encoding errors (i.e. they weren't
       valid text files), we would punt with a "file is not text" error.
       This was considered sub-optimal, as there are many files that users
       might want to edit that aren't correctly encoded.
       
       We now replace invalid characters with the Unicode replacement
       character (0xfffd) and warn. The dirty flag is handled "correctly"
       as well.
       
       Diffstat:
         sam/io.c                            |     132 ++++++++++++++++++++++++++++++-
         sam/sam.c                           |       2 ++
         sam/sam.h                           |       3 +++
       
       3 files changed, 136 insertions(+), 1 deletion(-)
       ---
 (DIR) diff --git a/sam/io.c b/sam/io.c
       @@ -10,6 +10,8 @@
        #define NSYSFILE    3
        #define NOFILE      128
        
       +#define MIN(x, y) ((x) < (y)? (x) : (y))
       +
        void
        checkqid(File *f)
        {
       @@ -76,9 +78,137 @@ writef(File *f)
            }
        }
        
       +static wchar_t
       +finishpartialchar(File *f, const char *s, size_t n, size_t *p)
       +{
       +    size_t lp = *p;
       +    wchar_t w = 0;
       +
       +    while (!w && f->mblen && lp < n && f->mblen < BLOCKSIZE){
       +        mbstate_t ts = f->ps;
       +        size_t rc = 0;
       +        wchar_t c = 0;
       +
       +        switch (rc = mbrtowc(&c, f->mbbuf, f->mblen, &ts)){
       +            case (size_t)-1:
       +                memset(&f->ps, 0, sizeof(f->ps));
       +                w = UNICODE_REPLACEMENT_CHAR;
       +                lp++;
       +                break;
       +
       +            case (size_t)-2:
       +                f->mbbuf[f->mblen++] = s[lp++];
       +                break;
       +
       +            default:
       +                f->ps = ts;
       +                w = c;
       +                break;
       +        }
       +    }
       +
       +    *p = lp;
       +    f->mblen = 0;
       +    memset(f->mbbuf, 0, sizeof(f->mbbuf));
       +
       +    return w? w : UNICODE_REPLACEMENT_CHAR;
       +}
       +
       +static size_t
       +insertbuf(File *f, const char *s, size_t n, bool *nulls)
       +{
       +    wchar_t wbuf[BLOCKSIZE + 1] = {0};
       +    size_t nw = 0;
       +    size_t nt = 0;
       +    size_t p = 0;
       +    Posn pos = addr.r.p2;
       +
       +    if (f->mblen)
       +        wbuf[nw++] = finishpartialchar(f, s, n, &p);
       +
       +    while (p < n){
       +        mbstate_t ts = f->ps;
       +        wchar_t c = 0;
       +        size_t rc = mbrtowc(&c, s + p, n - p, &ts);
       +        switch (rc){
       +            case (size_t)0:
       +                if (p < n){
       +                    memset(&f->ps, 0, sizeof(f->ps));
       +                    wbuf[nw++] = UNICODE_REPLACEMENT_CHAR;
       +                    *nulls = true;
       +                    p++;
       +                }
       +                break;
       +
       +            case (size_t)-1:
       +                memset(&f->ps, 0, sizeof(f->ps));
       +                wbuf[nw++] = UNICODE_REPLACEMENT_CHAR;
       +                p++;
       +                *nulls = true;
       +                break;
       +
       +            case (size_t)-2:
       +                Finsert(f, tmprstr(wbuf, nw), pos);
       +                memcpy(f->mbbuf, s + p, MIN(n - p, BLOCKSIZE));
       +                f->mblen = MIN(n - p, BLOCKSIZE);
       +                return nt + nw;
       +
       +            default:
       +                f->ps = ts;
       +                p += rc;
       +                wbuf[nw++] = c;
       +                break;
       +        }
       +
       +        if (nw >= BLOCKSIZE){
       +            Finsert(f, tmprstr(wbuf, nw), pos);
       +            memset(wbuf, 0, sizeof(wbuf));
       +            nt += nw;
       +            nw = 0;
       +        }
       +    }
       +
       +    Finsert(f, tmprstr(wbuf, nw), pos);
       +    return nt + nw;
       +}
       +
        Posn
        readio(File *f, bool *nulls, bool setdate)
        {
       +    char buf[(BLOCKSIZE * MB_LEN_MAX) + 1] = {0};
       +    wchar_t wbuf[BLOCKSIZE + 1] = {0};
       +    size_t nw = 0;
       +    size_t p = 0;
       +    size_t n = 0;
       +    size_t nt = 0;
       +    Posn pos = addr.r.p2;
       +    uint64_t dev, qid;
       +    int64_t mtime;
       +
       +    n = read(io, buf, BLOCKSIZE);
       +    while (n > 0){
       +        if ((ssize_t)n < 0)
       +            return nt;
       +
       +        nt += insertbuf(f, buf, n, nulls);
       +        n = read(io, buf, BLOCKSIZE);
       +    }
       +
       +    if (setdate){
       +        if (statfd(io, &dev, &qid, &mtime, 0, 0) > 0){
       +            f->dev = dev;
       +            f->qid = qid;
       +            f->date = mtime;
       +            checkqid(f);
       +        }
       +    }
       +
       +    return nt;
       +}
       +
       +/* Posn
       +readio(File *f, bool *nulls, bool setdate)
       +{
            size_t n = 0;
            size_t nt = 0;
            Posn p = addr.r.p2;
       @@ -117,7 +247,7 @@ readio(File *f, bool *nulls, bool setdate)
            }
        
            return nt;
       -}
       +} */
        
        Posn
        writeio(File *f)
 (DIR) diff --git a/sam/sam.c b/sam/sam.c
       @@ -516,6 +516,8 @@ edit(File *f, int cmd)
                error_s(Eopen, genc);
            }
            p = readio(f, &nulls, empty);
       +    if (nulls)
       +        warn(Wnulls);
            closeio((cmd=='e' || cmd=='I')? -1 : p);
            if(cmd == 'r')
                f->ndot.r.p1 = addr.r.p2, f->ndot.r.p2 = addr.r.p2+p;
 (DIR) diff --git a/sam/sam.h b/sam/sam.h
       @@ -119,6 +119,9 @@ struct File
            Posn    cp1, cp2;   /* Write-behind cache positions and */
            String  cache;      /* string */
            wchar_t    getcbuf[NGETC];
       +    char mbbuf[BUFSIZ];  /* partial character during read */
       +    size_t mblen; /* number of bytes in partial character */
       +    mbstate_t ps; /* state of multibyte decoding */
            int ngetc;
            int getci;
            Posn    getcp;