Don't punt on encoding errors. - sam - An updated version of the sam text editor.
(HTM) git clone git://vernunftzentrum.de/sam.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit ce79fc47ddd96ac43d9ab40078a80fe28a93ec12
(DIR) parent c6c0085b825c6060c598acfb6e2831a246e4fd31
(HTM) Author: Rob King <jking@deadpixi.com>
Date: Fri, 27 Jan 2017 10:08:55 -0600
Don't punt on encoding errors.
Originally, if files contained encoding errors (i.e. they weren't
valid text files), we would punt with a "file is not text" error.
This was considered sub-optimal, as there are many files that users
might want to edit that aren't correctly encoded.
We now replace invalid characters with the Unicode replacement
character (0xfffd) and warn. The dirty flag is handled "correctly"
as well.
Diffstat:
sam/io.c | 132 ++++++++++++++++++++++++++++++-
sam/sam.c | 2 ++
sam/sam.h | 3 +++
3 files changed, 136 insertions(+), 1 deletion(-)
---
(DIR) diff --git a/sam/io.c b/sam/io.c
@@ -10,6 +10,8 @@
#define NSYSFILE 3
#define NOFILE 128
+#define MIN(x, y) ((x) < (y)? (x) : (y))
+
void
checkqid(File *f)
{
@@ -76,9 +78,137 @@ writef(File *f)
}
}
+static wchar_t
+finishpartialchar(File *f, const char *s, size_t n, size_t *p)
+{
+ size_t lp = *p;
+ wchar_t w = 0;
+
+ while (!w && f->mblen && lp < n && f->mblen < BLOCKSIZE){
+ mbstate_t ts = f->ps;
+ size_t rc = 0;
+ wchar_t c = 0;
+
+ switch (rc = mbrtowc(&c, f->mbbuf, f->mblen, &ts)){
+ case (size_t)-1:
+ memset(&f->ps, 0, sizeof(f->ps));
+ w = UNICODE_REPLACEMENT_CHAR;
+ lp++;
+ break;
+
+ case (size_t)-2:
+ f->mbbuf[f->mblen++] = s[lp++];
+ break;
+
+ default:
+ f->ps = ts;
+ w = c;
+ break;
+ }
+ }
+
+ *p = lp;
+ f->mblen = 0;
+ memset(f->mbbuf, 0, sizeof(f->mbbuf));
+
+ return w? w : UNICODE_REPLACEMENT_CHAR;
+}
+
+static size_t
+insertbuf(File *f, const char *s, size_t n, bool *nulls)
+{
+ wchar_t wbuf[BLOCKSIZE + 1] = {0};
+ size_t nw = 0;
+ size_t nt = 0;
+ size_t p = 0;
+ Posn pos = addr.r.p2;
+
+ if (f->mblen)
+ wbuf[nw++] = finishpartialchar(f, s, n, &p);
+
+ while (p < n){
+ mbstate_t ts = f->ps;
+ wchar_t c = 0;
+ size_t rc = mbrtowc(&c, s + p, n - p, &ts);
+ switch (rc){
+ case (size_t)0:
+ if (p < n){
+ memset(&f->ps, 0, sizeof(f->ps));
+ wbuf[nw++] = UNICODE_REPLACEMENT_CHAR;
+ *nulls = true;
+ p++;
+ }
+ break;
+
+ case (size_t)-1:
+ memset(&f->ps, 0, sizeof(f->ps));
+ wbuf[nw++] = UNICODE_REPLACEMENT_CHAR;
+ p++;
+ *nulls = true;
+ break;
+
+ case (size_t)-2:
+ Finsert(f, tmprstr(wbuf, nw), pos);
+ memcpy(f->mbbuf, s + p, MIN(n - p, BLOCKSIZE));
+ f->mblen = MIN(n - p, BLOCKSIZE);
+ return nt + nw;
+
+ default:
+ f->ps = ts;
+ p += rc;
+ wbuf[nw++] = c;
+ break;
+ }
+
+ if (nw >= BLOCKSIZE){
+ Finsert(f, tmprstr(wbuf, nw), pos);
+ memset(wbuf, 0, sizeof(wbuf));
+ nt += nw;
+ nw = 0;
+ }
+ }
+
+ Finsert(f, tmprstr(wbuf, nw), pos);
+ return nt + nw;
+}
+
Posn
readio(File *f, bool *nulls, bool setdate)
{
+ char buf[(BLOCKSIZE * MB_LEN_MAX) + 1] = {0};
+ wchar_t wbuf[BLOCKSIZE + 1] = {0};
+ size_t nw = 0;
+ size_t p = 0;
+ size_t n = 0;
+ size_t nt = 0;
+ Posn pos = addr.r.p2;
+ uint64_t dev, qid;
+ int64_t mtime;
+
+ n = read(io, buf, BLOCKSIZE);
+ while (n > 0){
+ if ((ssize_t)n < 0)
+ return nt;
+
+ nt += insertbuf(f, buf, n, nulls);
+ n = read(io, buf, BLOCKSIZE);
+ }
+
+ if (setdate){
+ if (statfd(io, &dev, &qid, &mtime, 0, 0) > 0){
+ f->dev = dev;
+ f->qid = qid;
+ f->date = mtime;
+ checkqid(f);
+ }
+ }
+
+ return nt;
+}
+
+/* Posn
+readio(File *f, bool *nulls, bool setdate)
+{
size_t n = 0;
size_t nt = 0;
Posn p = addr.r.p2;
@@ -117,7 +247,7 @@ readio(File *f, bool *nulls, bool setdate)
}
return nt;
-}
+} */
Posn
writeio(File *f)
(DIR) diff --git a/sam/sam.c b/sam/sam.c
@@ -516,6 +516,8 @@ edit(File *f, int cmd)
error_s(Eopen, genc);
}
p = readio(f, &nulls, empty);
+ if (nulls)
+ warn(Wnulls);
closeio((cmd=='e' || cmd=='I')? -1 : p);
if(cmd == 'r')
f->ndot.r.p1 = addr.r.p2, f->ndot.r.p2 = addr.r.p2+p;
(DIR) diff --git a/sam/sam.h b/sam/sam.h
@@ -119,6 +119,9 @@ struct File
Posn cp1, cp2; /* Write-behind cache positions and */
String cache; /* string */
wchar_t getcbuf[NGETC];
+ char mbbuf[BUFSIZ]; /* partial character during read */
+ size_t mblen; /* number of bytes in partial character */
+ mbstate_t ps; /* state of multibyte decoding */
int ngetc;
int getci;
Posn getcp;