libc/wchar: Fix unicode handling - scc - simple c99 compiler
 (HTM) git clone git://git.simple-cc.org/scc
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Submodules
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit dee0c6f0c90f7d64fd45cb6e3c48321f4beaf81d
 (DIR) parent 2eaef0900f5ebd9f00bebd8ce899423a0b37b4bb
 (HTM) Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
       Date:   Wed, 26 Feb 2025 10:31:51 +0100
       
       libc/wchar: Fix unicode handling
       
       * mbrtowc: validate input, handle 4-byte UTF-8 code points, set errno
       * wcrtomb: if a UTF-8 sequence has N bytes, the leading byte has the first
         N bits set (with ASCII characters a special case), not the first N-1 bits
       * _validutf8: negate condition
       
       Diffstat:
         M src/libc/wchar/_validutf8.c         |       2 +-
         M src/libc/wchar/mbrtowc.c            |      26 +++++++++++++++++---------
         M src/libc/wchar/wcrtomb.c            |       9 +++++++--
       
       3 files changed, 25 insertions(+), 12 deletions(-)
       ---
 (DIR) diff --git a/src/libc/wchar/_validutf8.c b/src/libc/wchar/_validutf8.c
       @@ -23,7 +23,7 @@ _validutf8(wchar_t wc, int *nbytes)
                };
                struct range *bp;
        
       -        for (bp = ranges; bp->begin <= wc && bp->end > wc; ++bp)
       +        for (bp = ranges; bp->begin > wc || bp->end <= wc; ++bp)
                        ;
                *nbytes = bp->nbytes;
        
 (DIR) diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
       @@ -1,3 +1,5 @@
       +#include <errno.h>
       +#include <stdlib.h>
        #include <wchar.h>
        
        #include "../libc.h"
       @@ -8,37 +10,43 @@ size_t
        mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
                mbstate_t *restrict ps)
        {
       -        unsigned char *t = (unsigned char *) s;
       +        const unsigned char *t = (const unsigned char *) s;
                unsigned long wc;
                unsigned c;
                int i, len, maxlen;
        
       -        if (s == NULL)
       +        if (t == NULL)
                        return 0;
       +        if ((wc = *t) == 0)
       +                goto return_code;
        
       -        wc = c = *t++;
       +        c = *t++;
                for (len = 0; n > 0 && c & 0x80; --n, ++len)
                        c <<= 1;
       -        if (n == 0 || len == 1 || len == 8)
       -                return -1;
       +        if (n == 0 && c & 0x80)
       +                return -2;
       +        if (len == 1 || len > MB_CUR_MAX)
       +                goto return_error;
                if (len == 0)
                        goto return_code;
        
                wc = (c & 0xFF) >> len;
                for (i = 0; i < len-1; i++) {
                        if (((c = *t++) & 0xC0) != 0x80)
       -                        return -1;
       +                        goto return_error;
                        wc <<= 6;
                        wc |= c & 0x3F;
                }
        
                if (!_validutf8(wc, &maxlen) || len != maxlen)
       -                return -1;
       +                goto return_error;
        
        return_code:
                if (pwc)
                        *pwc = wc;
       -        if (*s == '\0')
       -                return 0;
                return t - (unsigned char *) s;
       +
       +return_error:
       +        errno = EILSEQ;
       +        return -1;
        }
 (DIR) diff --git a/src/libc/wchar/wcrtomb.c b/src/libc/wchar/wcrtomb.c
       @@ -14,13 +14,18 @@ wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict ps)
                if (!s)
                        return 1;
        
       +        if (c < 0x80) {
       +                *s = wc;
       +                return 1;
       +        }
       +
                if (!_validutf8(wc, &n)) {
                        errno = EILSEQ;
                        return -1;
                }
       -
                n--;
       -        *s = 0;
       +
       +        *s = 0x80;
                for (i = 0; i < n; i++) {
                        *s >>= 1;
                        *s |= 0x80;