libc/wchar: Rewrite mbtowc() - scc - simple c99 compiler
 (HTM) git clone git://git.simple-cc.org/scc
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) Submodules
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit b4b507633186e7f223231a60447d82a6e3ab92af
 (DIR) parent f507bae3a0e45b6f9eeadecb3065afaea1a6d6bc
 (HTM) Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
       Date:   Mon, 10 Mar 2025 21:22:52 +0100
       
       libc/wchar: Rewrite mbtowc()
       
       There are many different corner cases in the implementation of the
       underlaying call to mbrtowc(). When mbrtowc() returns -2, it should
       be ready to receive new bytes with new calls, and it requires to update
       the conversion state.
       
       Also, the standard specifies:
       
               The implementation shall behave as if no library function calls
               the mbtowc function.
       
       and for that reason, we cannot pass a NULL pointer as state to mbrtowc()
       because it would imply to use the hidden state of mbrtowc().
       
       After considering to keep ABI compatibility with the definiton of mbstate_t
       in the different systems, it created to many problems. There is 0 guarantees
       that code compiled with different libc implementations would work, and for
       that reason we dropped the ABI compatibility.
       
       Diffstat:
         M include/bits/darwin/sys/cdefs.h     |      10 +---------
         M include/bits/dragonfly/sys/cdefs.h  |       9 +--------
         M include/bits/freebsd/sys/cdefs.h    |       9 +--------
         M include/bits/linux/sys/cdefs.h      |       9 +--------
         M include/bits/netbsd/sys/cdefs.h     |       9 +--------
         M include/bits/openbsd/sys/cdefs.h    |       9 +--------
         M include/wchar.h                     |       7 ++++++-
         M src/libc/arch/bsd/Makefile          |       4 +---
         D src/libc/arch/bsd/_mbsget.c         |       9 ---------
         D src/libc/arch/bsd/_mbsset.c         |       9 ---------
         M src/libc/arch/darwin/Makefile       |       2 --
         D src/libc/arch/darwin/_mbsget.c      |       9 ---------
         D src/libc/arch/darwin/_mbsset.c      |      10 ----------
         M src/libc/arch/linux/Makefile        |       2 --
         D src/libc/arch/linux/_mbsget.c       |       9 ---------
         D src/libc/arch/linux/_mbsset.c       |       9 ---------
         M src/libc/libc.h                     |       2 --
         M src/libc/objs/amd64-linux.mk        |       2 --
         M src/libc/objs/amd64-netbsd.mk       |       2 --
         M src/libc/objs/amd64-openbsd.mk      |       2 --
         M src/libc/stdlib/mbtowc.c            |      10 +++++++++-
         M src/libc/wchar/mbrtowc.c            |      92 +++++++++++++++++++++++--------
         M src/libc/wchar/mbsinit.c            |       4 +---
       
       23 files changed, 92 insertions(+), 147 deletions(-)
       ---
 (DIR) diff --git a/include/bits/darwin/sys/cdefs.h b/include/bits/darwin/sys/cdefs.h
       @@ -1,9 +1 @@
       -#ifdef _NEED_MBSTATE_T
       -#ifndef _MBSTATE_T
       -typedef struct {
       -    unsigned char state[4];
       -    size_t count;
       -} mbstate_t;
       -#define _MBSTATE_T
       -#endif
       -#endif
       +/* nothing fpr darwin */
 (DIR) diff --git a/include/bits/dragonfly/sys/cdefs.h b/include/bits/dragonfly/sys/cdefs.h
       @@ -1,8 +1 @@
       -#ifdef _NEED_MBSTATE_T
       -#ifndef _MBSTATE_T
       -typedef union {
       -        char __mbstate8[128];
       -} mbstate_t;
       -#define _MBSTATE_T
       -#endif
       -#endif
       +/* nothing for dragonfly */
 (DIR) diff --git a/include/bits/freebsd/sys/cdefs.h b/include/bits/freebsd/sys/cdefs.h
       @@ -1,8 +1 @@
       -#ifdef _NEED_MBSTATE_T
       -#ifndef _MBSTATE_T
       -typedef union {
       -        char __mbstate8[128];
       -} mbstate_t;
       -#define _MBSTATE_T
       -#endif
       -#endif
       +/* nothing for Openbsd */
 (DIR) diff --git a/include/bits/linux/sys/cdefs.h b/include/bits/linux/sys/cdefs.h
       @@ -1,8 +1 @@
       -#ifdef _NEED_MBSTATE_T
       -#ifndef _MBSTATE_T
       -typedef struct __mbstate_t {
       -        unsigned __opaque1, __opaque2;
       -} mbstate_t;
       -#define _MBSTATE_T
       -#endif
       -#endif
       +/* nothing for Linux */
 (DIR) diff --git a/include/bits/netbsd/sys/cdefs.h b/include/bits/netbsd/sys/cdefs.h
       @@ -1,8 +1 @@
       -#ifdef _NEED_MBSTATE_T
       -#ifndef _MBSTATE_T
       -typedef union {
       -        char __mbstate8[128];
       -} mbstate_t;
       -#define _MBSTATE_T
       -#endif
       -#endif
       +/* nothing for netbsd */
 (DIR) diff --git a/include/bits/openbsd/sys/cdefs.h b/include/bits/openbsd/sys/cdefs.h
       @@ -1,8 +1 @@
       -#ifdef _NEED_MBSTATE_T
       -#ifndef _MBSTATE_T
       -typedef union {
       -        char __mbstate8[128];
       -} mbstate_t;
       -#define _MBSTATE_T
       -#endif
       -#endif
       +/* nothing for Openbsd */
 (DIR) diff --git a/include/wchar.h b/include/wchar.h
       @@ -8,10 +8,15 @@
        #define _NEED_WCHARLIM
        #define _NEED_WINT
        #define _NEED_VA_LIST
       -#define _NEED_MBSTATE_T
        #include <arch/cdefs.h>
        #include <sys/cdefs.h>
        
       +typedef struct {
       +        unsigned char oc;
       +        unsigned char sh;
       +        wchar_t wc;
       +} mbstate_t;
       +
        struct tm;
        struct _FILE;
        
 (DIR) diff --git a/src/libc/arch/bsd/Makefile b/src/libc/arch/bsd/Makefile
       @@ -4,8 +4,6 @@ include $(PROJECTDIR)/scripts/rules.mk
        include ../../rules.mk
        
        OBJS=\
       -        _mbsget.$O\
       -        _mbsset.$O\
       -        _waitpid.$O\
       +        _waitpid.$O\
        
        all: $(OBJS)
 (DIR) diff --git a/src/libc/arch/bsd/_mbsget.c b/src/libc/arch/bsd/_mbsget.c
       @@ -1,9 +0,0 @@
       -#include <wchar.h>
       -
       -#include "../../libc.h"
       -
       -int
       -_mbsget(mbstate_t *ps)
       -{
       -        return ps->__mbstate8[0];
       -}
 (DIR) diff --git a/src/libc/arch/bsd/_mbsset.c b/src/libc/arch/bsd/_mbsset.c
       @@ -1,9 +0,0 @@
       -#include <wchar.h>
       -
       -#include "../../libc.h"
       -
       -int
       -_mbsset(mbstate_t *ps, int ch)
       -{
       -        return ps->__mbstate8[0] = ch;
       -}
 (DIR) diff --git a/src/libc/arch/darwin/Makefile b/src/libc/arch/darwin/Makefile
       @@ -5,7 +5,5 @@ include ../../rules.mk
        
        OBJS=\
                _getheap.$O\
       -        _mbsget.$O\
       -        _mbsset.$O\
        
        all: $(OBJS)
 (DIR) diff --git a/src/libc/arch/darwin/_mbsget.c b/src/libc/arch/darwin/_mbsget.c
       @@ -1,9 +0,0 @@
       -#include <wchar.h>
       -
       -#include "../../libc.h"
       -
       -int
       -_mbsget(mbstate_t *ps)
       -{
       -        return ps->state[0];
       -}
 (DIR) diff --git a/src/libc/arch/darwin/_mbsset.c b/src/libc/arch/darwin/_mbsset.c
       @@ -1,10 +0,0 @@
       -#include <wchar.h>
       -
       -#include "../../libc.h"
       -
       -int
       -_mbsset(mbstate_t *ps, int ch)
       -{
       -        ps-count = 1;
       -        return ps->state[0] = ch;
       -}
 (DIR) diff --git a/src/libc/arch/linux/Makefile b/src/libc/arch/linux/Makefile
       @@ -6,8 +6,6 @@ include ../../rules.mk
        OBJS=\
                _brk.$O\
                _getheap.$O\
       -        _mbsget.$O\
       -        _mbsset.$O\
                _sigaction.$O\
                _waitpid.$O\
        
 (DIR) diff --git a/src/libc/arch/linux/_mbsget.c b/src/libc/arch/linux/_mbsget.c
       @@ -1,9 +0,0 @@
       -#include <wchar.h>
       -
       -#include "../../libc.h"
       -
       -int
       -_mbsget(mbstate_t *ps)
       -{
       -        return ps->__opaque1;
       -}
 (DIR) diff --git a/src/libc/arch/linux/_mbsset.c b/src/libc/arch/linux/_mbsset.c
       @@ -1,9 +0,0 @@
       -#include <wchar.h>
       -
       -#include "../../libc.h"
       -
       -int
       -_mbsset(mbstate_t *ps, int ch)
       -{
       -        return ps->__opaque1 = ch;
       -}
 (DIR) diff --git a/src/libc/libc.h b/src/libc/libc.h
       @@ -61,8 +61,6 @@ extern void (*_atexithdl)(void);
        
        #ifdef _WCHAR_H
        extern int _validutf8(wchar_t, int *);
       -extern int _mbsset(mbstate_t *, int);
       -extern int _mbsget(mbstate_t *);
        #ifdef _STDIO_H
        extern wint_t _fputwc(wchar_t, FILE *, int *);
        #endif
 (DIR) diff --git a/src/libc/objs/amd64-linux.mk b/src/libc/objs/amd64-linux.mk
       @@ -36,8 +36,6 @@ OBJS =\
                arch/amd64/strcpy.$O\
                arch/linux/_brk.$O\
                arch/linux/_getheap.$O\
       -        arch/linux/_mbsget.$O\
       -        arch/linux/_mbsset.$O\
                arch/linux/_sigaction.$O\
                arch/linux/_waitpid.$O\
                arch/posix/_open.$O\
 (DIR) diff --git a/src/libc/objs/amd64-netbsd.mk b/src/libc/objs/amd64-netbsd.mk
       @@ -28,8 +28,6 @@ OBJS =\
                arch/amd64/strcmp.$O\
                arch/amd64/strcpy.$O\
                arch/bsd/_waitpid.$O\
       -        arch/bsd/_mbsget.$O\
       -        arch/bsd/_mbsset.$O\
                arch/netbsd/_sigaction.$O\
                arch/posix/_getheap.$O\
                arch/posix/_open.$O\
 (DIR) diff --git a/src/libc/objs/amd64-openbsd.mk b/src/libc/objs/amd64-openbsd.mk
       @@ -33,8 +33,6 @@ OBJS =\
                arch/amd64/strcmp.$O\
                arch/amd64/strcpy.$O\
                arch/bsd/_waitpid.$O\
       -        arch/bsd/_mbsget.$O\
       -        arch/bsd/_mbsset.$O\
                arch/posix/_getheap.$O\
                arch/posix/_open.$O\
                arch/posix/_systime.$O\
 (DIR) diff --git a/src/libc/stdlib/mbtowc.c b/src/libc/stdlib/mbtowc.c
       @@ -1,4 +1,5 @@
        #include <stdlib.h>
       +#include <string.h>
        #include <wchar.h>
        
        #undef mbtowc
       @@ -6,5 +7,12 @@
        int
        mbtowc(wchar_t *restrict pwc, const char *restrict s, size_t n)
        {
       -        return mbrtowc(pwc, s, n, NULL);
       +        static mbstate_t st;
       +        int ret;
       +
       +        ret = mbrtowc(pwc, s, n, &st);
       +        if (ret < 0)
       +                ret = -1;
       +
       +        return ret;
        }
 (DIR) diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
       @@ -1,5 +1,6 @@
        #include <errno.h>
        #include <stdlib.h>
       +#include <string.h>
        #include <wchar.h>
        
        #include "../libc.h"
       @@ -10,43 +11,88 @@ size_t
        mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
                mbstate_t *restrict ps)
        {
       +        static mbstate_t state;
                const unsigned char *t = (const unsigned char *) s;
       +        wchar_t dummy;
                unsigned long wc;
       -        unsigned c;
       -        int i, len, maxlen;
       -
       -        if (t == NULL)
       -                return 0;
       -        if ((wc = *t) == 0)
       -                goto return_code;
       -
       -        c = *t++;
       -        for (len = 0; n > 0 && c & 0x80; --n, ++len)
       -                c <<= 1;
       -        if (n == 0 && c & 0x80)
       +        unsigned c, oc;
       +        int sh, max;
       +
       +        if (!ps)
       +                ps  = &state;
       +
       +        if (t == NULL) {
       +                if (ps->sh != 0)
       +                        goto return_error;
       +                pwc = &dummy;
       +                goto return_code_set;
       +        }
       +        if (n == 0)
                        return -2;
       -        if (len == 1 || len > MB_CUR_MAX)
       -                goto return_error;
       -        if (len == 0)
       -                goto return_code;
       -
       -        wc = (c & 0xFF) >> len;
       -        for (i = 0; i < len-1; i++) {
       -                if (((c = *t++) & 0xC0) != 0x80)
       +
       +        oc = ps->oc;
       +        wc = ps->wc;
       +        sh = ps->sh;
       +
       +        /* initial state? */
       +        if (sh == 0) {
       +                /* NUL character? */
       +                if ((c = wc = *t) == 0)
       +                        goto return_code;
       +                t++;
       +                n--;
       +
       +                /* fast track for ascii? */
       +                if (c < 0x80)
       +                        goto return_code;
       +
       +                /* out of sequence multibyte? */
       +                if ((c & 0xc0) != 0xc0)
                                goto return_error;
       +
       +                /* in sequence multibyte! */
       +                oc = c << 1;
       +                wc = 0;
       +                sh = 1;
       +        }
       +
       +        for ( ; n > 0; --n) {
       +                if (sh > MB_CUR_MAX)
       +                        goto return_error;
       +
       +                c = *t++;
       +                if ((c & 0xc0) != 0x80)
       +                        goto return_error;
       +
                        wc <<= 6;
       -                wc |= c & 0x3F;
       +                wc |= c & 0x3f;
       +                oc <<= 1;
       +                sh++;
       +
       +                if ((oc & 0x80) == 0) {
       +                        oc = (oc & 0xff) >> sh;
       +                        wc |= oc << (sh-1) * 6;
       +        
       +                        if (!_validutf8(wc, &max) || sh != max)
       +                                goto return_error;
       +                        goto return_code_set;
       +                }
                }
        
       -        if (!_validutf8(wc, &maxlen) || len != maxlen)
       -                goto return_error;
       +        ps->sh = sh;
       +        ps->oc = oc;
       +        ps->wc = wc;
       +        return -2;
        
       +return_code_set:
       +        memset(ps, 0, sizeof(*ps));
        return_code:
                if (pwc)
                        *pwc = wc;
                return t - (unsigned char *) s;
        
        return_error:
       +        memset(ps, 0, sizeof(*ps));
                errno = EILSEQ;
                return -1;
        }
 (DIR) diff --git a/src/libc/wchar/mbsinit.c b/src/libc/wchar/mbsinit.c
       @@ -1,7 +1,5 @@
        #include <wchar.h>
        
       -#include "../libc.h"
       -
        #undef mbsinit
        
        int
       @@ -9,5 +7,5 @@ mbsinit(const mbstate_t *ps)
        {
                if (!ps)
                        return 1;
       -        return _mbsget(ps) == 0;
       +        return ps->oc == 0;
        }