libc/wchar: Rewrite mbtowc() - scc - simple c99 compiler
(HTM) git clone git://git.simple-cc.org/scc
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Submodules
(DIR) README
(DIR) LICENSE
---
(DIR) commit b4b507633186e7f223231a60447d82a6e3ab92af
(DIR) parent f507bae3a0e45b6f9eeadecb3065afaea1a6d6bc
(HTM) Author: Roberto E. Vargas Caballero <k0ga@shike2.com>
Date: Mon, 10 Mar 2025 21:22:52 +0100
libc/wchar: Rewrite mbtowc()
There are many different corner cases in the implementation of the
underlaying call to mbrtowc(). When mbrtowc() returns -2, it should
be ready to receive new bytes with new calls, and it requires to update
the conversion state.
Also, the standard specifies:
The implementation shall behave as if no library function calls
the mbtowc function.
and for that reason, we cannot pass a NULL pointer as state to mbrtowc()
because it would imply to use the hidden state of mbrtowc().
After considering to keep ABI compatibility with the definiton of mbstate_t
in the different systems, it created to many problems. There is 0 guarantees
that code compiled with different libc implementations would work, and for
that reason we dropped the ABI compatibility.
Diffstat:
M include/bits/darwin/sys/cdefs.h | 10 +---------
M include/bits/dragonfly/sys/cdefs.h | 9 +--------
M include/bits/freebsd/sys/cdefs.h | 9 +--------
M include/bits/linux/sys/cdefs.h | 9 +--------
M include/bits/netbsd/sys/cdefs.h | 9 +--------
M include/bits/openbsd/sys/cdefs.h | 9 +--------
M include/wchar.h | 7 ++++++-
M src/libc/arch/bsd/Makefile | 4 +---
D src/libc/arch/bsd/_mbsget.c | 9 ---------
D src/libc/arch/bsd/_mbsset.c | 9 ---------
M src/libc/arch/darwin/Makefile | 2 --
D src/libc/arch/darwin/_mbsget.c | 9 ---------
D src/libc/arch/darwin/_mbsset.c | 10 ----------
M src/libc/arch/linux/Makefile | 2 --
D src/libc/arch/linux/_mbsget.c | 9 ---------
D src/libc/arch/linux/_mbsset.c | 9 ---------
M src/libc/libc.h | 2 --
M src/libc/objs/amd64-linux.mk | 2 --
M src/libc/objs/amd64-netbsd.mk | 2 --
M src/libc/objs/amd64-openbsd.mk | 2 --
M src/libc/stdlib/mbtowc.c | 10 +++++++++-
M src/libc/wchar/mbrtowc.c | 92 +++++++++++++++++++++++--------
M src/libc/wchar/mbsinit.c | 4 +---
23 files changed, 92 insertions(+), 147 deletions(-)
---
(DIR) diff --git a/include/bits/darwin/sys/cdefs.h b/include/bits/darwin/sys/cdefs.h
@@ -1,9 +1 @@
-#ifdef _NEED_MBSTATE_T
-#ifndef _MBSTATE_T
-typedef struct {
- unsigned char state[4];
- size_t count;
-} mbstate_t;
-#define _MBSTATE_T
-#endif
-#endif
+/* nothing fpr darwin */
(DIR) diff --git a/include/bits/dragonfly/sys/cdefs.h b/include/bits/dragonfly/sys/cdefs.h
@@ -1,8 +1 @@
-#ifdef _NEED_MBSTATE_T
-#ifndef _MBSTATE_T
-typedef union {
- char __mbstate8[128];
-} mbstate_t;
-#define _MBSTATE_T
-#endif
-#endif
+/* nothing for dragonfly */
(DIR) diff --git a/include/bits/freebsd/sys/cdefs.h b/include/bits/freebsd/sys/cdefs.h
@@ -1,8 +1 @@
-#ifdef _NEED_MBSTATE_T
-#ifndef _MBSTATE_T
-typedef union {
- char __mbstate8[128];
-} mbstate_t;
-#define _MBSTATE_T
-#endif
-#endif
+/* nothing for Openbsd */
(DIR) diff --git a/include/bits/linux/sys/cdefs.h b/include/bits/linux/sys/cdefs.h
@@ -1,8 +1 @@
-#ifdef _NEED_MBSTATE_T
-#ifndef _MBSTATE_T
-typedef struct __mbstate_t {
- unsigned __opaque1, __opaque2;
-} mbstate_t;
-#define _MBSTATE_T
-#endif
-#endif
+/* nothing for Linux */
(DIR) diff --git a/include/bits/netbsd/sys/cdefs.h b/include/bits/netbsd/sys/cdefs.h
@@ -1,8 +1 @@
-#ifdef _NEED_MBSTATE_T
-#ifndef _MBSTATE_T
-typedef union {
- char __mbstate8[128];
-} mbstate_t;
-#define _MBSTATE_T
-#endif
-#endif
+/* nothing for netbsd */
(DIR) diff --git a/include/bits/openbsd/sys/cdefs.h b/include/bits/openbsd/sys/cdefs.h
@@ -1,8 +1 @@
-#ifdef _NEED_MBSTATE_T
-#ifndef _MBSTATE_T
-typedef union {
- char __mbstate8[128];
-} mbstate_t;
-#define _MBSTATE_T
-#endif
-#endif
+/* nothing for Openbsd */
(DIR) diff --git a/include/wchar.h b/include/wchar.h
@@ -8,10 +8,15 @@
#define _NEED_WCHARLIM
#define _NEED_WINT
#define _NEED_VA_LIST
-#define _NEED_MBSTATE_T
#include <arch/cdefs.h>
#include <sys/cdefs.h>
+typedef struct {
+ unsigned char oc;
+ unsigned char sh;
+ wchar_t wc;
+} mbstate_t;
+
struct tm;
struct _FILE;
(DIR) diff --git a/src/libc/arch/bsd/Makefile b/src/libc/arch/bsd/Makefile
@@ -4,8 +4,6 @@ include $(PROJECTDIR)/scripts/rules.mk
include ../../rules.mk
OBJS=\
- _mbsget.$O\
- _mbsset.$O\
- _waitpid.$O\
+ _waitpid.$O\
all: $(OBJS)
(DIR) diff --git a/src/libc/arch/bsd/_mbsget.c b/src/libc/arch/bsd/_mbsget.c
@@ -1,9 +0,0 @@
-#include <wchar.h>
-
-#include "../../libc.h"
-
-int
-_mbsget(mbstate_t *ps)
-{
- return ps->__mbstate8[0];
-}
(DIR) diff --git a/src/libc/arch/bsd/_mbsset.c b/src/libc/arch/bsd/_mbsset.c
@@ -1,9 +0,0 @@
-#include <wchar.h>
-
-#include "../../libc.h"
-
-int
-_mbsset(mbstate_t *ps, int ch)
-{
- return ps->__mbstate8[0] = ch;
-}
(DIR) diff --git a/src/libc/arch/darwin/Makefile b/src/libc/arch/darwin/Makefile
@@ -5,7 +5,5 @@ include ../../rules.mk
OBJS=\
_getheap.$O\
- _mbsget.$O\
- _mbsset.$O\
all: $(OBJS)
(DIR) diff --git a/src/libc/arch/darwin/_mbsget.c b/src/libc/arch/darwin/_mbsget.c
@@ -1,9 +0,0 @@
-#include <wchar.h>
-
-#include "../../libc.h"
-
-int
-_mbsget(mbstate_t *ps)
-{
- return ps->state[0];
-}
(DIR) diff --git a/src/libc/arch/darwin/_mbsset.c b/src/libc/arch/darwin/_mbsset.c
@@ -1,10 +0,0 @@
-#include <wchar.h>
-
-#include "../../libc.h"
-
-int
-_mbsset(mbstate_t *ps, int ch)
-{
- ps-count = 1;
- return ps->state[0] = ch;
-}
(DIR) diff --git a/src/libc/arch/linux/Makefile b/src/libc/arch/linux/Makefile
@@ -6,8 +6,6 @@ include ../../rules.mk
OBJS=\
_brk.$O\
_getheap.$O\
- _mbsget.$O\
- _mbsset.$O\
_sigaction.$O\
_waitpid.$O\
(DIR) diff --git a/src/libc/arch/linux/_mbsget.c b/src/libc/arch/linux/_mbsget.c
@@ -1,9 +0,0 @@
-#include <wchar.h>
-
-#include "../../libc.h"
-
-int
-_mbsget(mbstate_t *ps)
-{
- return ps->__opaque1;
-}
(DIR) diff --git a/src/libc/arch/linux/_mbsset.c b/src/libc/arch/linux/_mbsset.c
@@ -1,9 +0,0 @@
-#include <wchar.h>
-
-#include "../../libc.h"
-
-int
-_mbsset(mbstate_t *ps, int ch)
-{
- return ps->__opaque1 = ch;
-}
(DIR) diff --git a/src/libc/libc.h b/src/libc/libc.h
@@ -61,8 +61,6 @@ extern void (*_atexithdl)(void);
#ifdef _WCHAR_H
extern int _validutf8(wchar_t, int *);
-extern int _mbsset(mbstate_t *, int);
-extern int _mbsget(mbstate_t *);
#ifdef _STDIO_H
extern wint_t _fputwc(wchar_t, FILE *, int *);
#endif
(DIR) diff --git a/src/libc/objs/amd64-linux.mk b/src/libc/objs/amd64-linux.mk
@@ -36,8 +36,6 @@ OBJS =\
arch/amd64/strcpy.$O\
arch/linux/_brk.$O\
arch/linux/_getheap.$O\
- arch/linux/_mbsget.$O\
- arch/linux/_mbsset.$O\
arch/linux/_sigaction.$O\
arch/linux/_waitpid.$O\
arch/posix/_open.$O\
(DIR) diff --git a/src/libc/objs/amd64-netbsd.mk b/src/libc/objs/amd64-netbsd.mk
@@ -28,8 +28,6 @@ OBJS =\
arch/amd64/strcmp.$O\
arch/amd64/strcpy.$O\
arch/bsd/_waitpid.$O\
- arch/bsd/_mbsget.$O\
- arch/bsd/_mbsset.$O\
arch/netbsd/_sigaction.$O\
arch/posix/_getheap.$O\
arch/posix/_open.$O\
(DIR) diff --git a/src/libc/objs/amd64-openbsd.mk b/src/libc/objs/amd64-openbsd.mk
@@ -33,8 +33,6 @@ OBJS =\
arch/amd64/strcmp.$O\
arch/amd64/strcpy.$O\
arch/bsd/_waitpid.$O\
- arch/bsd/_mbsget.$O\
- arch/bsd/_mbsset.$O\
arch/posix/_getheap.$O\
arch/posix/_open.$O\
arch/posix/_systime.$O\
(DIR) diff --git a/src/libc/stdlib/mbtowc.c b/src/libc/stdlib/mbtowc.c
@@ -1,4 +1,5 @@
#include <stdlib.h>
+#include <string.h>
#include <wchar.h>
#undef mbtowc
@@ -6,5 +7,12 @@
int
mbtowc(wchar_t *restrict pwc, const char *restrict s, size_t n)
{
- return mbrtowc(pwc, s, n, NULL);
+ static mbstate_t st;
+ int ret;
+
+ ret = mbrtowc(pwc, s, n, &st);
+ if (ret < 0)
+ ret = -1;
+
+ return ret;
}
(DIR) diff --git a/src/libc/wchar/mbrtowc.c b/src/libc/wchar/mbrtowc.c
@@ -1,5 +1,6 @@
#include <errno.h>
#include <stdlib.h>
+#include <string.h>
#include <wchar.h>
#include "../libc.h"
@@ -10,43 +11,88 @@ size_t
mbrtowc(wchar_t *restrict pwc, const char *restrict s, size_t n,
mbstate_t *restrict ps)
{
+ static mbstate_t state;
const unsigned char *t = (const unsigned char *) s;
+ wchar_t dummy;
unsigned long wc;
- unsigned c;
- int i, len, maxlen;
-
- if (t == NULL)
- return 0;
- if ((wc = *t) == 0)
- goto return_code;
-
- c = *t++;
- for (len = 0; n > 0 && c & 0x80; --n, ++len)
- c <<= 1;
- if (n == 0 && c & 0x80)
+ unsigned c, oc;
+ int sh, max;
+
+ if (!ps)
+ ps = &state;
+
+ if (t == NULL) {
+ if (ps->sh != 0)
+ goto return_error;
+ pwc = &dummy;
+ goto return_code_set;
+ }
+ if (n == 0)
return -2;
- if (len == 1 || len > MB_CUR_MAX)
- goto return_error;
- if (len == 0)
- goto return_code;
-
- wc = (c & 0xFF) >> len;
- for (i = 0; i < len-1; i++) {
- if (((c = *t++) & 0xC0) != 0x80)
+
+ oc = ps->oc;
+ wc = ps->wc;
+ sh = ps->sh;
+
+ /* initial state? */
+ if (sh == 0) {
+ /* NUL character? */
+ if ((c = wc = *t) == 0)
+ goto return_code;
+ t++;
+ n--;
+
+ /* fast track for ascii? */
+ if (c < 0x80)
+ goto return_code;
+
+ /* out of sequence multibyte? */
+ if ((c & 0xc0) != 0xc0)
goto return_error;
+
+ /* in sequence multibyte! */
+ oc = c << 1;
+ wc = 0;
+ sh = 1;
+ }
+
+ for ( ; n > 0; --n) {
+ if (sh > MB_CUR_MAX)
+ goto return_error;
+
+ c = *t++;
+ if ((c & 0xc0) != 0x80)
+ goto return_error;
+
wc <<= 6;
- wc |= c & 0x3F;
+ wc |= c & 0x3f;
+ oc <<= 1;
+ sh++;
+
+ if ((oc & 0x80) == 0) {
+ oc = (oc & 0xff) >> sh;
+ wc |= oc << (sh-1) * 6;
+
+ if (!_validutf8(wc, &max) || sh != max)
+ goto return_error;
+ goto return_code_set;
+ }
}
- if (!_validutf8(wc, &maxlen) || len != maxlen)
- goto return_error;
+ ps->sh = sh;
+ ps->oc = oc;
+ ps->wc = wc;
+ return -2;
+return_code_set:
+ memset(ps, 0, sizeof(*ps));
return_code:
if (pwc)
*pwc = wc;
return t - (unsigned char *) s;
return_error:
+ memset(ps, 0, sizeof(*ps));
errno = EILSEQ;
return -1;
}
(DIR) diff --git a/src/libc/wchar/mbsinit.c b/src/libc/wchar/mbsinit.c
@@ -1,7 +1,5 @@
#include <wchar.h>
-#include "../libc.h"
-
#undef mbsinit
int
@@ -9,5 +7,5 @@ mbsinit(const mbstate_t *ps)
{
if (!ps)
return 1;
- return _mbsget(ps) == 0;
+ return ps->oc == 0;
}