utf8pad: improve padded printing and printing invalid unicode characters - sfeed_curses - sfeed curses UI (now part of sfeed, development is in sfeed)
 (HTM) git clone git://git.codemadness.org/sfeed_curses
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 7f13213a355aba904f12a595b322909ce630fbe1
 (DIR) parent 1c4116d1fa7db2ddf540d05df381cbf58e932981
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat,  9 Jan 2021 14:57:57 +0100
       
       utf8pad: improve padded printing and printing invalid unicode characters
       
       - Use unicode replacement character (codepoint 0xfffd) when a codepoint is
         invalid and proceed printing the rest of the characters.
       
       - When a codepoint is invalid reset the internal state of mbtowc(3), from the
         OpenBSD man page:
       
         "  If a call to mbtowc() resulted in an undefined internal state, mbtowc()
            must be called with s set to NULL to reset the internal state before it
            can safely be used again."
       
       - Make the function return 0 when `len` is 0 (this should not be not an error).
       
       Diffstat:
         M sfeed_curses.c                      |      76 +++++++++++++++++++++++--------
       
       1 file changed, 56 insertions(+), 20 deletions(-)
       ---
 (DIR) diff --git a/sfeed_curses.c b/sfeed_curses.c
       @@ -30,6 +30,7 @@
        #define PAD_TRUNCATE_SYMBOL    "\xe2\x80\xa6" /* symbol: "ellipsis" */
        #define SCROLLBAR_SYMBOL_BAR   "\xe2\x94\x82" /* symbol: "light vertical" */
        #define SCROLLBAR_SYMBOL_TICK  " "
       +#define UTF_INVALID_SYMBOL     "\xef\xbf\xbd" /* symbol: "replacement" */
        
        /* color-theme */
        #ifndef SFEED_THEME
       @@ -310,15 +311,28 @@ colw(const char *s)
        {
                wchar_t wc;
                size_t col = 0, i, slen;
       -        int rl, w;
       +        int inc, rl, w;
        
                slen = strlen(s);
       -        for (i = 0; i < slen; i += rl) {
       -                if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
       -                        break;
       -                if ((w = wcwidth(wc)) == -1)
       +        for (i = 0; i < slen; i += inc) {
       +                inc = 1;
       +                if ((unsigned char)s[i] < 32) {
                                continue;
       -                col += w;
       +                } else if ((unsigned char)s[i] >= 127) {
       +                        rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
       +                        if (rl < 0) {
       +                                mbtowc(NULL, NULL, 0); /* reset state */
       +                                inc = 1; /* next byte */
       +                                w = 1; /* replacement char is one width */
       +                        } else if ((w = wcwidth(wc)) == -1) {
       +                                continue;
       +                        } else {
       +                                inc = rl;
       +                        }
       +                        col += w;
       +                } else {
       +                        col++;
       +                }
                }
                return col;
        }
       @@ -330,33 +344,55 @@ utf8pad(char *buf, size_t bufsiz, const char *s, size_t len, int pad)
        {
                wchar_t wc;
                size_t col = 0, i, slen, siz = 0;
       -        int rl, w;
       +        int inc, rl, w;
        
       -        if (!len)
       +        if (!bufsiz)
                        return -1;
       +        if (!len) {
       +                buf[0] = '\0';
       +                return 0;
       +        }
        
                slen = strlen(s);
       -        for (i = 0; i < slen; i += rl) {
       -                if ((rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4)) <= 0)
       -                        break;
       -                if ((w = wcwidth(wc)) == -1)
       +        for (i = 0; i < slen; i += inc) {
       +                inc = 1;
       +                if ((unsigned char)s[i] < 32)
       +                        continue;
       +
       +                rl = mbtowc(&wc, &s[i], slen - i < 4 ? slen - i : 4);
       +                if (rl < 0) {
       +                        mbtowc(NULL, NULL, 0); /* reset state */
       +                        inc = 1; /* next byte */
       +                        w = 1; /* replacement char is one width */
       +                } else if ((w = wcwidth(wc)) == -1) {
                                continue;
       -                if (col + w > len || (col + w == len && s[i + rl])) {
       +                } else {
       +                        inc = rl;
       +                }
       +
       +                if (col + w > len || (col + w == len && s[i + inc])) {
                                if (siz + 4 >= bufsiz)
                                        return -1;
                                memcpy(&buf[siz], PAD_TRUNCATE_SYMBOL, sizeof(PAD_TRUNCATE_SYMBOL) - 1);
                                siz += sizeof(PAD_TRUNCATE_SYMBOL) - 1;
       -                        if (col + w == len && w > 1)
       -                                buf[siz++] = pad;
                                buf[siz] = '\0';
       -                        return 0;
       +                        col++;
       +                        break;
       +                } else if (rl < 0) {
       +                        if (siz + 4 >= bufsiz)
       +                                return -1;
       +                        memcpy(&buf[siz], UTF_INVALID_SYMBOL, sizeof(UTF_INVALID_SYMBOL) - 1);
       +                        siz += sizeof(UTF_INVALID_SYMBOL) - 1;
       +                        buf[siz] = '\0';
       +                        col++;
       +                        continue;
                        }
       -                if (siz + rl + 1 >= bufsiz)
       +                if (siz + inc + 1 >= bufsiz)
                                return -1;
       -                memcpy(&buf[siz], &s[i], rl);
       -                col += w;
       -                siz += rl;
       +                memcpy(&buf[siz], &s[i], inc);
       +                siz += inc;
                        buf[siz] = '\0';
       +                col += w;
                }
        
                len -= col;