utf8.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       utf8.c (6131B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stddef.h>
            3 #include <stdint.h>
            4 
            5 #include "../grapheme.h"
            6 #include "util.h"
            7 
            8 #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
            9 
           10 /* lookup-table for the types of sequence first bytes */
           11 static const struct {
           12         uint_least8_t lower;  /* lower bound of sequence first byte */
           13         uint_least8_t upper;  /* upper bound of sequence first byte */
           14         uint_least32_t mincp; /* smallest non-overlong encoded codepoint */
           15         uint_least32_t maxcp; /* largest encodable codepoint */
           16                               /*
           17                                * implicit: table-offset represents the number of following
           18                                * bytes of the form 10xxxxxx (6 bits capacity each)
           19                                */
           20 } lut[] = {
           21         [0] = {
           22                 /* 0xxxxxxx */
           23                 .lower = 0x00, /* 00000000 */
           24                 .upper = 0x7F, /* 01111111 */
           25                 .mincp = (uint_least32_t)0,
           26                 .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
           27         },
           28         [1] = {
           29                 /* 110xxxxx */
           30                 .lower = 0xC0, /* 11000000 */
           31                 .upper = 0xDF, /* 11011111 */
           32                 .mincp = (uint_least32_t)1 << 7,
           33                 .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
           34         },
           35         [2] = {
           36                 /* 1110xxxx */
           37                 .lower = 0xE0, /* 11100000 */
           38                 .upper = 0xEF, /* 11101111 */
           39                 .mincp = (uint_least32_t)1 << 11,
           40                 .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
           41         },
           42         [3] = {
           43                 /* 11110xxx */
           44                 .lower = 0xF0, /* 11110000 */
           45                 .upper = 0xF7, /* 11110111 */
           46                 .mincp = (uint_least32_t)1 << 16,
           47                 .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
           48         },
           49 };
           50 
           51 size_t
           52 grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
           53 {
           54         size_t off, i;
           55         uint_least32_t tmp;
           56 
           57         if (cp == NULL) {
           58                 /*
           59                  * instead of checking every time if cp is NULL within
           60                  * the decoder, simply point it at a dummy variable here.
           61                  */
           62                 cp = &tmp;
           63         }
           64 
           65         if (str == NULL || len == 0) {
           66                 /* a sequence must be at least 1 byte long */
           67                 *cp = GRAPHEME_INVALID_CODEPOINT;
           68                 return 0;
           69         }
           70 
           71         /* identify sequence type with the first byte */
           72         for (off = 0; off < LEN(lut); off++) {
           73                 if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower,
           74                             lut[off].upper)) {
           75                         /*
           76                          * first byte is within the bounds; fill
           77                          * p with the the first bits contained in
           78                          * the first byte (by subtracting the high bits)
           79                          */
           80                         *cp = ((const unsigned char *)str)[0] - lut[off].lower;
           81                         break;
           82                 }
           83         }
           84         if (off == LEN(lut)) {
           85                 /*
           86                  * first byte does not match a sequence type;
           87                  * set cp as invalid and return 1 byte processed
           88                  *
           89                  * this also includes the cases where bits higher than
           90                  * the 8th are set on systems with CHAR_BIT > 8
           91                  */
           92                 *cp = GRAPHEME_INVALID_CODEPOINT;
           93                 return 1;
           94         }
           95         if (1 + off > len) {
           96                 /*
           97                  * input is not long enough, set cp as invalid
           98                  */
           99                 *cp = GRAPHEME_INVALID_CODEPOINT;
          100 
          101                 /*
          102                  * count the following continuation bytes, but nothing
          103                  * else in case we have a "rogue" case where e.g. such a
          104                  * sequence starter occurs right before a NUL-byte.
          105                  */
          106                 for (i = 0; 1 + i < len; i++) {
          107                         if (!BETWEEN(((const unsigned char *)str)[1 + i], 0x80,
          108                                      0xBF)) {
          109                                 break;
          110                         }
          111                 }
          112 
          113                 /*
          114                  * if the continuation bytes do not continue until
          115                  * the end, return the incomplete sequence length.
          116                  * Otherwise return the number of bytes we actually
          117                  * expected, which is larger than n.
          118                  */
          119                 return ((1 + i) < len) ? (1 + i) : (1 + off);
          120         }
          121 
          122         /*
          123          * process 'off' following bytes, each of the form 10xxxxxx
          124          * (i.e. between 0x80 (10000000) and 0xBF (10111111))
          125          */
          126         for (i = 1; i <= off; i++) {
          127                 if (!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) {
          128                         /*
          129                          * byte does not match format; return
          130                          * number of bytes processed excluding the
          131                          * unexpected character as recommended since
          132                          * Unicode 6 (chapter 3)
          133                          *
          134                          * this also includes the cases where bits
          135                          * higher than the 8th are set on systems
          136                          * with CHAR_BIT > 8
          137                          */
          138                         *cp = GRAPHEME_INVALID_CODEPOINT;
          139                         return 1 + (i - 1);
          140                 }
          141                 /*
          142                  * shift codepoint by 6 bits and add the 6 stored bits
          143                  * in s[i] to it using the bitmask 0x3F (00111111)
          144                  */
          145                 *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F);
          146         }
          147 
          148         if (*cp < lut[off].mincp ||
          149             BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
          150             *cp > UINT32_C(0x10FFFF)) {
          151                 /*
          152                  * codepoint is overlong encoded in the sequence, is a
          153                  * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
          154                  * not representable in UTF-16 (>0x10FFFF) (RFC-3629
          155                  * specifies the latter two conditions)
          156                  */
          157                 *cp = GRAPHEME_INVALID_CODEPOINT;
          158         }
          159 
          160         return 1 + off;
          161 }
          162 
          163 size_t
          164 grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
          165 {
          166         size_t off, i;
          167 
          168         if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
          169             cp > UINT32_C(0x10FFFF)) {
          170                 /*
          171                  * codepoint is a high or low UTF-16 surrogate half
          172                  * (0xD800..0xDFFF) or not representable in UTF-16
          173                  * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
          174                  */
          175                 cp = GRAPHEME_INVALID_CODEPOINT;
          176         }
          177 
          178         /* determine necessary sequence type */
          179         for (off = 0; off < LEN(lut); off++) {
          180                 if (cp <= lut[off].maxcp) {
          181                         break;
          182                 }
          183         }
          184         if (1 + off > len || str == NULL || len == 0) {
          185                 /*
          186                  * specified buffer is too small to store sequence or
          187                  * the caller just wanted to know how many bytes the
          188                  * codepoint needs by passing a NULL-buffer.
          189                  */
          190                 return 1 + off;
          191         }
          192 
          193         /* build sequence by filling cp-bits into each byte */
          194 
          195         /*
          196          * lut[off].lower is the bit-format for the first byte and
          197          * the bits to fill into it are determined by shifting the
          198          * cp 6 times the number of following bytes, as each
          199          * following byte stores 6 bits, yielding the wanted bits.
          200          *
          201          * We do not overwrite the mask because we guaranteed earlier
          202          * that there are no bits higher than the mask allows.
          203          */
          204         ((unsigned char *)str)[0] =
          205                 lut[off].lower | (uint_least8_t)(cp >> (6 * off));
          206 
          207         for (i = 1; i <= off; i++) {
          208                 /*
          209                  * the bit-format for following bytes is 10000000 (0x80)
          210                  * and it each stores 6 bits in the 6 low bits that we
          211                  * extract from the properly-shifted value using the
          212                  * mask 00111111 (0x3F)
          213                  */
          214                 ((unsigned char *)str)[i] =
          215                         0x80 | ((cp >> (6 * (off - i))) & 0x3F);
          216         }
          217 
          218         return 1 + off;
          219 }