utf8-decode.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       utf8-decode.c (7826B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stddef.h>
            3 #include <stdint.h>
            4 #include <stdio.h>
            5 #include <string.h>
            6 
            7 #include "../grapheme.h"
            8 #include "util.h"
            9 
           10 static const struct {
           11         char *arr;             /* UTF-8 byte sequence */
           12         size_t len;            /* length of UTF-8 byte sequence */
           13         size_t exp_len;        /* expected length returned */
           14         uint_least32_t exp_cp; /* expected codepoint returned */
           15 } dec_test[] = {
           16         {
           17                 /* empty sequence
           18                  * [ ] ->
           19                  * INVALID
           20                  */
           21                 .arr = NULL,
           22                 .len = 0,
           23                 .exp_len = 0,
           24                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
           25         },
           26         {
           27                 /* invalid lead byte
           28                  * [ 11111101 ] ->
           29                  * INVALID
           30                  */
           31                 .arr = (char *)(unsigned char[]) { 0xFD },
           32                 .len = 1,
           33                 .exp_len = 1,
           34                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
           35         },
           36         {
           37                 /* valid 1-byte sequence
           38                  * [ 00000001 ] ->
           39                  * 0000001
           40                  */
           41                 .arr = (char *)(unsigned char[]) { 0x01 },
           42                 .len = 1,
           43                 .exp_len = 1,
           44                 .exp_cp = 0x1,
           45         },
           46         {
           47                 /* valid 2-byte sequence
           48                  * [ 11000011 10111111 ] ->
           49                  * 00011111111
           50                  */
           51                 .arr = (char *)(unsigned char[]) { 0xC3, 0xBF },
           52                 .len = 2,
           53                 .exp_len = 2,
           54                 .exp_cp = 0xFF,
           55         },
           56         {
           57                 /* invalid 2-byte sequence (second byte missing)
           58                  * [ 11000011 ] ->
           59                  * INVALID
           60                  */
           61                 .arr = (char *)(unsigned char[]) { 0xC3 },
           62                 .len = 1,
           63                 .exp_len = 2,
           64                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
           65         },
           66         {
           67                 /* invalid 2-byte sequence (second byte malformed)
           68                  * [ 11000011 11111111 ] ->
           69                  * INVALID
           70                  */
           71                 .arr = (char *)(unsigned char[]) { 0xC3, 0xFF },
           72                 .len = 2,
           73                 .exp_len = 1,
           74                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
           75         },
           76         {
           77                 /* invalid 2-byte sequence (overlong encoded)
           78                  * [ 11000001 10111111 ] ->
           79                  * INVALID
           80                  */
           81                 .arr = (char *)(unsigned char[]) { 0xC1, 0xBF },
           82                 .len = 2,
           83                 .exp_len = 2,
           84                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
           85         },
           86         {
           87                 /* valid 3-byte sequence
           88                  * [ 11100000 10111111 10111111 ] ->
           89                  * 0000111111111111
           90                  */
           91                 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF },
           92                 .len = 3,
           93                 .exp_len = 3,
           94                 .exp_cp = 0xFFF,
           95         },
           96         {
           97                 /* invalid 3-byte sequence (second byte missing)
           98                  * [ 11100000 ] ->
           99                  * INVALID
          100                  */
          101                 .arr = (char *)(unsigned char[]) { 0xE0 },
          102                 .len = 1,
          103                 .exp_len = 3,
          104                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          105         },
          106         {
          107                 /* invalid 3-byte sequence (second byte malformed)
          108                  * [ 11100000 01111111 10111111 ] ->
          109                  * INVALID
          110                  */
          111                 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF },
          112                 .len = 3,
          113                 .exp_len = 1,
          114                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          115         },
          116         {
          117                 /* invalid 3-byte sequence (short string, second byte malformed)
          118                  * [ 11100000 01111111 ] ->
          119                  * INVALID
          120                  */
          121                 .arr = (char *)(unsigned char[]) { 0xE0, 0x7F },
          122                 .len = 2,
          123                 .exp_len = 1,
          124                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          125         },
          126         {
          127                 /* invalid 3-byte sequence (third byte missing)
          128                  * [ 11100000 10111111 ] ->
          129                  * INVALID
          130                  */
          131                 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF },
          132                 .len = 2,
          133                 .exp_len = 3,
          134                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          135         },
          136         {
          137                 /* invalid 3-byte sequence (third byte malformed)
          138                  * [ 11100000 10111111 01111111 ] ->
          139                  * INVALID
          140                  */
          141                 .arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F },
          142                 .len = 3,
          143                 .exp_len = 2,
          144                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          145         },
          146         {
          147                 /* invalid 3-byte sequence (overlong encoded)
          148                  * [ 11100000 10011111 10111111 ] ->
          149                  * INVALID
          150                  */
          151                 .arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF },
          152                 .len = 3,
          153                 .exp_len = 3,
          154                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          155         },
          156         {
          157                 /* invalid 3-byte sequence (UTF-16 surrogate half)
          158                  * [ 11101101 10100000 10000000 ] ->
          159                  * INVALID
          160                  */
          161                 .arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 },
          162                 .len = 3,
          163                 .exp_len = 3,
          164                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          165         },
          166         {
          167                 /* valid 4-byte sequence
          168                  * [ 11110011 10111111 10111111 10111111 ] ->
          169                  * 011111111111111111111
          170                  */
          171                 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xBF },
          172                 .len = 4,
          173                 .exp_len = 4,
          174                 .exp_cp = UINT32_C(0xFFFFF),
          175         },
          176         {
          177                 /* invalid 4-byte sequence (second byte missing)
          178                  * [ 11110011 ] ->
          179                  * INVALID
          180                  */
          181                 .arr = (char *)(unsigned char[]) { 0xF3 },
          182                 .len = 1,
          183                 .exp_len = 4,
          184                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          185         },
          186         {
          187                 /* invalid 4-byte sequence (second byte malformed)
          188                  * [ 11110011 01111111 10111111 10111111 ] ->
          189                  * INVALID
          190                  */
          191                 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xBF },
          192                 .len = 4,
          193                 .exp_len = 1,
          194                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          195         },
          196         {
          197                 /* invalid 4-byte sequence (short string 1, second byte
          198                  * malformed) [ 11110011 011111111 ] -> INVALID
          199                  */
          200                 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F },
          201                 .len = 2,
          202                 .exp_len = 1,
          203                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          204         },
          205         {
          206                 /* invalid 4-byte sequence (short string 2, second byte
          207                  * malformed) [ 11110011 011111111 10111111 ] -> INVALID
          208                  */
          209                 .arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF },
          210                 .len = 3,
          211                 .exp_len = 1,
          212                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          213         },
          214 
          215         {
          216                 /* invalid 4-byte sequence (third byte missing)
          217                  * [ 11110011 10111111 ] ->
          218                  * INVALID
          219                  */
          220                 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF },
          221                 .len = 2,
          222                 .exp_len = 4,
          223                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          224         },
          225         {
          226                 /* invalid 4-byte sequence (third byte malformed)
          227                  * [ 11110011 10111111 01111111 10111111 ] ->
          228                  * INVALID
          229                  */
          230                 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xBF },
          231                 .len = 4,
          232                 .exp_len = 2,
          233                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          234         },
          235         {
          236                 /* invalid 4-byte sequence (short string, third byte malformed)
          237                  * [ 11110011 10111111 01111111 ] ->
          238                  * INVALID
          239                  */
          240                 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F },
          241                 .len = 3,
          242                 .exp_len = 2,
          243                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          244         },
          245         {
          246                 /* invalid 4-byte sequence (fourth byte missing)
          247                  * [ 11110011 10111111 10111111 ] ->
          248                  * INVALID
          249                  */
          250                 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF },
          251                 .len = 3,
          252                 .exp_len = 4,
          253                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          254         },
          255         {
          256                 /* invalid 4-byte sequence (fourth byte malformed)
          257                  * [ 11110011 10111111 10111111 01111111 ] ->
          258                  * INVALID
          259                  */
          260                 .arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7F },
          261                 .len = 4,
          262                 .exp_len = 3,
          263                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          264         },
          265         {
          266                 /* invalid 4-byte sequence (overlong encoded)
          267                  * [ 11110000 10000000 10000001 10111111 ] ->
          268                  * INVALID
          269                  */
          270                 .arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xBF },
          271                 .len = 4,
          272                 .exp_len = 4,
          273                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          274         },
          275         {
          276                 /* invalid 4-byte sequence (UTF-16-unrepresentable)
          277                  * [ 11110100 10010000 10000000 10000000 ] ->
          278                  * INVALID
          279                  */
          280                 .arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x80 },
          281                 .len = 4,
          282                 .exp_len = 4,
          283                 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
          284         },
          285 };
          286 
          287 int
          288 main(int argc, char *argv[])
          289 {
          290         size_t i, failed;
          291 
          292         (void)argc;
          293 
          294         /* UTF-8 decoder test */
          295         for (i = 0, failed = 0; i < LEN(dec_test); i++) {
          296                 size_t len;
          297                 uint_least32_t cp;
          298 
          299                 len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].len,
          300                                            &cp);
          301 
          302                 if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) {
          303                         fprintf(stderr,
          304                                 "%s: Failed test %zu: "
          305                                 "Expected (%zx,%u), but got (%zx,%u).\n",
          306                                 argv[0], i, dec_test[i].exp_len,
          307                                 dec_test[i].exp_cp, len, cp);
          308                         failed++;
          309                 }
          310         }
          311         printf("%s: %zu/%zu unit tests passed.\n", argv[0],
          312                LEN(dec_test) - failed, LEN(dec_test));
          313 
          314         return (failed > 0) ? 1 : 0;
          315 }