character.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       character.c (18706B)
       ---
            1 #include <stdio.h>
            2 
            3 /* See LICENSE file for copyright and license details. */
            4 #include <limits.h>
            5 #include <stdbool.h>
            6 #include <stddef.h>
            7 
            8 #include "../gen/character.h"
            9 #include "../grapheme.h"
           10 #include "util.h"
           11 
           12 struct character_break_state {
           13         uint_least8_t prop;
           14         bool prop_set;
           15         bool gb11_flag;
           16         bool gb12_13_flag;
           17         uint_least8_t gb9c_level;
           18 };
           19 
           20 static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = {
           21         [CHAR_BREAK_PROP_OTHER] =
           22                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           23                 UINT32_C(1)
           24                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           25                 UINT32_C(1)
           26                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           27                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           28                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           29                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           30         [CHAR_BREAK_PROP_ICB_CONSONANT] =
           31                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           32                 UINT32_C(1)
           33                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           34                 UINT32_C(1)
           35                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           36                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           37                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           38                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           39         [CHAR_BREAK_PROP_ICB_EXTEND] =
           40                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           41                 UINT32_C(1)
           42                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           43                 UINT32_C(1)
           44                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           45                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           46                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           47                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           48         [CHAR_BREAK_PROP_ICB_LINKER] =
           49                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           50                 UINT32_C(1)
           51                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           52                 UINT32_C(1)
           53                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           54                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           55                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           56                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           57         [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF,    /* GB3  */
           58         [CHAR_BREAK_PROP_EXTEND] =
           59                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           60                 UINT32_C(1)
           61                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           62                 UINT32_C(1)
           63                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           64                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           65                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           66                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           67         [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] =
           68                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           69                 UINT32_C(1)
           70                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           71                 UINT32_C(1)
           72                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           73                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           74                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           75                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           76         [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] =
           77                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           78                 UINT32_C(1)
           79                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           80                 UINT32_C(1)
           81                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           82                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           83                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           84                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           85         [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
           86                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
           87                 UINT32_C(1)
           88                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
           89                 UINT32_C(1)
           90                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
           91                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
           92                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
           93                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
           94         [CHAR_BREAK_PROP_HANGUL_L] =
           95                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L |   /* GB6  */
           96                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB6  */
           97                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV |  /* GB6  */
           98                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6  */
           99                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
          100                 UINT32_C(1)
          101                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          102                 UINT32_C(1)
          103                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          104                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          105                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          106                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          107         [CHAR_BREAK_PROP_HANGUL_V] =
          108                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7  */
          109                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7  */
          110                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
          111                 UINT32_C(1)
          112                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          113                 UINT32_C(1)
          114                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          115                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          116                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          117                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          118         [CHAR_BREAK_PROP_HANGUL_T] =
          119                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8  */
          120                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
          121                 UINT32_C(1)
          122                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          123                 UINT32_C(1)
          124                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          125                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          126                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          127                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          128         [CHAR_BREAK_PROP_HANGUL_LV] =
          129                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7  */
          130                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7  */
          131                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
          132                 UINT32_C(1)
          133                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          134                 UINT32_C(1)
          135                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          136                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          137                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          138                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          139         [CHAR_BREAK_PROP_HANGUL_LVT] =
          140                 UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8  */
          141                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
          142                 UINT32_C(1)
          143                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          144                 UINT32_C(1)
          145                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          146                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          147                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          148                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          149         [CHAR_BREAK_PROP_PREPEND] =
          150                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
          151                 UINT32_C(1)
          152                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          153                 UINT32_C(1)
          154                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          155                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          156                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          157                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK |         /* GB9a */
          158                 (UINT32_C(0xFFFFFFFF) &
          159                  ~(UINT32_C(1) << CHAR_BREAK_PROP_CR |
          160                    UINT32_C(1) << CHAR_BREAK_PROP_LF |
          161                    UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
          162         [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
          163                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
          164                 UINT32_C(1)
          165                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          166                 UINT32_C(1)
          167                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          168                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          169                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          170                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          171         [CHAR_BREAK_PROP_SPACINGMARK] =
          172                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
          173                 UINT32_C(1)
          174                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          175                 UINT32_C(1)
          176                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          177                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          178                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          179                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          180         [CHAR_BREAK_PROP_ZWJ] =
          181                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
          182                 UINT32_C(1)
          183                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          184                 UINT32_C(1)
          185                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          186                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          187                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          188                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          189         [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] =
          190                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
          191                 UINT32_C(1)
          192                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          193                 UINT32_C(1)
          194                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
          195                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
          196                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          197                 UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
          198 
          199 };
          200 static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
          201         [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
          202                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
          203                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
          204                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |              /* GB9  */
          205                 UINT32_C(1)
          206                         << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
          207                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 */
          208         [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
          209                 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
          210         [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
          211                 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
          212         [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
          213                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
          214                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
          215                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
          216                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
          217                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
          218         [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
          219                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
          220                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
          221                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
          222                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
          223                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
          224         [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] =
          225                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
          226                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
          227                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
          228                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
          229                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
          230         [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
          231                 UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
          232                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND |
          233                 UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
          234                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
          235                 UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER,
          236 };
          237 static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
          238         [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
          239                 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
          240         [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
          241                 UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
          242 };
          243 static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
          244         [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
          245                 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
          246 };
          247 static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
          248         [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
          249                 UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
          250 };
          251 
          252 static inline enum char_break_property
          253 get_break_prop(uint_least32_t cp)
          254 {
          255         if (likely(cp <= UINT32_C(0x10FFFF))) {
          256                 return (enum char_break_property)
          257                         char_break_minor[char_break_major[cp >> 8] +
          258                                          (cp & 0xFF)];
          259         } else {
          260                 return CHAR_BREAK_PROP_OTHER;
          261         }
          262 }
          263 
          264 static inline void
          265 state_serialize(const struct character_break_state *in, uint_least16_t *out)
          266 {
          267         *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
          268                (uint_least16_t)(((uint_least16_t)(in->prop_set))
          269                                 << 8) | /* 9th bit */
          270                (uint_least16_t)(((uint_least16_t)(in->gb11_flag))
          271                                 << 9) | /* 10th bit */
          272                (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
          273                                 << 10) | /* 11th bit */
          274                (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3))
          275                                 << 11); /* 12th and 13th bit */
          276 }
          277 
          278 static inline void
          279 state_deserialize(uint_least16_t in, struct character_break_state *out)
          280 {
          281         out->prop = in & UINT8_C(0xFF);
          282         out->prop_set = in & (UINT16_C(1) << 8);
          283         out->gb11_flag = in & (UINT16_C(1) << 9);
          284         out->gb12_13_flag = in & (UINT16_C(1) << 10);
          285         out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3);
          286 }
          287 
          288 bool
          289 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
          290                             uint_least16_t *s)
          291 {
          292         struct character_break_state state;
          293         enum char_break_property cp0_prop, cp1_prop;
          294         bool notbreak = false;
          295 
          296         if (likely(s)) {
          297                 state_deserialize(*s, &state);
          298 
          299                 if (likely(state.prop_set)) {
          300                         cp0_prop = state.prop;
          301                 } else {
          302                         cp0_prop = get_break_prop(cp0);
          303                 }
          304                 cp1_prop = get_break_prop(cp1);
          305 
          306                 /* preserve prop of right codepoint for next iteration */
          307                 state.prop = (uint_least8_t)cp1_prop;
          308                 state.prop_set = true;
          309 
          310                 /* update flags */
          311                 state.gb11_flag =
          312                         flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
          313                                                             state.gb11_flag] &
          314                         UINT32_C(1) << cp1_prop;
          315                 state.gb12_13_flag =
          316                         flag_update_gb12_13[cp0_prop +
          317                                             NUM_CHAR_BREAK_PROPS *
          318                                                     state.gb12_13_flag] &
          319                         UINT32_C(1) << cp1_prop;
          320 
          321                 /*
          322                  * update GB9c state, which deals with indic conjunct breaks.
          323                  * We want to detect the following prefix:
          324                  *
          325                  *   ICB_CONSONANT
          326                  *   [ICB_EXTEND ICB_LINKER]*
          327                  *   ICB_LINKER
          328                  *   [ICB_EXTEND ICB_LINKER]*
          329                  *
          330                  * This representation is not ideal: In reality, what is
          331                  * meant is that the prefix is a sequence of [ICB_EXTEND
          332                  * ICB_LINKER]*, following an ICB_CONSONANT, that contains at
          333                  * least one ICB_LINKER. We thus use the following equivalent
          334                  * representation that allows us to store the levels 0..3 in 2
          335                  * bits.
          336                  *
          337                  *   ICB_CONSONANT              -- Level 1
          338                  *   ICB_EXTEND*                -- Level 2
          339                  *   ICB_LINKER                 -- Level 3
          340                  *   [ICB_EXTEND ICB_LINKER]*   -- Level 3
          341                  *
          342                  * The following chain of if-else-blocks is a bit redundant and
          343                  * of course could be optimised, but this is kept as is for
          344                  * best readability.
          345                  */
          346                 if (state.gb9c_level == 0 &&
          347                     cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
          348                         /* the sequence has begun */
          349                         state.gb9c_level = 1;
          350                 } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
          351                            (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
          352                             cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
          353                             cp0_prop ==
          354                                     CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) {
          355                         /*
          356                          * either the level is 1 and thus the ICB consonant is
          357                          * followed by an ICB extend, where we jump
          358                          * to level 2, or we are at level 2 and just witness
          359                          * more ICB extends, staying at level 2.
          360                          */
          361                         state.gb9c_level = 2;
          362                 } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
          363                            (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
          364                             cp0_prop ==
          365                                     CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
          366                         /*
          367                          * witnessing an ICB linker directly lifts us up to
          368                          * level 3
          369                          */
          370                         state.gb9c_level = 3;
          371                 } else if (state.gb9c_level == 3 &&
          372                            (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
          373                             cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
          374                             cp0_prop ==
          375                                     CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND ||
          376                             cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
          377                             cp0_prop ==
          378                                     CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
          379                         /*
          380                          * we stay at level 3 when we observe either ICB
          381                          * extends or linkers
          382                          */
          383                         state.gb9c_level = 3;
          384                 } else {
          385                         /*
          386                          * the sequence has collapsed, but it could be
          387                          * that the left property is ICB consonant, which
          388                          * means that we jump right back to level 1 instead
          389                          * of 0
          390                          */
          391                         if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
          392                                 state.gb9c_level = 1;
          393                         } else {
          394                                 state.gb9c_level = 0;
          395                         }
          396                 }
          397 
          398                 /*
          399                  * Apply grapheme cluster breaking algorithm (UAX #29), see
          400                  * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
          401                  */
          402                 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
          403                            (state.gb9c_level == 3 &&
          404                             cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) ||
          405                            (dont_break_gb11[cp0_prop +
          406                                             state.gb11_flag *
          407                                                     NUM_CHAR_BREAK_PROPS] &
          408                             (UINT32_C(1) << cp1_prop)) ||
          409                            (dont_break_gb12_13[cp0_prop +
          410                                                state.gb12_13_flag *
          411                                                        NUM_CHAR_BREAK_PROPS] &
          412                             (UINT32_C(1) << cp1_prop));
          413 
          414                 /* update or reset flags (when we have a break) */
          415                 if (likely(!notbreak)) {
          416                         state.gb11_flag = state.gb12_13_flag = false;
          417                 }
          418 
          419                 state_serialize(&state, s);
          420         } else {
          421                 cp0_prop = get_break_prop(cp0);
          422                 cp1_prop = get_break_prop(cp1);
          423 
          424                 /*
          425                  * Apply grapheme cluster breaking algorithm (UAX #29), see
          426                  * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
          427                  *
          428                  * Given we have no state, this behaves as if the state-booleans
          429                  * were all set to false
          430                  */
          431                 notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
          432                            (dont_break_gb11[cp0_prop] &
          433                             (UINT32_C(1) << cp1_prop)) ||
          434                            (dont_break_gb12_13[cp0_prop] &
          435                             (UINT32_C(1) << cp1_prop));
          436         }
          437 
          438         return !notbreak;
          439 }
          440 
          441 static size_t
          442 next_character_break(HERODOTUS_READER *r)
          443 {
          444         uint_least16_t state = 0;
          445         uint_least32_t cp0 = 0, cp1 = 0;
          446 
          447         for (herodotus_read_codepoint(r, true, &cp0);
          448              herodotus_read_codepoint(r, false, &cp1) ==
          449              HERODOTUS_STATUS_SUCCESS;
          450              herodotus_read_codepoint(r, true, &cp0)) {
          451                 if (grapheme_is_character_break(cp0, cp1, &state)) {
          452                         break;
          453                 }
          454         }
          455 
          456         return herodotus_reader_number_read(r);
          457 }
          458 
          459 size_t
          460 grapheme_next_character_break(const uint_least32_t *str, size_t len)
          461 {
          462         HERODOTUS_READER r;
          463 
          464         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
          465 
          466         return next_character_break(&r);
          467 }
          468 
          469 size_t
          470 grapheme_next_character_break_utf8(const char *str, size_t len)
          471 {
          472         HERODOTUS_READER r;
          473 
          474         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
          475 
          476         return next_character_break(&r);
          477 }