word.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       word.c (8052B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stdbool.h>
            3 #include <stddef.h>
            4 
            5 #include "../gen/word.h"
            6 #include "../grapheme.h"
            7 #include "util.h"
            8 
            9 struct word_break_state {
           10         bool ri_even;
           11 };
           12 
           13 static inline uint_least8_t
           14 get_word_break_prop(uint_least32_t cp)
           15 {
           16         if (likely(cp <= UINT32_C(0x10FFFF))) {
           17                 return (uint_least8_t)
           18                         word_break_minor[word_break_major[cp >> 8] +
           19                                          (cp & 0xff)];
           20         } else {
           21                 return WORD_BREAK_PROP_OTHER;
           22         }
           23 }
           24 
           25 static bool
           26 is_skippable_word_prop(uint_least8_t prop)
           27 {
           28         return prop == WORD_BREAK_PROP_EXTEND ||
           29                prop == WORD_BREAK_PROP_FORMAT || prop == WORD_BREAK_PROP_ZWJ;
           30 }
           31 
           32 static void
           33 word_skip_shift_callback(uint_least8_t prop, void *s)
           34 {
           35         struct word_break_state *state = (struct word_break_state *)s;
           36 
           37         if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
           38                 /*
           39                  * The property we just shifted in is
           40                  * a regional indicator, increasing the
           41                  * number of consecutive RIs on the left
           42                  * side of the breakpoint by one, changing
           43                  * the oddness.
           44                  *
           45                  */
           46                 state->ri_even = !(state->ri_even);
           47         } else {
           48                 /*
           49                  * We saw no regional indicator, so the
           50                  * number of consecutive RIs on the left
           51                  * side of the breakpoint is zero, which
           52                  * is an even number.
           53                  *
           54                  */
           55                 state->ri_even = true;
           56         }
           57 }
           58 
           59 static size_t
           60 next_word_break(HERODOTUS_READER *r)
           61 {
           62         struct proper p;
           63         struct word_break_state state = { .ri_even = true };
           64 
           65         /*
           66          * Apply word breaking algorithm (UAX #29), see
           67          * https://unicode.org/reports/tr29/#Word_Boundary_Rules
           68          */
           69         proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
           70                     is_skippable_word_prop, word_skip_shift_callback, &p);
           71 
           72         while (!proper_advance(&p)) {
           73                 /* WB3 */
           74                 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
           75                     p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
           76                         continue;
           77                 }
           78 
           79                 /* WB3a */
           80                 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
           81                     p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
           82                     p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
           83                         break;
           84                 }
           85 
           86                 /* WB3b */
           87                 if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
           88                     p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
           89                     p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
           90                         break;
           91                 }
           92 
           93                 /* WB3c */
           94                 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
           95                     (p.raw.next_prop[0] ==
           96                              WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
           97                      p.raw.next_prop[0] ==
           98                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
           99                         continue;
          100                 }
          101 
          102                 /* WB3d */
          103                 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
          104                     p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
          105                         continue;
          106                 }
          107 
          108                 /* WB4 */
          109                 if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
          110                     p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
          111                     p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
          112                         continue;
          113                 }
          114 
          115                 /* WB5 */
          116                 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
          117                      p.skip.prev_prop[0] ==
          118                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          119                      p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
          120                     (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
          121                      p.skip.next_prop[0] ==
          122                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          123                      p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
          124                         continue;
          125                 }
          126 
          127                 /* WB6 */
          128                 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
          129                      p.skip.prev_prop[0] ==
          130                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          131                      p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
          132                     (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
          133                      p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
          134                      p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
          135                     (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
          136                      p.skip.next_prop[1] ==
          137                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          138                      p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
          139                         continue;
          140                 }
          141 
          142                 /* WB7 */
          143                 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
          144                      p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
          145                      p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
          146                     (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
          147                      p.skip.next_prop[0] ==
          148                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          149                      p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
          150                     (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
          151                      p.skip.prev_prop[1] ==
          152                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          153                      p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
          154                         continue;
          155                 }
          156 
          157                 /* WB7a */
          158                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
          159                     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
          160                         continue;
          161                 }
          162 
          163                 /* WB7b */
          164                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
          165                     p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
          166                     p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
          167                         continue;
          168                 }
          169 
          170                 /* WB7c */
          171                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
          172                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
          173                     p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
          174                         continue;
          175                 }
          176 
          177                 /* WB8 */
          178                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
          179                     p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
          180                         continue;
          181                 }
          182 
          183                 /* WB9 */
          184                 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
          185                      p.skip.prev_prop[0] ==
          186                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          187                      p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
          188                     p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
          189                         continue;
          190                 }
          191 
          192                 /* WB10 */
          193                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
          194                     (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
          195                      p.skip.next_prop[0] ==
          196                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          197                      p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
          198                         continue;
          199                 }
          200 
          201                 /* WB11 */
          202                 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
          203                      p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
          204                      p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
          205                     p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
          206                     p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
          207                         continue;
          208                 }
          209 
          210                 /* WB12 */
          211                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
          212                     (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
          213                      p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
          214                      p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
          215                     p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
          216                         continue;
          217                 }
          218 
          219                 /* WB13 */
          220                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
          221                     p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
          222                         continue;
          223                 }
          224 
          225                 /* WB13a */
          226                 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
          227                      p.skip.prev_prop[0] ==
          228                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          229                      p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
          230                      p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
          231                      p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
          232                      p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
          233                     p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
          234                         continue;
          235                 }
          236 
          237                 /* WB13b */
          238                 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
          239                     (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
          240                      p.skip.next_prop[0] ==
          241                              WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
          242                      p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
          243                      p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
          244                      p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
          245                         continue;
          246                 }
          247 
          248                 /* WB15 and WB16 */
          249                 if (!state.ri_even &&
          250                     p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
          251                         continue;
          252                 }
          253 
          254                 /* WB999 */
          255                 break;
          256         }
          257 
          258         return herodotus_reader_number_read(&(p.mid_reader));
          259 }
          260 
          261 size_t
          262 grapheme_next_word_break(const uint_least32_t *str, size_t len)
          263 {
          264         HERODOTUS_READER r;
          265 
          266         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
          267 
          268         return next_word_break(&r);
          269 }
          270 
          271 size_t
          272 grapheme_next_word_break_utf8(const char *str, size_t len)
          273 {
          274         HERODOTUS_READER r;
          275 
          276         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
          277 
          278         return next_word_break(&r);
          279 }