sentence.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       sentence.c (8420B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stdbool.h>
            3 #include <stddef.h>
            4 
            5 #include "../gen/sentence.h"
            6 #include "../grapheme.h"
            7 #include "util.h"
            8 
            9 struct sentence_break_state {
           10         uint_least8_t aterm_close_sp_level;
           11         uint_least8_t saterm_close_sp_parasep_level;
           12 };
           13 
           14 static inline uint_least8_t
           15 get_sentence_break_prop(uint_least32_t cp)
           16 {
           17         if (likely(cp <= UINT32_C(0x10FFFF))) {
           18                 return (uint_least8_t)
           19                         sentence_break_minor[sentence_break_major[cp >> 8] +
           20                                              (cp & 0xff)];
           21         } else {
           22                 return SENTENCE_BREAK_PROP_OTHER;
           23         }
           24 }
           25 
           26 static bool
           27 is_skippable_sentence_prop(uint_least8_t prop)
           28 {
           29         return prop == SENTENCE_BREAK_PROP_EXTEND ||
           30                prop == SENTENCE_BREAK_PROP_FORMAT;
           31 }
           32 
           33 static void
           34 sentence_skip_shift_callback(uint_least8_t prop, void *s)
           35 {
           36         struct sentence_break_state *state = (struct sentence_break_state *)s;
           37 
           38         /*
           39          * Here comes a bit of magic. The rules
           40          * SB8, SB8a, SB9 and SB10 have very complicated
           41          * left-hand-side-rules of the form
           42          *
           43          *  ATerm Close* Sp*
           44          *  SATerm Close*
           45          *  SATerm Close* Sp*
           46          *  SATerm Close* Sp* ParaSep?
           47          *
           48          * but instead of backtracking, we keep the
           49          * state as some kind of "power level" in
           50          * two state-variables
           51          *
           52          *  aterm_close_sp_level
           53          *  saterm_close_sp_parasep_level
           54          *
           55          * that go from 0 to 3/4:
           56          *
           57          *  0: we are not in the sequence
           58          *  1: we have one ATerm/SATerm to the left of
           59          *     the middle spot
           60          *  2: we have one ATerm/SATerm and one or more
           61          *     Close to the left of the middle spot
           62          *  3: we have one ATerm/SATerm, zero or more
           63          *     Close and one or more Sp to the left of
           64          *     the middle spot.
           65          *  4: we have one SATerm, zero or more Close,
           66          *     zero or more Sp and one ParaSep to the
           67          *     left of the middle spot.
           68          *
           69          */
           70         if ((state->aterm_close_sp_level == 0 ||
           71              state->aterm_close_sp_level == 1) &&
           72             prop == SENTENCE_BREAK_PROP_ATERM) {
           73                 /* sequence has begun */
           74                 state->aterm_close_sp_level = 1;
           75         } else if ((state->aterm_close_sp_level == 1 ||
           76                     state->aterm_close_sp_level == 2) &&
           77                    prop == SENTENCE_BREAK_PROP_CLOSE) {
           78                 /* close-sequence begins or continued */
           79                 state->aterm_close_sp_level = 2;
           80         } else if ((state->aterm_close_sp_level == 1 ||
           81                     state->aterm_close_sp_level == 2 ||
           82                     state->aterm_close_sp_level == 3) &&
           83                    prop == SENTENCE_BREAK_PROP_SP) {
           84                 /* sp-sequence begins or continued */
           85                 state->aterm_close_sp_level = 3;
           86         } else {
           87                 /* sequence broke */
           88                 state->aterm_close_sp_level = 0;
           89         }
           90 
           91         if ((state->saterm_close_sp_parasep_level == 0 ||
           92              state->saterm_close_sp_parasep_level == 1) &&
           93             (prop == SENTENCE_BREAK_PROP_STERM ||
           94              prop == SENTENCE_BREAK_PROP_ATERM)) {
           95                 /* sequence has begun */
           96                 state->saterm_close_sp_parasep_level = 1;
           97         } else if ((state->saterm_close_sp_parasep_level == 1 ||
           98                     state->saterm_close_sp_parasep_level == 2) &&
           99                    prop == SENTENCE_BREAK_PROP_CLOSE) {
          100                 /* close-sequence begins or continued */
          101                 state->saterm_close_sp_parasep_level = 2;
          102         } else if ((state->saterm_close_sp_parasep_level == 1 ||
          103                     state->saterm_close_sp_parasep_level == 2 ||
          104                     state->saterm_close_sp_parasep_level == 3) &&
          105                    prop == SENTENCE_BREAK_PROP_SP) {
          106                 /* sp-sequence begins or continued */
          107                 state->saterm_close_sp_parasep_level = 3;
          108         } else if ((state->saterm_close_sp_parasep_level == 1 ||
          109                     state->saterm_close_sp_parasep_level == 2 ||
          110                     state->saterm_close_sp_parasep_level == 3) &&
          111                    (prop == SENTENCE_BREAK_PROP_SEP ||
          112                     prop == SENTENCE_BREAK_PROP_CR ||
          113                     prop == SENTENCE_BREAK_PROP_LF)) {
          114                 /* ParaSep at the end of the sequence */
          115                 state->saterm_close_sp_parasep_level = 4;
          116         } else {
          117                 /* sequence broke */
          118                 state->saterm_close_sp_parasep_level = 0;
          119         }
          120 }
          121 
          122 static size_t
          123 next_sentence_break(HERODOTUS_READER *r)
          124 {
          125         HERODOTUS_READER tmp;
          126         enum sentence_break_property prop;
          127         struct proper p;
          128         struct sentence_break_state state = { 0 };
          129         uint_least32_t cp;
          130 
          131         /*
          132          * Apply sentence breaking algorithm (UAX #29), see
          133          * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
          134          */
          135         proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
          136                     get_sentence_break_prop, is_skippable_sentence_prop,
          137                     sentence_skip_shift_callback, &p);
          138 
          139         while (!proper_advance(&p)) {
          140                 /* SB3 */
          141                 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
          142                     p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
          143                         continue;
          144                 }
          145 
          146                 /* SB4 */
          147                 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
          148                     p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
          149                     p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
          150                         break;
          151                 }
          152 
          153                 /* SB5 */
          154                 if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
          155                     p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
          156                         continue;
          157                 }
          158 
          159                 /* SB6 */
          160                 if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
          161                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
          162                         continue;
          163                 }
          164 
          165                 /* SB7 */
          166                 if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
          167                      p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
          168                     p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
          169                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
          170                         continue;
          171                 }
          172 
          173                 /* SB8 */
          174                 if (state.aterm_close_sp_level == 1 ||
          175                     state.aterm_close_sp_level == 2 ||
          176                     state.aterm_close_sp_level == 3) {
          177                         /*
          178                          * This is the most complicated rule, requiring
          179                          * the right-hand-side to satisfy the regular expression
          180                          *
          181                          *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )*
          182                          * Lower
          183                          *
          184                          * which we simply check "manually" given LUT-lookups
          185                          * are very cheap by starting at the mid_reader.
          186                          *
          187                          */
          188                         herodotus_reader_copy(&(p.mid_reader), &tmp);
          189 
          190                         prop = NUM_SENTENCE_BREAK_PROPS;
          191                         while (herodotus_read_codepoint(&tmp, true, &cp) ==
          192                                HERODOTUS_STATUS_SUCCESS) {
          193                                 prop = get_sentence_break_prop(cp);
          194 
          195                                 /*
          196                                  * the skippable properties are ignored
          197                                  * automatically here given they do not
          198                                  * match the following condition
          199                                  */
          200                                 if (prop == SENTENCE_BREAK_PROP_OLETTER ||
          201                                     prop == SENTENCE_BREAK_PROP_UPPER ||
          202                                     prop == SENTENCE_BREAK_PROP_LOWER ||
          203                                     prop == SENTENCE_BREAK_PROP_SEP ||
          204                                     prop == SENTENCE_BREAK_PROP_CR ||
          205                                     prop == SENTENCE_BREAK_PROP_LF ||
          206                                     prop == SENTENCE_BREAK_PROP_STERM ||
          207                                     prop == SENTENCE_BREAK_PROP_ATERM) {
          208                                         break;
          209                                 }
          210                         }
          211 
          212                         if (prop == SENTENCE_BREAK_PROP_LOWER) {
          213                                 continue;
          214                         }
          215                 }
          216 
          217                 /* SB8a */
          218                 if ((state.saterm_close_sp_parasep_level == 1 ||
          219                      state.saterm_close_sp_parasep_level == 2 ||
          220                      state.saterm_close_sp_parasep_level == 3) &&
          221                     (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
          222                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
          223                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
          224                         continue;
          225                 }
          226 
          227                 /* SB9 */
          228                 if ((state.saterm_close_sp_parasep_level == 1 ||
          229                      state.saterm_close_sp_parasep_level == 2) &&
          230                     (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
          231                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
          232                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
          233                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
          234                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
          235                         continue;
          236                 }
          237 
          238                 /* SB10 */
          239                 if ((state.saterm_close_sp_parasep_level == 1 ||
          240                      state.saterm_close_sp_parasep_level == 2 ||
          241                      state.saterm_close_sp_parasep_level == 3) &&
          242                     (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
          243                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
          244                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
          245                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
          246                         continue;
          247                 }
          248 
          249                 /* SB11 */
          250                 if (state.saterm_close_sp_parasep_level == 1 ||
          251                     state.saterm_close_sp_parasep_level == 2 ||
          252                     state.saterm_close_sp_parasep_level == 3 ||
          253                     state.saterm_close_sp_parasep_level == 4) {
          254                         break;
          255                 }
          256 
          257                 /* SB998 */
          258                 continue;
          259         }
          260 
          261         return herodotus_reader_number_read(&(p.mid_reader));
          262 }
          263 
          264 size_t
          265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
          266 {
          267         HERODOTUS_READER r;
          268 
          269         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
          270 
          271         return next_sentence_break(&r);
          272 }
          273 
          274 size_t
          275 grapheme_next_sentence_break_utf8(const char *str, size_t len)
          276 {
          277         HERODOTUS_READER r;
          278 
          279         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
          280 
          281         return next_sentence_break(&r);
          282 }