line.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       line.c (14397B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stdbool.h>
            3 #include <stddef.h>
            4 
            5 #include "../gen/line.h"
            6 #include "../grapheme.h"
            7 #include "util.h"
            8 
            9 static inline enum line_break_property
           10 get_break_prop(uint_least32_t cp)
           11 {
           12         if (likely(cp <= UINT32_C(0x10FFFF))) {
           13                 return (enum line_break_property)
           14                         line_break_minor[line_break_major[cp >> 8] +
           15                                          (cp & 0xff)];
           16         } else {
           17                 return LINE_BREAK_PROP_AL;
           18         }
           19 }
           20 
           21 static size_t
           22 next_line_break(HERODOTUS_READER *r)
           23 {
           24         HERODOTUS_READER tmp;
           25         enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
           26                 last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
           27         uint_least32_t cp;
           28         uint_least8_t lb25_level = 0;
           29         bool lb21a_flag = false, ri_even = true;
           30 
           31         /*
           32          * Apply line breaking algorithm (UAX #14), see
           33          * https://unicode.org/reports/tr14/#Algorithm and tailoring
           34          * https://unicode.org/reports/tr14/#Examples (example 7),
           35          * given the automatic test-cases implement this example for
           36          * better number handling.
           37          *
           38          */
           39 
           40         /*
           41          * Initialize the different properties such that we have
           42          * a good state after the state-update in the loop
           43          */
           44         last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
           45         last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
           46 
           47         for (herodotus_read_codepoint(r, true, &cp),
           48              cp0_prop = get_break_prop(cp);
           49              herodotus_read_codepoint(r, false, &cp) ==
           50              HERODOTUS_STATUS_SUCCESS;
           51              herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
           52                 /* get property of the right codepoint */
           53                 cp1_prop = get_break_prop(cp);
           54 
           55                 /* update retention-states */
           56 
           57                 /*
           58                  * store the last observed non-CM-or-ZWJ-property for
           59                  * LB9 and following.
           60                  */
           61                 if (cp0_prop != LINE_BREAK_PROP_CM &&
           62                     cp0_prop != LINE_BREAK_PROP_ZWJ) {
           63                         /*
           64                          * check if the property we are overwriting now is an
           65                          * HL. If so, we set the LB21a-flag which depends on
           66                          * this knowledge.
           67                          */
           68                         lb21a_flag =
           69                                 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
           70 
           71                         /* check regional indicator state */
           72                         if (cp0_prop == LINE_BREAK_PROP_RI) {
           73                                 /*
           74                                  * The property we just shifted in is
           75                                  * a regional indicator, increasing the
           76                                  * number of consecutive RIs on the left
           77                                  * side of the breakpoint by one, changing
           78                                  * the oddness.
           79                                  *
           80                                  */
           81                                 ri_even = !ri_even;
           82                         } else {
           83                                 /*
           84                                  * We saw no regional indicator, so the
           85                                  * number of consecutive RIs on the left
           86                                  * side of the breakpoint is zero, which
           87                                  * is an even number.
           88                                  *
           89                                  */
           90                                 ri_even = true;
           91                         }
           92 
           93                         /*
           94                          * Here comes a bit of magic. The tailored rule
           95                          * LB25 (using example 7) has a very complicated
           96                          * left-hand-side-rule of the form
           97                          *
           98                          *  NU (NU | SY | IS)* (CL | CP)?
           99                          *
          100                          * but instead of backtracking, we keep the state
          101                          * as some kind of "power level" in the variable
          102                          *
          103                          *  lb25_level
          104                          *
          105                          * that goes from 0 to 3
          106                          *
          107                          *  0: we are not in the sequence
          108                          *  1: we have one NU to the left of the middle
          109                          *     spot
          110                          *  2: we have one NU and one or more (NU | SY | IS)
          111                          *     to the left of the middle spot
          112                          *  3: we have one NU, zero or more (NU | SY | IS)
          113                          *     and one (CL | CP) to the left of the middle
          114                          *     spot
          115                          */
          116                         if ((lb25_level == 0 || lb25_level == 1) &&
          117                             cp0_prop == LINE_BREAK_PROP_NU) {
          118                                 /* sequence has begun */
          119                                 lb25_level = 1;
          120                         } else if ((lb25_level == 1 || lb25_level == 2) &&
          121                                    (cp0_prop == LINE_BREAK_PROP_NU ||
          122                                     cp0_prop == LINE_BREAK_PROP_SY ||
          123                                     cp0_prop == LINE_BREAK_PROP_IS)) {
          124                                 /* (NU | SY | IS) sequence begins or continued
          125                                  */
          126                                 lb25_level = 2;
          127                         } else if (
          128                                 (lb25_level == 1 || lb25_level == 2) &&
          129                                 (cp0_prop == LINE_BREAK_PROP_CL ||
          130                                  cp0_prop ==
          131                                          LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
          132                                  cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
          133                                 /* CL or CP at the end of the sequence */
          134                                 lb25_level = 3;
          135                         } else {
          136                                 /* sequence broke */
          137                                 lb25_level = 0;
          138                         }
          139 
          140                         last_non_cm_or_zwj_prop = cp0_prop;
          141                 }
          142 
          143                 /*
          144                  * store the last observed non-SP-property for LB8, LB14,
          145                  * LB15, LB16 and LB17. LB8 gets its own unskipped property,
          146                  * whereas the others build on top of the CM-ZWJ-skipped
          147                  * properties as they come after LB9
          148                  */
          149                 if (cp0_prop != LINE_BREAK_PROP_SP) {
          150                         last_non_sp_prop = cp0_prop;
          151                 }
          152                 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
          153                         last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
          154                 }
          155 
          156                 /* apply the algorithm */
          157 
          158                 /* LB4 */
          159                 if (cp0_prop == LINE_BREAK_PROP_BK) {
          160                         break;
          161                 }
          162 
          163                 /* LB5 */
          164                 if (cp0_prop == LINE_BREAK_PROP_CR &&
          165                     cp1_prop == LINE_BREAK_PROP_LF) {
          166                         continue;
          167                 }
          168                 if (cp0_prop == LINE_BREAK_PROP_CR ||
          169                     cp0_prop == LINE_BREAK_PROP_LF ||
          170                     cp0_prop == LINE_BREAK_PROP_NL) {
          171                         break;
          172                 }
          173 
          174                 /* LB6 */
          175                 if (cp1_prop == LINE_BREAK_PROP_BK ||
          176                     cp1_prop == LINE_BREAK_PROP_CR ||
          177                     cp1_prop == LINE_BREAK_PROP_LF ||
          178                     cp1_prop == LINE_BREAK_PROP_NL) {
          179                         continue;
          180                 }
          181 
          182                 /* LB7 */
          183                 if (cp1_prop == LINE_BREAK_PROP_SP ||
          184                     cp1_prop == LINE_BREAK_PROP_ZW) {
          185                         continue;
          186                 }
          187 
          188                 /* LB8 */
          189                 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
          190                         break;
          191                 }
          192 
          193                 /* LB8a */
          194                 if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
          195                         continue;
          196                 }
          197 
          198                 /* LB9 */
          199                 if ((cp0_prop != LINE_BREAK_PROP_BK &&
          200                      cp0_prop != LINE_BREAK_PROP_CR &&
          201                      cp0_prop != LINE_BREAK_PROP_LF &&
          202                      cp0_prop != LINE_BREAK_PROP_NL &&
          203                      cp0_prop != LINE_BREAK_PROP_SP &&
          204                      cp0_prop != LINE_BREAK_PROP_ZW) &&
          205                     (cp1_prop == LINE_BREAK_PROP_CM ||
          206                      cp1_prop == LINE_BREAK_PROP_ZWJ)) {
          207                         /*
          208                          * given we skip them, we don't break in such
          209                          * a sequence
          210                          */
          211                         continue;
          212                 }
          213 
          214                 /* LB10 is baked into the following rules */
          215 
          216                 /* LB11 */
          217                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
          218                     cp1_prop == LINE_BREAK_PROP_WJ) {
          219                         continue;
          220                 }
          221 
          222                 /* LB12 */
          223                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
          224                         continue;
          225                 }
          226 
          227                 /* LB12a */
          228                 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
          229                      last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
          230                      last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
          231                     cp1_prop == LINE_BREAK_PROP_GL) {
          232                         continue;
          233                 }
          234 
          235                 /* LB13 (affected by tailoring for LB25, see example 7) */
          236                 if (cp1_prop == LINE_BREAK_PROP_EX ||
          237                     (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
          238                      (cp1_prop == LINE_BREAK_PROP_CL ||
          239                       cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
          240                       cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
          241                       cp1_prop == LINE_BREAK_PROP_IS ||
          242                       cp1_prop == LINE_BREAK_PROP_SY))) {
          243                         continue;
          244                 }
          245 
          246                 /* LB14 */
          247                 if (last_non_sp_cm_or_zwj_prop ==
          248                             LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
          249                     last_non_sp_cm_or_zwj_prop ==
          250                             LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
          251                         continue;
          252                 }
          253 
          254                 /* LB15 */
          255                 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
          256                     (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
          257                      cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
          258                         continue;
          259                 }
          260 
          261                 /* LB16 */
          262                 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
          263                      last_non_sp_cm_or_zwj_prop ==
          264                              LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
          265                      last_non_sp_cm_or_zwj_prop ==
          266                              LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
          267                     cp1_prop == LINE_BREAK_PROP_NS) {
          268                         continue;
          269                 }
          270 
          271                 /* LB17 */
          272                 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
          273                     cp1_prop == LINE_BREAK_PROP_B2) {
          274                         continue;
          275                 }
          276 
          277                 /* LB18 */
          278                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
          279                         break;
          280                 }
          281 
          282                 /* LB19 */
          283                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
          284                     cp1_prop == LINE_BREAK_PROP_QU) {
          285                         continue;
          286                 }
          287 
          288                 /* LB20 */
          289                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
          290                     cp1_prop == LINE_BREAK_PROP_CB) {
          291                         break;
          292                 }
          293 
          294                 /* LB21 */
          295                 if (cp1_prop == LINE_BREAK_PROP_BA ||
          296                     cp1_prop == LINE_BREAK_PROP_HY ||
          297                     cp1_prop == LINE_BREAK_PROP_NS ||
          298                     last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
          299                         continue;
          300                 }
          301 
          302                 /* LB21a */
          303                 if (lb21a_flag &&
          304                     (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
          305                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
          306                         continue;
          307                 }
          308 
          309                 /* LB21b */
          310                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
          311                     cp1_prop == LINE_BREAK_PROP_HL) {
          312                         continue;
          313                 }
          314 
          315                 /* LB22 */
          316                 if (cp1_prop == LINE_BREAK_PROP_IN) {
          317                         continue;
          318                 }
          319 
          320                 /* LB23 */
          321                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
          322                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
          323                     cp1_prop == LINE_BREAK_PROP_NU) {
          324                         continue;
          325                 }
          326                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
          327                     (cp1_prop == LINE_BREAK_PROP_AL ||
          328                      cp1_prop == LINE_BREAK_PROP_HL)) {
          329                         continue;
          330                 }
          331 
          332                 /* LB23a */
          333                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
          334                     (cp1_prop == LINE_BREAK_PROP_ID ||
          335                      cp1_prop == LINE_BREAK_PROP_EB ||
          336                      cp1_prop == LINE_BREAK_PROP_EM)) {
          337                         continue;
          338                 }
          339                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
          340                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
          341                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
          342                     cp1_prop == LINE_BREAK_PROP_PO) {
          343                         continue;
          344                 }
          345 
          346                 /* LB24 */
          347                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
          348                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
          349                     (cp1_prop == LINE_BREAK_PROP_AL ||
          350                      cp1_prop == LINE_BREAK_PROP_HL)) {
          351                         continue;
          352                 }
          353                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
          354                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
          355                     (cp1_prop == LINE_BREAK_PROP_PR ||
          356                      cp1_prop == LINE_BREAK_PROP_PO)) {
          357                         continue;
          358                 }
          359 
          360                 /* LB25 (tailored with example 7) */
          361                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
          362                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
          363                         if (cp1_prop == LINE_BREAK_PROP_NU) {
          364                                 continue;
          365                         }
          366 
          367                         /* this stupid rule is the reason why we cannot
          368                          * simply have a stateful break-detection between
          369                          * two adjacent codepoints as we have it with
          370                          * characters.
          371                          */
          372                         herodotus_reader_copy(r, &tmp);
          373                         herodotus_read_codepoint(&tmp, true, &cp);
          374                         if (herodotus_read_codepoint(&tmp, true, &cp) ==
          375                                     HERODOTUS_STATUS_SUCCESS &&
          376                             (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
          377                              cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
          378                              cp1_prop == LINE_BREAK_PROP_HY)) {
          379                                 if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
          380                                         continue;
          381                                 }
          382                         }
          383                 }
          384                 if ((last_non_cm_or_zwj_prop ==
          385                              LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
          386                      last_non_cm_or_zwj_prop ==
          387                              LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
          388                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
          389                     cp1_prop == LINE_BREAK_PROP_NU) {
          390                         continue;
          391                 }
          392                 if (lb25_level == 1 && (cp1_prop == LINE_BREAK_PROP_NU ||
          393                                         cp1_prop == LINE_BREAK_PROP_SY ||
          394                                         cp1_prop == LINE_BREAK_PROP_IS)) {
          395                         continue;
          396                 }
          397                 if ((lb25_level == 1 || lb25_level == 2) &&
          398                     (cp1_prop == LINE_BREAK_PROP_NU ||
          399                      cp1_prop == LINE_BREAK_PROP_SY ||
          400                      cp1_prop == LINE_BREAK_PROP_IS ||
          401                      cp1_prop == LINE_BREAK_PROP_CL ||
          402                      cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
          403                      cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
          404                         continue;
          405                 }
          406                 if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
          407                     (cp1_prop == LINE_BREAK_PROP_PO ||
          408                      cp1_prop == LINE_BREAK_PROP_PR)) {
          409                         continue;
          410                 }
          411 
          412                 /* LB26 */
          413                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
          414                     (cp1_prop == LINE_BREAK_PROP_JL ||
          415                      cp1_prop == LINE_BREAK_PROP_JV ||
          416                      cp1_prop == LINE_BREAK_PROP_H2 ||
          417                      cp1_prop == LINE_BREAK_PROP_H3)) {
          418                         continue;
          419                 }
          420                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
          421                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
          422                     (cp1_prop == LINE_BREAK_PROP_JV ||
          423                      cp1_prop == LINE_BREAK_PROP_JT)) {
          424                         continue;
          425                 }
          426                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
          427                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
          428                     cp1_prop == LINE_BREAK_PROP_JT) {
          429                         continue;
          430                 }
          431 
          432                 /* LB27 */
          433                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
          434                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
          435                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
          436                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
          437                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
          438                     cp1_prop == LINE_BREAK_PROP_PO) {
          439                         continue;
          440                 }
          441                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
          442                     (cp1_prop == LINE_BREAK_PROP_JL ||
          443                      cp1_prop == LINE_BREAK_PROP_JV ||
          444                      cp1_prop == LINE_BREAK_PROP_JT ||
          445                      cp1_prop == LINE_BREAK_PROP_H2 ||
          446                      cp1_prop == LINE_BREAK_PROP_H3)) {
          447                         continue;
          448                 }
          449 
          450                 /* LB28 */
          451                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
          452                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
          453                     (cp1_prop == LINE_BREAK_PROP_AL ||
          454                      cp1_prop == LINE_BREAK_PROP_HL)) {
          455                         continue;
          456                 }
          457 
          458                 /* LB29 */
          459                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
          460                     (cp1_prop == LINE_BREAK_PROP_AL ||
          461                      cp1_prop == LINE_BREAK_PROP_HL)) {
          462                         continue;
          463                 }
          464 
          465                 /* LB30 */
          466                 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
          467                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
          468                      last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
          469                     cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
          470                         continue;
          471                 }
          472                 if (last_non_cm_or_zwj_prop ==
          473                             LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
          474                     (cp1_prop == LINE_BREAK_PROP_AL ||
          475                      cp1_prop == LINE_BREAK_PROP_HL ||
          476                      cp1_prop == LINE_BREAK_PROP_NU)) {
          477                         continue;
          478                 }
          479 
          480                 /* LB30a */
          481                 if (!ri_even && last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
          482                     cp1_prop == LINE_BREAK_PROP_RI) {
          483                         continue;
          484                 }
          485 
          486                 /* LB30b */
          487                 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
          488                     cp1_prop == LINE_BREAK_PROP_EM) {
          489                         continue;
          490                 }
          491                 if (last_non_cm_or_zwj_prop ==
          492                             LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
          493                     cp1_prop == LINE_BREAK_PROP_EM) {
          494                         continue;
          495                 }
          496 
          497                 /* LB31 */
          498                 break;
          499         }
          500 
          501         return herodotus_reader_number_read(r);
          502 }
          503 
          504 size_t
          505 grapheme_next_line_break(const uint_least32_t *str, size_t len)
          506 {
          507         HERODOTUS_READER r;
          508 
          509         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
          510 
          511         return next_line_break(&r);
          512 }
          513 
          514 size_t
          515 grapheme_next_line_break_utf8(const char *str, size_t len)
          516 {
          517         HERODOTUS_READER r;
          518 
          519         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
          520 
          521         return next_line_break(&r);
          522 }