line.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       line.c (11273B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stdio.h>
            3 #include <stdlib.h>
            4 #include <string.h>
            5 
            6 #include "util.h"
            7 
            8 #define FILE_EAW   "data/EastAsianWidth.txt"
            9 #define FILE_EMOJI "data/emoji-data.txt"
           10 #define FILE_LINE  "data/LineBreak.txt"
           11 
           12 static const struct property_spec line_break_property[] = {
           13         {
           14                 .enumname = "AL",
           15                 .file = FILE_LINE,
           16                 .ucdname = "AL",
           17         },
           18         /*
           19          * Both extended pictographic and cn are large classes,
           20          * but we are only interested in their intersection for LB30b,
           21          * so we have the following two temporary classes. At first
           22          * the extpict-class is filled, then the cn-class, which leads
           23          * to conflicts (that we handle by putting them in the "proper"
           24          * class BOTH_CN_EXTPICT). We make use of the fact that there
           25          * is no intersection between AL and Cn.
           26          *
           27          * Any consecutive conflicts are permitted to overwrite
           28          * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
           29          * them, and in the final postprocessing we "reset" all
           30          * remaining matches (that then didn't fit any of the other
           31          * classes) to the generic class AL.
           32          */
           33         {
           34                 .enumname = "TMP_CN",
           35                 .file = FILE_LINE,
           36                 .ucdname = "Cn",
           37         },
           38         {
           39                 .enumname = "TMP_EXTENDED_PICTOGRAPHIC",
           40                 .file = FILE_EMOJI,
           41                 .ucdname = "Extended_Pictographic",
           42         },
           43         /* end of special block */
           44         {
           45                 .enumname = "B2",
           46                 .file = FILE_LINE,
           47                 .ucdname = "B2",
           48         },
           49         {
           50                 .enumname = "BA",
           51                 .file = FILE_LINE,
           52                 .ucdname = "BA",
           53         },
           54         {
           55                 .enumname = "BB",
           56                 .file = FILE_LINE,
           57                 .ucdname = "BB",
           58         },
           59         {
           60                 .enumname = "BK",
           61                 .file = FILE_LINE,
           62                 .ucdname = "BK",
           63         },
           64         {
           65                 .enumname = "BOTH_CN_EXTPICT",
           66                 .file = NULL,
           67                 .ucdname = NULL,
           68         },
           69         {
           70                 .enumname = "CB",
           71                 .file = FILE_LINE,
           72                 .ucdname = "CB",
           73         },
           74         {
           75                 .enumname = "CL",
           76                 .file = FILE_LINE,
           77                 .ucdname = "CL",
           78         },
           79         {
           80                 .enumname = "CM",
           81                 .file = FILE_LINE,
           82                 .ucdname = "CM",
           83         },
           84         {
           85                 .enumname = "CP_WITHOUT_EAW_HWF",
           86                 .file = FILE_LINE,
           87                 .ucdname = "CP",
           88         },
           89         {
           90                 .enumname = "CP_WITH_EAW_HWF",
           91                 .file = NULL,
           92                 .ucdname = NULL,
           93         },
           94         {
           95                 .enumname = "CR",
           96                 .file = FILE_LINE,
           97                 .ucdname = "CR",
           98         },
           99         {
          100                 .enumname = "EB",
          101                 .file = FILE_LINE,
          102                 .ucdname = "EB",
          103         },
          104         {
          105                 .enumname = "EM",
          106                 .file = FILE_LINE,
          107                 .ucdname = "EM",
          108         },
          109         {
          110                 .enumname = "EX",
          111                 .file = FILE_LINE,
          112                 .ucdname = "EX",
          113         },
          114         {
          115                 .enumname = "GL",
          116                 .file = FILE_LINE,
          117                 .ucdname = "GL",
          118         },
          119         {
          120                 .enumname = "H2",
          121                 .file = FILE_LINE,
          122                 .ucdname = "H2",
          123         },
          124         {
          125                 .enumname = "H3",
          126                 .file = FILE_LINE,
          127                 .ucdname = "H3",
          128         },
          129         {
          130                 .enumname = "HL",
          131                 .file = FILE_LINE,
          132                 .ucdname = "HL",
          133         },
          134         {
          135                 .enumname = "HY",
          136                 .file = FILE_LINE,
          137                 .ucdname = "HY",
          138         },
          139         {
          140                 .enumname = "ID",
          141                 .file = FILE_LINE,
          142                 .ucdname = "ID",
          143         },
          144         {
          145                 .enumname = "IN",
          146                 .file = FILE_LINE,
          147                 .ucdname = "IN",
          148         },
          149         {
          150                 .enumname = "IS",
          151                 .file = FILE_LINE,
          152                 .ucdname = "IS",
          153         },
          154         {
          155                 .enumname = "JL",
          156                 .file = FILE_LINE,
          157                 .ucdname = "JL",
          158         },
          159         {
          160                 .enumname = "JT",
          161                 .file = FILE_LINE,
          162                 .ucdname = "JT",
          163         },
          164         {
          165                 .enumname = "JV",
          166                 .file = FILE_LINE,
          167                 .ucdname = "JV",
          168         },
          169         {
          170                 .enumname = "LF",
          171                 .file = FILE_LINE,
          172                 .ucdname = "LF",
          173         },
          174         {
          175                 .enumname = "NL",
          176                 .file = FILE_LINE,
          177                 .ucdname = "NL",
          178         },
          179         {
          180                 .enumname = "NS",
          181                 .file = FILE_LINE,
          182                 .ucdname = "NS",
          183         },
          184         {
          185                 .enumname = "NU",
          186                 .file = FILE_LINE,
          187                 .ucdname = "NU",
          188         },
          189         {
          190                 .enumname = "OP_WITHOUT_EAW_HWF",
          191                 .file = FILE_LINE,
          192                 .ucdname = "OP",
          193         },
          194         {
          195                 .enumname = "OP_WITH_EAW_HWF",
          196                 .file = NULL,
          197                 .ucdname = NULL,
          198         },
          199         {
          200                 .enumname = "PO",
          201                 .file = FILE_LINE,
          202                 .ucdname = "PO",
          203         },
          204         {
          205                 .enumname = "PR",
          206                 .file = FILE_LINE,
          207                 .ucdname = "PR",
          208         },
          209         {
          210                 .enumname = "QU",
          211                 .file = FILE_LINE,
          212                 .ucdname = "QU",
          213         },
          214         {
          215                 .enumname = "RI",
          216                 .file = FILE_LINE,
          217                 .ucdname = "RI",
          218         },
          219         {
          220                 .enumname = "SP",
          221                 .file = FILE_LINE,
          222                 .ucdname = "SP",
          223         },
          224         {
          225                 .enumname = "SY",
          226                 .file = FILE_LINE,
          227                 .ucdname = "SY",
          228         },
          229         {
          230                 .enumname = "WJ",
          231                 .file = FILE_LINE,
          232                 .ucdname = "WJ",
          233         },
          234         {
          235                 .enumname = "ZW",
          236                 .file = FILE_LINE,
          237                 .ucdname = "ZW",
          238         },
          239         {
          240                 .enumname = "ZWJ",
          241                 .file = FILE_LINE,
          242                 .ucdname = "ZWJ",
          243         },
          244         {
          245                 .enumname = "TMP_AI",
          246                 .file = FILE_LINE,
          247                 .ucdname = "AI",
          248         },
          249         {
          250                 .enumname = "TMP_CJ",
          251                 .file = FILE_LINE,
          252                 .ucdname = "CJ",
          253         },
          254         {
          255                 .enumname = "TMP_XX",
          256                 .file = NULL,
          257                 .ucdname = NULL,
          258         },
          259         {
          260                 .enumname = "TMP_MN",
          261                 .file = FILE_LINE,
          262                 .ucdname = "Mn",
          263         },
          264         {
          265                 .enumname = "TMP_MC",
          266                 .file = FILE_LINE,
          267                 .ucdname = "Mc",
          268         },
          269         {
          270                 .enumname = "TMP_SA_WITHOUT_MN_OR_MC",
          271                 .file = FILE_LINE,
          272                 .ucdname = "SA",
          273         },
          274         {
          275                 .enumname = "TMP_SA_WITH_MN_OR_MC",
          276                 .file = FILE_LINE,
          277                 .ucdname = "SA",
          278         },
          279         {
          280                 .enumname = "TMP_SG",
          281                 .file = FILE_LINE,
          282                 .ucdname = "SG",
          283         },
          284         {
          285                 .enumname = "TMP_EAW_H",
          286                 .file = FILE_EAW,
          287                 .ucdname = "H",
          288         },
          289         {
          290                 .enumname = "TMP_EAW_W",
          291                 .file = FILE_EAW,
          292                 .ucdname = "W",
          293         },
          294         {
          295                 .enumname = "TMP_EAW_F",
          296                 .file = FILE_EAW,
          297                 .ucdname = "F",
          298         },
          299 };
          300 
          301 static uint_least8_t
          302 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
          303 {
          304         uint_least8_t result = prop2;
          305         char *target = NULL;
          306 
          307         (void)cp;
          308 
          309         if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
          310              !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
          311              !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
          312             (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
          313              !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
          314              !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
          315                 if (!strcmp(line_break_property[prop1].enumname,
          316                             "CP_WITHOUT_EAW_HWF") ||
          317                     !strcmp(line_break_property[prop2].enumname,
          318                             "CP_WITHOUT_EAW_HWF")) {
          319                         target = "CP_WITH_EAW_HWF";
          320                 } else if (!strcmp(line_break_property[prop1].enumname,
          321                                    "OP_WITHOUT_EAW_HWF") ||
          322                            !strcmp(line_break_property[prop2].enumname,
          323                                    "OP_WITHOUT_EAW_HWF")) {
          324                         target = "OP_WITH_EAW_HWF";
          325                 } else {
          326                         /* ignore EAW for the rest */
          327                         if ((!strcmp(line_break_property[prop1].enumname,
          328                                      "TMP_EAW_H") ||
          329                              !strcmp(line_break_property[prop1].enumname,
          330                                      "TMP_EAW_W") ||
          331                              !strcmp(line_break_property[prop1].enumname,
          332                                      "TMP_EAW_F"))) {
          333                                 result = prop2;
          334                         } else {
          335                                 result = prop1;
          336                         }
          337                 }
          338         } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
          339                     !strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
          340                    (!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
          341                     !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
          342                 if (!strcmp(line_break_property[prop1].enumname,
          343                             "SA_WITHOUT_MN_OR_MC") ||
          344                     !strcmp(line_break_property[prop2].enumname,
          345                             "SA_WITHOUT_MN_OR_MC")) {
          346                         target = "SA_WITH_MN_OR_MC";
          347                 } else {
          348                         /* ignore Mn and Mc for the rest */
          349                         if ((!strcmp(line_break_property[prop1].enumname,
          350                                      "TMP_MN") ||
          351                              !strcmp(line_break_property[prop1].enumname,
          352                                      "TMP_MC"))) {
          353                                 result = prop2;
          354                         } else {
          355                                 result = prop1;
          356                         }
          357                 }
          358         } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
          359                    !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
          360                 if (!strcmp(line_break_property[prop1].enumname,
          361                             "TMP_EXTENDED_PICTOGRAPHIC") ||
          362                     !strcmp(line_break_property[prop2].enumname,
          363                             "TMP_EXTENDED_PICTOGRAPHIC")) {
          364                         target = "BOTH_CN_EXTPICT";
          365                 } else {
          366                         /* ignore Cn for all the other properties */
          367                         if (!strcmp(line_break_property[prop1].enumname,
          368                                     "TMP_CN")) {
          369                                 result = prop2;
          370                         } else {
          371                                 result = prop1;
          372                         }
          373                 }
          374         } else if (!strcmp(line_break_property[prop1].enumname,
          375                            "TMP_EXTENDED_PICTOGRAPHIC") ||
          376                    !strcmp(line_break_property[prop2].enumname,
          377                            "TMP_EXTENDED_PICTOGRAPHIC")) {
          378                 if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
          379                     !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
          380                         target = "BOTH_CN_EXTPICT";
          381                 } else {
          382                         /* ignore Extended_Pictographic for all the other
          383                          * properties */
          384                         if (!strcmp(line_break_property[prop1].enumname,
          385                                     "TMP_EXTENDED_PICTOGRAPHIC")) {
          386                                 result = prop2;
          387                         } else {
          388                                 result = prop1;
          389                         }
          390                 }
          391         } else {
          392                 fprintf(stderr,
          393                         "handle_conflict: Cannot handle conflict %s <- %s.\n",
          394                         line_break_property[prop1].enumname,
          395                         line_break_property[prop2].enumname);
          396                 exit(1);
          397         }
          398 
          399         if (target) {
          400                 for (result = 0; result < LEN(line_break_property); result++) {
          401                         if (!strcmp(line_break_property[result].enumname,
          402                                     target)) {
          403                                 break;
          404                         }
          405                 }
          406                 if (result == LEN(line_break_property)) {
          407                         fprintf(stderr, "handle_conflict: Internal error.\n");
          408                         exit(1);
          409                 }
          410         }
          411 
          412         return result;
          413 }
          414 
          415 static void
          416 post_process(struct properties *prop)
          417 {
          418         const char *target;
          419         uint_least8_t result;
          420         size_t i;
          421 
          422         /* post-mapping according to the line breaking algorithm */
          423         for (i = 0; i < UINT32_C(0x110000); i++) {
          424                 /* LB1 */
          425                 if (!strcmp(line_break_property[prop[i].property].enumname,
          426                             "TMP_AI") ||
          427                     !strcmp(line_break_property[prop[i].property].enumname,
          428                             "TMP_SG") ||
          429                     !strcmp(line_break_property[prop[i].property].enumname,
          430                             "TMP_XX")) {
          431                         /* map AI, SG and XX to AL */
          432                         target = "AL";
          433                 } else if (!strcmp(line_break_property[prop[i].property]
          434                                            .enumname,
          435                                    "TMP_SA_WITH_MN_OR_MC")) {
          436                         /* map SA (with General_Category Mn or Mc) to CM */
          437                         target = "CM";
          438                 } else if (!strcmp(line_break_property[prop[i].property]
          439                                            .enumname,
          440                                    "TMP_SA_WITHOUT_MN_OR_MC")) {
          441                         /* map SA (without General_Category Mn or Mc) to AL */
          442                         target = "AL";
          443                 } else if (!strcmp(line_break_property[prop[i].property]
          444                                            .enumname,
          445                                    "TMP_CJ")) {
          446                         /* map CJ to NS */
          447                         target = "NS";
          448                 } else if (
          449                         !strcmp(line_break_property[prop[i].property].enumname,
          450                                 "TMP_CN") ||
          451                         !strcmp(line_break_property[prop[i].property].enumname,
          452                                 "TMP_EXTENDED_PICTOGRAPHIC") ||
          453                         !strcmp(line_break_property[prop[i].property].enumname,
          454                                 "TMP_MN") ||
          455                         !strcmp(line_break_property[prop[i].property].enumname,
          456                                 "TMP_MC") ||
          457                         !strcmp(line_break_property[prop[i].property].enumname,
          458                                 "TMP_EAW_H") ||
          459                         !strcmp(line_break_property[prop[i].property].enumname,
          460                                 "TMP_EAW_W") ||
          461                         !strcmp(line_break_property[prop[i].property].enumname,
          462                                 "TMP_EAW_F")) {
          463                         /* map all the temporary classes "residue" to AL */
          464                         target = "AL";
          465                 } else {
          466                         target = NULL;
          467                 }
          468 
          469                 if (target) {
          470                         for (result = 0; result < LEN(line_break_property);
          471                              result++) {
          472                                 if (!strcmp(line_break_property[result]
          473                                                     .enumname,
          474                                             target)) {
          475                                         break;
          476                                 }
          477                         }
          478                         if (result == LEN(line_break_property)) {
          479                                 fprintf(stderr,
          480                                         "handle_conflict: Internal error.\n");
          481                                 exit(1);
          482                         }
          483 
          484                         prop[i].property = result;
          485                 }
          486         }
          487 }
          488 
          489 int
          490 main(int argc, char *argv[])
          491 {
          492         (void)argc;
          493 
          494         properties_generate_break_property(
          495                 line_break_property, LEN(line_break_property), NULL,
          496                 handle_conflict, post_process, "line_break", argv[0]);
          497 
          498         return 0;
          499 }