bidirectional.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       bidirectional.c (12515B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <errno.h>
            3 #include <inttypes.h>
            4 #include <stddef.h>
            5 #include <stdio.h>
            6 #include <stdlib.h>
            7 #include <string.h>
            8 
            9 #include "util.h"
           10 
           11 #define FILE_BIDI_BRACKETS  "data/BidiBrackets.txt"
           12 #define FILE_BIDI_CLASS     "data/DerivedBidiClass.txt"
           13 #define FILE_BIDI_MIRRORING "data/BidiMirroring.txt"
           14 #define FILE_UNICODE_DATA   "data/UnicodeData.txt"
           15 
           16 #define NUM_BRACKET_ALIASES 20
           17 
           18 static const struct property_spec bidi_property[] = {
           19         {
           20                 /* default */
           21                 .enumname = "L",
           22                 .file = FILE_BIDI_CLASS,
           23                 .ucdname = "L",
           24         },
           25         {
           26                 .enumname = "AL",
           27                 .file = FILE_BIDI_CLASS,
           28                 .ucdname = "AL",
           29         },
           30         {
           31                 .enumname = "AN",
           32                 .file = FILE_BIDI_CLASS,
           33                 .ucdname = "AN",
           34         },
           35         {
           36                 .enumname = "B",
           37                 .file = FILE_BIDI_CLASS,
           38                 .ucdname = "B",
           39         },
           40         {
           41                 .enumname = "BN",
           42                 .file = FILE_BIDI_CLASS,
           43                 .ucdname = "BN",
           44         },
           45         {
           46                 .enumname = "CS",
           47                 .file = FILE_BIDI_CLASS,
           48                 .ucdname = "CS",
           49         },
           50         {
           51                 .enumname = "EN",
           52                 .file = FILE_BIDI_CLASS,
           53                 .ucdname = "EN",
           54         },
           55         {
           56                 .enumname = "ES",
           57                 .file = FILE_BIDI_CLASS,
           58                 .ucdname = "ES",
           59         },
           60         {
           61                 .enumname = "ET",
           62                 .file = FILE_BIDI_CLASS,
           63                 .ucdname = "ET",
           64         },
           65         {
           66                 .enumname = "FSI",
           67                 .file = FILE_BIDI_CLASS,
           68                 .ucdname = "FSI",
           69         },
           70         {
           71                 .enumname = "LRE",
           72                 .file = FILE_BIDI_CLASS,
           73                 .ucdname = "LRE",
           74         },
           75         {
           76                 .enumname = "LRI",
           77                 .file = FILE_BIDI_CLASS,
           78                 .ucdname = "LRI",
           79         },
           80         {
           81                 .enumname = "LRO",
           82                 .file = FILE_BIDI_CLASS,
           83                 .ucdname = "LRO",
           84         },
           85         {
           86                 .enumname = "NSM",
           87                 .file = FILE_BIDI_CLASS,
           88                 .ucdname = "NSM",
           89         },
           90         {
           91                 .enumname = "ON",
           92                 .file = FILE_BIDI_CLASS,
           93                 .ucdname = "ON",
           94         },
           95         {
           96                 .enumname = "PDF",
           97                 .file = FILE_BIDI_CLASS,
           98                 .ucdname = "PDF",
           99         },
          100         {
          101                 .enumname = "PDI",
          102                 .file = FILE_BIDI_CLASS,
          103                 .ucdname = "PDI",
          104         },
          105         {
          106                 .enumname = "R",
          107                 .file = FILE_BIDI_CLASS,
          108                 .ucdname = "R",
          109         },
          110         {
          111                 .enumname = "RLE",
          112                 .file = FILE_BIDI_CLASS,
          113                 .ucdname = "RLE",
          114         },
          115         {
          116                 .enumname = "RLI",
          117                 .file = FILE_BIDI_CLASS,
          118                 .ucdname = "RLI",
          119         },
          120         {
          121                 .enumname = "RLO",
          122                 .file = FILE_BIDI_CLASS,
          123                 .ucdname = "RLO",
          124         },
          125         {
          126                 .enumname = "S",
          127                 .file = FILE_BIDI_CLASS,
          128                 .ucdname = "S",
          129         },
          130         {
          131                 .enumname = "WS",
          132                 .file = FILE_BIDI_CLASS,
          133                 .ucdname = "WS",
          134         },
          135 };
          136 
          137 struct decomposition_payload {
          138         uint_least32_t cp;
          139         uint_least32_t decomposition;
          140 };
          141 
          142 static int
          143 decomposition_callback(const char *file, char **field, size_t nfields,
          144                        char *comment, void *payload)
          145 {
          146         char *p;
          147         struct decomposition_payload *decomp =
          148                 (struct decomposition_payload *)payload;
          149         uint_least32_t cp;
          150 
          151         (void)file;
          152         (void)comment;
          153 
          154         if (nfields < 6) {
          155                 /* we have fewer than 6 fields, discard the line */
          156                 return 0;
          157         }
          158 
          159         hextocp(field[0], strlen(field[0]), &cp);
          160 
          161         if (decomp->cp == cp) {
          162                 /* we hit the line that contains our decomposition target */
          163                 if (strlen(field[5]) > 0) {
          164                         p = field[5];
          165                         if (*p == '<') {
          166                                 /*
          167                                  * the decomposition contains some metadata
          168                                  * <...> we skip
          169                                  */
          170                                 for (; *p != '\0'; p++) {
          171                                         if (*p == '>') {
          172                                                 p++;
          173                                                 while (*p == ' ') {
          174                                                         p++;
          175                                                 }
          176                                                 break;
          177                                         }
          178                                 }
          179                         }
          180                         hextocp(p, strlen(p), &(decomp->decomposition));
          181                 } else {
          182                         decomp->decomposition = decomp->cp;
          183                 }
          184         }
          185 
          186         return 0;
          187 }
          188 
          189 static struct {
          190         uint_least32_t base[NUM_BRACKET_ALIASES];
          191         size_t baselen;
          192         uint_least32_t pair[NUM_BRACKET_ALIASES];
          193         size_t pairlen;
          194         uint_least8_t class;
          195         char type;
          196 } *b = NULL;
          197 
          198 static size_t blen;
          199 static uint_least8_t bracket_class_count = 1;
          200 
          201 static int
          202 bracket_callback(const char *file, char **field, size_t nfields, char *comment,
          203                  void *payload)
          204 {
          205         size_t i, j;
          206         struct decomposition_payload decomp_base, decomp_pair;
          207         uint_least32_t cp_base, cp_pair;
          208 
          209         (void)file;
          210         (void)comment;
          211         (void)payload;
          212 
          213         if (nfields < 3) {
          214                 /* we have fewer than 3 fields, discard the line */
          215                 return 0;
          216         }
          217 
          218         /* parse field data */
          219         hextocp(field[0], strlen(field[0]), &cp_base);
          220         hextocp(field[1], strlen(field[1]), &cp_pair);
          221 
          222         /* determine decomposition of the base and pair codepoints */
          223         decomp_base.cp = cp_base;
          224         decomp_pair.cp = cp_pair;
          225         parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
          226                                  &decomp_base);
          227         parse_file_with_callback(FILE_UNICODE_DATA, decomposition_callback,
          228                                  &decomp_pair);
          229 
          230         /*
          231          * check if we already have the canonical form in the bracket array,
          232          * per convention the canonical form is the first element of the alias
          233          * array
          234          */
          235         for (i = 0; i < blen; i++) {
          236                 if (decomp_base.decomposition == b[i].base[0]) {
          237                         /* we have a match, check type */
          238                         if (strlen(field[2]) != 1 ||
          239                             (field[2][0] != 'o' && field[2][0] != 'c')) {
          240                                 /* malformed line */
          241                                 return 1;
          242                         } else if (b[i].type != field[2][0]) {
          243                                 /* mismatching types */
          244                                 return 1;
          245                         }
          246 
          247                         /*
          248                          * add our base alias to the base array unless it isn't
          249                          * already in it
          250                          */
          251                         for (j = 0; j < b[i].baselen; j++) {
          252                                 if (cp_base == b[i].base[j]) {
          253                                         /* already in array, do nothing */
          254                                         break;
          255                                 }
          256                         }
          257                         if (j == b[i].baselen) {
          258                                 /*
          259                                  * the base alias is not already in the array,
          260                                  * add it
          261                                  */
          262                                 if (b[i].baselen == NUM_BRACKET_ALIASES) {
          263                                         fprintf(stderr, "too many aliases\n");
          264                                         return 1;
          265                                 }
          266                                 b[i].baselen++;
          267                                 b[i].base[b[i].baselen - 1] = cp_base;
          268                         }
          269 
          270                         /*
          271                          * also add our pair alias to the pair array unless
          272                          * it isn't already in it
          273                          */
          274                         for (j = 0; j < b[i].pairlen; j++) {
          275                                 if (cp_pair == b[i].pair[j]) {
          276                                         /* already in array, do nothing */
          277                                         break;
          278                                 }
          279                         }
          280                         if (j == b[i].pairlen) {
          281                                 /*
          282                                  * the pair alias is not already in the array,
          283                                  * add it
          284                                  */
          285                                 if (b[i].pairlen == NUM_BRACKET_ALIASES) {
          286                                         fprintf(stderr, "too many aliases\n");
          287                                         return 1;
          288                                 }
          289                                 b[i].pairlen++;
          290                                 b[i].pair[b[i].pairlen - 1] = cp_pair;
          291                         }
          292 
          293                         return 0;
          294                 }
          295         }
          296 
          297         /* extend bracket pair array, as this is a new bracket type */
          298         if (!(b = realloc(b, (++blen) * sizeof(*b)))) {
          299                 fprintf(stderr, "realloc: %s\n", strerror(errno));
          300                 exit(1);
          301         }
          302 
          303         /* fill field data by adding the canonical form first */
          304         b[blen - 1].base[0] = decomp_base.decomposition;
          305         b[blen - 1].baselen = 1;
          306         b[blen - 1].pair[0] = decomp_pair.decomposition;
          307         b[blen - 1].pairlen = 1;
          308 
          309         /* add alias if it differs from the canonical form */
          310         if (cp_base != decomp_base.decomposition) {
          311                 b[blen - 1].base[1] = cp_base;
          312                 b[blen - 1].baselen = 2;
          313         }
          314         if (cp_pair != decomp_pair.decomposition) {
          315                 b[blen - 1].pair[1] = cp_pair;
          316                 b[blen - 1].pairlen = 2;
          317         }
          318 
          319         /* add bracket type */
          320         if (strlen(field[2]) != 1 ||
          321             (field[2][0] != 'o' && field[2][0] != 'c')) {
          322                 /* malformed line */
          323                 return 1;
          324         } else {
          325                 b[blen - 1].type = field[2][0];
          326         }
          327 
          328         /*
          329          * determine bracket class by iterating over the bracket-array
          330          * and seeing if our current canonical cp already has a matching pair.
          331          * We only need to check the first entry in each bracket alias
          332          * list, as this is, per convention, the canonical form.
          333          * If not, add a new class.
          334          */
          335         for (i = 0; i + 1 < blen; i++) {
          336                 if (b[i].pair[0] == b[blen - 1].base[0]) {
          337                         /* matched class */
          338                         b[blen - 1].class = b[i].class;
          339                         break;
          340                 }
          341         }
          342         if (i + 1 == blen) {
          343                 /* no match, assign a new class */
          344                 b[blen - 1].class = bracket_class_count++;
          345         }
          346 
          347         return 0;
          348 }
          349 
          350 static void
          351 post_process(struct properties *prop)
          352 {
          353         size_t i, j;
          354 
          355         for (i = 0; i < blen; i++) {
          356                 /*
          357                  * given the base property fits in 5 bits, we simply
          358                  * store the bracket-offset in the bits above that.
          359                  *
          360                  * All those properties that are not set here implicitly
          361                  * have offset 0, which we prepared to contain a stub
          362                  * for a character that is not a bracket.
          363                  */
          364                 for (j = 0; j < b[i].baselen; j++) {
          365                         prop[b[i].base[j]].property |= (i << 5);
          366                 }
          367         }
          368 }
          369 
          370 static uint_least8_t
          371 fill_missing(uint_least32_t cp)
          372 {
          373         /* based on the @missing-properties in data/DerivedBidiClass.txt */
          374         if ((cp >= UINT32_C(0x0590) && cp <= UINT32_C(0x05FF)) ||
          375             (cp >= UINT32_C(0x07C0) && cp <= UINT32_C(0x085F)) ||
          376             (cp >= UINT32_C(0xFB1D) && cp <= UINT32_C(0xFB4F)) ||
          377             (cp >= UINT32_C(0x10800) && cp <= UINT32_C(0x10CFF)) ||
          378             (cp >= UINT32_C(0x10D40) && cp <= UINT32_C(0x10EBF)) ||
          379             (cp >= UINT32_C(0x10F00) && cp <= UINT32_C(0x10F2F)) ||
          380             (cp >= UINT32_C(0x10F70) && cp <= UINT32_C(0x10FFF)) ||
          381             (cp >= UINT32_C(0x1E800) && cp <= UINT32_C(0x1EC6F)) ||
          382             (cp >= UINT32_C(0x1ECC0) && cp <= UINT32_C(0x1ECFF)) ||
          383             (cp >= UINT32_C(0x1ED50) && cp <= UINT32_C(0x1EDFF)) ||
          384             (cp >= UINT32_C(0x1EF00) && cp <= UINT32_C(0x1EFFF))) {
          385                 return 17; /* class R */
          386         } else if ((cp >= UINT32_C(0x0600) && cp <= UINT32_C(0x07BF)) ||
          387                    (cp >= UINT32_C(0x0860) && cp <= UINT32_C(0x08FF)) ||
          388                    (cp >= UINT32_C(0xFB50) && cp <= UINT32_C(0xFDCF)) ||
          389                    (cp >= UINT32_C(0xFDF0) && cp <= UINT32_C(0xFDFF)) ||
          390                    (cp >= UINT32_C(0xFE70) && cp <= UINT32_C(0xFEFF)) ||
          391                    (cp >= UINT32_C(0x10D00) && cp <= UINT32_C(0x10D3F)) ||
          392                    (cp >= UINT32_C(0x10EC0) && cp <= UINT32_C(0x10EFF)) ||
          393                    (cp >= UINT32_C(0x10F30) && cp <= UINT32_C(0x10F6F)) ||
          394                    (cp >= UINT32_C(0x1EC70) && cp <= UINT32_C(0x1ECBF)) ||
          395                    (cp >= UINT32_C(0x1ED00) && cp <= UINT32_C(0x1ED4F)) ||
          396                    (cp >= UINT32_C(0x1EE00) && cp <= UINT32_C(0x1EEFF))) {
          397                 return 1; /* class AL */
          398         } else if (cp >= UINT32_C(0x20A0) && cp <= UINT32_C(0x20CF)) {
          399                 return 8; /* class ET */
          400         } else {
          401                 return 0; /* class L */
          402         }
          403 }
          404 
          405 static struct properties *prop_mirror = NULL;
          406 
          407 static int
          408 mirror_callback(const char *file, char **field, size_t nfields, char *comment,
          409                 void *payload)
          410 {
          411         uint_least32_t cp, cp_mirror;
          412 
          413         (void)file;
          414         (void)comment;
          415         (void)payload;
          416 
          417         hextocp(field[0], strlen(field[0]), &cp);
          418 
          419         cp_mirror = cp;
          420 
          421         if (nfields >= 2 && strlen(field[1]) > 0 &&
          422             hextocp(field[1], strlen(field[1]), &cp_mirror)) {
          423                 return 1;
          424         }
          425 
          426         prop_mirror[cp].property = (int_least32_t)cp_mirror - (int_least32_t)cp;
          427 
          428         return 0;
          429 }
          430 
          431 static int_least64_t
          432 get_value(const struct properties *prop, size_t offset)
          433 {
          434         return prop[offset].property;
          435 }
          436 
          437 int
          438 main(int argc, char *argv[])
          439 {
          440         struct properties_compressed comp_mirror;
          441         struct properties_major_minor mm_mirror;
          442         size_t i;
          443 
          444         (void)argc;
          445 
          446         /*
          447          * the first element in the bracket array is initialized to
          448          * all-zeros, as we use the implicit 0-offset for all those
          449          * codepoints that are not a bracket
          450          */
          451         if (!(b = calloc((blen = 1), sizeof(*b)))) {
          452                 fprintf(stderr, "calloc: %s\n", strerror(errno));
          453                 exit(1);
          454         }
          455         parse_file_with_callback(FILE_BIDI_BRACKETS, bracket_callback, NULL);
          456 
          457         properties_generate_break_property(bidi_property, LEN(bidi_property),
          458                                            fill_missing, NULL, post_process,
          459                                            "bidi", argv[0]);
          460 
          461         printf("\nenum bracket_type {\n\tBIDI_BRACKET_NONE,\n\t"
          462                "BIDI_BRACKET_OPEN,\n\tBIDI_BRACKET_CLOSE,\n};\n\n"
          463                "static const struct bracket {\n\tenum bracket_type type;\n"
          464                "\tuint_least8_t class;\n} bidi_bracket[] = {\n");
          465         for (i = 0; i < blen; i++) {
          466                 printf("\t{\n\t\t.type = %s,\n\t\t.class = "
          467                        "%" PRIuLEAST8 ",\n\t},\n",
          468                        (b[i].type == 'o') ? "BIDI_BRACKET_OPEN" :
          469                        (b[i].type == 'c') ? "BIDI_BRACKET_CLOSE" :
          470                                             "BIDI_BRACKET_NONE",
          471                        b[i].class);
          472         }
          473         printf("};\n");
          474 
          475         /*
          476          * allocate property buffer for all 0x110000 codepoints
          477          *
          478          * the buffers contain the offset from the "base" character
          479          * to the respective mirrored character. By callocing we set all
          480          * fields to zero, which is also the Unicode "default" in the sense
          481          * that the coe point is its mirror (unless we fill it in)
          482          */
          483         if (!(prop_mirror = calloc(UINT32_C(0x110000), sizeof(*prop_mirror)))) {
          484                 fprintf(stderr, "calloc: %s\n", strerror(errno));
          485                 exit(1);
          486         }
          487         parse_file_with_callback(FILE_BIDI_MIRRORING, mirror_callback, NULL);
          488 
          489         /* compress properties */
          490         properties_compress(prop_mirror, &comp_mirror);
          491 
          492         fprintf(stderr, "%s: mirror-LUT compression-ratio: %.2f%%\n", argv[0],
          493                 properties_get_major_minor(&comp_mirror, &mm_mirror));
          494 
          495         /* print tables */
          496         properties_print_lookup_table("mirror_major", mm_mirror.major, 0x1100);
          497         printf("\n");
          498         properties_print_derived_lookup_table("mirror_minor", mm_mirror.minor,
          499                                               mm_mirror.minorlen, get_value,
          500                                               comp_mirror.data);
          501 
          502         free(comp_mirror.data);
          503         free(comp_mirror.offset);
          504         free(mm_mirror.major);
          505         free(mm_mirror.minor);
          506 
          507         return 0;
          508 }