case.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       case.c (8442B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <errno.h>
            3 #include <stdint.h>
            4 #include <stdio.h>
            5 #include <stdlib.h>
            6 #include <string.h>
            7 
            8 #include "util.h"
            9 
           10 #define FILE_DCP "data/DerivedCoreProperties.txt"
           11 
           12 static const struct property_spec case_property[] = {
           13         {
           14                 .enumname = "OTHER",
           15                 .file = NULL,
           16                 .ucdname = NULL,
           17         },
           18         {
           19                 .enumname = "BOTH_CASED_CASE_IGNORABLE",
           20                 .file = NULL,
           21                 .ucdname = NULL,
           22         },
           23         {
           24                 .enumname = "CASED",
           25                 .file = FILE_DCP,
           26                 .ucdname = "Cased",
           27         },
           28         {
           29                 .enumname = "CASE_IGNORABLE",
           30                 .file = FILE_DCP,
           31                 .ucdname = "Case_Ignorable",
           32         },
           33         {
           34                 .enumname = "UNCASED",
           35                 .file = FILE_DCP,
           36                 .ucdname = "Uncased",
           37         },
           38 };
           39 
           40 static uint_least8_t
           41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
           42 {
           43         uint_least8_t result;
           44 
           45         (void)cp;
           46 
           47         if ((!strcmp(case_property[prop1].enumname, "CASED") &&
           48              !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
           49             (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
           50              !strcmp(case_property[prop2].enumname, "CASED"))) {
           51                 for (result = 0; result < LEN(case_property); result++) {
           52                         if (!strcmp(case_property[result].enumname,
           53                                     "BOTH_CASED_CASE_IGNORABLE")) {
           54                                 break;
           55                         }
           56                 }
           57                 if (result == LEN(case_property)) {
           58                         fprintf(stderr, "handle_conflict: Internal error.\n");
           59                         exit(1);
           60                 }
           61         } else {
           62                 fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
           63                 exit(1);
           64         }
           65 
           66         return result;
           67 }
           68 
           69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
           70 
           71 static struct special_case {
           72         struct {
           73                 uint_least32_t *cp;
           74                 size_t cplen;
           75         } upper, lower, title;
           76 } *sc = NULL;
           77 
           78 static size_t sclen = 0;
           79 
           80 static int
           81 unicodedata_callback(const char *file, char **field, size_t nfields,
           82                      char *comment, void *payload)
           83 {
           84         uint_least32_t cp, upper, lower, title;
           85 
           86         (void)file;
           87         (void)comment;
           88         (void)payload;
           89 
           90         hextocp(field[0], strlen(field[0]), &cp);
           91 
           92         upper = lower = title = cp;
           93 
           94         if ((strlen(field[12]) > 0 &&
           95              hextocp(field[12], strlen(field[12]), &upper)) ||
           96             (strlen(field[13]) > 0 &&
           97              hextocp(field[13], strlen(field[13]), &lower)) ||
           98             (nfields >= 15 && strlen(field[14]) > 0 &&
           99              hextocp(field[14], strlen(field[14]), &title))) {
          100                 return 1;
          101         }
          102 
          103         prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
          104         prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
          105         prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
          106 
          107         return 0;
          108 }
          109 
          110 static int
          111 specialcasing_callback(const char *file, char **field, size_t nfields,
          112                        char *comment, void *payload)
          113 {
          114         uint_least32_t cp;
          115 
          116         (void)file;
          117         (void)comment;
          118         (void)payload;
          119 
          120         if (nfields > 4 && strlen(field[4]) > 0) {
          121                 /*
          122                  * we have more than 4 fields, i.e. the rule has a
          123                  * condition (language-sensitive, etc.) and is discarded
          124                  */
          125                 return 0;
          126         }
          127 
          128         /* parse affected codepoint */
          129         hextocp(field[0], strlen(field[0]), &cp);
          130 
          131         /* extend special case array */
          132         if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
          133                 fprintf(stderr, "realloc: %s\n", strerror(errno));
          134                 exit(1);
          135         }
          136 
          137         /* parse field data */
          138         parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
          139                       &(sc[sclen - 1].upper.cplen));
          140         parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
          141                       &(sc[sclen - 1].lower.cplen));
          142         parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
          143                       &(sc[sclen - 1].title.cplen));
          144 
          145         /*
          146          * overwrite value in "single mapping" property table by the
          147          * special value 0x110000 + (offset in special case array),
          148          * even if the special case has length 1
          149          */
          150         prop_upper[cp].property =
          151                 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
          152         prop_lower[cp].property =
          153                 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
          154         prop_title[cp].property =
          155                 (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
          156 
          157         return 0;
          158 }
          159 
          160 static int_least64_t
          161 get_value(const struct properties *prop, size_t offset)
          162 {
          163         return prop[offset].property;
          164 }
          165 
          166 int
          167 main(int argc, char *argv[])
          168 {
          169         struct properties_compressed comp_upper, comp_lower, comp_title;
          170         struct properties_major_minor mm_upper, mm_lower, mm_title;
          171         size_t i, j;
          172 
          173         (void)argc;
          174 
          175         /* generate case property table from the specification */
          176         properties_generate_break_property(case_property, LEN(case_property),
          177                                            NULL, handle_conflict, NULL, "case",
          178                                            argv[0]);
          179 
          180         /*
          181          * allocate property buffers for all 0x110000 codepoints
          182          *
          183          * the buffers contain the offset from the "base" character
          184          * to the respective case mapping. By callocing we set all fields
          185          * to zero, which is also the Unicode "default" in the sense that
          186          * there is no case mapping by default (unless we fill it in)
          187          */
          188         if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
          189             !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
          190             !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
          191                 fprintf(stderr, "calloc: %s\n", strerror(errno));
          192                 exit(1);
          193         }
          194         parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
          195                                  NULL);
          196         parse_file_with_callback("data/SpecialCasing.txt",
          197                                  specialcasing_callback, NULL);
          198 
          199         /* compress properties */
          200         properties_compress(prop_upper, &comp_upper);
          201         properties_compress(prop_lower, &comp_lower);
          202         properties_compress(prop_title, &comp_title);
          203 
          204         fprintf(stderr,
          205                 "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, "
          206                 "title=%.2f%%\n",
          207                 argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
          208                 properties_get_major_minor(&comp_lower, &mm_lower),
          209                 properties_get_major_minor(&comp_title, &mm_title));
          210 
          211         /* print tables */
          212         printf("/* Automatically generated by %s */\n#include "
          213                "<stdint.h>\n#include <stddef.h>\n\n",
          214                argv[0]);
          215 
          216         printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t "
          217                "cplen;\n};\n\n");
          218 
          219         properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
          220         printf("\n");
          221         properties_print_derived_lookup_table("upper_minor", mm_upper.minor,
          222                                               mm_upper.minorlen, get_value,
          223                                               comp_upper.data);
          224         printf("\n");
          225         properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
          226         printf("\n");
          227         properties_print_derived_lookup_table("lower_minor", mm_lower.minor,
          228                                               mm_lower.minorlen, get_value,
          229                                               comp_lower.data);
          230         printf("\n");
          231         properties_print_lookup_table("title_major", mm_title.major, 0x1100);
          232         printf("\n");
          233         properties_print_derived_lookup_table("title_minor", mm_title.minor,
          234                                               mm_title.minorlen, get_value,
          235                                               comp_title.data);
          236         printf("\n");
          237 
          238         printf("static const struct special_case upper_special[] = {\n");
          239         for (i = 0; i < sclen; i++) {
          240                 printf("\t{\n");
          241 
          242                 printf("\t\t.cp     = (uint_least32_t[]){");
          243                 for (j = 0; j < sc[i].upper.cplen; j++) {
          244                         printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
          245                         if (j + 1 < sc[i].upper.cplen) {
          246                                 putchar(',');
          247                         }
          248                 }
          249                 printf(" },\n");
          250                 printf("\t\t.cplen  = %zu,\n", sc[i].upper.cplen);
          251                 printf("\t},\n");
          252         }
          253         printf("};\n\n");
          254 
          255         printf("static const struct special_case lower_special[] = {\n");
          256         for (i = 0; i < sclen; i++) {
          257                 printf("\t{\n");
          258 
          259                 printf("\t\t.cp     = (uint_least32_t[]){");
          260                 for (j = 0; j < sc[i].lower.cplen; j++) {
          261                         printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
          262                         if (j + 1 < sc[i].lower.cplen) {
          263                                 putchar(',');
          264                         }
          265                 }
          266                 printf(" },\n");
          267                 printf("\t\t.cplen  = %zu,\n", sc[i].lower.cplen);
          268                 printf("\t},\n");
          269         }
          270         printf("};\n\n");
          271 
          272         printf("static const struct special_case title_special[] = {\n");
          273         for (i = 0; i < sclen; i++) {
          274                 printf("\t{\n");
          275 
          276                 printf("\t\t.cp     = (uint_least32_t[]){");
          277                 for (j = 0; j < sc[i].title.cplen; j++) {
          278                         printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
          279                         if (j + 1 < sc[i].title.cplen) {
          280                                 putchar(',');
          281                         }
          282                 }
          283                 printf(" },\n");
          284                 printf("\t\t.cplen  = %zu,\n", sc[i].title.cplen);
          285                 printf("\t},\n");
          286         }
          287         printf("};\n\n");
          288 
          289         free(comp_lower.data);
          290         free(comp_lower.offset);
          291         free(comp_title.data);
          292         free(comp_title.offset);
          293         free(comp_upper.data);
          294         free(comp_upper.offset);
          295         free(mm_lower.major);
          296         free(mm_lower.minor);
          297         free(mm_title.major);
          298         free(mm_title.minor);
          299         free(mm_upper.major);
          300         free(mm_upper.minor);
          301 
          302         return 0;
          303 }