case.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       case.c (12993B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <stddef.h>
            3 #include <stdint.h>
            4 
            5 #include "../gen/case.h"
            6 #include "../grapheme.h"
            7 #include "util.h"
            8 
            9 static inline enum case_property
           10 get_case_property(uint_least32_t cp)
           11 {
           12         if (likely(cp <= UINT32_C(0x10FFFF))) {
           13                 return (enum case_property)
           14                         case_minor[case_major[cp >> 8] + (cp & 0xFF)];
           15         } else {
           16                 return CASE_PROP_OTHER;
           17         }
           18 }
           19 
           20 static inline int_least32_t
           21 get_case_offset(uint_least32_t cp, const uint_least16_t *major,
           22                 const int_least32_t *minor)
           23 {
           24         if (likely(cp <= UINT32_C(0x10FFFF))) {
           25                 /*
           26                  * this value might be larger than or equal to 0x110000
           27                  * for the special-case-mapping. This needs to be handled
           28                  * separately
           29                  */
           30                 return minor[major[cp >> 8] + (cp & 0xFF)];
           31         } else {
           32                 return 0;
           33         }
           34 }
           35 
           36 static inline size_t
           37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
           38         uint_least8_t final_sigma_level, const uint_least16_t *major,
           39         const int_least32_t *minor, const struct special_case *sc)
           40 {
           41         HERODOTUS_READER tmp;
           42         enum case_property prop;
           43         enum herodotus_status s;
           44         size_t off, i;
           45         uint_least32_t cp, tmp_cp;
           46         int_least32_t map;
           47 
           48         for (; herodotus_read_codepoint(r, true, &cp) ==
           49                HERODOTUS_STATUS_SUCCESS;) {
           50                 if (sc == lower_special) {
           51                         /*
           52                          * For the special Final_Sigma-rule (see
           53                          * SpecialCasing.txt), which is the only non-localized
           54                          * case-dependent rule, we apply a different mapping
           55                          * when a sigma is at the end of a word.
           56                          *
           57                          * Before: cased case-ignorable*
           58                          * After: not(case-ignorable* cased)
           59                          *
           60                          * We check the after-condition on demand, but the
           61                          * before- condition is best checked using the
           62                          * "level"-heuristic also used in the sentence and line
           63                          * breaking-implementations.
           64                          */
           65                         if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER
           66                                                          SIGMA */
           67                             (final_sigma_level == 1 ||
           68                              final_sigma_level == 2)) {
           69                                 /*
           70                                  * check succeeding characters by first skipping
           71                                  * all case-ignorable characters and then
           72                                  * checking if the succeeding character is
           73                                  * cased, invalidating the after-condition
           74                                  */
           75                                 herodotus_reader_copy(r, &tmp);
           76                                 for (prop = NUM_CASE_PROPS;
           77                                      (s = herodotus_read_codepoint(&tmp, true,
           78                                                                    &tmp_cp)) ==
           79                                      HERODOTUS_STATUS_SUCCESS;) {
           80                                         prop = get_case_property(tmp_cp);
           81 
           82                                         if (prop != CASE_PROP_CASE_IGNORABLE &&
           83                                             prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
           84                                                 break;
           85                                         }
           86                                 }
           87 
           88                                 /*
           89                                  * Now prop is something other than
           90                                  * case-ignorable or the source-string ended. If
           91                                  * it is something other than cased, we know
           92                                  * that the after-condition holds
           93                                  */
           94                                 if (s != HERODOTUS_STATUS_SUCCESS ||
           95                                     (prop != CASE_PROP_CASED &&
           96                                      prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
           97                                         /*
           98                                          * write GREEK SMALL LETTER FINAL SIGMA
           99                                          * to destination
          100                                          */
          101                                         herodotus_write_codepoint(
          102                                                 w, UINT32_C(0x03C2));
          103 
          104                                         /* reset Final_Sigma-state and continue
          105                                          */
          106                                         final_sigma_level = 0;
          107                                         continue;
          108                                 }
          109                         }
          110 
          111                         /* update state */
          112                         prop = get_case_property(cp);
          113                         if ((final_sigma_level == 0 ||
          114                              final_sigma_level == 1) &&
          115                             (prop == CASE_PROP_CASED ||
          116                              prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
          117                                 /* sequence has begun */
          118                                 final_sigma_level = 1;
          119                         } else if (
          120                                 (final_sigma_level == 1 ||
          121                                  final_sigma_level == 2) &&
          122                                 (prop == CASE_PROP_CASE_IGNORABLE ||
          123                                  prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
          124                                 /* case-ignorable sequence begins or continued
          125                                  */
          126                                 final_sigma_level = 2;
          127                         } else {
          128                                 /* sequence broke */
          129                                 final_sigma_level = 0;
          130                         }
          131                 }
          132 
          133                 /* get and handle case mapping */
          134                 if (unlikely((map = get_case_offset(cp, major, minor)) >=
          135                              INT32_C(0x110000))) {
          136                         /* we have a special case and the offset in the sc-array
          137                          * is the difference to 0x110000*/
          138                         off = (uint_least32_t)map - UINT32_C(0x110000);
          139 
          140                         for (i = 0; i < sc[off].cplen; i++) {
          141                                 herodotus_write_codepoint(w, sc[off].cp[i]);
          142                         }
          143                 } else {
          144                         /* we have a simple mapping */
          145                         herodotus_write_codepoint(
          146                                 w, (uint_least32_t)((int_least32_t)cp + map));
          147                 }
          148         }
          149 
          150         herodotus_writer_nul_terminate(w);
          151 
          152         return herodotus_writer_number_written(w);
          153 }
          154 
          155 static size_t
          156 herodotus_next_word_break(const HERODOTUS_READER *r)
          157 {
          158         HERODOTUS_READER tmp;
          159 
          160         herodotus_reader_copy(r, &tmp);
          161 
          162         if (r->type == HERODOTUS_TYPE_CODEPOINT) {
          163                 return grapheme_next_word_break(tmp.src, tmp.srclen);
          164         } else { /* r->type == HERODOTUS_TYPE_UTF8 */
          165                 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
          166         }
          167 }
          168 
          169 static inline size_t
          170 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
          171 {
          172         enum case_property prop;
          173         enum herodotus_status s;
          174         uint_least32_t cp;
          175         size_t nwb;
          176 
          177         for (; (nwb = herodotus_next_word_break(r)) > 0;) {
          178                 herodotus_reader_push_advance_limit(r, nwb);
          179                 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
          180                        HERODOTUS_STATUS_SUCCESS;) {
          181                         /* check if we have a cased character */
          182                         prop = get_case_property(cp);
          183                         if (prop == CASE_PROP_CASED ||
          184                             prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
          185                                 break;
          186                         } else {
          187                                 /* write the data to the output verbatim, it if
          188                                  * permits */
          189                                 herodotus_write_codepoint(w, cp);
          190 
          191                                 /* increment reader */
          192                                 herodotus_read_codepoint(r, true, &cp);
          193                         }
          194                 }
          195 
          196                 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
          197                         /* we are done */
          198                         herodotus_reader_pop_limit(r);
          199                         break;
          200                 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
          201                         /*
          202                          * we did not encounter any cased character
          203                          * up to the word break
          204                          */
          205                         herodotus_reader_pop_limit(r);
          206                         continue;
          207                 } else {
          208                         /*
          209                          * we encountered a cased character before the word
          210                          * break, convert it to titlecase
          211                          */
          212                         herodotus_reader_push_advance_limit(
          213                                 r, herodotus_reader_next_codepoint_break(r));
          214                         to_case(r, w, 0, title_major, title_minor,
          215                                 title_special);
          216                         herodotus_reader_pop_limit(r);
          217                 }
          218 
          219                 /* cast the rest of the codepoints in the word to lowercase */
          220                 to_case(r, w, 1, lower_major, lower_minor, lower_special);
          221 
          222                 /* remove the limit on the word before the next iteration */
          223                 herodotus_reader_pop_limit(r);
          224         }
          225 
          226         herodotus_writer_nul_terminate(w);
          227 
          228         return herodotus_writer_number_written(w);
          229 }
          230 
          231 size_t
          232 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen,
          233                       uint_least32_t *dest, size_t destlen)
          234 {
          235         HERODOTUS_READER r;
          236         HERODOTUS_WRITER w;
          237 
          238         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
          239         herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
          240 
          241         return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
          242 }
          243 
          244 size_t
          245 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen,
          246                       uint_least32_t *dest, size_t destlen)
          247 {
          248         HERODOTUS_READER r;
          249         HERODOTUS_WRITER w;
          250 
          251         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
          252         herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
          253 
          254         return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
          255 }
          256 
          257 size_t
          258 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen,
          259                       uint_least32_t *dest, size_t destlen)
          260 {
          261         HERODOTUS_READER r;
          262         HERODOTUS_WRITER w;
          263 
          264         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
          265         herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
          266 
          267         return to_titlecase(&r, &w);
          268 }
          269 
          270 size_t
          271 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest,
          272                            size_t destlen)
          273 {
          274         HERODOTUS_READER r;
          275         HERODOTUS_WRITER w;
          276 
          277         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
          278         herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
          279 
          280         return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
          281 }
          282 
          283 size_t
          284 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest,
          285                            size_t destlen)
          286 {
          287         HERODOTUS_READER r;
          288         HERODOTUS_WRITER w;
          289 
          290         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
          291         herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
          292 
          293         return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
          294 }
          295 
          296 size_t
          297 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest,
          298                            size_t destlen)
          299 {
          300         HERODOTUS_READER r;
          301         HERODOTUS_WRITER w;
          302 
          303         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
          304         herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
          305 
          306         return to_titlecase(&r, &w);
          307 }
          308 
          309 static inline bool
          310 is_case(HERODOTUS_READER *r, const uint_least16_t *major,
          311         const int_least32_t *minor, const struct special_case *sc,
          312         size_t *output)
          313 {
          314         size_t off, i;
          315         bool ret = true;
          316         uint_least32_t cp;
          317         int_least32_t map;
          318 
          319         for (; herodotus_read_codepoint(r, false, &cp) ==
          320                HERODOTUS_STATUS_SUCCESS;) {
          321                 /* get and handle case mapping */
          322                 if (unlikely((map = get_case_offset(cp, major, minor)) >=
          323                              INT32_C(0x110000))) {
          324                         /* we have a special case and the offset in the sc-array
          325                          * is the difference to 0x110000*/
          326                         off = (uint_least32_t)map - UINT32_C(0x110000);
          327 
          328                         for (i = 0; i < sc[off].cplen; i++) {
          329                                 if (herodotus_read_codepoint(r, false, &cp) ==
          330                                     HERODOTUS_STATUS_SUCCESS) {
          331                                         if (cp != sc[off].cp[i]) {
          332                                                 ret = false;
          333                                                 goto done;
          334                                         } else {
          335                                                 /* move forward */
          336                                                 herodotus_read_codepoint(
          337                                                         r, true, &cp);
          338                                         }
          339                                 } else {
          340                                         /*
          341                                          * input ended and we didn't see
          342                                          * any difference so far, so this
          343                                          * string is in fact okay
          344                                          */
          345                                         ret = true;
          346                                         goto done;
          347                                 }
          348                         }
          349                 } else {
          350                         /* we have a simple mapping */
          351                         if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
          352                                 /* we have a difference */
          353                                 ret = false;
          354                                 goto done;
          355                         } else {
          356                                 /* move forward */
          357                                 herodotus_read_codepoint(r, true, &cp);
          358                         }
          359                 }
          360         }
          361 done:
          362         if (output) {
          363                 *output = herodotus_reader_number_read(r);
          364         }
          365         return ret;
          366 }
          367 
          368 static inline bool
          369 is_titlecase(HERODOTUS_READER *r, size_t *output)
          370 {
          371         enum case_property prop;
          372         enum herodotus_status s;
          373         bool ret = true;
          374         uint_least32_t cp;
          375         size_t nwb;
          376 
          377         for (; (nwb = herodotus_next_word_break(r)) > 0;) {
          378                 herodotus_reader_push_advance_limit(r, nwb);
          379                 for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
          380                        HERODOTUS_STATUS_SUCCESS;) {
          381                         /* check if we have a cased character */
          382                         prop = get_case_property(cp);
          383                         if (prop == CASE_PROP_CASED ||
          384                             prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
          385                                 break;
          386                         } else {
          387                                 /* increment reader */
          388                                 herodotus_read_codepoint(r, true, &cp);
          389                         }
          390                 }
          391 
          392                 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
          393                         /* we are done */
          394                         break;
          395                 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
          396                         /*
          397                          * we did not encounter any cased character
          398                          * up to the word break
          399                          */
          400                         herodotus_reader_pop_limit(r);
          401                         continue;
          402                 } else {
          403                         /*
          404                          * we encountered a cased character before the word
          405                          * break, check if it's titlecase
          406                          */
          407                         herodotus_reader_push_advance_limit(
          408                                 r, herodotus_reader_next_codepoint_break(r));
          409                         if (!is_case(r, title_major, title_minor, title_special,
          410                                      NULL)) {
          411                                 ret = false;
          412                                 goto done;
          413                         }
          414                         herodotus_reader_pop_limit(r);
          415                 }
          416 
          417                 /* check if the rest of the codepoints in the word are lowercase
          418                  */
          419                 if (!is_case(r, lower_major, lower_minor, lower_special,
          420                              NULL)) {
          421                         ret = false;
          422                         goto done;
          423                 }
          424 
          425                 /* remove the limit on the word before the next iteration */
          426                 herodotus_reader_pop_limit(r);
          427         }
          428 done:
          429         if (output) {
          430                 *output = herodotus_reader_number_read(r);
          431         }
          432         return ret;
          433 }
          434 
          435 bool
          436 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
          437 {
          438         HERODOTUS_READER r;
          439 
          440         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
          441 
          442         return is_case(&r, upper_major, upper_minor, upper_special, caselen);
          443 }
          444 
          445 bool
          446 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
          447 {
          448         HERODOTUS_READER r;
          449 
          450         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
          451 
          452         return is_case(&r, lower_major, lower_minor, lower_special, caselen);
          453 }
          454 
          455 bool
          456 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
          457 {
          458         HERODOTUS_READER r;
          459 
          460         herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
          461 
          462         return is_titlecase(&r, caselen);
          463 }
          464 
          465 bool
          466 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
          467 {
          468         HERODOTUS_READER r;
          469 
          470         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
          471 
          472         return is_case(&r, upper_major, upper_minor, upper_special, caselen);
          473 }
          474 
          475 bool
          476 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
          477 {
          478         HERODOTUS_READER r;
          479 
          480         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
          481 
          482         return is_case(&r, lower_major, lower_minor, lower_special, caselen);
          483 }
          484 
          485 bool
          486 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
          487 {
          488         HERODOTUS_READER r;
          489 
          490         herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
          491 
          492         return is_titlecase(&r, caselen);
          493 }