util.c - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       util.c (11480B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <limits.h>
            3 #include <stdbool.h>
            4 #include <stddef.h>
            5 #include <stdint.h>
            6 
            7 #include "../gen/types.h"
            8 #include "../grapheme.h"
            9 #include "util.h"
           10 
           11 void
           12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
           13                       const void *src, size_t srclen)
           14 {
           15         size_t i;
           16 
           17         r->type = type;
           18         r->src = src;
           19         r->srclen = srclen;
           20         r->off = 0;
           21         r->terminated_by_null = false;
           22 
           23         for (i = 0; i < LEN(r->soft_limit); i++) {
           24                 r->soft_limit[i] = SIZE_MAX;
           25         }
           26 }
           27 
           28 void
           29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
           30 {
           31         size_t i;
           32 
           33         /*
           34          * we copy such that we have a "fresh" start and build on the
           35          * fact that src->soft_limit[i] for any i and src->srclen are
           36          * always larger or equal to src->off
           37          */
           38         dest->type = src->type;
           39         if (src->type == HERODOTUS_TYPE_CODEPOINT) {
           40                 dest->src =
           41                         (src->src == NULL) ?
           42                                 NULL :
           43                                 ((const uint_least32_t *)(src->src)) + src->off;
           44         } else { /* src->type == HERODOTUS_TYPE_UTF8 */
           45                 dest->src = (src->src == NULL) ?
           46                                     NULL :
           47                                     ((const char *)(src->src)) + src->off;
           48         }
           49         if (src->srclen == SIZE_MAX) {
           50                 dest->srclen = SIZE_MAX;
           51         } else {
           52                 dest->srclen =
           53                         (src->off < src->srclen) ? src->srclen - src->off : 0;
           54         }
           55         dest->off = 0;
           56         dest->terminated_by_null = src->terminated_by_null;
           57 
           58         for (i = 0; i < LEN(src->soft_limit); i++) {
           59                 if (src->soft_limit[i] == SIZE_MAX) {
           60                         dest->soft_limit[i] = SIZE_MAX;
           61                 } else {
           62                         /*
           63                          * if we have a degenerate case where the offset is
           64                          * higher than the soft-limit, we simply clamp the
           65                          * soft-limit to zero given we can't decide here
           66                          * to release the limit and, instead, we just
           67                          * prevent any more reads
           68                          */
           69                         dest->soft_limit[i] =
           70                                 (src->off < src->soft_limit[i]) ?
           71                                         src->soft_limit[i] - src->off :
           72                                         0;
           73                 }
           74         }
           75 }
           76 
           77 void
           78 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
           79 {
           80         size_t i;
           81 
           82         for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
           83                 r->soft_limit[i] = r->soft_limit[i - 1];
           84         }
           85         r->soft_limit[0] = r->off + count;
           86 }
           87 
           88 void
           89 herodotus_reader_pop_limit(HERODOTUS_READER *r)
           90 {
           91         size_t i;
           92 
           93         for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
           94                 r->soft_limit[i] = r->soft_limit[i + 1];
           95         }
           96         r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
           97 }
           98 
           99 size_t
          100 herodotus_reader_next_word_break(const HERODOTUS_READER *r)
          101 {
          102         if (r->type == HERODOTUS_TYPE_CODEPOINT) {
          103                 return grapheme_next_word_break(
          104                         (const uint_least32_t *)(r->src) + r->off,
          105                         MIN(r->srclen, r->soft_limit[0]) - r->off);
          106         } else { /* r->type == HERODOTUS_TYPE_UTF8 */
          107                 return grapheme_next_word_break_utf8(
          108                         (const char *)(r->src) + r->off,
          109                         MIN(r->srclen, r->soft_limit[0]) - r->off);
          110         }
          111 }
          112 
          113 size_t
          114 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
          115 {
          116         if (r->type == HERODOTUS_TYPE_CODEPOINT) {
          117                 return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
          118         } else { /* r->type == HERODOTUS_TYPE_UTF8 */
          119                 return grapheme_decode_utf8(
          120                         (const char *)(r->src) + r->off,
          121                         MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
          122         }
          123 }
          124 
          125 size_t
          126 herodotus_reader_number_read(const HERODOTUS_READER *r)
          127 {
          128         return r->off;
          129 }
          130 
          131 enum herodotus_status
          132 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
          133 {
          134         size_t ret;
          135 
          136         if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
          137                 *cp = GRAPHEME_INVALID_CODEPOINT;
          138                 return HERODOTUS_STATUS_END_OF_BUFFER;
          139         }
          140 
          141         if (r->off >= r->soft_limit[0]) {
          142                 *cp = GRAPHEME_INVALID_CODEPOINT;
          143                 return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
          144         }
          145 
          146         if (r->type == HERODOTUS_TYPE_CODEPOINT) {
          147                 *cp = ((const uint_least32_t *)(r->src))[r->off];
          148                 ret = 1;
          149         } else { /* r->type == HERODOTUS_TYPE_UTF8 */
          150                 ret = grapheme_decode_utf8(
          151                         (const char *)r->src + r->off,
          152                         MIN(r->srclen, r->soft_limit[0]) - r->off, cp);
          153         }
          154 
          155         if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
          156                 /*
          157                  * We encountered a null-codepoint. Don't increment
          158                  * offset and return as if the buffer had ended here all
          159                  * along
          160                  */
          161                 r->terminated_by_null = true;
          162                 return HERODOTUS_STATUS_END_OF_BUFFER;
          163         }
          164 
          165         if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
          166                 /*
          167                  * we want more than we have; instead of returning
          168                  * garbage we terminate here.
          169                  */
          170                 return HERODOTUS_STATUS_END_OF_BUFFER;
          171         }
          172 
          173         /*
          174          * Increase offset which we now know won't surpass the limits,
          175          * unless we got told otherwise
          176          */
          177         if (advance) {
          178                 r->off += ret;
          179         }
          180 
          181         return HERODOTUS_STATUS_SUCCESS;
          182 }
          183 
          184 void
          185 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, void *dest,
          186                       size_t destlen)
          187 {
          188         w->type = type;
          189         w->dest = dest;
          190         w->destlen = destlen;
          191         w->off = 0;
          192         w->first_unwritable_offset = SIZE_MAX;
          193 }
          194 
          195 void
          196 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
          197 {
          198         if (w->dest == NULL) {
          199                 return;
          200         }
          201 
          202         if (w->off < w->destlen) {
          203                 /* We still have space in the buffer. Simply use it */
          204                 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
          205                         ((uint_least32_t *)(w->dest))[w->off] = 0;
          206                 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
          207                         ((char *)(w->dest))[w->off] = '\0';
          208                 }
          209         } else if (w->first_unwritable_offset < w->destlen) {
          210                 /*
          211                  * There is no more space in the buffer. However,
          212                  * we have noted down the first offset we couldn't
          213                  * use to write into the buffer and it's smaller than
          214                  * destlen. Thus we bailed writing into the
          215                  * destination when a multibyte-codepoint couldn't be
          216                  * written. So the last "real" byte might be at
          217                  * destlen-4, destlen-3, destlen-2 or destlen-1
          218                  * (the last case meaning truncation).
          219                  */
          220                 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
          221                         ((uint_least32_t
          222                                   *)(w->dest))[w->first_unwritable_offset] = 0;
          223                 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
          224                         ((char *)(w->dest))[w->first_unwritable_offset] = '\0';
          225                 }
          226         } else if (w->destlen > 0) {
          227                 /*
          228                  * In this case, there is no more space in the buffer and
          229                  * the last unwritable offset is larger than
          230                  * or equal to the destination buffer length. This means
          231                  * that we are forced to simply write into the last
          232                  * byte.
          233                  */
          234                 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
          235                         ((uint_least32_t *)(w->dest))[w->destlen - 1] = 0;
          236                 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
          237                         ((char *)(w->dest))[w->destlen - 1] = '\0';
          238                 }
          239         }
          240 
          241         /* w->off is not incremented in any case */
          242 }
          243 
          244 size_t
          245 herodotus_writer_number_written(const HERODOTUS_WRITER *w)
          246 {
          247         return w->off;
          248 }
          249 
          250 void
          251 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
          252 {
          253         size_t ret;
          254 
          255         /*
          256          * This function will always faithfully say how many codepoints
          257          * were written, even if the buffer ends. This is used to enable
          258          * truncation detection.
          259          */
          260         if (w->type == HERODOTUS_TYPE_CODEPOINT) {
          261                 if (w->dest != NULL && w->off < w->destlen) {
          262                         ((uint_least32_t *)(w->dest))[w->off] = cp;
          263                 }
          264 
          265                 w->off += 1;
          266         } else { /* w->type == HERODOTUS_TYPE_UTF8 */
          267                 /*
          268                  * First determine how many bytes we need to encode the
          269                  * codepoint
          270                  */
          271                 ret = grapheme_encode_utf8(cp, NULL, 0);
          272 
          273                 if (w->dest != NULL && w->off + ret < w->destlen) {
          274                         /* we still have enough room in the buffer */
          275                         grapheme_encode_utf8(cp, (char *)(w->dest) + w->off,
          276                                              w->destlen - w->off);
          277                 } else if (w->first_unwritable_offset == SIZE_MAX) {
          278                         /*
          279                          * the first unwritable offset has not been
          280                          * noted down, so this is the first time we can't
          281                          * write (completely) to an offset
          282                          */
          283                         w->first_unwritable_offset = w->off;
          284                 }
          285 
          286                 w->off += ret;
          287         }
          288 }
          289 
          290 void
          291 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
          292             uint_least8_t (*get_break_prop)(uint_least32_t),
          293             bool (*is_skippable_prop)(uint_least8_t),
          294             void (*skip_shift_callback)(uint_least8_t, void *),
          295             struct proper *p)
          296 {
          297         uint_least8_t prop;
          298         uint_least32_t cp;
          299         size_t i;
          300 
          301         /* set internal variables */
          302         p->state = state;
          303         p->no_prop = no_prop;
          304         p->get_break_prop = get_break_prop;
          305         p->is_skippable_prop = is_skippable_prop;
          306         p->skip_shift_callback = skip_shift_callback;
          307 
          308         /*
          309          * Initialize mid-reader, which is basically just there
          310          * to reflect the current position of the viewing-line
          311          */
          312         herodotus_reader_copy(r, &(p->mid_reader));
          313 
          314         /*
          315          * In the initialization, we simply (try to) fill in next_prop.
          316          * If we cannot read in more (due to the buffer ending), we
          317          * fill in the prop as invalid
          318          */
          319 
          320         /*
          321          * initialize the previous properties to have no property
          322          * (given we are at the start of the buffer)
          323          */
          324         p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
          325         p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
          326 
          327         /*
          328          * initialize the next properties
          329          */
          330 
          331         /* initialize the raw reader */
          332         herodotus_reader_copy(r, &(p->raw_reader));
          333 
          334         /* fill in the two next raw properties (after no-initialization) */
          335         p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
          336         for (i = 0;
          337              i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
          338                               HERODOTUS_STATUS_SUCCESS;) {
          339                 p->raw.next_prop[i++] = p->get_break_prop(cp);
          340         }
          341 
          342         /* initialize the skip reader */
          343         herodotus_reader_copy(r, &(p->skip_reader));
          344 
          345         /* fill in the two next skip properties (after no-initialization) */
          346         p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
          347         for (i = 0;
          348              i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
          349                               HERODOTUS_STATUS_SUCCESS;) {
          350                 prop = p->get_break_prop(cp);
          351                 if (!p->is_skippable_prop(prop)) {
          352                         p->skip.next_prop[i++] = prop;
          353                 }
          354         }
          355 }
          356 
          357 int
          358 proper_advance(struct proper *p)
          359 {
          360         uint_least8_t prop;
          361         uint_least32_t cp;
          362 
          363         /* read in next "raw" property */
          364         if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
          365             HERODOTUS_STATUS_SUCCESS) {
          366                 prop = p->get_break_prop(cp);
          367         } else {
          368                 prop = p->no_prop;
          369         }
          370 
          371         /*
          372          * do a shift-in, unless we find that the property that is to
          373          * be moved past the "raw-viewing-line" (this property is stored
          374          * in p->raw.next_prop[0]) is a no_prop, indicating that
          375          * we are at the end of the buffer.
          376          */
          377         if (p->raw.next_prop[0] == p->no_prop) {
          378                 return 1;
          379         }
          380 
          381         /* shift in the properties */
          382         p->raw.prev_prop[1] = p->raw.prev_prop[0];
          383         p->raw.prev_prop[0] = p->raw.next_prop[0];
          384         p->raw.next_prop[0] = p->raw.next_prop[1];
          385         p->raw.next_prop[1] = prop;
          386 
          387         /* advance the middle reader viewing-line */
          388         (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
          389 
          390         /* check skippability-property */
          391         if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
          392                 /*
          393                  * the property that has moved past the "raw-viewing-line"
          394                  * (this property is now (after the raw-shift) stored in
          395                  * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
          396                  * guaranteeing that we won't shift a no-prop past the
          397                  * "viewing-line" in the skip-properties) is not a skippable
          398                  * property, thus we need to shift the skip property as well.
          399                  */
          400                 p->skip.prev_prop[1] = p->skip.prev_prop[0];
          401                 p->skip.prev_prop[0] = p->skip.next_prop[0];
          402                 p->skip.next_prop[0] = p->skip.next_prop[1];
          403 
          404                 /*
          405                  * call the skip-shift-callback on the property that
          406                  * passed the skip-viewing-line (this property is now
          407                  * stored in p->skip.prev_prop[0]).
          408                  */
          409                 p->skip_shift_callback(p->skip.prev_prop[0], p->state);
          410 
          411                 /* determine the next shift property */
          412                 p->skip.next_prop[1] = p->no_prop;
          413                 while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
          414                        HERODOTUS_STATUS_SUCCESS) {
          415                         prop = p->get_break_prop(cp);
          416                         if (!p->is_skippable_prop(prop)) {
          417                                 p->skip.next_prop[1] = prop;
          418                                 break;
          419                         }
          420                 }
          421         }
          422 
          423         return 0;
          424 }