Update grapheme break algorithm to Unicode version 15.1.0 - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 65b354f0fcb1d925f4340dbb4415ea06e8af2bec
 (DIR) parent 3ee106e4ab1d5fe4696ab9089f052706d7cb9a48
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Sun,  1 Sep 2024 22:42:18 +0200
       
       Update grapheme break algorithm to Unicode version 15.1.0
       
       While the change to the algorithm looks harmless in the specification,
       it comes at the price of more complexity because we have to keep track
       of a relatively complex state for a sequence of indic conjunct breaks.
       
       Fortunately adding so many additional classes only decreases the
       compression ratio for the grapheme cluster LUTs by ~0.5%.
       
       We now pass all 1187 character tests.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M Makefile                            |       4 ++--
         M gen/character.c                     |     104 ++++++++++++++++++++++++++++++-
         M gen/util.c                          |       5 ++++-
         M gen/util.h                          |       1 +
         M src/character.c                     |     376 +++++++++++++++++++++++++------
       
       5 files changed, 409 insertions(+), 81 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -196,7 +196,7 @@ src/sentence.o: src/sentence.c Makefile config.mk gen/sentence.h grapheme.h src/
        src/utf8.o: src/utf8.c Makefile config.mk grapheme.h
        src/util.o: src/util.c Makefile config.mk gen/types.h grapheme.h src/util.h
        src/word.o: src/word.c Makefile config.mk gen/word.h grapheme.h src/util.h
       -test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectional-test.h grapheme.h test/util.h
       +test/bidirectional.o: test/bidirectional.c Makefile config.mk gen/bidirectional.h gen/bidirectional-test.h grapheme.h test/util.h
        test/case.o: test/case.c Makefile config.mk grapheme.h test/util.h
        test/character.o: test/character.c Makefile config.mk gen/character-test.h grapheme.h test/util.h
        test/line.o: test/line.c Makefile config.mk gen/line-test.h grapheme.h test/util.h
       @@ -236,7 +236,7 @@ test/word$(BINSUFFIX): test/word.o test/util.o $(ANAME)
        gen/bidirectional.h: data/BidiBrackets.txt data/BidiMirroring.txt data/DerivedBidiClass.txt data/UnicodeData.txt gen/bidirectional$(BINSUFFIX)
        gen/bidirectional-test.h: data/BidiCharacterTest.txt data/BidiTest.txt gen/bidirectional-test$(BINSUFFIX)
        gen/case.h: data/DerivedCoreProperties.txt data/UnicodeData.txt data/SpecialCasing.txt gen/case$(BINSUFFIX)
       -gen/character.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character$(BINSUFFIX)
       +gen/character.h: data/DerivedCoreProperties.txt data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character$(BINSUFFIX)
        gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test$(BINSUFFIX)
        gen/line.h: data/emoji-data.txt data/EastAsianWidth.txt data/LineBreak.txt gen/line$(BINSUFFIX)
        gen/line-test.h: data/LineBreakTest.txt gen/line-test$(BINSUFFIX)
 (DIR) diff --git a/gen/character.c b/gen/character.c
       @@ -1,8 +1,12 @@
        /* See LICENSE file for copyright and license details. */
        #include <stddef.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +#include <string.h>
        
        #include "util.h"
        
       +#define FILE_DCP      "data/DerivedCoreProperties.txt"
        #define FILE_EMOJI    "data/emoji-data.txt"
        #define FILE_GRAPHEME "data/GraphemeBreakProperty.txt"
        
       @@ -13,6 +17,21 @@ static const struct property_spec char_break_property[] = {
                        .ucdname = NULL,
                },
                {
       +                .enumname = "BOTH_EXTEND_ICB_EXTEND",
       +                .file = NULL,
       +                .ucdname = NULL,
       +        },
       +        {
       +                .enumname = "BOTH_EXTEND_ICB_LINKER",
       +                .file = NULL,
       +                .ucdname = NULL,
       +        },
       +        {
       +                .enumname = "BOTH_ZWJ_ICB_EXTEND",
       +                .file = NULL,
       +                .ucdname = NULL,
       +        },
       +        {
                        .enumname = "CONTROL",
                        .file = FILE_GRAPHEME,
                        .ucdname = "Control",
       @@ -58,6 +77,24 @@ static const struct property_spec char_break_property[] = {
                        .ucdname = "LVT",
                },
                {
       +                .enumname = "ICB_CONSONANT",
       +                .file = FILE_DCP,
       +                .ucdname = "InCB",
       +                .ucdsubname = "Consonant",
       +        },
       +        {
       +                .enumname = "ICB_EXTEND",
       +                .file = FILE_DCP,
       +                .ucdname = "InCB",
       +                .ucdsubname = "Extend",
       +        },
       +        {
       +                .enumname = "ICB_LINKER",
       +                .file = FILE_DCP,
       +                .ucdname = "InCB",
       +                .ucdsubname = "Linker",
       +        },
       +        {
                        .enumname = "LF",
                        .file = FILE_GRAPHEME,
                        .ucdname = "LF",
       @@ -84,14 +121,75 @@ static const struct property_spec char_break_property[] = {
                },
        };
        
       +static uint_least8_t
       +handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
       +{
       +        uint_least8_t result;
       +
       +        (void)cp;
       +
       +        if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") &&
       +             !strcmp(char_break_property[prop2].enumname, "ICB_EXTEND")) ||
       +            (!strcmp(char_break_property[prop1].enumname, "ICB_EXTEND") &&
       +             !strcmp(char_break_property[prop2].enumname, "EXTEND"))) {
       +                for (result = 0; result < LEN(char_break_property); result++) {
       +                        if (!strcmp(char_break_property[result].enumname,
       +                                    "BOTH_EXTEND_ICB_EXTEND")) {
       +                                break;
       +                        }
       +                }
       +                if (result == LEN(char_break_property)) {
       +                        fprintf(stderr, "handle_conflict: Internal error.\n");
       +                        exit(1);
       +                }
       +        } else if ((!strcmp(char_break_property[prop1].enumname, "EXTEND") &&
       +                    !strcmp(char_break_property[prop2].enumname,
       +                            "ICB_LINKER")) ||
       +                   (!strcmp(char_break_property[prop1].enumname,
       +                            "ICB_LINKER") &&
       +                    !strcmp(char_break_property[prop2].enumname, "EXTEND"))) {
       +                for (result = 0; result < LEN(char_break_property); result++) {
       +                        if (!strcmp(char_break_property[result].enumname,
       +                                    "BOTH_EXTEND_ICB_LINKER")) {
       +                                break;
       +                        }
       +                }
       +                if (result == LEN(char_break_property)) {
       +                        fprintf(stderr, "handle_conflict: Internal error.\n");
       +                        exit(1);
       +                }
       +        } else if ((!strcmp(char_break_property[prop1].enumname, "ZWJ") &&
       +                    !strcmp(char_break_property[prop2].enumname,
       +                            "ICB_EXTEND")) ||
       +                   (!strcmp(char_break_property[prop1].enumname,
       +                            "ICB_EXTEND") &&
       +                    !strcmp(char_break_property[prop2].enumname, "ZWJ"))) {
       +                for (result = 0; result < LEN(char_break_property); result++) {
       +                        if (!strcmp(char_break_property[result].enumname,
       +                                    "BOTH_ZWJ_ICB_EXTEND")) {
       +                                break;
       +                        }
       +                }
       +                if (result == LEN(char_break_property)) {
       +                        fprintf(stderr, "handle_conflict: Internal error.\n");
       +                        exit(1);
       +                }
       +        } else {
       +                fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
       +                exit(1);
       +        }
       +
       +        return result;
       +}
       +
        int
        main(int argc, char *argv[])
        {
                (void)argc;
        
       -        properties_generate_break_property(char_break_property,
       -                                           LEN(char_break_property), NULL, NULL,
       -                                           NULL, "char_break", argv[0]);
       +        properties_generate_break_property(
       +                char_break_property, LEN(char_break_property), NULL,
       +                handle_conflict, NULL, "char_break", argv[0]);
        
                return 0;
        }
 (DIR) diff --git a/gen/util.c b/gen/util.c
       @@ -317,7 +317,10 @@ properties_callback(const char *file, char **field, size_t nfields,
                             (comment != NULL &&
                              !strncmp(p->spec[i].ucdname, comment,
                                       strlen(p->spec[i].ucdname)) &&
       -                      comment[strlen(p->spec[i].ucdname)] == ' '))) {
       +                      comment[strlen(p->spec[i].ucdname)] == ' ')) &&
       +                    (p->spec[i].ucdsubname == NULL ||
       +                     (nfields >= 3 &&
       +                      !strcmp(p->spec[i].ucdsubname, field[2])))) {
                                /* parse range in first field */
                                if (range_parse(field[0], &r)) {
                                        return 1;
 (DIR) diff --git a/gen/util.h b/gen/util.h
       @@ -13,6 +13,7 @@ struct property_spec {
                const char *enumname;
                const char *file;
                const char *ucdname;
       +        const char *ucdsubname;
        };
        
        struct properties {
 (DIR) diff --git a/src/character.c b/src/character.c
       @@ -1,3 +1,5 @@
       +#include <stdio.h>
       +
        /* See LICENSE file for copyright and license details. */
        #include <limits.h>
        #include <stdbool.h>
       @@ -12,97 +14,239 @@ struct character_break_state {
                bool prop_set;
                bool gb11_flag;
                bool gb12_13_flag;
       +        uint_least8_t gb9c_level;
        };
        
       -static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
       +static const uint_least32_t dont_break[NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_OTHER] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |           /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |              /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,       /* GB9a */
       -        [CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +        [CHAR_BREAK_PROP_ICB_CONSONANT] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +        [CHAR_BREAK_PROP_ICB_EXTEND] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +        [CHAR_BREAK_PROP_ICB_LINKER] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +        [CHAR_BREAK_PROP_CR] = UINT32_C(1) << CHAR_BREAK_PROP_LF,    /* GB3  */
                [CHAR_BREAK_PROP_EXTEND] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +        [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +        [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_L] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L |   /* GB6  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB6  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV |  /* GB6  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_L |   /* GB6  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB6  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LV |  /* GB6  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_V] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB7  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB7  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_T] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB8  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_LV] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V |   /* GB7  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB7  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_LVT] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T |   /* GB8  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |   /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_PREPEND] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |      /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |         /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
       -                (UINT16_C(0xFFFF) &
       -                 ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
       -                   UINT16_C(1) << CHAR_BREAK_PROP_LF |
       -                   UINT16_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK |         /* GB9a */
       +                (UINT32_C(0xFFFFFFFF) &
       +                 ~(UINT32_C(1) << CHAR_BREAK_PROP_CR |
       +                   UINT32_C(1) << CHAR_BREAK_PROP_LF |
       +                   UINT32_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */
                [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_SPACINGMARK] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
                [CHAR_BREAK_PROP_ZWJ] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |     /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |        /* GB9  */
       -                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +        [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |  /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |                 /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_SPACINGMARK,          /* GB9a */
       +
        };
       -static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
       +static const uint_least32_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |              /* GB9  */
       +                UINT32_C(1)
       +                        << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND | /* GB9  */
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER, /* GB9 */
                [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
       +        [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
                [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
       +        [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
       +        [CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER + NUM_CHAR_BREAK_PROPS] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER |
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND,
                [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
       +                UINT32_C(1) << CHAR_BREAK_PROP_ZWJ |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND |
       +                UINT32_C(1) << CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER,
        };
       -static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
       +static const uint_least32_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
       +        [CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND + NUM_CHAR_BREAK_PROPS] =
       +                UINT32_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
        };
       -static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
       +static const uint_least32_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
       +                UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
        };
       -static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
       +static const uint_least32_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
       +                UINT32_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
        };
        
        static inline enum char_break_property
       @@ -126,7 +270,9 @@ state_serialize(const struct character_break_state *in, uint_least16_t *out)
                       (uint_least16_t)(((uint_least16_t)(in->gb11_flag))
                                        << 9) | /* 10th bit */
                       (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag))
       -                                << 10); /* 11th bit */
       +                                << 10) | /* 11th bit */
       +               (uint_least16_t)(((uint_least16_t)(in->gb9c_level & 0x3))
       +                                << 11); /* 12th and 13th bit */
        }
        
        static inline void
       @@ -136,6 +282,7 @@ state_deserialize(uint_least16_t in, struct character_break_state *out)
                out->prop_set = in & (UINT16_C(1) << 8);
                out->gb11_flag = in & (UINT16_C(1) << 9);
                out->gb12_13_flag = in & (UINT16_C(1) << 10);
       +        out->gb9c_level = (uint_least8_t)(in >> 11) & UINT8_C(0x3);
        }
        
        bool
       @@ -164,26 +311,105 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
                        state.gb11_flag =
                                flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
                                                                    state.gb11_flag] &
       -                        UINT16_C(1) << cp1_prop;
       +                        UINT32_C(1) << cp1_prop;
                        state.gb12_13_flag =
                                flag_update_gb12_13[cp0_prop +
                                                    NUM_CHAR_BREAK_PROPS *
                                                            state.gb12_13_flag] &
       -                        UINT16_C(1) << cp1_prop;
       +                        UINT32_C(1) << cp1_prop;
       +
       +                /*
       +                 * update GB9c state, which deals with indic conjunct breaks.
       +                 * We want to detect the following prefix:
       +                 *
       +                 *   ICB_CONSONANT
       +                 *   [ICB_EXTEND ICB_LINKER]*
       +                 *   ICB_LINKER
       +                 *   [ICB_EXTEND ICB_LINKER]*
       +                 *
       +                 * This representation is not ideal: In reality, what is
       +                 * meant is that the prefix is a sequence of [ICB_EXTEND
       +                 * ICB_LINKER]*, following an ICB_CONSONANT, that contains at
       +                 * least one ICB_LINKER. We thus use the following equivalent
       +                 * representation that allows us to store the levels 0..3 in 2
       +                 * bits.
       +                 *
       +                 *   ICB_CONSONANT              -- Level 1
       +                 *   ICB_EXTEND*                -- Level 2
       +                 *   ICB_LINKER                 -- Level 3
       +                 *   [ICB_EXTEND ICB_LINKER]*   -- Level 3
       +                 *
       +                 * The following chain of if-else-blocks is a bit redundant and
       +                 * of course could be optimised, but this is kept as is for
       +                 * best readability.
       +                 */
       +                if (state.gb9c_level == 0 &&
       +                    cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
       +                        /* the sequence has begun */
       +                        state.gb9c_level = 1;
       +                } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
       +                           (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
       +                            cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
       +                            cp0_prop ==
       +                                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND)) {
       +                        /*
       +                         * either the level is 1 and thus the ICB consonant is
       +                         * followed by an ICB extend, where we jump
       +                         * to level 2, or we are at level 2 and just witness
       +                         * more ICB extends, staying at level 2.
       +                         */
       +                        state.gb9c_level = 2;
       +                } else if ((state.gb9c_level == 1 || state.gb9c_level == 2) &&
       +                           (cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
       +                            cp0_prop ==
       +                                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
       +                        /*
       +                         * witnessing an ICB linker directly lifts us up to
       +                         * level 3
       +                         */
       +                        state.gb9c_level = 3;
       +                } else if (state.gb9c_level == 3 &&
       +                           (cp0_prop == CHAR_BREAK_PROP_ICB_EXTEND ||
       +                            cp0_prop == CHAR_BREAK_PROP_BOTH_ZWJ_ICB_EXTEND ||
       +                            cp0_prop ==
       +                                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_EXTEND ||
       +                            cp0_prop == CHAR_BREAK_PROP_ICB_LINKER ||
       +                            cp0_prop ==
       +                                    CHAR_BREAK_PROP_BOTH_EXTEND_ICB_LINKER)) {
       +                        /*
       +                         * we stay at level 3 when we observe either ICB
       +                         * extends or linkers
       +                         */
       +                        state.gb9c_level = 3;
       +                } else {
       +                        /*
       +                         * the sequence has collapsed, but it could be
       +                         * that the left property is ICB consonant, which
       +                         * means that we jump right back to level 1 instead
       +                         * of 0
       +                         */
       +                        if (cp0_prop == CHAR_BREAK_PROP_ICB_CONSONANT) {
       +                                state.gb9c_level = 1;
       +                        } else {
       +                                state.gb9c_level = 0;
       +                        }
       +                }
        
                        /*
                         * Apply grapheme cluster breaking algorithm (UAX #29), see
                         * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
                         */
       -                notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
       +                notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
       +                           (state.gb9c_level == 3 &&
       +                            cp1_prop == CHAR_BREAK_PROP_ICB_CONSONANT) ||
                                   (dont_break_gb11[cp0_prop +
                                                    state.gb11_flag *
                                                            NUM_CHAR_BREAK_PROPS] &
       -                            (UINT16_C(1) << cp1_prop)) ||
       +                            (UINT32_C(1) << cp1_prop)) ||
                                   (dont_break_gb12_13[cp0_prop +
                                                       state.gb12_13_flag *
                                                               NUM_CHAR_BREAK_PROPS] &
       -                            (UINT16_C(1) << cp1_prop));
       +                            (UINT32_C(1) << cp1_prop));
        
                        /* update or reset flags (when we have a break) */
                        if (likely(!notbreak)) {
       @@ -202,11 +428,11 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1,
                         * Given we have no state, this behaves as if the state-booleans
                         * were all set to false
                         */
       -                notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
       +                notbreak = (dont_break[cp0_prop] & (UINT32_C(1) << cp1_prop)) ||
                                   (dont_break_gb11[cp0_prop] &
       -                            (UINT16_C(1) << cp1_prop)) ||
       +                            (UINT32_C(1) << cp1_prop)) ||
                                   (dont_break_gb12_13[cp0_prop] &
       -                            (UINT16_C(1) << cp1_prop));
       +                            (UINT32_C(1) << cp1_prop));
                }
        
                return !notbreak;