Merge branch 'master' into bidirectional - libgrapheme - unicode string library
 (HTM) git clone git://git.suckless.org/libgrapheme
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit efb2f452b6d1327ba091ac8a69556a060401afed
 (DIR) parent f2783665bc71b9b1f1b72830629c3724bd8e1ae4
 (HTM) Author: Laslo Hunhold <dev@frign.de>
       Date:   Thu, 13 Oct 2022 23:54:28 +0200
       
       Merge branch 'master' into bidirectional
       
       This brings this branch up to speed with the previous work.
       
       Signed-off-by: Laslo Hunhold <dev@frign.de>
       
       Diffstat:
         M Makefile                            |     297 ++++++++++++++++++++-----------
         M README                              |      62 +++++++++++++++++--------------
         M benchmark/character.c               |       3 ++-
         M config.mk                           |      25 ++++++++++++++++++-------
         A configure                           |      39 +++++++++++++++++++++++++++++++
         M data/DerivedCoreProperties.txt      |     253 +++++++++++++++++++++++++------
         M data/EastAsianWidth.txt             |      74 ++++++++++++++++++++++---------
         M data/GraphemeBreakProperty.txt      |      38 ++++++++++++++++++++++---------
         M data/GraphemeBreakTest.txt          |      10 +++++-----
         M data/LineBreak.txt                  |     105 +++++++++++++++++++++----------
         M data/LineBreakTest.txt              |      10 +++++-----
         M data/SentenceBreakProperty.txt      |      66 ++++++++++++++++++++++---------
         M data/SentenceBreakTest.txt          |      10 +++++-----
         M data/SpecialCasing.txt              |      10 +++++-----
         M data/UnicodeData.txt                |     300 ++++++++++++++++++++++++++++++-
         M data/WordBreakProperty.txt          |      53 +++++++++++++++++++++++--------
         M data/WordBreakTest.txt              |      10 +++++-----
         M data/emoji-data.txt                 |      85 ++++++++++++++++++++-----------
         M gen/case.c                          |      21 +++++++++++++++++++--
         M gen/util.c                          |      16 +++++++++++-----
         M grapheme.h                          |      49 +++++++++++++------------------
         D man/grapheme_decode_utf8.3          |     101 -------------------------------
         A man/grapheme_decode_utf8.sh         |     102 +++++++++++++++++++++++++++++++
         D man/grapheme_encode_utf8.3          |      98 -------------------------------
         A man/grapheme_encode_utf8.sh         |     103 +++++++++++++++++++++++++++++++
         D man/grapheme_is_character_break.3   |      80 -------------------------------
         A man/grapheme_is_character_break.sh  |      83 +++++++++++++++++++++++++++++++
         A man/grapheme_is_lowercase.sh        |       3 +++
         A man/grapheme_is_lowercase_utf8.sh   |       3 +++
         A man/grapheme_is_titlecase.sh        |       3 +++
         A man/grapheme_is_titlecase_utf8.sh   |       3 +++
         A man/grapheme_is_uppercase.sh        |       3 +++
         A man/grapheme_is_uppercase_utf8.sh   |       3 +++
         A man/grapheme_next_character_break.… |       4 ++++
         D man/grapheme_next_character_break_… |      92 -------------------------------
         A man/grapheme_next_character_break_… |       4 ++++
         A man/grapheme_next_line_break.sh     |       4 ++++
         A man/grapheme_next_line_break_utf8.… |       4 ++++
         A man/grapheme_next_sentence_break.sh |       4 ++++
         A man/grapheme_next_sentence_break_u… |       4 ++++
         A man/grapheme_next_word_break.sh     |       4 ++++
         A man/grapheme_next_word_break_utf8.… |       4 ++++
         A man/grapheme_to_lowercase.sh        |       3 +++
         A man/grapheme_to_lowercase_utf8.sh   |       3 +++
         A man/grapheme_to_titlecase.sh        |       3 +++
         A man/grapheme_to_titlecase_utf8.sh   |       3 +++
         A man/grapheme_to_uppercase.sh        |       3 +++
         A man/grapheme_to_uppercase_utf8.sh   |       3 +++
         D man/libgrapheme.7                   |     140 -------------------------------
         A man/libgrapheme.sh                  |     167 +++++++++++++++++++++++++++++++
         A man/template/is_case.sh             |      67 +++++++++++++++++++++++++++++++
         A man/template/next_break.sh          |     112 +++++++++++++++++++++++++++++++
         A man/template/to_case.sh             |      72 +++++++++++++++++++++++++++++++
         M src/case.c                          |     477 ++++++++++++++-----------------
         M src/character.c                     |     261 ++++++++++++++++---------------
         M src/line.c                          |      59 +++++++++++++------------------
         M src/sentence.c                      |     426 +++++++++++++------------------
         M src/utf8.c                          |       3 ++-
         M src/util.c                          |     421 ++++++++++++++++++++++++++++---
         M src/util.h                          |      90 +++++++++++++++++++++++++++++--
         M src/word.c                          |     365 ++++++++++++-------------------
         A test/case.c                         |     580 +++++++++++++++++++++++++++++++
         M test/character.c                    |     113 ++++++++++++++++++++++++++++++-
         M test/line.c                         |     112 ++++++++++++++++++++++++++++++-
         M test/sentence.c                     |     112 ++++++++++++++++++++++++++++++-
         M test/utf8-decode.c                  |       2 +-
         M test/utf8-encode.c                  |       2 +-
         M test/util.c                         |      64 ++++++++++++++++++++++++++++++-
         M test/util.h                         |      35 +++++++++++++++++++++++++++++++
         M test/word.c                         |     112 ++++++++++++++++++++++++++++++-
       
       70 files changed, 4228 insertions(+), 1827 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -1,9 +1,18 @@
        # See LICENSE file for copyright and license details
        # libgrapheme - unicode string library
        .POSIX:
       +.SUFFIXES:
       +
       +VERSION_MAJOR = 2
       +VERSION_MINOR = 0
       +VERSION_PATCH = 1
       +UNICODE_VERSION = 15.0.0
       +MAN_DATE = 2022-10-06
        
        include config.mk
        
       +VERSION = $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
       +
        BENCHMARK =\
                benchmark/case\
                benchmark/character\
       @@ -52,6 +61,7 @@ SRC =\
                src/word\
        
        TEST =\
       +        test/case\
                test/character\
                test/line\
                test/sentence\
       @@ -59,48 +69,120 @@ TEST =\
                test/utf8-encode\
                test/word\
        
       +MAN_TEMPLATE =\
       +        man/template/is_case.sh\
       +        man/template/next_break.sh\
       +        man/template/to_case.sh\
       +
        MAN3 =\
       -        man/grapheme_decode_utf8.3\
       -        man/grapheme_encode_utf8.3\
       -        man/grapheme_is_character_break.3\
       -
       -MAN7 = man/libgrapheme.7
       -
       -all: data/LICENSE libgrapheme.a libgrapheme.so
       -
       -benchmark/case.o: benchmark/case.c config.mk gen/word-test.h grapheme.h benchmark/util.h
       -benchmark/character.o: benchmark/character.c config.mk gen/character-test.h grapheme.h benchmark/util.h
       -benchmark/line.o: benchmark/line.c config.mk gen/line-test.h grapheme.h benchmark/util.h
       -benchmark/utf8-decode.o: benchmark/utf8-decode.c config.mk gen/character-test.h grapheme.h benchmark/util.h
       -benchmark/sentence.o: benchmark/sentence.c config.mk gen/sentence-test.h grapheme.h benchmark/util.h
       -benchmark/util.o: benchmark/util.c config.mk benchmark/util.h
       -benchmark/word.o: benchmark/word.c config.mk gen/word-test.h grapheme.h benchmark/util.h
       -gen/bidirectional.o: gen/bidirectional.c config.mk gen/util.h
       -gen/case.o: gen/case.c config.mk gen/util.h
       -gen/character.o: gen/character.c config.mk gen/util.h
       -gen/character-test.o: gen/character-test.c config.mk gen/util.h
       -gen/line.o: gen/line.c config.mk gen/util.h
       -gen/line-test.o: gen/line-test.c config.mk gen/util.h
       -gen/sentence.o: gen/sentence.c config.mk gen/util.h
       -gen/sentence-test.o: gen/sentence-test.c config.mk gen/util.h
       -gen/word.o: gen/word.c config.mk gen/util.h
       -gen/word-test.o: gen/word-test.c config.mk gen/util.h
       -gen/util.o: gen/util.c config.mk gen/util.h
       -src/bidirectional.o: src/bidirectional.c config.mk gen/bidirectional.h grapheme.h src/util.h
       -src/case.o: src/case.c config.mk gen/case.h grapheme.h src/util.h
       -src/character.o: src/character.c config.mk gen/character.h grapheme.h src/util.h
       -src/line.o: src/line.c config.mk gen/line.h grapheme.h src/util.h
       -src/sentence.o: src/sentence.c config.mk gen/sentence.h grapheme.h src/util.h
       -src/utf8.o: src/utf8.c config.mk grapheme.h
       -src/util.o: src/util.c config.mk gen/types.h grapheme.h src/util.h
       -src/word.o: src/word.c config.mk gen/word.h grapheme.h src/util.h
       -test/character.o: test/character.c config.mk gen/character-test.h grapheme.h test/util.h
       -test/line.o: test/line.c config.mk gen/line-test.h grapheme.h test/util.h
       -test/sentence.o: test/sentence.c config.mk gen/sentence-test.h grapheme.h test/util.h
       -test/utf8-encode.o: test/utf8-encode.c config.mk grapheme.h test/util.h
       -test/utf8-decode.o: test/utf8-decode.c config.mk grapheme.h test/util.h
       -test/util.o: test/util.c config.mk test/util.h
       -test/word.o: test/word.c config.mk gen/word-test.h grapheme.h test/util.h
       +        man/grapheme_decode_utf8\
       +        man/grapheme_encode_utf8\
       +        man/grapheme_is_character_break\
       +        man/grapheme_is_uppercase\
       +        man/grapheme_is_uppercase_utf8\
       +        man/grapheme_is_lowercase\
       +        man/grapheme_is_lowercase_utf8\
       +        man/grapheme_is_titlecase\
       +        man/grapheme_is_titlecase_utf8\
       +        man/grapheme_next_character_break\
       +        man/grapheme_next_line_break\
       +        man/grapheme_next_sentence_break\
       +        man/grapheme_next_word_break\
       +        man/grapheme_next_character_break_utf8\
       +        man/grapheme_next_line_break_utf8\
       +        man/grapheme_next_sentence_break_utf8\
       +        man/grapheme_next_word_break_utf8\
       +        man/grapheme_to_uppercase\
       +        man/grapheme_to_uppercase_utf8\
       +        man/grapheme_to_lowercase\
       +        man/grapheme_to_lowercase_utf8\
       +        man/grapheme_to_titlecase\
       +        man/grapheme_to_titlecase_utf8\
       +
       +MAN7 =\
       +        man/libgrapheme\
       +
       +all: data/LICENSE $(MAN3:=.3) $(MAN7:=.7) libgrapheme.a $(SONAME)
       +
       +data/DerivedBidiClass.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedBidiClass.txt
       +
       +data/DerivedCoreProperties.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/DerivedCoreProperties.txt
       +
       +data/EastAsianWidth.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/EastAsianWidth.txt
       +
       +data/emoji-data.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
       +
       +data/GraphemeBreakProperty.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakProperty.txt
       +
       +data/GraphemeBreakTest.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt
       +
       +data/LICENSE:
       +        wget -O $@ https://www.unicode.org/license.txt
       +
       +data/LineBreak.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/LineBreak.txt
       +
       +data/LineBreakTest.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/LineBreakTest.txt
       +
       +data/SentenceBreakProperty.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/SentenceBreakProperty.txt
       +
       +data/SentenceBreakTest.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/SentenceBreakTest.txt
       +
       +data/SpecialCasing.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/SpecialCasing.txt
       +
       +data/UnicodeData.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
       +
       +data/WordBreakProperty.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/WordBreakProperty.txt
       +
       +data/WordBreakTest.txt:
       +        wget -O $@ https://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/WordBreakTest.txt
       +
       +benchmark/case.o: benchmark/case.c Makefile config.mk gen/word-test.h grapheme.h benchmark/util.h
       +benchmark/character.o: benchmark/character.c Makefile config.mk gen/character-test.h grapheme.h benchmark/util.h
       +benchmark/line.o: benchmark/line.c Makefile config.mk gen/line-test.h grapheme.h benchmark/util.h
       +benchmark/utf8-decode.o: benchmark/utf8-decode.c Makefile config.mk gen/character-test.h grapheme.h benchmark/util.h
       +benchmark/sentence.o: benchmark/sentence.c Makefile config.mk gen/sentence-test.h grapheme.h benchmark/util.h
       +benchmark/util.o: benchmark/util.c Makefile config.mk benchmark/util.h
       +benchmark/word.o: benchmark/word.c Makefile config.mk gen/word-test.h grapheme.h benchmark/util.h
       +gen/bidirectional.o: gen/bidirectional.c Makefile config.mk gen/util.h
       +gen/case.o: gen/case.c Makefile config.mk gen/util.h
       +gen/character.o: gen/character.c Makefile config.mk gen/util.h
       +gen/character-test.o: gen/character-test.c Makefile config.mk gen/util.h
       +gen/line.o: gen/line.c Makefile config.mk gen/util.h
       +gen/line-test.o: gen/line-test.c Makefile config.mk gen/util.h
       +gen/sentence.o: gen/sentence.c Makefile config.mk gen/util.h
       +gen/sentence-test.o: gen/sentence-test.c Makefile config.mk gen/util.h
       +gen/word.o: gen/word.c Makefile config.mk gen/util.h
       +gen/word-test.o: gen/word-test.c Makefile config.mk gen/util.h
       +gen/util.o: gen/util.c Makefile config.mk gen/util.h
       +src/bidirectional.o: src/bidirectional.c Makefile config.mk gen/bidirectional.h grapheme.h src/util.h
       +src/case.o: src/case.c Makefile config.mk gen/case.h grapheme.h src/util.h
       +src/character.o: src/character.c Makefile config.mk gen/character.h grapheme.h src/util.h
       +src/line.o: src/line.c Makefile config.mk gen/line.h grapheme.h src/util.h
       +src/sentence.o: src/sentence.c Makefile config.mk gen/sentence.h grapheme.h src/util.h
       +src/utf8.o: src/utf8.c Makefile config.mk grapheme.h
       +src/util.o: src/util.c Makefile config.mk gen/types.h grapheme.h src/util.h
       +src/word.o: src/word.c Makefile config.mk gen/word.h grapheme.h src/util.h
       +test/case.o: test/case.c Makefile config.mk grapheme.h test/util.h
       +test/character.o: test/character.c Makefile config.mk gen/character-test.h grapheme.h test/util.h
       +test/line.o: test/line.c Makefile config.mk gen/line-test.h grapheme.h test/util.h
       +test/sentence.o: test/sentence.c Makefile config.mk gen/sentence-test.h grapheme.h test/util.h
       +test/utf8-encode.o: test/utf8-encode.c Makefile config.mk grapheme.h test/util.h
       +test/utf8-decode.o: test/utf8-decode.c Makefile config.mk grapheme.h test/util.h
       +test/util.o: test/util.c Makefile config.mk test/util.h
       +test/word.o: test/word.c Makefile config.mk gen/word-test.h grapheme.h test/util.h
        
        benchmark/case: benchmark/case.o benchmark/util.o libgrapheme.a
        benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a
       @@ -118,6 +200,7 @@ gen/sentence: gen/sentence.o gen/util.o
        gen/sentence-test: gen/sentence-test.o gen/util.o
        gen/word: gen/word.o gen/util.o
        gen/word-test: gen/word-test.o gen/util.o
       +test/case: test/case.o test/util.o libgrapheme.a
        test/character: test/character.o test/util.o libgrapheme.a
        test/line: test/line.o test/util.o libgrapheme.a
        test/sentence: test/sentence.o test/util.o libgrapheme.a
       @@ -136,71 +219,65 @@ gen/sentence-test.h: data/SentenceBreakTest.txt gen/sentence-test
        gen/word.h: data/WordBreakProperty.txt gen/word
        gen/word-test.h: data/WordBreakTest.txt gen/word-test
        
       -data/DerivedBidiClass.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/extracted/DerivedBidiClass.txt
       -
       -data/DerivedCoreProperties.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/DerivedCoreProperties.txt
       -
       -data/EastAsianWidth.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/EastAsianWidth.txt
       -
       -data/emoji-data.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
       -
       -data/GraphemeBreakProperty.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
       -
       -data/GraphemeBreakTest.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/GraphemeBreakTest.txt
       -
       -data/LICENSE:
       -        wget -O $@ https://www.unicode.org/license.txt
       -
       -data/LineBreak.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/LineBreak.txt
       -
       -data/LineBreakTest.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/LineBreakTest.txt
       -
       -data/SentenceBreakProperty.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakProperty.txt
       -
       -data/SentenceBreakTest.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/SentenceBreakTest.txt
       -
       -data/SpecialCasing.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/SpecialCasing.txt
       -
       -data/UnicodeData.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/UnicodeData.txt
       -
       -data/WordBreakProperty.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakProperty.txt
       -
       -data/WordBreakTest.txt:
       -        wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/auxiliary/WordBreakTest.txt
       +man/grapheme_is_character_break.3: man/grapheme_is_character_break.sh Makefile config.mk
       +man/grapheme_is_uppercase.3: man/grapheme_is_uppercase.sh man/template/is_case.sh Makefile config.mk
       +man/grapheme_is_uppercase_utf8.3: man/grapheme_is_uppercase_utf8.sh man/template/is_case.sh Makefile config.mk
       +man/grapheme_is_lowercase.3: man/grapheme_is_lowercase.sh man/template/is_case.sh Makefile config.mk
       +man/grapheme_is_lowercase_utf8.3: man/grapheme_is_lowercase_utf8.sh man/template/is_case.sh Makefile config.mk
       +man/grapheme_is_titlecase.3: man/grapheme_is_titlecase.sh man/template/is_case.sh Makefile config.mk
       +man/grapheme_is_titlecase_utf8.3: man/grapheme_is_titlecase_utf8.sh man/template/is_case.sh Makefile config.mk
       +man/grapheme_next_character_break.3: man/grapheme_next_character_break.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_next_line_break.3: man/grapheme_next_line_break.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_next_sentence_break.3: man/grapheme_next_sentence_break.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_next_word_break.3: man/grapheme_next_word_break.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_next_character_break_utf8.3: man/grapheme_next_character_break_utf8.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_next_line_break_utf8.3: man/grapheme_next_line_break_utf8.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_next_sentence_break_utf8.3: man/grapheme_next_sentence_break_utf8.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_next_word_break_utf8.3: man/grapheme_next_word_break_utf8.sh man/template/next_break.sh Makefile config.mk
       +man/grapheme_to_uppercase.3: man/grapheme_to_uppercase.sh man/template/to_case.sh Makefile config.mk
       +man/grapheme_to_lowercase.3: man/grapheme_to_lowercase.sh man/template/to_case.sh Makefile config.mk
       +man/grapheme_to_titlecase.3: man/grapheme_to_titlecase.sh man/template/to_case.sh Makefile config.mk
       +man/grapheme_to_uppercase_utf8.3: man/grapheme_to_uppercase_utf8.sh man/template/to_case.sh Makefile config.mk
       +man/grapheme_to_lowercase_utf8.3: man/grapheme_to_lowercase_utf8.sh man/template/to_case.sh Makefile config.mk
       +man/grapheme_to_titlecase_utf8.3: man/grapheme_to_titlecase_utf8.sh man/template/to_case.sh Makefile config.mk
       +man/grapheme_decode_utf8.3: man/grapheme_decode_utf8.sh Makefile config.mk
       +man/grapheme_encode_utf8.3: man/grapheme_encode_utf8.sh Makefile config.mk
       +
       +man/libgrapheme.7: man/libgrapheme.sh Makefile config.mk
       +
       +$(GEN:=.o) gen/util.o:
       +        $(BUILD_CC) -c -o $@ $(BUILD_CPPFLAGS) $(BUILD_CFLAGS) $(@:.o=.c)
       +
       +$(BENCHMARK:=.o) benchmark/util.o $(TEST:=.o) test/util.o:
       +        $(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $(@:.o=.c)
       +
       +$(SRC:=.o):
       +        $(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $(SHFLAGS) $(@:.o=.c)
        
        $(BENCHMARK):
                $(CC) -o $@ $(LDFLAGS) $@.o benchmark/util.o libgrapheme.a -lutf8proc
        
        $(GEN):
       -        $(CC) -o $@ $(LDFLAGS) $@.o gen/util.o
       -
       -$(GEN:=.h):
       -        $(@:.h=) > $@
       +        $(BUILD_CC) -o $@ $(BUILD_LDFLAGS) $@.o gen/util.o
        
        $(TEST):
                $(CC) -o $@ $(LDFLAGS) $@.o test/util.o libgrapheme.a
        
       -.c.o:
       -        $(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $<
       +$(GEN:=.h):
       +        $(@:.h=) > $@
        
        libgrapheme.a: $(SRC:=.o)
       -        $(AR) -rcs $@ $?
       +        $(AR) -rc $@ $?
       +        $(RANLIB) $@
       +
       +$(SONAME): $(SRC:=.o)
       +        $(CC) -o $@ $(SOFLAGS) $(LDFLAGS) $(SRC:=.o)
        
       -libgrapheme.so: $(SRC:=.o)
       -        $(CC) -o $@ -shared $(SRC:=.o)
       +$(MAN3:=.3):
       +        SH="$(SH)" MAN_DATE="$(MAN_DATE)" UNICODE_VERSION="$(UNICODE_VERSION)" $(SH) $(@:.3=.sh) > $@
       +
       +$(MAN7:=.7):
       +        SH="$(SH)" MAN_DATE="$(MAN_DATE)" UNICODE_VERSION="$(UNICODE_VERSION)" $(SH) $(@:.7=.sh) > $@
        
        benchmark: $(BENCHMARK)
                for m in $(BENCHMARK); do ./$$m; done
       @@ -213,39 +290,43 @@ install: all
                mkdir -p "$(DESTDIR)$(INCPREFIX)"
                mkdir -p "$(DESTDIR)$(MANPREFIX)/man3"
                mkdir -p "$(DESTDIR)$(MANPREFIX)/man7"
       -        cp -f $(MAN3) "$(DESTDIR)$(MANPREFIX)/man3"
       -        cp -f $(MAN7) "$(DESTDIR)$(MANPREFIX)/man7"
       +        cp -f $(MAN3:=.3) "$(DESTDIR)$(MANPREFIX)/man3"
       +        cp -f $(MAN7:=.7) "$(DESTDIR)$(MANPREFIX)/man7"
                cp -f libgrapheme.a "$(DESTDIR)$(LIBPREFIX)"
       -        cp -f libgrapheme.so "$(DESTDIR)$(LIBPREFIX)"
       +        cp -f $(SONAME) "$(DESTDIR)$(LIBPREFIX)/$(SONAME)"
       +        if [ "$(SOSYMLINK)" = "true" ]; then i=0; while [ "$$i" -le $(VERSION_MINOR) ]; do ln -sf "$(SONAME)" "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR).$$i"; i=$$((i+1)); done; fi
       +        if [ "$(SOSYMLINK)" = "true" ]; then ln -sf "$(SONAME)" "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR)"; fi
       +        if [ "$(SOSYMLINK)" = "true" ]; then ln -sf "$(SONAME)" "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so"; fi
                cp -f grapheme.h "$(DESTDIR)$(INCPREFIX)"
       -        ldconfig || true
       +        if ! [ -z "$(LDCONFIG)" ]; then $(SHELL) -c "$(LDCONFIG)"; fi
        
        uninstall:
       -        for m in $(MAN3); do rm -f "$(DESTDIR)$(MANPREFIX)/man3/`basename $$m`"; done
       -        for m in $(MAN7); do rm -f "$(DESTDIR)$(MANPREFIX)/man7/`basename $$m`"; done
       +        for m in $(MAN3:=.3); do rm -f "$(DESTDIR)$(MANPREFIX)/man3/`basename $$m`"; done
       +        for m in $(MAN7:=.7); do rm -f "$(DESTDIR)$(MANPREFIX)/man7/`basename $$m`"; done
                rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.a"
       -        rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so"
       +        rm -f "$(DESTDIR)$(LIBPREFIX)/$(SONAME)"
       +        if [ "$(SOSYMLINK)" = "true" ]; then i=0; while [ "$$i" -le $(VERSION_MINOR) ]; do rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR).$$i"; i=$$((i+1)); done; fi
       +        if [ "$(SOSYMLINK)" = "true" ]; then rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so.$(VERSION_MAJOR)"; fi
       +        if [ "$(SOSYMLINK)" = "true" ]; then rm -f "$(DESTDIR)$(LIBPREFIX)/libgrapheme.so"; fi
                rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
       -        ldconfig || true
       +        if ! [ -z "$(LDCONFIG)" ]; then $(SHELL) -c "$(LDCONFIG)"; fi
        
        clean:
       -        rm -f $(BENCHMARK:=.o) benchmark/util.o $(BENCHMARK) $(GEN:=.h) $(GEN:=.o) gen/util.o $(GEN) $(SRC:=.o) src/util.o $(TEST:=.o) test/util.o $(TEST) libgrapheme.a libgrapheme.so
       +        rm -f $(BENCHMARK:=.o) benchmark/util.o $(BENCHMARK) $(GEN:=.h) $(GEN:=.o) gen/util.o $(GEN) $(SRC:=.o) src/util.o $(TEST:=.o) test/util.o $(TEST) libgrapheme.a $(SONAME) $(MAN3:=.3) $(MAN7:=.7)
        
        clean-data:
                rm -f $(DATA)
        
       -print:
       -        @echo $(PREFIX)
       -
        dist:
                rm -rf "libgrapheme-$(VERSION)"
                mkdir "libgrapheme-$(VERSION)"
       -        for m in benchmark data gen man src test; do mkdir "libgrapheme-$(VERSION)/$$m"; done
       +        for m in benchmark data gen man man/template src test; do mkdir "libgrapheme-$(VERSION)/$$m"; done
                cp config.mk grapheme.h LICENSE Makefile README "libgrapheme-$(VERSION)"
                cp $(BENCHMARK:=.c) benchmark/util.c benchmark/util.h "libgrapheme-$(VERSION)/benchmark"
                cp $(DATA) "libgrapheme-$(VERSION)/data"
                cp $(GEN:=.c) gen/util.c gen/types.h gen/util.h "libgrapheme-$(VERSION)/gen"
       -        cp $(MAN3) $(MAN7) "libgrapheme-$(VERSION)/man"
       +        cp $(MAN3:=.sh) $(MAN7:=.sh) "libgrapheme-$(VERSION)/man"
       +        cp $(MAN_TEMPLATE) "libgrapheme-$(VERSION)/man/template"
                cp $(SRC:=.c) src/util.h "libgrapheme-$(VERSION)/src"
                cp $(TEST:=.c) test/util.c test/util.h "libgrapheme-$(VERSION)/test"
                tar -cf - "libgrapheme-$(VERSION)" | gzip -c > "libgrapheme-$(VERSION).tar.gz"
 (DIR) diff --git a/README b/README
       @@ -1,25 +1,34 @@
        libgrapheme
        ===========
        
       -The libgrapheme library provides functions to properly handle Unicode
       -strings according to the Unicode specification. Unicode strings are made
       -up of user-perceived characters (so-called "grapheme clusters") that are
       -made up of one or more Unicode codepoints, which in turn are encoded in
       -one or more bytes in an encoding like UTF-8.
       -
       -There is a widespread misconception that it was enough to simply
       -determine codepoints in a string and treat them as user-perceived
       -characters to be Unicode compliant. While this may work in some cases,
       -this assumption quickly breaks, especially for non-Western languages and
       -decomposed Unicode strings where user-perceived characters are usually
       -represented using multiple codepoints.
       -
       -Despite the complicated multilevel structure of Unicode strings,
       -libgrapheme provides methods to work with them at the byte-level (i.e.
       -UTF-8 ‘char’ arrays) while also providing codepoint-level methods.
       -
       -See libgrapheme(7) to get started and try out the self-contained examples
       -given on the manual pages for each function.
       +libgrapheme is an extremely simple freestanding C99 library providing
       +utilities for properly handling strings according to the latest Unicode
       +standard 15.0.0. It offers fully Unicode compliant
       +
       + - grapheme cluster (i.e. user-perceived character) segmentation
       + - word segmentation
       + - sentence segmentation
       + - detection of permissible line break opportunities
       + - case detection (lower-, upper- and title-case)
       + - case conversion (to lower-, upper- and title-case)
       +
       +on UTF-8 strings and codepoint arrays, which both can also be
       +null-terminated.
       +
       +The necessary lookup-tables are automatically generated from the Unicode
       +standard data (contained in the tarball) and heavily compressed. Over
       +10,000 automatically generated conformance tests and over 150 unit tests
       +ensure conformance and correctness.
       +
       +There is no complicated build-system involved and it's all done using one
       +POSIX-compliant Makefile. All you need is a C99 compiler, given the
       +lookup-table-generators and compressors are also written in C99. The
       +resulting library is freestanding and thus not even dependent on a
       +standard library to be present at runtime, making it a suitable choice
       +for bare metal applications.
       +
       +It is also way smaller and much faster than the other established
       +Unicode string libraries (ICU, GNU's libunistring, libutf8proc).
        
        Requirements
        ------------
       @@ -27,8 +36,9 @@ A C99-compiler and POSIX make.
        
        Installation
        ------------
       -Edit config.mk to match your local setup (usually not necessary, the
       -default prefix is /usr/local).
       +Run ./configure, which automatically edits config.mk to match your local
       +setup. Edit config.mk by hand if necessary or desired for further
       +customization.
        
        Afterwards enter the following command to build and install libgrapheme
        (if necessary as root):
       @@ -37,16 +47,12 @@ Afterwards enter the following command to build and install libgrapheme
        
        Conformance
        -----------
       -The libgrapheme library is compliant with the Unicode 14.0.0
       -specification (September 2021).
       -
       -To ensure conformance, libgrapheme includes hundreds of tests including
       -all provided with the standard-provided test-data that is parsed
       -automatically. The tests can be run with
       +The libgrapheme library is compliant with the Unicode 15.0.0
       +specification (September 2022). The tests can be run with
        
                make test
        
       -to check standard conformance.
       +to check standard conformance and correctness.
        
        Usage
        -----
 (DIR) diff --git a/benchmark/character.c b/benchmark/character.c
       @@ -23,7 +23,7 @@ struct break_benchmark_payload {
        void
        libgrapheme(const void *payload)
        {
       -        GRAPHEME_STATE state = { 0 };
       +        uint_least16_t state = 0;
                const struct break_benchmark_payload *p = payload;
                size_t i;
        
       @@ -80,6 +80,7 @@ main(int argc, char *argv[])
                              &baseline, NUM_ITERATIONS, p.buflen - 1);
        
                free(p.buf);
       +        free(p.buf_utf8proc);
        
                return 0;
        }
 (DIR) diff --git a/config.mk b/config.mk
       @@ -1,7 +1,4 @@
       -# libgrapheme version
       -VERSION = 1
       -
       -# Customize below to fit your system
       +# Customize below to fit your system (run ./configure for automatic presets)
        
        # paths
        PREFIX = /usr/local
       @@ -11,9 +8,23 @@ MANPREFIX = $(PREFIX)/share/man
        
        # flags
        CPPFLAGS = -D_DEFAULT_SOURCE
       -CFLAGS   = -std=c99 -Os -fPIC -Wall -Wextra -Wpedantic
       +CFLAGS   = -std=c99 -Os -Wall -Wextra -Wpedantic
        LDFLAGS  = -s
        
       +BUILD_CPPFLAGS = $(CPPFLAGS)
       +BUILD_CFLAGS   = $(CFLAGS)
       +BUILD_LDFLAGS  = $(LDFLAGS)
       +
       +SHFLAGS   = -fPIC -ffreestanding
       +
       +SOFLAGS   = -shared -nostdlib -Wl,--soname=libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR)
       +SONAME    = libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
       +SOSYMLINK = true
       +
        # tools
       -CC = cc
       -AR = ar
       +CC       = cc
       +BUILD_CC = $(CC)
       +AR       = ar
       +RANLIB   = ranlib
       +LDCONFIG = ldconfig # unset to not call ldconfig(1) after install/uninstall
       +SH       = sh
 (DIR) diff --git a/configure b/configure
       @@ -0,0 +1,39 @@
       +#!/bin/sh
       +# See LICENSE file for copyright and license details.
       +
       +replace_line()
       +{
       +        VAR=$1
       +        ALIGNMENT=$2
       +        VALUE=$3
       +        awk "/^${VAR}[ ]*=/ { print \"${VAR}${ALIGNMENT} = ${VALUE}\"; next }; { print; }" config.mk > config.mk.tmp
       +        mv config.mk.tmp config.mk
       +}
       +
       +case $(uname) in
       +        DragonFly|FreeBSD|Linux|NetBSD)
       +                # the default
       +                replace_line 'SOFLAGS'   '  '  '-shared -nostdlib -Wl,--soname=libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR)'
       +                replace_line 'SONAME'    '   ' 'libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)'
       +                replace_line 'SOSYMLINK' ''    'true'
       +                replace_line 'LDCONFIG'  ''    'ldconfig \# unset to not call ldconfig(1) after install/uninstall'
       +                ;;
       +        OpenBSD)
       +                replace_line 'SOFLAGS'   '  '  '-shared -nostdlib'
       +                replace_line 'SONAME'    '   ' 'libgrapheme.so.$(VERSION_MAJOR).$(VERSION_MINOR)'
       +                replace_line 'SOSYMLINK' ''    'false'
       +                replace_line 'LDCONFIG'  ''    ''
       +                ;;
       +        Darwin)
       +                replace_line 'SOFLAGS'   '  '  '-dynamiclib -install_name libgrapheme.$(VERSION_MAJOR).dylib -current_version $(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH) -compatibility_version $(VERSION_MAJOR).$(VERSION_MINOR).0'
       +                replace_line 'SONAME'    '   ' 'libgrapheme.$(VERSION_MAJOR).dylib'
       +                replace_line 'SOSYMLINK' ''    'false'
       +                replace_line 'LDCONFIG'  ''    ''
       +                ;;
       +        *)
       +                echo "Your system does not have a preset. Edit config.mk and send a patch please! :)"
       +                exit 1
       +                ;;
       +esac
       +
       +exit 0
 (DIR) diff --git a/data/DerivedCoreProperties.txt b/data/DerivedCoreProperties.txt
       @@ -1,11 +1,11 @@
       -# DerivedCoreProperties-14.0.0.txt
       -# Date: 2021-08-12, 23:12:53 GMT
       -# © 2021 Unicode®, Inc.
       +# DerivedCoreProperties-15.0.0.txt
       +# Date: 2022-08-05, 22:17:05 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        
        # ================================================
        
       @@ -462,6 +462,7 @@ FFE9..FFEC    ; Math # Sm   [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A
        0BD7          ; Alphabetic # Mc       TAMIL AU LENGTH MARK
        0C00          ; Alphabetic # Mn       TELUGU SIGN COMBINING CANDRABINDU ABOVE
        0C01..0C03    ; Alphabetic # Mc   [3] TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
       +0C04          ; Alphabetic # Mn       TELUGU SIGN COMBINING ANUSVARA ABOVE
        0C05..0C0C    ; Alphabetic # Lo   [8] TELUGU LETTER A..TELUGU LETTER VOCALIC L
        0C0E..0C10    ; Alphabetic # Lo   [3] TELUGU LETTER E..TELUGU LETTER AI
        0C12..0C28    ; Alphabetic # Lo  [23] TELUGU LETTER O..TELUGU LETTER NA
       @@ -497,6 +498,7 @@ FFE9..FFEC    ; Math # Sm   [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A
        0CE0..0CE1    ; Alphabetic # Lo   [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
        0CE2..0CE3    ; Alphabetic # Mn   [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
        0CF1..0CF2    ; Alphabetic # Lo   [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
       +0CF3          ; Alphabetic # Mc       KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D00..0D01    ; Alphabetic # Mn   [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
        0D02..0D03    ; Alphabetic # Mc   [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D04..0D0C    ; Alphabetic # Lo   [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
       @@ -552,7 +554,7 @@ FFE9..FFEC    ; Math # Sm   [4] HALFWIDTH LEFTWARDS ARROW..HALFWIDTH DOWNWARDS A
        0F49..0F6C    ; Alphabetic # Lo  [36] TIBETAN LETTER NYA..TIBETAN LETTER RRA
        0F71..0F7E    ; Alphabetic # Mn  [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
        0F7F          ; Alphabetic # Mc       TIBETAN SIGN RNAM BCAD
       -0F80..0F81    ; Alphabetic # Mn   [2] TIBETAN VOWEL SIGN REVERSED I..TIBETAN VOWEL SIGN REVERSED II
       +0F80..0F83    ; Alphabetic # Mn   [4] TIBETAN VOWEL SIGN REVERSED I..TIBETAN SIGN SNA LDAN
        0F88..0F8C    ; Alphabetic # Lo   [5] TIBETAN SIGN LCE TSA CAN..TIBETAN SIGN INVERTED MCHU CAN
        0F8D..0F97    ; Alphabetic # Mn  [11] TIBETAN SUBJOINED SIGN LCE TSA CAN..TIBETAN SUBJOINED LETTER JA
        0F99..0FBC    ; Alphabetic # Mn  [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
       @@ -1053,6 +1055,7 @@ FFDA..FFDC    ; Alphabetic # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
        11071..11072  ; Alphabetic # Lo   [2] BRAHMI LETTER OLD TAMIL SHORT E..BRAHMI LETTER OLD TAMIL SHORT O
        11073..11074  ; Alphabetic # Mn   [2] BRAHMI VOWEL SIGN OLD TAMIL SHORT E..BRAHMI VOWEL SIGN OLD TAMIL SHORT O
        11075         ; Alphabetic # Lo       BRAHMI LETTER OLD TAMIL LLA
       +11080..11081  ; Alphabetic # Mn   [2] KAITHI SIGN CANDRABINDU..KAITHI SIGN ANUSVARA
        11082         ; Alphabetic # Mc       KAITHI SIGN VISARGA
        11083..110AF  ; Alphabetic # Lo  [45] KAITHI LETTER A..KAITHI LETTER HA
        110B0..110B2  ; Alphabetic # Mc   [3] KAITHI VOWEL SIGN AA..KAITHI VOWEL SIGN II
       @@ -1089,6 +1092,8 @@ FFDA..FFDC    ; Alphabetic # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
        11234         ; Alphabetic # Mn       KHOJKI SIGN ANUSVARA
        11237         ; Alphabetic # Mn       KHOJKI SIGN SHADDA
        1123E         ; Alphabetic # Mn       KHOJKI SIGN SUKUN
       +1123F..11240  ; Alphabetic # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
       +11241         ; Alphabetic # Mn       KHOJKI VOWEL SIGN VOCALIC R
        11280..11286  ; Alphabetic # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; Alphabetic # Lo       MULTANI LETTER GHA
        1128A..1128D  ; Alphabetic # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -1243,12 +1248,22 @@ FFDA..FFDC    ; Alphabetic # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
        11EE0..11EF2  ; Alphabetic # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
        11EF3..11EF4  ; Alphabetic # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
        11EF5..11EF6  ; Alphabetic # Mc   [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
       +11F00..11F01  ; Alphabetic # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F02         ; Alphabetic # Lo       KAWI SIGN REPHA
       +11F03         ; Alphabetic # Mc       KAWI SIGN VISARGA
       +11F04..11F10  ; Alphabetic # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; Alphabetic # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
       +11F34..11F35  ; Alphabetic # Mc   [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F36..11F3A  ; Alphabetic # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F3E..11F3F  ; Alphabetic # Mc   [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F40         ; Alphabetic # Mn       KAWI VOWEL SIGN EU
        11FB0         ; Alphabetic # Lo       LISU LETTER YHA
        12000..12399  ; Alphabetic # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
        12400..1246E  ; Alphabetic # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
        12480..12543  ; Alphabetic # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; Alphabetic # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
       -13000..1342E  ; Alphabetic # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; Alphabetic # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13441..13446  ; Alphabetic # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
        14400..14646  ; Alphabetic # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; Alphabetic # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; Alphabetic # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -1275,7 +1290,9 @@ FFDA..FFDC    ; Alphabetic # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
        1AFF5..1AFFB  ; Alphabetic # Lm   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
        1AFFD..1AFFE  ; Alphabetic # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B122  ; Alphabetic # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
       +1B132         ; Alphabetic # Lo       HIRAGANA LETTER SMALL KO
        1B150..1B152  ; Alphabetic # Lo   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155         ; Alphabetic # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; Alphabetic # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB  ; Alphabetic # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A  ; Alphabetic # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -1316,16 +1333,21 @@ FFDA..FFDC    ; Alphabetic # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
        1DF00..1DF09  ; Alphabetic # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A         ; Alphabetic # Lo       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E  ; Alphabetic # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; Alphabetic # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
        1E000..1E006  ; Alphabetic # Mn   [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
        1E008..1E018  ; Alphabetic # Mn  [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
        1E01B..1E021  ; Alphabetic # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; Alphabetic # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; Alphabetic # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E030..1E06D  ; Alphabetic # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
       +1E08F         ; Alphabetic # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E100..1E12C  ; Alphabetic # Lo  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E137..1E13D  ; Alphabetic # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
        1E14E         ; Alphabetic # Lo       NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
        1E290..1E2AD  ; Alphabetic # Lo  [30] TOTO LETTER PA..TOTO LETTER A
        1E2C0..1E2EB  ; Alphabetic # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
       +1E4D0..1E4EA  ; Alphabetic # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; Alphabetic # Lm       NAG MUNDARI SIGN OJOD
        1E7E0..1E7E6  ; Alphabetic # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; Alphabetic # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; Alphabetic # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -1371,14 +1393,15 @@ FFDA..FFDC    ; Alphabetic # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
        1F150..1F169  ; Alphabetic # So  [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
        1F170..1F189  ; Alphabetic # So  [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
        20000..2A6DF  ; Alphabetic # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
       -2A700..2B738  ; Alphabetic # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       +2A700..2B739  ; Alphabetic # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
        2B740..2B81D  ; Alphabetic # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B820..2CEA1  ; Alphabetic # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
        2CEB0..2EBE0  ; Alphabetic # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
        2F800..2FA1D  ; Alphabetic # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
        30000..3134A  ; Alphabetic # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       +31350..323AF  ; Alphabetic # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
        
       -# Total code points: 133396
       +# Total code points: 137765
        
        # ================================================
        
       @@ -1663,6 +1686,7 @@ FFDA..FFDC    ; Alphabetic # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANG
        052F          ; Lowercase # L&       CYRILLIC SMALL LETTER EL WITH DESCENDER
        0560..0588    ; Lowercase # L&  [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
        10D0..10FA    ; Lowercase # L&  [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
       +10FC          ; Lowercase # Lm       MODIFIER LETTER GEORGIAN NAR
        10FD..10FF    ; Lowercase # L&   [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
        13F8..13FD    ; Lowercase # L&   [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
        1C80..1C88    ; Lowercase # L&   [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
       @@ -2012,12 +2036,14 @@ A7D3          ; Lowercase # L&       LATIN SMALL LETTER DOUBLE THORN
        A7D5          ; Lowercase # L&       LATIN SMALL LETTER DOUBLE WYNN
        A7D7          ; Lowercase # L&       LATIN SMALL LETTER MIDDLE SCOTS S
        A7D9          ; Lowercase # L&       LATIN SMALL LETTER SIGMOID S
       +A7F2..A7F4    ; Lowercase # Lm   [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
        A7F6          ; Lowercase # L&       LATIN SMALL LETTER REVERSED HALF H
        A7F8..A7F9    ; Lowercase # Lm   [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
        A7FA          ; Lowercase # L&       LATIN LETTER SMALL CAPITAL TURNED M
        AB30..AB5A    ; Lowercase # L&  [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
        AB5C..AB5F    ; Lowercase # Lm   [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
        AB60..AB68    ; Lowercase # L&   [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
       +AB69          ; Lowercase # Lm       MODIFIER LETTER SMALL TURNED W
        AB70..ABBF    ; Lowercase # L&  [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
        FB00..FB06    ; Lowercase # L&   [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
        FB13..FB17    ; Lowercase # L&   [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
       @@ -2065,9 +2091,11 @@ FF41..FF5A    ; Lowercase # L&  [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH L
        1D7CB         ; Lowercase # L&       MATHEMATICAL BOLD SMALL DIGAMMA
        1DF00..1DF09  ; Lowercase # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0B..1DF1E  ; Lowercase # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; Lowercase # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
       +1E030..1E06D  ; Lowercase # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
        1E922..1E943  ; Lowercase # L&  [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA
        
       -# Total code points: 2471
       +# Total code points: 2544
        
        # ================================================
        
       @@ -2767,6 +2795,7 @@ FF21..FF3A    ; Uppercase # L&  [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH
        10C7          ; Cased # L&       GEORGIAN CAPITAL LETTER YN
        10CD          ; Cased # L&       GEORGIAN CAPITAL LETTER AEN
        10D0..10FA    ; Cased # L&  [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
       +10FC          ; Cased # Lm       MODIFIER LETTER GEORGIAN NAR
        10FD..10FF    ; Cased # L&   [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
        13A0..13F5    ; Cased # L&  [86] CHEROKEE LETTER A..CHEROKEE LETTER MV
        13F8..13FD    ; Cased # L&   [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
       @@ -2837,12 +2866,14 @@ A790..A7CA    ; Cased # L&  [59] LATIN CAPITAL LETTER N WITH DESCENDER..LATIN SM
        A7D0..A7D1    ; Cased # L&   [2] LATIN CAPITAL LETTER CLOSED INSULAR G..LATIN SMALL LETTER CLOSED INSULAR G
        A7D3          ; Cased # L&       LATIN SMALL LETTER DOUBLE THORN
        A7D5..A7D9    ; Cased # L&   [5] LATIN SMALL LETTER DOUBLE WYNN..LATIN SMALL LETTER SIGMOID S
       +A7F2..A7F4    ; Cased # Lm   [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
        A7F5..A7F6    ; Cased # L&   [2] LATIN CAPITAL LETTER REVERSED HALF H..LATIN SMALL LETTER REVERSED HALF H
        A7F8..A7F9    ; Cased # Lm   [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
        A7FA          ; Cased # L&       LATIN LETTER SMALL CAPITAL TURNED M
        AB30..AB5A    ; Cased # L&  [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
        AB5C..AB5F    ; Cased # Lm   [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
        AB60..AB68    ; Cased # L&   [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
       +AB69          ; Cased # Lm       MODIFIER LETTER SMALL TURNED W
        AB70..ABBF    ; Cased # L&  [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
        FB00..FB06    ; Cased # L&   [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
        FB13..FB17    ; Cased # L&   [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
       @@ -2899,12 +2930,14 @@ FF41..FF5A    ; Cased # L&  [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN
        1D7C4..1D7CB  ; Cased # L&   [8] MATHEMATICAL SANS-SERIF BOLD ITALIC EPSILON SYMBOL..MATHEMATICAL BOLD SMALL DIGAMMA
        1DF00..1DF09  ; Cased # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0B..1DF1E  ; Cased # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; Cased # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
       +1E030..1E06D  ; Cased # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
        1E900..1E943  ; Cased # L&  [68] ADLAM CAPITAL LETTER ALIF..ADLAM SMALL LETTER SHA
        1F130..1F149  ; Cased # So  [26] SQUARED LATIN CAPITAL LETTER A..SQUARED LATIN CAPITAL LETTER Z
        1F150..1F169  ; Cased # So  [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
        1F170..1F189  ; Cased # So  [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
        
       -# Total code points: 4453
       +# Total code points: 4526
        
        # ================================================
        
       @@ -3054,7 +3087,7 @@ FF41..FF5A    ; Cased # L&  [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN
        0EB1          ; Case_Ignorable # Mn       LAO VOWEL SIGN MAI KAN
        0EB4..0EBC    ; Case_Ignorable # Mn   [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
        0EC6          ; Case_Ignorable # Lm       LAO KO LA
       -0EC8..0ECD    ; Case_Ignorable # Mn   [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE    ; Case_Ignorable # Mn   [7] LAO TONE MAI EK..LAO YAMAKKAN
        0F18..0F19    ; Case_Ignorable # Mn   [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
        0F35          ; Case_Ignorable # Mn       TIBETAN MARK NGAS BZUNG NYI ZLA
        0F37          ; Case_Ignorable # Mn       TIBETAN MARK NGAS BZUNG SGOR RTAGS
       @@ -3263,6 +3296,7 @@ FFF9..FFFB    ; Case_Ignorable # Cf   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
        10AE5..10AE6  ; Case_Ignorable # Mn   [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
        10D24..10D27  ; Case_Ignorable # Mn   [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
        10EAB..10EAC  ; Case_Ignorable # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
       +10EFD..10EFF  ; Case_Ignorable # Mn   [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F46..10F50  ; Case_Ignorable # Mn  [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
        10F82..10F85  ; Case_Ignorable # Mn   [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
        11001         ; Case_Ignorable # Mn       BRAHMI SIGN ANUSVARA
       @@ -3287,6 +3321,7 @@ FFF9..FFFB    ; Case_Ignorable # Cf   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
        11234         ; Case_Ignorable # Mn       KHOJKI SIGN ANUSVARA
        11236..11237  ; Case_Ignorable # Mn   [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        1123E         ; Case_Ignorable # Mn       KHOJKI SIGN SUKUN
       +11241         ; Case_Ignorable # Mn       KHOJKI VOWEL SIGN VOCALIC R
        112DF         ; Case_Ignorable # Mn       KHUDAWADI SIGN ANUSVARA
        112E3..112EA  ; Case_Ignorable # Mn   [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
        11300..11301  ; Case_Ignorable # Mn   [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
       @@ -3348,7 +3383,13 @@ FFF9..FFFB    ; Case_Ignorable # Cf   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
        11D95         ; Case_Ignorable # Mn       GUNJALA GONDI SIGN ANUSVARA
        11D97         ; Case_Ignorable # Mn       GUNJALA GONDI VIRAMA
        11EF3..11EF4  ; Case_Ignorable # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
       -13430..13438  ; Case_Ignorable # Cf   [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
       +11F00..11F01  ; Case_Ignorable # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F36..11F3A  ; Case_Ignorable # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F40         ; Case_Ignorable # Mn       KAWI VOWEL SIGN EU
       +11F42         ; Case_Ignorable # Mn       KAWI CONJOINER
       +13430..1343F  ; Case_Ignorable # Cf  [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
       +13440         ; Case_Ignorable # Mn       EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13447..13455  ; Case_Ignorable # Mn  [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        16AF0..16AF4  ; Case_Ignorable # Mn   [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
        16B30..16B36  ; Case_Ignorable # Mn   [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
        16B40..16B43  ; Case_Ignorable # Lm   [4] PAHAWH HMONG SIGN VOS SEEV..PAHAWH HMONG SIGN IB YAM
       @@ -3382,10 +3423,14 @@ FFF9..FFFB    ; Case_Ignorable # Cf   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLI
        1E01B..1E021  ; Case_Ignorable # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; Case_Ignorable # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; Case_Ignorable # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E030..1E06D  ; Case_Ignorable # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
       +1E08F         ; Case_Ignorable # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E130..1E136  ; Case_Ignorable # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E137..1E13D  ; Case_Ignorable # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
        1E2AE         ; Case_Ignorable # Mn       TOTO SIGN RISING TONE
        1E2EC..1E2EF  ; Case_Ignorable # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
       +1E4EB         ; Case_Ignorable # Lm       NAG MUNDARI SIGN OJOD
       +1E4EC..1E4EF  ; Case_Ignorable # Mn   [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
        1E8D0..1E8D6  ; Case_Ignorable # Mn   [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
        1E944..1E94A  ; Case_Ignorable # Mn   [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
        1E94B         ; Case_Ignorable # Lm       ADLAM NASALIZATION MARK
       @@ -3394,7 +3439,7 @@ E0001         ; Case_Ignorable # Cf       LANGUAGE TAG
        E0020..E007F  ; Case_Ignorable # Cf  [96] TAG SPACE..CANCEL TAG
        E0100..E01EF  ; Case_Ignorable # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        
       -# Total code points: 2602
       +# Total code points: 2707
        
        # ================================================
        
       @@ -6617,6 +6662,7 @@ FFDA..FFDC    ; ID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
        111DC         ; ID_Start # Lo       SHARADA HEADSTROKE
        11200..11211  ; ID_Start # Lo  [18] KHOJKI LETTER A..KHOJKI LETTER JJA
        11213..1122B  ; ID_Start # Lo  [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
       +1123F..11240  ; ID_Start # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
        11280..11286  ; ID_Start # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; ID_Start # Lo       MULTANI LETTER GHA
        1128A..1128D  ; ID_Start # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -6679,12 +6725,16 @@ FFDA..FFDC    ; ID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
        11D6A..11D89  ; ID_Start # Lo  [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
        11D98         ; ID_Start # Lo       GUNJALA GONDI OM
        11EE0..11EF2  ; ID_Start # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
       +11F02         ; ID_Start # Lo       KAWI SIGN REPHA
       +11F04..11F10  ; ID_Start # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; ID_Start # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
        11FB0         ; ID_Start # Lo       LISU LETTER YHA
        12000..12399  ; ID_Start # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
        12400..1246E  ; ID_Start # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
        12480..12543  ; ID_Start # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; ID_Start # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
       -13000..1342E  ; ID_Start # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; ID_Start # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13441..13446  ; ID_Start # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
        14400..14646  ; ID_Start # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; ID_Start # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; ID_Start # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -6707,7 +6757,9 @@ FFDA..FFDC    ; ID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
        1AFF5..1AFFB  ; ID_Start # Lm   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
        1AFFD..1AFFE  ; ID_Start # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B122  ; ID_Start # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
       +1B132         ; ID_Start # Lo       HIRAGANA LETTER SMALL KO
        1B150..1B152  ; ID_Start # Lo   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155         ; ID_Start # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; ID_Start # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB  ; ID_Start # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A  ; ID_Start # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -6747,11 +6799,15 @@ FFDA..FFDC    ; ID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
        1DF00..1DF09  ; ID_Start # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A         ; ID_Start # Lo       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E  ; ID_Start # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; ID_Start # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
       +1E030..1E06D  ; ID_Start # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
        1E100..1E12C  ; ID_Start # Lo  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E137..1E13D  ; ID_Start # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
        1E14E         ; ID_Start # Lo       NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
        1E290..1E2AD  ; ID_Start # Lo  [30] TOTO LETTER PA..TOTO LETTER A
        1E2C0..1E2EB  ; ID_Start # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
       +1E4D0..1E4EA  ; ID_Start # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; ID_Start # Lm       NAG MUNDARI SIGN OJOD
        1E7E0..1E7E6  ; ID_Start # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; ID_Start # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; ID_Start # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -6793,14 +6849,15 @@ FFDA..FFDC    ; ID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
        1EEA5..1EEA9  ; ID_Start # Lo   [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
        1EEAB..1EEBB  ; ID_Start # Lo  [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
        20000..2A6DF  ; ID_Start # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
       -2A700..2B738  ; ID_Start # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       +2A700..2B739  ; ID_Start # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
        2B740..2B81D  ; ID_Start # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B820..2CEA1  ; ID_Start # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
        2CEB0..2EBE0  ; ID_Start # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
        2F800..2FA1D  ; ID_Start # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
        30000..3134A  ; ID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       +31350..323AF  ; ID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
        
       -# Total code points: 131997
       +# Total code points: 136345
        
        # ================================================
        
       @@ -7083,6 +7140,7 @@ FFDA..FFDC    ; ID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
        0CE2..0CE3    ; ID_Continue # Mn   [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
        0CE6..0CEF    ; ID_Continue # Nd  [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
        0CF1..0CF2    ; ID_Continue # Lo   [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
       +0CF3          ; ID_Continue # Mc       KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D00..0D01    ; ID_Continue # Mn   [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
        0D02..0D03    ; ID_Continue # Mc   [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D04..0D0C    ; ID_Continue # Lo   [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
       @@ -7136,7 +7194,7 @@ FFDA..FFDC    ; ID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL
        0EBD          ; ID_Continue # Lo       LAO SEMIVOWEL SIGN NYO
        0EC0..0EC4    ; ID_Continue # Lo   [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
        0EC6          ; ID_Continue # Lm       LAO KO LA
       -0EC8..0ECD    ; ID_Continue # Mn   [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE    ; ID_Continue # Mn   [7] LAO TONE MAI EK..LAO YAMAKKAN
        0ED0..0ED9    ; ID_Continue # Nd  [10] LAO DIGIT ZERO..LAO DIGIT NINE
        0EDC..0EDF    ; ID_Continue # Lo   [4] LAO HO NO..LAO LETTER KHMU NYO
        0F00          ; ID_Continue # Lo       TIBETAN SYLLABLE OM
       @@ -7719,6 +7777,7 @@ FFDA..FFDC    ; ID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
        10E80..10EA9  ; ID_Continue # Lo  [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
        10EAB..10EAC  ; ID_Continue # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
        10EB0..10EB1  ; ID_Continue # Lo   [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
       +10EFD..10EFF  ; ID_Continue # Mn   [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F00..10F1C  ; ID_Continue # Lo  [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
        10F27         ; ID_Continue # Lo       OLD SOGDIAN LIGATURE AYIN-DALETH
        10F30..10F45  ; ID_Continue # Lo  [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
       @@ -7781,6 +7840,8 @@ FFDA..FFDC    ; ID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
        11235         ; ID_Continue # Mc       KHOJKI SIGN VIRAMA
        11236..11237  ; ID_Continue # Mn   [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        1123E         ; ID_Continue # Mn       KHOJKI SIGN SUKUN
       +1123F..11240  ; ID_Continue # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
       +11241         ; ID_Continue # Mn       KHOJKI VOWEL SIGN VOCALIC R
        11280..11286  ; ID_Continue # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; ID_Continue # Lo       MULTANI LETTER GHA
        1128A..1128D  ; ID_Continue # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -7963,12 +8024,27 @@ FFDA..FFDC    ; ID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
        11EE0..11EF2  ; ID_Continue # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
        11EF3..11EF4  ; ID_Continue # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
        11EF5..11EF6  ; ID_Continue # Mc   [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
       +11F00..11F01  ; ID_Continue # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F02         ; ID_Continue # Lo       KAWI SIGN REPHA
       +11F03         ; ID_Continue # Mc       KAWI SIGN VISARGA
       +11F04..11F10  ; ID_Continue # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; ID_Continue # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
       +11F34..11F35  ; ID_Continue # Mc   [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F36..11F3A  ; ID_Continue # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F3E..11F3F  ; ID_Continue # Mc   [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F40         ; ID_Continue # Mn       KAWI VOWEL SIGN EU
       +11F41         ; ID_Continue # Mc       KAWI SIGN KILLER
       +11F42         ; ID_Continue # Mn       KAWI CONJOINER
       +11F50..11F59  ; ID_Continue # Nd  [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
        11FB0         ; ID_Continue # Lo       LISU LETTER YHA
        12000..12399  ; ID_Continue # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
        12400..1246E  ; ID_Continue # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
        12480..12543  ; ID_Continue # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; ID_Continue # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
       -13000..1342E  ; ID_Continue # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; ID_Continue # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13440         ; ID_Continue # Mn       EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13441..13446  ; ID_Continue # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
       +13447..13455  ; ID_Continue # Mn  [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        14400..14646  ; ID_Continue # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; ID_Continue # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; ID_Continue # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -8001,7 +8077,9 @@ FFDA..FFDC    ; ID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
        1AFF5..1AFFB  ; ID_Continue # Lm   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
        1AFFD..1AFFE  ; ID_Continue # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B122  ; ID_Continue # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
       +1B132         ; ID_Continue # Lo       HIRAGANA LETTER SMALL KO
        1B150..1B152  ; ID_Continue # Lo   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155         ; ID_Continue # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; ID_Continue # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB  ; ID_Continue # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A  ; ID_Continue # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -8058,11 +8136,14 @@ FFDA..FFDC    ; ID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
        1DF00..1DF09  ; ID_Continue # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A         ; ID_Continue # Lo       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E  ; ID_Continue # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; ID_Continue # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
        1E000..1E006  ; ID_Continue # Mn   [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
        1E008..1E018  ; ID_Continue # Mn  [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
        1E01B..1E021  ; ID_Continue # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; ID_Continue # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; ID_Continue # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E030..1E06D  ; ID_Continue # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
       +1E08F         ; ID_Continue # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E100..1E12C  ; ID_Continue # Lo  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E130..1E136  ; ID_Continue # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E137..1E13D  ; ID_Continue # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
       @@ -8073,6 +8154,10 @@ FFDA..FFDC    ; ID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
        1E2C0..1E2EB  ; ID_Continue # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
        1E2EC..1E2EF  ; ID_Continue # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
        1E2F0..1E2F9  ; ID_Continue # Nd  [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
       +1E4D0..1E4EA  ; ID_Continue # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; ID_Continue # Lm       NAG MUNDARI SIGN OJOD
       +1E4EC..1E4EF  ; ID_Continue # Mn   [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
       +1E4F0..1E4F9  ; ID_Continue # Nd  [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
        1E7E0..1E7E6  ; ID_Continue # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; ID_Continue # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; ID_Continue # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -8118,15 +8203,16 @@ FFDA..FFDC    ; ID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HAN
        1EEAB..1EEBB  ; ID_Continue # Lo  [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
        1FBF0..1FBF9  ; ID_Continue # Nd  [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
        20000..2A6DF  ; ID_Continue # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
       -2A700..2B738  ; ID_Continue # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       +2A700..2B739  ; ID_Continue # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
        2B740..2B81D  ; ID_Continue # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B820..2CEA1  ; ID_Continue # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
        2CEB0..2EBE0  ; ID_Continue # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
        2F800..2FA1D  ; ID_Continue # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
        30000..3134A  ; ID_Continue # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       +31350..323AF  ; ID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
        E0100..E01EF  ; ID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        
       -# Total code points: 135072
       +# Total code points: 139482
        
        # ================================================
        
       @@ -8685,6 +8771,7 @@ FFDA..FFDC    ; XID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
        111DC         ; XID_Start # Lo       SHARADA HEADSTROKE
        11200..11211  ; XID_Start # Lo  [18] KHOJKI LETTER A..KHOJKI LETTER JJA
        11213..1122B  ; XID_Start # Lo  [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
       +1123F..11240  ; XID_Start # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
        11280..11286  ; XID_Start # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; XID_Start # Lo       MULTANI LETTER GHA
        1128A..1128D  ; XID_Start # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -8747,12 +8834,16 @@ FFDA..FFDC    ; XID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
        11D6A..11D89  ; XID_Start # Lo  [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
        11D98         ; XID_Start # Lo       GUNJALA GONDI OM
        11EE0..11EF2  ; XID_Start # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
       +11F02         ; XID_Start # Lo       KAWI SIGN REPHA
       +11F04..11F10  ; XID_Start # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; XID_Start # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
        11FB0         ; XID_Start # Lo       LISU LETTER YHA
        12000..12399  ; XID_Start # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
        12400..1246E  ; XID_Start # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
        12480..12543  ; XID_Start # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; XID_Start # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
       -13000..1342E  ; XID_Start # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; XID_Start # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13441..13446  ; XID_Start # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
        14400..14646  ; XID_Start # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; XID_Start # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; XID_Start # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -8775,7 +8866,9 @@ FFDA..FFDC    ; XID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
        1AFF5..1AFFB  ; XID_Start # Lm   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
        1AFFD..1AFFE  ; XID_Start # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B122  ; XID_Start # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
       +1B132         ; XID_Start # Lo       HIRAGANA LETTER SMALL KO
        1B150..1B152  ; XID_Start # Lo   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155         ; XID_Start # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; XID_Start # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB  ; XID_Start # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A  ; XID_Start # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -8815,11 +8908,15 @@ FFDA..FFDC    ; XID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
        1DF00..1DF09  ; XID_Start # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A         ; XID_Start # Lo       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E  ; XID_Start # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; XID_Start # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
       +1E030..1E06D  ; XID_Start # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
        1E100..1E12C  ; XID_Start # Lo  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E137..1E13D  ; XID_Start # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
        1E14E         ; XID_Start # Lo       NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
        1E290..1E2AD  ; XID_Start # Lo  [30] TOTO LETTER PA..TOTO LETTER A
        1E2C0..1E2EB  ; XID_Start # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
       +1E4D0..1E4EA  ; XID_Start # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; XID_Start # Lm       NAG MUNDARI SIGN OJOD
        1E7E0..1E7E6  ; XID_Start # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; XID_Start # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; XID_Start # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -8861,14 +8958,15 @@ FFDA..FFDC    ; XID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
        1EEA5..1EEA9  ; XID_Start # Lo   [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
        1EEAB..1EEBB  ; XID_Start # Lo  [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
        20000..2A6DF  ; XID_Start # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
       -2A700..2B738  ; XID_Start # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       +2A700..2B739  ; XID_Start # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
        2B740..2B81D  ; XID_Start # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B820..2CEA1  ; XID_Start # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
        2CEB0..2EBE0  ; XID_Start # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
        2F800..2FA1D  ; XID_Start # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
        30000..3134A  ; XID_Start # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       +31350..323AF  ; XID_Start # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
        
       -# Total code points: 131974
       +# Total code points: 136322
        
        # ================================================
        
       @@ -9147,6 +9245,7 @@ FFDA..FFDC    ; XID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
        0CE2..0CE3    ; XID_Continue # Mn   [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
        0CE6..0CEF    ; XID_Continue # Nd  [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
        0CF1..0CF2    ; XID_Continue # Lo   [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
       +0CF3          ; XID_Continue # Mc       KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D00..0D01    ; XID_Continue # Mn   [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
        0D02..0D03    ; XID_Continue # Mc   [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D04..0D0C    ; XID_Continue # Lo   [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
       @@ -9200,7 +9299,7 @@ FFDA..FFDC    ; XID_Start # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGU
        0EBD          ; XID_Continue # Lo       LAO SEMIVOWEL SIGN NYO
        0EC0..0EC4    ; XID_Continue # Lo   [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
        0EC6          ; XID_Continue # Lm       LAO KO LA
       -0EC8..0ECD    ; XID_Continue # Mn   [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE    ; XID_Continue # Mn   [7] LAO TONE MAI EK..LAO YAMAKKAN
        0ED0..0ED9    ; XID_Continue # Nd  [10] LAO DIGIT ZERO..LAO DIGIT NINE
        0EDC..0EDF    ; XID_Continue # Lo   [4] LAO HO NO..LAO LETTER KHMU NYO
        0F00          ; XID_Continue # Lo       TIBETAN SYLLABLE OM
       @@ -9788,6 +9887,7 @@ FFDA..FFDC    ; XID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
        10E80..10EA9  ; XID_Continue # Lo  [42] YEZIDI LETTER ELIF..YEZIDI LETTER ET
        10EAB..10EAC  ; XID_Continue # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
        10EB0..10EB1  ; XID_Continue # Lo   [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
       +10EFD..10EFF  ; XID_Continue # Mn   [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F00..10F1C  ; XID_Continue # Lo  [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
        10F27         ; XID_Continue # Lo       OLD SOGDIAN LIGATURE AYIN-DALETH
        10F30..10F45  ; XID_Continue # Lo  [22] SOGDIAN LETTER ALEPH..SOGDIAN INDEPENDENT SHIN
       @@ -9850,6 +9950,8 @@ FFDA..FFDC    ; XID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
        11235         ; XID_Continue # Mc       KHOJKI SIGN VIRAMA
        11236..11237  ; XID_Continue # Mn   [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        1123E         ; XID_Continue # Mn       KHOJKI SIGN SUKUN
       +1123F..11240  ; XID_Continue # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
       +11241         ; XID_Continue # Mn       KHOJKI VOWEL SIGN VOCALIC R
        11280..11286  ; XID_Continue # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; XID_Continue # Lo       MULTANI LETTER GHA
        1128A..1128D  ; XID_Continue # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -10032,12 +10134,27 @@ FFDA..FFDC    ; XID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
        11EE0..11EF2  ; XID_Continue # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
        11EF3..11EF4  ; XID_Continue # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
        11EF5..11EF6  ; XID_Continue # Mc   [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
       +11F00..11F01  ; XID_Continue # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F02         ; XID_Continue # Lo       KAWI SIGN REPHA
       +11F03         ; XID_Continue # Mc       KAWI SIGN VISARGA
       +11F04..11F10  ; XID_Continue # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; XID_Continue # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
       +11F34..11F35  ; XID_Continue # Mc   [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F36..11F3A  ; XID_Continue # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F3E..11F3F  ; XID_Continue # Mc   [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F40         ; XID_Continue # Mn       KAWI VOWEL SIGN EU
       +11F41         ; XID_Continue # Mc       KAWI SIGN KILLER
       +11F42         ; XID_Continue # Mn       KAWI CONJOINER
       +11F50..11F59  ; XID_Continue # Nd  [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
        11FB0         ; XID_Continue # Lo       LISU LETTER YHA
        12000..12399  ; XID_Continue # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
        12400..1246E  ; XID_Continue # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
        12480..12543  ; XID_Continue # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; XID_Continue # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
       -13000..1342E  ; XID_Continue # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; XID_Continue # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13440         ; XID_Continue # Mn       EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13441..13446  ; XID_Continue # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
       +13447..13455  ; XID_Continue # Mn  [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        14400..14646  ; XID_Continue # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; XID_Continue # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; XID_Continue # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -10070,7 +10187,9 @@ FFDA..FFDC    ; XID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
        1AFF5..1AFFB  ; XID_Continue # Lm   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
        1AFFD..1AFFE  ; XID_Continue # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B122  ; XID_Continue # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
       +1B132         ; XID_Continue # Lo       HIRAGANA LETTER SMALL KO
        1B150..1B152  ; XID_Continue # Lo   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155         ; XID_Continue # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; XID_Continue # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB  ; XID_Continue # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A  ; XID_Continue # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -10127,11 +10246,14 @@ FFDA..FFDC    ; XID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
        1DF00..1DF09  ; XID_Continue # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A         ; XID_Continue # Lo       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E  ; XID_Continue # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; XID_Continue # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
        1E000..1E006  ; XID_Continue # Mn   [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
        1E008..1E018  ; XID_Continue # Mn  [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
        1E01B..1E021  ; XID_Continue # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; XID_Continue # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; XID_Continue # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E030..1E06D  ; XID_Continue # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
       +1E08F         ; XID_Continue # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E100..1E12C  ; XID_Continue # Lo  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E130..1E136  ; XID_Continue # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E137..1E13D  ; XID_Continue # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
       @@ -10142,6 +10264,10 @@ FFDA..FFDC    ; XID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
        1E2C0..1E2EB  ; XID_Continue # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
        1E2EC..1E2EF  ; XID_Continue # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
        1E2F0..1E2F9  ; XID_Continue # Nd  [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
       +1E4D0..1E4EA  ; XID_Continue # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; XID_Continue # Lm       NAG MUNDARI SIGN OJOD
       +1E4EC..1E4EF  ; XID_Continue # Mn   [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
       +1E4F0..1E4F9  ; XID_Continue # Nd  [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
        1E7E0..1E7E6  ; XID_Continue # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; XID_Continue # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; XID_Continue # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -10187,15 +10313,16 @@ FFDA..FFDC    ; XID_Continue # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HA
        1EEAB..1EEBB  ; XID_Continue # Lo  [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
        1FBF0..1FBF9  ; XID_Continue # Nd  [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
        20000..2A6DF  ; XID_Continue # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
       -2A700..2B738  ; XID_Continue # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       +2A700..2B739  ; XID_Continue # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
        2B740..2B81D  ; XID_Continue # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B820..2CEA1  ; XID_Continue # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
        2CEB0..2EBE0  ; XID_Continue # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
        2F800..2FA1D  ; XID_Continue # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
        30000..3134A  ; XID_Continue # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       +31350..323AF  ; XID_Continue # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
        E0100..E01EF  ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        
       -# Total code points: 135053
       +# Total code points: 139463
        
        # ================================================
        
       @@ -10206,7 +10333,7 @@ E0100..E01EF  ; XID_Continue # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTO
        #  + Variation_Selector
        #  - White_Space
        #  - FFF9..FFFB (Interlinear annotation format characters)
       -#  - 13430..13438 (Egyptian hieroglyph format characters)
       +#  - 13430..13440 (Egyptian hieroglyph format characters)
        #  - Prepended_Concatenation_Mark (Exceptional format characters that should be visible)
        
        00AD          ; Default_Ignorable_Code_Point # Cf       SOFT HYPHEN
       @@ -10351,7 +10478,7 @@ E01F0..E0FFF  ; Default_Ignorable_Code_Point # Cn [3600] <reserved-E01F0>..<rese
        0E47..0E4E    ; Grapheme_Extend # Mn   [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
        0EB1          ; Grapheme_Extend # Mn       LAO VOWEL SIGN MAI KAN
        0EB4..0EBC    ; Grapheme_Extend # Mn   [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
       -0EC8..0ECD    ; Grapheme_Extend # Mn   [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE    ; Grapheme_Extend # Mn   [7] LAO TONE MAI EK..LAO YAMAKKAN
        0F18..0F19    ; Grapheme_Extend # Mn   [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
        0F35          ; Grapheme_Extend # Mn       TIBETAN MARK NGAS BZUNG NYI ZLA
        0F37          ; Grapheme_Extend # Mn       TIBETAN MARK NGAS BZUNG SGOR RTAGS
       @@ -10490,6 +10617,7 @@ FF9E..FF9F    ; Grapheme_Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
        10AE5..10AE6  ; Grapheme_Extend # Mn   [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
        10D24..10D27  ; Grapheme_Extend # Mn   [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
        10EAB..10EAC  ; Grapheme_Extend # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
       +10EFD..10EFF  ; Grapheme_Extend # Mn   [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F46..10F50  ; Grapheme_Extend # Mn  [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
        10F82..10F85  ; Grapheme_Extend # Mn   [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
        11001         ; Grapheme_Extend # Mn       BRAHMI SIGN ANUSVARA
       @@ -10512,6 +10640,7 @@ FF9E..FF9F    ; Grapheme_Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
        11234         ; Grapheme_Extend # Mn       KHOJKI SIGN ANUSVARA
        11236..11237  ; Grapheme_Extend # Mn   [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        1123E         ; Grapheme_Extend # Mn       KHOJKI SIGN SUKUN
       +11241         ; Grapheme_Extend # Mn       KHOJKI VOWEL SIGN VOCALIC R
        112DF         ; Grapheme_Extend # Mn       KHUDAWADI SIGN ANUSVARA
        112E3..112EA  ; Grapheme_Extend # Mn   [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
        11300..11301  ; Grapheme_Extend # Mn   [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
       @@ -10579,6 +10708,12 @@ FF9E..FF9F    ; Grapheme_Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
        11D95         ; Grapheme_Extend # Mn       GUNJALA GONDI SIGN ANUSVARA
        11D97         ; Grapheme_Extend # Mn       GUNJALA GONDI VIRAMA
        11EF3..11EF4  ; Grapheme_Extend # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
       +11F00..11F01  ; Grapheme_Extend # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F36..11F3A  ; Grapheme_Extend # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F40         ; Grapheme_Extend # Mn       KAWI VOWEL SIGN EU
       +11F42         ; Grapheme_Extend # Mn       KAWI CONJOINER
       +13440         ; Grapheme_Extend # Mn       EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13447..13455  ; Grapheme_Extend # Mn  [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        16AF0..16AF4  ; Grapheme_Extend # Mn   [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
        16B30..16B36  ; Grapheme_Extend # Mn   [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
        16F4F         ; Grapheme_Extend # Mn       MIAO SIGN CONSONANT MODIFIER BAR
       @@ -10605,15 +10740,17 @@ FF9E..FF9F    ; Grapheme_Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK.
        1E01B..1E021  ; Grapheme_Extend # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; Grapheme_Extend # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; Grapheme_Extend # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E08F         ; Grapheme_Extend # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E130..1E136  ; Grapheme_Extend # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E2AE         ; Grapheme_Extend # Mn       TOTO SIGN RISING TONE
        1E2EC..1E2EF  ; Grapheme_Extend # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
       +1E4EC..1E4EF  ; Grapheme_Extend # Mn   [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
        1E8D0..1E8D6  ; Grapheme_Extend # Mn   [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
        1E944..1E94A  ; Grapheme_Extend # Mn   [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
        E0020..E007F  ; Grapheme_Extend # Cf  [96] TAG SPACE..CANCEL TAG
        E0100..E01EF  ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        
       -# Total code points: 2090
       +# Total code points: 2125
        
        # ================================================
        
       @@ -10913,6 +11050,7 @@ E0100..E01EF  ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELE
        0CE0..0CE1    ; Grapheme_Base # Lo   [2] KANNADA LETTER VOCALIC RR..KANNADA LETTER VOCALIC LL
        0CE6..0CEF    ; Grapheme_Base # Nd  [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
        0CF1..0CF2    ; Grapheme_Base # Lo   [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
       +0CF3          ; Grapheme_Base # Mc       KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D02..0D03    ; Grapheme_Base # Mc   [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D04..0D0C    ; Grapheme_Base # Lo   [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
        0D0E..0D10    ; Grapheme_Base # Lo   [3] MALAYALAM LETTER E..MALAYALAM LETTER AI
       @@ -11965,6 +12103,7 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        11232..11233  ; Grapheme_Base # Mc   [2] KHOJKI VOWEL SIGN O..KHOJKI VOWEL SIGN AU
        11235         ; Grapheme_Base # Mc       KHOJKI SIGN VIRAMA
        11238..1123D  ; Grapheme_Base # Po   [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
       +1123F..11240  ; Grapheme_Base # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
        11280..11286  ; Grapheme_Base # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; Grapheme_Base # Lo       MULTANI LETTER GHA
        1128A..1128D  ; Grapheme_Base # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -12080,6 +12219,7 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        11A9D         ; Grapheme_Base # Lo       SOYOMBO MARK PLUTA
        11A9E..11AA2  ; Grapheme_Base # Po   [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
        11AB0..11AF8  ; Grapheme_Base # Lo  [73] CANADIAN SYLLABICS NATTILIK HI..PAU CIN HAU GLOTTAL STOP FINAL
       +11B00..11B09  ; Grapheme_Base # Po  [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
        11C00..11C08  ; Grapheme_Base # Lo   [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
        11C0A..11C2E  ; Grapheme_Base # Lo  [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
        11C2F         ; Grapheme_Base # Mc       BHAIKSUKI VOWEL SIGN AA
       @@ -12109,6 +12249,15 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        11EE0..11EF2  ; Grapheme_Base # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
        11EF5..11EF6  ; Grapheme_Base # Mc   [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
        11EF7..11EF8  ; Grapheme_Base # Po   [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
       +11F02         ; Grapheme_Base # Lo       KAWI SIGN REPHA
       +11F03         ; Grapheme_Base # Mc       KAWI SIGN VISARGA
       +11F04..11F10  ; Grapheme_Base # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; Grapheme_Base # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
       +11F34..11F35  ; Grapheme_Base # Mc   [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F3E..11F3F  ; Grapheme_Base # Mc   [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F41         ; Grapheme_Base # Mc       KAWI SIGN KILLER
       +11F43..11F4F  ; Grapheme_Base # Po  [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
       +11F50..11F59  ; Grapheme_Base # Nd  [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
        11FB0         ; Grapheme_Base # Lo       LISU LETTER YHA
        11FC0..11FD4  ; Grapheme_Base # No  [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
        11FD5..11FDC  ; Grapheme_Base # So   [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
       @@ -12121,7 +12270,8 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        12480..12543  ; Grapheme_Base # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; Grapheme_Base # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
        12FF1..12FF2  ; Grapheme_Base # Po   [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
       -13000..1342E  ; Grapheme_Base # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; Grapheme_Base # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13441..13446  ; Grapheme_Base # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
        14400..14646  ; Grapheme_Base # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; Grapheme_Base # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; Grapheme_Base # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -12159,7 +12309,9 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        1AFF5..1AFFB  ; Grapheme_Base # Lm   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
        1AFFD..1AFFE  ; Grapheme_Base # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B122  ; Grapheme_Base # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
       +1B132         ; Grapheme_Base # Lo       HIRAGANA LETTER SMALL KO
        1B150..1B152  ; Grapheme_Base # Lo   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155         ; Grapheme_Base # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; Grapheme_Base # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB  ; Grapheme_Base # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A  ; Grapheme_Base # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -12180,6 +12332,7 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        1D1AE..1D1EA  ; Grapheme_Base # So  [61] MUSICAL SYMBOL PEDAL MARK..MUSICAL SYMBOL KORON
        1D200..1D241  ; Grapheme_Base # So  [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
        1D245         ; Grapheme_Base # So       GREEK MUSICAL LEIMMA
       +1D2C0..1D2D3  ; Grapheme_Base # No  [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
        1D2E0..1D2F3  ; Grapheme_Base # No  [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
        1D300..1D356  ; Grapheme_Base # So  [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
        1D360..1D378  ; Grapheme_Base # No  [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
       @@ -12233,6 +12386,8 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        1DF00..1DF09  ; Grapheme_Base # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A         ; Grapheme_Base # Lo       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E  ; Grapheme_Base # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; Grapheme_Base # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
       +1E030..1E06D  ; Grapheme_Base # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
        1E100..1E12C  ; Grapheme_Base # Lo  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E137..1E13D  ; Grapheme_Base # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
        1E140..1E149  ; Grapheme_Base # Nd  [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
       @@ -12242,6 +12397,9 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        1E2C0..1E2EB  ; Grapheme_Base # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
        1E2F0..1E2F9  ; Grapheme_Base # Nd  [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
        1E2FF         ; Grapheme_Base # Sc       WANCHO NGUN SIGN
       +1E4D0..1E4EA  ; Grapheme_Base # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; Grapheme_Base # Lm       NAG MUNDARI SIGN OJOD
       +1E4F0..1E4F9  ; Grapheme_Base # Nd  [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
        1E7E0..1E7E6  ; Grapheme_Base # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; Grapheme_Base # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; Grapheme_Base # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -12310,10 +12468,10 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        1F300..1F3FA  ; Grapheme_Base # So [251] CYCLONE..AMPHORA
        1F3FB..1F3FF  ; Grapheme_Base # Sk   [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
        1F400..1F6D7  ; Grapheme_Base # So [728] RAT..ELEVATOR
       -1F6DD..1F6EC  ; Grapheme_Base # So  [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
       +1F6DC..1F6EC  ; Grapheme_Base # So  [17] WIRELESS..AIRPLANE ARRIVING
        1F6F0..1F6FC  ; Grapheme_Base # So  [13] SATELLITE..ROLLER SKATE
       -1F700..1F773  ; Grapheme_Base # So [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
       -1F780..1F7D8  ; Grapheme_Base # So  [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
       +1F700..1F776  ; Grapheme_Base # So [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE
       +1F77B..1F7D9  ; Grapheme_Base # So  [95] HAUMEA..NINE POINTED WHITE STAR
        1F7E0..1F7EB  ; Grapheme_Base # So  [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
        1F7F0         ; Grapheme_Base # So       HEAVY EQUALS SIGN
        1F800..1F80B  ; Grapheme_Base # So  [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
       @@ -12324,27 +12482,26 @@ FFFC..FFFD    ; Grapheme_Base # So   [2] OBJECT REPLACEMENT CHARACTER..REPLACEME
        1F8B0..1F8B1  ; Grapheme_Base # So   [2] ARROW POINTING UPWARDS THEN NORTH WEST..ARROW POINTING RIGHTWARDS THEN CURVING SOUTH WEST
        1F900..1FA53  ; Grapheme_Base # So [340] CIRCLED CROSS FORMEE WITH FOUR DOTS..BLACK CHESS KNIGHT-BISHOP
        1FA60..1FA6D  ; Grapheme_Base # So  [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
       -1FA70..1FA74  ; Grapheme_Base # So   [5] BALLET SHOES..THONG SANDAL
       -1FA78..1FA7C  ; Grapheme_Base # So   [5] DROP OF BLOOD..CRUTCH
       -1FA80..1FA86  ; Grapheme_Base # So   [7] YO-YO..NESTING DOLLS
       -1FA90..1FAAC  ; Grapheme_Base # So  [29] RINGED PLANET..HAMSA
       -1FAB0..1FABA  ; Grapheme_Base # So  [11] FLY..NEST WITH EGGS
       -1FAC0..1FAC5  ; Grapheme_Base # So   [6] ANATOMICAL HEART..PERSON WITH CROWN
       -1FAD0..1FAD9  ; Grapheme_Base # So  [10] BLUEBERRIES..JAR
       -1FAE0..1FAE7  ; Grapheme_Base # So   [8] MELTING FACE..BUBBLES
       -1FAF0..1FAF6  ; Grapheme_Base # So   [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
       +1FA70..1FA7C  ; Grapheme_Base # So  [13] BALLET SHOES..CRUTCH
       +1FA80..1FA88  ; Grapheme_Base # So   [9] YO-YO..FLUTE
       +1FA90..1FABD  ; Grapheme_Base # So  [46] RINGED PLANET..WING
       +1FABF..1FAC5  ; Grapheme_Base # So   [7] GOOSE..PERSON WITH CROWN
       +1FACE..1FADB  ; Grapheme_Base # So  [14] MOOSE..PEA POD
       +1FAE0..1FAE8  ; Grapheme_Base # So   [9] MELTING FACE..SHAKING FACE
       +1FAF0..1FAF8  ; Grapheme_Base # So   [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
        1FB00..1FB92  ; Grapheme_Base # So [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
        1FB94..1FBCA  ; Grapheme_Base # So  [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
        1FBF0..1FBF9  ; Grapheme_Base # Nd  [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
        20000..2A6DF  ; Grapheme_Base # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
       -2A700..2B738  ; Grapheme_Base # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       +2A700..2B739  ; Grapheme_Base # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
        2B740..2B81D  ; Grapheme_Base # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B820..2CEA1  ; Grapheme_Base # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
        2CEB0..2EBE0  ; Grapheme_Base # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
        2F800..2FA1D  ; Grapheme_Base # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
        30000..3134A  ; Grapheme_Base # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       +31350..323AF  ; Grapheme_Base # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
        
       -# Total code points: 142539
       +# Total code points: 146986
        
        # ================================================
        
       @@ -12410,7 +12567,9 @@ ABED          ; Grapheme_Link # Mn       MEETEI MAYEK APUN IYEK
        11C3F         ; Grapheme_Link # Mn       BHAIKSUKI SIGN VIRAMA
        11D44..11D45  ; Grapheme_Link # Mn   [2] MASARAM GONDI SIGN HALANTA..MASARAM GONDI VIRAMA
        11D97         ; Grapheme_Link # Mn       GUNJALA GONDI VIRAMA
       +11F41         ; Grapheme_Link # Mc       KAWI SIGN KILLER
       +11F42         ; Grapheme_Link # Mn       KAWI CONJOINER
        
       -# Total code points: 63
       +# Total code points: 65
        
        # EOF
 (DIR) diff --git a/data/EastAsianWidth.txt b/data/EastAsianWidth.txt
       @@ -1,6 +1,6 @@
       -# EastAsianWidth-14.0.0.txt
       -# Date: 2021-07-06, 09:58:53 GMT [KW, LI]
       -# © 2021 Unicode®, Inc.
       +# EastAsianWidth-15.0.0.txt
       +# Date: 2022-05-24, 17:40:20 GMT [KW, LI]
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
        # For terms of use, see https://www.unicode.org/terms_of_use.html
        #
       @@ -534,6 +534,7 @@
        0CE2..0CE3;N     # Mn     [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
        0CE6..0CEF;N     # Nd    [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
        0CF1..0CF2;N     # Lo     [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
       +0CF3;N           # Mc         KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D00..0D01;N     # Mn     [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
        0D02..0D03;N     # Mc     [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D04..0D0C;N     # Lo     [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
       @@ -595,7 +596,7 @@
        0EBD;N           # Lo         LAO SEMIVOWEL SIGN NYO
        0EC0..0EC4;N     # Lo     [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
        0EC6;N           # Lm         LAO KO LA
       -0EC8..0ECD;N     # Mn     [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE;N     # Mn     [7] LAO TONE MAI EK..LAO YAMAKKAN
        0ED0..0ED9;N     # Nd    [10] LAO DIGIT ZERO..LAO DIGIT NINE
        0EDC..0EDF;N     # Lo     [4] LAO HO NO..LAO LETTER KHMU NYO
        0F00;N           # Lo         TIBETAN SYLLABLE OM
       @@ -1946,6 +1947,7 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        10EAB..10EAC;N   # Mn     [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
        10EAD;N          # Pd         YEZIDI HYPHENATION MARK
        10EB0..10EB1;N   # Lo     [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
       +10EFD..10EFF;N   # Mn     [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F00..10F1C;N   # Lo    [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
        10F1D..10F26;N   # No    [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
        10F27;N          # Lo         OLD SOGDIAN LIGATURE AYIN-DALETH
       @@ -2028,6 +2030,8 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        11236..11237;N   # Mn     [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        11238..1123D;N   # Po     [6] KHOJKI DANDA..KHOJKI ABBREVIATION SIGN
        1123E;N          # Mn         KHOJKI SIGN SUKUN
       +1123F..11240;N   # Lo     [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
       +11241;N          # Mn         KHOJKI VOWEL SIGN VOCALIC R
        11280..11286;N   # Lo     [7] MULTANI LETTER A..MULTANI LETTER GA
        11288;N          # Lo         MULTANI LETTER GHA
        1128A..1128D;N   # Lo     [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -2190,6 +2194,7 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        11A9E..11AA2;N   # Po     [5] SOYOMBO HEAD MARK WITH MOON AND SUN AND TRIPLE FLAME..SOYOMBO TERMINAL MARK-2
        11AB0..11ABF;N   # Lo    [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
        11AC0..11AF8;N   # Lo    [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
       +11B00..11B09;N   # Po    [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
        11C00..11C08;N   # Lo     [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
        11C0A..11C2E;N   # Lo    [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
        11C2F;N          # Mc         BHAIKSUKI VOWEL SIGN AA
       @@ -2235,6 +2240,19 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        11EF3..11EF4;N   # Mn     [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
        11EF5..11EF6;N   # Mc     [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
        11EF7..11EF8;N   # Po     [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
       +11F00..11F01;N   # Mn     [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F02;N          # Lo         KAWI SIGN REPHA
       +11F03;N          # Mc         KAWI SIGN VISARGA
       +11F04..11F10;N   # Lo    [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33;N   # Lo    [34] KAWI LETTER KA..KAWI LETTER JNYA
       +11F34..11F35;N   # Mc     [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F36..11F3A;N   # Mn     [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F3E..11F3F;N   # Mc     [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F40;N          # Mn         KAWI VOWEL SIGN EU
       +11F41;N          # Mc         KAWI SIGN KILLER
       +11F42;N          # Mn         KAWI CONJOINER
       +11F43..11F4F;N   # Po    [13] KAWI DANDA..KAWI PUNCTUATION CLOSING SPIRAL
       +11F50..11F59;N   # Nd    [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
        11FB0;N          # Lo         LISU LETTER YHA
        11FC0..11FD4;N   # No    [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
        11FD5..11FDC;N   # So     [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
       @@ -2247,8 +2265,11 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        12480..12543;N   # Lo   [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0;N   # Lo    [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
        12FF1..12FF2;N   # Po     [2] CYPRO-MINOAN SIGN CM301..CYPRO-MINOAN SIGN CM302
       -13000..1342E;N   # Lo  [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       -13430..13438;N   # Cf     [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
       +13000..1342F;N   # Lo  [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13430..1343F;N   # Cf    [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
       +13440;N          # Mn         EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13441..13446;N   # Lo     [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
       +13447..13455;N   # Mn    [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        14400..14646;N   # Lo   [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38;N   # Lo   [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E;N   # Lo    [31] MRO LETTER TA..MRO LETTER TEK
       @@ -2293,7 +2314,9 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        1AFFD..1AFFE;W   # Lm     [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B0FF;W   # Lo   [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
        1B100..1B122;W   # Lo    [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
       +1B132;W          # Lo         HIRAGANA LETTER SMALL KO
        1B150..1B152;W   # Lo     [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155;W          # Lo         KATAKANA LETTER SMALL KO
        1B164..1B167;W   # Lo     [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB;W   # Lo   [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A;N   # Lo   [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -2324,6 +2347,7 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        1D200..1D241;N   # So    [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
        1D242..1D244;N   # Mn     [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
        1D245;N          # So         GREEK MUSICAL LEIMMA
       +1D2C0..1D2D3;N   # No    [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
        1D2E0..1D2F3;N   # No    [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
        1D300..1D356;N   # So    [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
        1D360..1D378;N   # No    [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
       @@ -2383,11 +2407,14 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        1DF00..1DF09;N   # Ll    [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A;N          # Lo         LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E;N   # Ll    [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A;N   # Ll     [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
        1E000..1E006;N   # Mn     [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
        1E008..1E018;N   # Mn    [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
        1E01B..1E021;N   # Mn     [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024;N   # Mn     [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A;N   # Mn     [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E030..1E06D;N   # Lm    [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
       +1E08F;N          # Mn         COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E100..1E12C;N   # Lo    [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E130..1E136;N   # Mn     [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E137..1E13D;N   # Lm     [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
       @@ -2400,6 +2427,10 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        1E2EC..1E2EF;N   # Mn     [4] WANCHO TONE TUP..WANCHO TONE KOINI
        1E2F0..1E2F9;N   # Nd    [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
        1E2FF;N          # Sc         WANCHO NGUN SIGN
       +1E4D0..1E4EA;N   # Lo    [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB;N          # Lm         NAG MUNDARI SIGN OJOD
       +1E4EC..1E4EF;N   # Mn     [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
       +1E4F0..1E4F9;N   # Nd    [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
        1E7E0..1E7E6;N   # Lo     [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB;N   # Lo     [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE;N   # Lo     [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -2528,13 +2559,14 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        1F6D0..1F6D2;W   # So     [3] PLACE OF WORSHIP..SHOPPING TROLLEY
        1F6D3..1F6D4;N   # So     [2] STUPA..PAGODA
        1F6D5..1F6D7;W   # So     [3] HINDU TEMPLE..ELEVATOR
       -1F6DD..1F6DF;W   # So     [3] PLAYGROUND SLIDE..RING BUOY
       +1F6DC..1F6DF;W   # So     [4] WIRELESS..RING BUOY
        1F6E0..1F6EA;N   # So    [11] HAMMER AND WRENCH..NORTHEAST-POINTING AIRPLANE
        1F6EB..1F6EC;W   # So     [2] AIRPLANE DEPARTURE..AIRPLANE ARRIVING
        1F6F0..1F6F3;N   # So     [4] SATELLITE..PASSENGER SHIP
        1F6F4..1F6FC;W   # So     [9] SCOOTER..ROLLER SKATE
       -1F700..1F773;N   # So   [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
       -1F780..1F7D8;N   # So    [89] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NEGATIVE CIRCLED SQUARE
       +1F700..1F776;N   # So   [119] ALCHEMICAL SYMBOL FOR QUINTESSENCE..LUNAR ECLIPSE
       +1F77B..1F77F;N   # So     [5] HAUMEA..ORCUS
       +1F780..1F7D9;N   # So    [90] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..NINE POINTED WHITE STAR
        1F7E0..1F7EB;W   # So    [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
        1F7F0;W          # So         HEAVY EQUALS SIGN
        1F800..1F80B;N   # So    [12] LEFTWARDS ARROW WITH SMALL TRIANGLE ARROWHEAD..DOWNWARDS ARROW WITH LARGE TRIANGLE ARROWHEAD
       @@ -2551,22 +2583,20 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        1F947..1F9FF;W   # So   [185] FIRST PLACE MEDAL..NAZAR AMULET
        1FA00..1FA53;N   # So    [84] NEUTRAL CHESS KING..BLACK CHESS KNIGHT-BISHOP
        1FA60..1FA6D;N   # So    [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
       -1FA70..1FA74;W   # So     [5] BALLET SHOES..THONG SANDAL
       -1FA78..1FA7C;W   # So     [5] DROP OF BLOOD..CRUTCH
       -1FA80..1FA86;W   # So     [7] YO-YO..NESTING DOLLS
       -1FA90..1FAAC;W   # So    [29] RINGED PLANET..HAMSA
       -1FAB0..1FABA;W   # So    [11] FLY..NEST WITH EGGS
       -1FAC0..1FAC5;W   # So     [6] ANATOMICAL HEART..PERSON WITH CROWN
       -1FAD0..1FAD9;W   # So    [10] BLUEBERRIES..JAR
       -1FAE0..1FAE7;W   # So     [8] MELTING FACE..BUBBLES
       -1FAF0..1FAF6;W   # So     [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
       +1FA70..1FA7C;W   # So    [13] BALLET SHOES..CRUTCH
       +1FA80..1FA88;W   # So     [9] YO-YO..FLUTE
       +1FA90..1FABD;W   # So    [46] RINGED PLANET..WING
       +1FABF..1FAC5;W   # So     [7] GOOSE..PERSON WITH CROWN
       +1FACE..1FADB;W   # So    [14] MOOSE..PEA POD
       +1FAE0..1FAE8;W   # So     [9] MELTING FACE..SHAKING FACE
       +1FAF0..1FAF8;W   # So     [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
        1FB00..1FB92;N   # So   [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
        1FB94..1FBCA;N   # So    [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
        1FBF0..1FBF9;N   # Nd    [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
        20000..2A6DF;W   # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
        2A6E0..2A6FF;W   # Cn    [32] <reserved-2A6E0>..<reserved-2A6FF>
       -2A700..2B738;W   # Lo  [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       -2B739..2B73F;W   # Cn     [7] <reserved-2B739>..<reserved-2B73F>
       +2A700..2B739;W   # Lo  [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
       +2B73A..2B73F;W   # Cn     [6] <reserved-2B73A>..<reserved-2B73F>
        2B740..2B81D;W   # Lo   [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B81E..2B81F;W   # Cn     [2] <reserved-2B81E>..<reserved-2B81F>
        2B820..2CEA1;W   # Lo  [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
       @@ -2577,7 +2607,9 @@ FFFD;A           # So         REPLACEMENT CHARACTER
        2FA1E..2FA1F;W   # Cn     [2] <reserved-2FA1E>..<reserved-2FA1F>
        2FA20..2FFFD;W   # Cn  [1502] <reserved-2FA20>..<reserved-2FFFD>
        30000..3134A;W   # Lo  [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       -3134B..3FFFD;W   # Cn [60595] <reserved-3134B>..<reserved-3FFFD>
       +3134B..3134F;W   # Cn     [5] <reserved-3134B>..<reserved-3134F>
       +31350..323AF;W   # Lo  [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
       +323B0..3FFFD;W   # Cn [56398] <reserved-323B0>..<reserved-3FFFD>
        E0001;N          # Cf         LANGUAGE TAG
        E0020..E007F;N   # Cf    [96] TAG SPACE..CANCEL TAG
        E0100..E01EF;A   # Mn   [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
 (DIR) diff --git a/data/GraphemeBreakProperty.txt b/data/GraphemeBreakProperty.txt
       @@ -1,11 +1,11 @@
       -# GraphemeBreakProperty-14.0.0.txt
       -# Date: 2021-08-12, 23:13:02 GMT
       -# © 2021 Unicode®, Inc.
       +# GraphemeBreakProperty-15.0.0.txt
       +# Date: 2022-04-27, 17:07:38 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        
        # ================================================
        
       @@ -32,8 +32,9 @@
        11A3A         ; Prepend # Lo       ZANABAZAR SQUARE CLUSTER-INITIAL LETTER RA
        11A84..11A89  ; Prepend # Lo   [6] SOYOMBO SIGN JIHVAMULIYA..SOYOMBO CLUSTER-INITIAL LETTER SA
        11D46         ; Prepend # Lo       MASARAM GONDI REPHA
       +11F02         ; Prepend # Lo       KAWI SIGN REPHA
        
       -# Total code points: 26
       +# Total code points: 27
        
        # ================================================
        
       @@ -67,7 +68,7 @@
        FEFF          ; Control # Cf       ZERO WIDTH NO-BREAK SPACE
        FFF0..FFF8    ; Control # Cn   [9] <reserved-FFF0>..<reserved-FFF8>
        FFF9..FFFB    ; Control # Cf   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
       -13430..13438  ; Control # Cf   [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
       +13430..1343F  ; Control # Cf  [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
        1BCA0..1BCA3  ; Control # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
        1D173..1D17A  ; Control # Cf   [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
        E0000         ; Control # Cn       <reserved-E0000>
       @@ -76,7 +77,7 @@ E0002..E001F  ; Control # Cn  [30] <reserved-E0002>..<reserved-E001F>
        E0080..E00FF  ; Control # Cn [128] <reserved-E0080>..<reserved-E00FF>
        E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
        
       -# Total code points: 3886
       +# Total code points: 3893
        
        # ================================================
        
       @@ -185,7 +186,7 @@ E01F0..E0FFF  ; Control # Cn [3600] <reserved-E01F0>..<reserved-E0FFF>
        0E47..0E4E    ; Extend # Mn   [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
        0EB1          ; Extend # Mn       LAO VOWEL SIGN MAI KAN
        0EB4..0EBC    ; Extend # Mn   [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
       -0EC8..0ECD    ; Extend # Mn   [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE    ; Extend # Mn   [7] LAO TONE MAI EK..LAO YAMAKKAN
        0F18..0F19    ; Extend # Mn   [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
        0F35          ; Extend # Mn       TIBETAN MARK NGAS BZUNG NYI ZLA
        0F37          ; Extend # Mn       TIBETAN MARK NGAS BZUNG SGOR RTAGS
       @@ -324,6 +325,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        10AE5..10AE6  ; Extend # Mn   [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
        10D24..10D27  ; Extend # Mn   [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
        10EAB..10EAC  ; Extend # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
       +10EFD..10EFF  ; Extend # Mn   [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F46..10F50  ; Extend # Mn  [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
        10F82..10F85  ; Extend # Mn   [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
        11001         ; Extend # Mn       BRAHMI SIGN ANUSVARA
       @@ -346,6 +348,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        11234         ; Extend # Mn       KHOJKI SIGN ANUSVARA
        11236..11237  ; Extend # Mn   [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        1123E         ; Extend # Mn       KHOJKI SIGN SUKUN
       +11241         ; Extend # Mn       KHOJKI VOWEL SIGN VOCALIC R
        112DF         ; Extend # Mn       KHUDAWADI SIGN ANUSVARA
        112E3..112EA  ; Extend # Mn   [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
        11300..11301  ; Extend # Mn   [2] GRANTHA SIGN COMBINING ANUSVARA ABOVE..GRANTHA SIGN CANDRABINDU
       @@ -413,6 +416,12 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        11D95         ; Extend # Mn       GUNJALA GONDI SIGN ANUSVARA
        11D97         ; Extend # Mn       GUNJALA GONDI VIRAMA
        11EF3..11EF4  ; Extend # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
       +11F00..11F01  ; Extend # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F36..11F3A  ; Extend # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F40         ; Extend # Mn       KAWI VOWEL SIGN EU
       +11F42         ; Extend # Mn       KAWI CONJOINER
       +13440         ; Extend # Mn       EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13447..13455  ; Extend # Mn  [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        16AF0..16AF4  ; Extend # Mn   [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
        16B30..16B36  ; Extend # Mn   [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
        16F4F         ; Extend # Mn       MIAO SIGN CONSONANT MODIFIER BAR
       @@ -439,16 +448,18 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        1E01B..1E021  ; Extend # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; Extend # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; Extend # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E08F         ; Extend # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E130..1E136  ; Extend # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E2AE         ; Extend # Mn       TOTO SIGN RISING TONE
        1E2EC..1E2EF  ; Extend # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
       +1E4EC..1E4EF  ; Extend # Mn   [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
        1E8D0..1E8D6  ; Extend # Mn   [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
        1E944..1E94A  ; Extend # Mn   [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
        1F3FB..1F3FF  ; Extend # Sk   [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
        E0020..E007F  ; Extend # Cf  [96] TAG SPACE..CANCEL TAG
        E0100..E01EF  ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        
       -# Total code points: 2095
       +# Total code points: 2130
        
        # ================================================
        
       @@ -489,6 +500,7 @@ E0100..E01EF  ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        0CC3..0CC4    ; SpacingMark # Mc   [2] KANNADA VOWEL SIGN VOCALIC R..KANNADA VOWEL SIGN VOCALIC RR
        0CC7..0CC8    ; SpacingMark # Mc   [2] KANNADA VOWEL SIGN EE..KANNADA VOWEL SIGN AI
        0CCA..0CCB    ; SpacingMark # Mc   [2] KANNADA VOWEL SIGN O..KANNADA VOWEL SIGN OO
       +0CF3          ; SpacingMark # Mc       KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D02..0D03    ; SpacingMark # Mc   [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D3F..0D40    ; SpacingMark # Mc   [2] MALAYALAM VOWEL SIGN I..MALAYALAM VOWEL SIGN II
        0D46..0D48    ; SpacingMark # Mc   [3] MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN AI
       @@ -614,12 +626,16 @@ ABEC          ; SpacingMark # Mc       MEETEI MAYEK LUM IYEK
        11D93..11D94  ; SpacingMark # Mc   [2] GUNJALA GONDI VOWEL SIGN OO..GUNJALA GONDI VOWEL SIGN AU
        11D96         ; SpacingMark # Mc       GUNJALA GONDI SIGN VISARGA
        11EF5..11EF6  ; SpacingMark # Mc   [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
       +11F03         ; SpacingMark # Mc       KAWI SIGN VISARGA
       +11F34..11F35  ; SpacingMark # Mc   [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F3E..11F3F  ; SpacingMark # Mc   [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F41         ; SpacingMark # Mc       KAWI SIGN KILLER
        16F51..16F87  ; SpacingMark # Mc  [55] MIAO SIGN ASPIRATION..MIAO VOWEL SIGN UI
        16FF0..16FF1  ; SpacingMark # Mc   [2] VIETNAMESE ALTERNATE READING MARK CA..VIETNAMESE ALTERNATE READING MARK NHAY
        1D166         ; SpacingMark # Mc       MUSICAL SYMBOL COMBINING SPRECHGESANG STEM
        1D16D         ; SpacingMark # Mc       MUSICAL SYMBOL COMBINING AUGMENTATION DOT
        
       -# Total code points: 388
       +# Total code points: 395
        
        # ================================================
        
 (DIR) diff --git a/data/GraphemeBreakTest.txt b/data/GraphemeBreakTest.txt
       @@ -1,11 +1,11 @@
       -# GraphemeBreakTest-14.0.0.txt
       -# Date: 2021-03-08, 06:22:32 GMT
       -# © 2021 Unicode®, Inc.
       +# GraphemeBreakTest-15.0.0.txt
       +# Date: 2022-02-26, 00:38:37 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        #
        # Default Grapheme_Cluster_Break Test
        #
 (DIR) diff --git a/data/LineBreak.txt b/data/LineBreak.txt
       @@ -1,6 +1,6 @@
       -# LineBreak-14.0.0.txt
       -# Date: 2021-07-06, 09:58:55 GMT [KW, LI]
       -# © 2021 Unicode®, Inc.
       +# LineBreak-15.0.0.txt
       +# Date: 2022-07-28, 09:20:42 GMT [KW, LI]
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
        # For terms of use, see https://www.unicode.org/terms_of_use.html
        #
       @@ -481,6 +481,7 @@
        0CE2..0CE3;CM     # Mn     [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
        0CE6..0CEF;NU     # Nd    [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
        0CF1..0CF2;AL     # Lo     [2] KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADHMANIYA
       +0CF3;CM           # Mc         KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D00..0D01;CM     # Mn     [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
        0D02..0D03;CM     # Mc     [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D04..0D0C;AL     # Lo     [9] MALAYALAM LETTER VEDIC ANUSVARA..MALAYALAM LETTER VOCALIC L
       @@ -542,7 +543,7 @@
        0EBD;SA           # Lo         LAO SEMIVOWEL SIGN NYO
        0EC0..0EC4;SA     # Lo     [5] LAO VOWEL SIGN E..LAO VOWEL SIGN AI
        0EC6;SA           # Lm         LAO KO LA
       -0EC8..0ECD;SA     # Mn     [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE;SA     # Mn     [7] LAO TONE MAI EK..LAO YAMAKKAN
        0ED0..0ED9;NU     # Nd    [10] LAO DIGIT ZERO..LAO DIGIT NINE
        0EDC..0EDF;SA     # Lo     [4] LAO HO NO..LAO LETTER KHMU NYO
        0F00;AL           # Lo         TIBETAN SYLLABLE OM
       @@ -855,7 +856,11 @@
        1D79..1D7F;AL     # Ll     [7] LATIN SMALL LETTER INSULAR G..LATIN SMALL LETTER UPSILON WITH STROKE
        1D80..1D9A;AL     # Ll    [27] LATIN SMALL LETTER B WITH PALATAL HOOK..LATIN SMALL LETTER EZH WITH RETROFLEX HOOK
        1D9B..1DBF;AL     # Lm    [37] MODIFIER LETTER SMALL TURNED ALPHA..MODIFIER LETTER SMALL THETA
       -1DC0..1DFF;CM     # Mn    [64] COMBINING DOTTED GRAVE ACCENT..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
       +1DC0..1DCC;CM     # Mn    [13] COMBINING DOTTED GRAVE ACCENT..COMBINING MACRON-BREVE
       +1DCD;GL           # Mn         COMBINING DOUBLE CIRCUMFLEX ABOVE
       +1DCE..1DFB;CM     # Mn    [46] COMBINING OGONEK ABOVE..COMBINING DELETION MARK
       +1DFC;GL           # Mn         COMBINING DOUBLE INVERTED BREVE BELOW
       +1DFD..1DFF;CM     # Mn     [3] COMBINING ALMOST EQUAL TO BELOW..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
        1E00..1EFF;AL     # L&   [256] LATIN CAPITAL LETTER A WITH RING BELOW..LATIN SMALL LETTER Y WITH LOOP
        1F00..1F15;AL     # L&    [22] GREEK SMALL LETTER ALPHA WITH PSILI..GREEK SMALL LETTER EPSILON WITH DASIA AND OXIA
        1F18..1F1D;AL     # Lu     [6] GREEK CAPITAL LETTER EPSILON WITH PSILI..GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
       @@ -931,7 +936,7 @@
        2054;AL           # Pc         INVERTED UNDERTIE
        2055;AL           # Po         FLOWER PUNCTUATION MARK
        2056;BA           # Po         THREE DOT PUNCTUATION
       -2057;AL           # Po         QUADRUPLE PRIME
       +2057;PO           # Po         QUADRUPLE PRIME
        2058..205B;BA     # Po     [4] FOUR DOT PUNCTUATION..FOUR DOT MARK
        205C;AL           # Po         DOTTED CROSS
        205D..205E;BA     # Po     [2] TRICOLON..VERTICAL FOUR DOTS
       @@ -2793,6 +2798,7 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        10EAB..10EAC;CM   # Mn     [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
        10EAD;BA          # Pd         YEZIDI HYPHENATION MARK
        10EB0..10EB1;AL   # Lo     [2] YEZIDI LETTER LAM WITH DOT ABOVE..YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE
       +10EFD..10EFF;CM   # Mn     [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F00..10F1C;AL   # Lo    [29] OLD SOGDIAN LETTER ALEPH..OLD SOGDIAN LETTER FINAL TAW WITH VERTICAL TAIL
        10F1D..10F26;AL   # No    [10] OLD SOGDIAN NUMBER ONE..OLD SOGDIAN FRACTION ONE HALF
        10F27;AL          # Lo         OLD SOGDIAN LIGATURE AYIN-DALETH
       @@ -2882,6 +2888,8 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1123B..1123C;BA   # Po     [2] KHOJKI SECTION MARK..KHOJKI DOUBLE SECTION MARK
        1123D;AL          # Po         KHOJKI ABBREVIATION SIGN
        1123E;CM          # Mn         KHOJKI SIGN SUKUN
       +1123F..11240;AL   # Lo     [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
       +11241;CM          # Mn         KHOJKI VOWEL SIGN VOCALIC R
        11280..11286;AL   # Lo     [7] MULTANI LETTER A..MULTANI LETTER GA
        11288;AL          # Lo         MULTANI LETTER GHA
        1128A..1128D;AL   # Lo     [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -3055,6 +3063,7 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        11AA1..11AA2;BA   # Po     [2] SOYOMBO TERMINAL MARK-1..SOYOMBO TERMINAL MARK-2
        11AB0..11ABF;AL   # Lo    [16] CANADIAN SYLLABICS NATTILIK HI..CANADIAN SYLLABICS SPA
        11AC0..11AF8;AL   # Lo    [57] PAU CIN HAU LETTER PA..PAU CIN HAU GLOTTAL STOP FINAL
       +11B00..11B09;BB   # Po    [10] DEVANAGARI HEAD MARK..DEVANAGARI SIGN MINDU
        11C00..11C08;AL   # Lo     [9] BHAIKSUKI LETTER A..BHAIKSUKI LETTER VOCALIC L
        11C0A..11C2E;AL   # Lo    [37] BHAIKSUKI LETTER E..BHAIKSUKI LETTER HA
        11C2F;CM          # Mc         BHAIKSUKI VOWEL SIGN AA
       @@ -3101,6 +3110,20 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        11EF3..11EF4;CM   # Mn     [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
        11EF5..11EF6;CM   # Mc     [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
        11EF7..11EF8;AL   # Po     [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
       +11F00..11F01;CM   # Mn     [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F02;AL          # Lo         KAWI SIGN REPHA
       +11F03;CM          # Mc         KAWI SIGN VISARGA
       +11F04..11F10;AL   # Lo    [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33;AL   # Lo    [34] KAWI LETTER KA..KAWI LETTER JNYA
       +11F34..11F35;CM   # Mc     [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F36..11F3A;CM   # Mn     [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F3E..11F3F;CM   # Mc     [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F40;CM          # Mn         KAWI VOWEL SIGN EU
       +11F41;CM          # Mc         KAWI SIGN KILLER
       +11F42;CM          # Mn         KAWI CONJOINER
       +11F43..11F44;BA   # Po     [2] KAWI DANDA..KAWI DOUBLE DANDA
       +11F45..11F4F;ID   # Po    [11] KAWI PUNCTUATION SECTION MARKER..KAWI PUNCTUATION CLOSING SPIRAL
       +11F50..11F59;NU   # Nd    [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
        11FB0;AL          # Lo         LISU LETTER YHA
        11FC0..11FD4;AL   # No    [21] TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH..TAMIL FRACTION DOWNSCALING FACTOR KIIZH
        11FD5..11FDC;AL   # So     [8] TAMIL SIGN NEL..TAMIL SIGN MUKKURUNI
       @@ -3126,10 +3149,18 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1328A..13378;AL   # Lo   [239] EGYPTIAN HIEROGLYPH O037..EGYPTIAN HIEROGLYPH V011
        13379;OP          # Lo         EGYPTIAN HIEROGLYPH V011A
        1337A..1337B;CL   # Lo     [2] EGYPTIAN HIEROGLYPH V011B..EGYPTIAN HIEROGLYPH V011C
       -1337C..1342E;AL   # Lo   [179] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH AA032
       +1337C..1342F;AL   # Lo   [180] EGYPTIAN HIEROGLYPH V012..EGYPTIAN HIEROGLYPH V011D
        13430..13436;GL   # Cf     [7] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH OVERLAY MIDDLE
        13437;OP          # Cf         EGYPTIAN HIEROGLYPH BEGIN SEGMENT
        13438;CL          # Cf         EGYPTIAN HIEROGLYPH END SEGMENT
       +13439..1343B;GL   # Cf     [3] EGYPTIAN HIEROGLYPH INSERT AT MIDDLE..EGYPTIAN HIEROGLYPH INSERT AT BOTTOM
       +1343C;OP          # Cf         EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE
       +1343D;CL          # Cf         EGYPTIAN HIEROGLYPH END ENCLOSURE
       +1343E;OP          # Cf         EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE
       +1343F;CL          # Cf         EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
       +13440;CM          # Mn         EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13441..13446;AL   # Lo     [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
       +13447..13455;CM   # Mn    [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        14400..145CD;AL   # Lo   [462] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A409
        145CE;OP          # Lo         ANATOLIAN HIEROGLYPH A410 BEGIN LOGOGRAM MARK
        145CF;CL          # Lo         ANATOLIAN HIEROGLYPH A410A END LOGOGRAM MARK
       @@ -3179,7 +3210,9 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1AFFD..1AFFE;AL   # Lm     [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B0FF;ID   # Lo   [256] KATAKANA LETTER ARCHAIC E..HENTAIGANA LETTER RE-2
        1B100..1B122;ID   # Lo    [35] HENTAIGANA LETTER RE-3..KATAKANA LETTER ARCHAIC WU
       +1B132;CJ          # Lo         HIRAGANA LETTER SMALL KO
        1B150..1B152;CJ   # Lo     [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155;CJ          # Lo         KATAKANA LETTER SMALL KO
        1B164..1B167;CJ   # Lo     [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB;ID   # Lo   [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A;AL   # Lo   [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -3210,6 +3243,7 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1D200..1D241;AL   # So    [66] GREEK VOCAL NOTATION SYMBOL-1..GREEK INSTRUMENTAL NOTATION SYMBOL-54
        1D242..1D244;CM   # Mn     [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
        1D245;AL          # So         GREEK MUSICAL LEIMMA
       +1D2C0..1D2D3;AL   # No    [20] KAKTOVIK NUMERAL ZERO..KAKTOVIK NUMERAL NINETEEN
        1D2E0..1D2F3;AL   # No    [20] MAYAN NUMERAL ZERO..MAYAN NUMERAL NINETEEN
        1D300..1D356;AL   # So    [87] MONOGRAM FOR EARTH..TETRAGRAM FOR FOSTERING
        1D360..1D378;AL   # No    [25] COUNTING ROD UNIT DIGIT ONE..TALLY MARK FIVE
       @@ -3270,11 +3304,14 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1DF00..1DF09;AL   # Ll    [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A;AL          # Lo         LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E;AL   # Ll    [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A;AL   # Ll     [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
        1E000..1E006;CM   # Mn     [7] COMBINING GLAGOLITIC LETTER AZU..COMBINING GLAGOLITIC LETTER ZHIVETE
        1E008..1E018;CM   # Mn    [17] COMBINING GLAGOLITIC LETTER ZEMLJA..COMBINING GLAGOLITIC LETTER HERU
        1E01B..1E021;CM   # Mn     [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024;CM   # Mn     [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A;CM   # Mn     [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E030..1E06D;AL   # Lm    [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
       +1E08F;CM          # Mn         COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E100..1E12C;AL   # Lo    [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E130..1E136;CM   # Mn     [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E137..1E13D;AL   # Lm     [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
       @@ -3287,6 +3324,10 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1E2EC..1E2EF;CM   # Mn     [4] WANCHO TONE TUP..WANCHO TONE KOINI
        1E2F0..1E2F9;NU   # Nd    [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
        1E2FF;PR          # Sc         WANCHO NGUN SIGN
       +1E4D0..1E4EA;AL   # Lo    [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB;AL          # Lm         NAG MUNDARI SIGN OJOD
       +1E4EC..1E4EF;CM   # Mn     [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
       +1E4F0..1E4F9;NU   # Nd    [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
        1E7E0..1E7E6;AL   # Lo     [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB;AL   # Lo     [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE;AL   # Lo     [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -3454,16 +3495,18 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1F6C1..1F6CB;ID   # So    [11] BATHTUB..COUCH AND LAMP
        1F6CC;EB          # So         SLEEPING ACCOMMODATION
        1F6CD..1F6D7;ID   # So    [11] SHOPPING BAGS..ELEVATOR
       -1F6D8..1F6DC;ID   # Cn     [5] <reserved-1F6D8>..<reserved-1F6DC>
       -1F6DD..1F6EC;ID   # So    [16] PLAYGROUND SLIDE..AIRPLANE ARRIVING
       +1F6D8..1F6DB;ID   # Cn     [4] <reserved-1F6D8>..<reserved-1F6DB>
       +1F6DC..1F6EC;ID   # So    [17] WIRELESS..AIRPLANE ARRIVING
        1F6ED..1F6EF;ID   # Cn     [3] <reserved-1F6ED>..<reserved-1F6EF>
        1F6F0..1F6FC;ID   # So    [13] SATELLITE..ROLLER SKATE
        1F6FD..1F6FF;ID   # Cn     [3] <reserved-1F6FD>..<reserved-1F6FF>
        1F700..1F773;AL   # So   [116] ALCHEMICAL SYMBOL FOR QUINTESSENCE..ALCHEMICAL SYMBOL FOR HALF OUNCE
       -1F774..1F77F;ID   # Cn    [12] <reserved-1F774>..<reserved-1F77F>
       +1F774..1F776;ID   # So     [3] LOT OF FORTUNE..LUNAR ECLIPSE
       +1F777..1F77A;ID   # Cn     [4] <reserved-1F777>..<reserved-1F77A>
       +1F77B..1F77F;ID   # So     [5] HAUMEA..ORCUS
        1F780..1F7D4;AL   # So    [85] BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE..HEAVY TWELVE POINTED PINWHEEL STAR
       -1F7D5..1F7D8;ID   # So     [4] CIRCLED TRIANGLE..NEGATIVE CIRCLED SQUARE
       -1F7D9..1F7DF;ID   # Cn     [7] <reserved-1F7D9>..<reserved-1F7DF>
       +1F7D5..1F7D9;ID   # So     [5] CIRCLED TRIANGLE..NINE POINTED WHITE STAR
       +1F7DA..1F7DF;ID   # Cn     [6] <reserved-1F7DA>..<reserved-1F7DF>
        1F7E0..1F7EB;ID   # So    [12] LARGE ORANGE CIRCLE..LARGE BROWN SQUARE
        1F7EC..1F7EF;ID   # Cn     [4] <reserved-1F7EC>..<reserved-1F7EF>
        1F7F0;ID          # So         HEAVY EQUALS SIGN
       @@ -3509,33 +3552,29 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        1FA54..1FA5F;ID   # Cn    [12] <reserved-1FA54>..<reserved-1FA5F>
        1FA60..1FA6D;ID   # So    [14] XIANGQI RED GENERAL..XIANGQI BLACK SOLDIER
        1FA6E..1FA6F;ID   # Cn     [2] <reserved-1FA6E>..<reserved-1FA6F>
       -1FA70..1FA74;ID   # So     [5] BALLET SHOES..THONG SANDAL
       -1FA75..1FA77;ID   # Cn     [3] <reserved-1FA75>..<reserved-1FA77>
       -1FA78..1FA7C;ID   # So     [5] DROP OF BLOOD..CRUTCH
       +1FA70..1FA7C;ID   # So    [13] BALLET SHOES..CRUTCH
        1FA7D..1FA7F;ID   # Cn     [3] <reserved-1FA7D>..<reserved-1FA7F>
       -1FA80..1FA86;ID   # So     [7] YO-YO..NESTING DOLLS
       -1FA87..1FA8F;ID   # Cn     [9] <reserved-1FA87>..<reserved-1FA8F>
       -1FA90..1FAAC;ID   # So    [29] RINGED PLANET..HAMSA
       -1FAAD..1FAAF;ID   # Cn     [3] <reserved-1FAAD>..<reserved-1FAAF>
       -1FAB0..1FABA;ID   # So    [11] FLY..NEST WITH EGGS
       -1FABB..1FABF;ID   # Cn     [5] <reserved-1FABB>..<reserved-1FABF>
       -1FAC0..1FAC2;ID   # So     [3] ANATOMICAL HEART..PEOPLE HUGGING
       +1FA80..1FA88;ID   # So     [9] YO-YO..FLUTE
       +1FA89..1FA8F;ID   # Cn     [7] <reserved-1FA89>..<reserved-1FA8F>
       +1FA90..1FABD;ID   # So    [46] RINGED PLANET..WING
       +1FABE;ID          # Cn         <reserved-1FABE>
       +1FABF..1FAC2;ID   # So     [4] GOOSE..PEOPLE HUGGING
        1FAC3..1FAC5;EB   # So     [3] PREGNANT MAN..PERSON WITH CROWN
       -1FAC6..1FACF;ID   # Cn    [10] <reserved-1FAC6>..<reserved-1FACF>
       -1FAD0..1FAD9;ID   # So    [10] BLUEBERRIES..JAR
       -1FADA..1FADF;ID   # Cn     [6] <reserved-1FADA>..<reserved-1FADF>
       -1FAE0..1FAE7;ID   # So     [8] MELTING FACE..BUBBLES
       -1FAE8..1FAEF;ID   # Cn     [8] <reserved-1FAE8>..<reserved-1FAEF>
       -1FAF0..1FAF6;EB   # So     [7] HAND WITH INDEX FINGER AND THUMB CROSSED..HEART HANDS
       -1FAF7..1FAFF;ID   # Cn     [9] <reserved-1FAF7>..<reserved-1FAFF>
       +1FAC6..1FACD;ID   # Cn     [8] <reserved-1FAC6>..<reserved-1FACD>
       +1FACE..1FADB;ID   # So    [14] MOOSE..PEA POD
       +1FADC..1FADF;ID   # Cn     [4] <reserved-1FADC>..<reserved-1FADF>
       +1FAE0..1FAE8;ID   # So     [9] MELTING FACE..SHAKING FACE
       +1FAE9..1FAEF;ID   # Cn     [7] <reserved-1FAE9>..<reserved-1FAEF>
       +1FAF0..1FAF8;EB   # So     [9] HAND WITH INDEX FINGER AND THUMB CROSSED..RIGHTWARDS PUSHING HAND
       +1FAF9..1FAFF;ID   # Cn     [7] <reserved-1FAF9>..<reserved-1FAFF>
        1FB00..1FB92;AL   # So   [147] BLOCK SEXTANT-1..UPPER HALF INVERSE MEDIUM SHADE AND LOWER HALF BLOCK
        1FB94..1FBCA;AL   # So    [55] LEFT HALF INVERSE MEDIUM SHADE AND RIGHT HALF BLOCK..WHITE UP-POINTING CHEVRON
        1FBF0..1FBF9;NU   # Nd    [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
        1FC00..1FFFD;ID   # Cn  [1022] <reserved-1FC00>..<reserved-1FFFD>
        20000..2A6DF;ID   # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
        2A6E0..2A6FF;ID   # Cn    [32] <reserved-2A6E0>..<reserved-2A6FF>
       -2A700..2B738;ID   # Lo  [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       -2B739..2B73F;ID   # Cn     [7] <reserved-2B739>..<reserved-2B73F>
       +2A700..2B739;ID   # Lo  [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
       +2B73A..2B73F;ID   # Cn     [6] <reserved-2B73A>..<reserved-2B73F>
        2B740..2B81D;ID   # Lo   [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B81E..2B81F;ID   # Cn     [2] <reserved-2B81E>..<reserved-2B81F>
        2B820..2CEA1;ID   # Lo  [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
       @@ -3546,7 +3585,9 @@ FFFD;AI           # So         REPLACEMENT CHARACTER
        2FA1E..2FA1F;ID   # Cn     [2] <reserved-2FA1E>..<reserved-2FA1F>
        2FA20..2FFFD;ID   # Cn  [1502] <reserved-2FA20>..<reserved-2FFFD>
        30000..3134A;ID   # Lo  [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       -3134B..3FFFD;ID   # Cn [60595] <reserved-3134B>..<reserved-3FFFD>
       +3134B..3134F;ID   # Cn     [5] <reserved-3134B>..<reserved-3134F>
       +31350..323AF;ID   # Lo  [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
       +323B0..3FFFD;ID   # Cn [56398] <reserved-323B0>..<reserved-3FFFD>
        E0001;CM          # Cf         LANGUAGE TAG
        E0020..E007F;CM   # Cf    [96] TAG SPACE..CANCEL TAG
        E0100..E01EF;CM   # Mn   [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
 (DIR) diff --git a/data/LineBreakTest.txt b/data/LineBreakTest.txt
       @@ -1,11 +1,11 @@
       -# LineBreakTest-14.0.0.txt
       -# Date: 2021-08-20, 21:08:45 GMT
       -# © 2021 Unicode®, Inc.
       +# LineBreakTest-15.0.0.txt
       +# Date: 2022-02-26, 00:38:39 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        #
        # Default Line_Break Test
        #
 (DIR) diff --git a/data/SentenceBreakProperty.txt b/data/SentenceBreakProperty.txt
       @@ -1,11 +1,11 @@
       -# SentenceBreakProperty-14.0.0.txt
       -# Date: 2021-08-12, 23:13:21 GMT
       -# © 2021 Unicode®, Inc.
       +# SentenceBreakProperty-15.0.0.txt
       +# Date: 2022-08-05, 22:17:35 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        
        # ================================================
        
       @@ -144,6 +144,7 @@
        0CCC..0CCD    ; Extend # Mn   [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
        0CD5..0CD6    ; Extend # Mc   [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
        0CE2..0CE3    ; Extend # Mn   [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
       +0CF3          ; Extend # Mc       KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D00..0D01    ; Extend # Mn   [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
        0D02..0D03    ; Extend # Mc   [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D3B..0D3C    ; Extend # Mn   [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
       @@ -167,7 +168,7 @@
        0E47..0E4E    ; Extend # Mn   [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
        0EB1          ; Extend # Mn       LAO VOWEL SIGN MAI KAN
        0EB4..0EBC    ; Extend # Mn   [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
       -0EC8..0ECD    ; Extend # Mn   [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE    ; Extend # Mn   [7] LAO TONE MAI EK..LAO YAMAKKAN
        0F18..0F19    ; Extend # Mn   [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
        0F35          ; Extend # Mn       TIBETAN MARK NGAS BZUNG NYI ZLA
        0F37          ; Extend # Mn       TIBETAN MARK NGAS BZUNG SGOR RTAGS
       @@ -371,6 +372,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        10AE5..10AE6  ; Extend # Mn   [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
        10D24..10D27  ; Extend # Mn   [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
        10EAB..10EAC  ; Extend # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
       +10EFD..10EFF  ; Extend # Mn   [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F46..10F50  ; Extend # Mn  [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
        10F82..10F85  ; Extend # Mn   [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
        11000         ; Extend # Mc       BRAHMI SIGN CANDRABINDU
       @@ -407,6 +409,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        11235         ; Extend # Mc       KHOJKI SIGN VIRAMA
        11236..11237  ; Extend # Mn   [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        1123E         ; Extend # Mn       KHOJKI SIGN SUKUN
       +11241         ; Extend # Mn       KHOJKI VOWEL SIGN VOCALIC R
        112DF         ; Extend # Mn       KHUDAWADI SIGN ANUSVARA
        112E0..112E2  ; Extend # Mc   [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
        112E3..112EA  ; Extend # Mn   [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
       @@ -516,6 +519,16 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        11D97         ; Extend # Mn       GUNJALA GONDI VIRAMA
        11EF3..11EF4  ; Extend # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
        11EF5..11EF6  ; Extend # Mc   [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
       +11F00..11F01  ; Extend # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F03         ; Extend # Mc       KAWI SIGN VISARGA
       +11F34..11F35  ; Extend # Mc   [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F36..11F3A  ; Extend # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F3E..11F3F  ; Extend # Mc   [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F40         ; Extend # Mn       KAWI VOWEL SIGN EU
       +11F41         ; Extend # Mc       KAWI SIGN KILLER
       +11F42         ; Extend # Mn       KAWI CONJOINER
       +13440         ; Extend # Mn       EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13447..13455  ; Extend # Mn  [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        16AF0..16AF4  ; Extend # Mn   [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
        16B30..16B36  ; Extend # Mn   [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
        16F4F         ; Extend # Mn       MIAO SIGN CONSONANT MODIFIER BAR
       @@ -544,15 +557,17 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        1E01B..1E021  ; Extend # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; Extend # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; Extend # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E08F         ; Extend # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E130..1E136  ; Extend # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E2AE         ; Extend # Mn       TOTO SIGN RISING TONE
        1E2EC..1E2EF  ; Extend # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
       +1E4EC..1E4EF  ; Extend # Mn   [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
        1E8D0..1E8D6  ; Extend # Mn   [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
        1E944..1E94A  ; Extend # Mn   [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
        E0020..E007F  ; Extend # Cf  [96] TAG SPACE..CANCEL TAG
        E0100..E01EF  ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        
       -# Total code points: 2508
       +# Total code points: 2550
        
        # ================================================
        
       @@ -581,12 +596,12 @@ FEFF          ; Format # Cf       ZERO WIDTH NO-BREAK SPACE
        FFF9..FFFB    ; Format # Cf   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
        110BD         ; Format # Cf       KAITHI NUMBER SIGN
        110CD         ; Format # Cf       KAITHI NUMBER SIGN ABOVE
       -13430..13438  ; Format # Cf   [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
       +13430..1343F  ; Format # Cf  [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
        1BCA0..1BCA3  ; Format # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
        1D173..1D17A  ; Format # Cf   [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
        E0001         ; Format # Cf       LANGUAGE TAG
        
       -# Total code points: 65
       +# Total code points: 72
        
        # ================================================
        
       @@ -880,6 +895,7 @@ E0001         ; Format # Cf       LANGUAGE TAG
        052D          ; Lower # L&       CYRILLIC SMALL LETTER DCHE
        052F          ; Lower # L&       CYRILLIC SMALL LETTER EL WITH DESCENDER
        0560..0588    ; Lower # L&  [41] ARMENIAN SMALL LETTER TURNED AYB..ARMENIAN SMALL LETTER YI WITH STROKE
       +10FC          ; Lower # Lm       MODIFIER LETTER GEORGIAN NAR
        13F8..13FD    ; Lower # L&   [6] CHEROKEE SMALL LETTER YE..CHEROKEE SMALL LETTER MV
        1C80..1C88    ; Lower # L&   [9] CYRILLIC SMALL LETTER ROUNDED VE..CYRILLIC SMALL LETTER UNBLENDED UK
        1D00..1D2B    ; Lower # L&  [44] LATIN LETTER SMALL CAPITAL A..CYRILLIC LETTER SMALL CAPITAL EL
       @@ -1228,12 +1244,14 @@ A7D3          ; Lower # L&       LATIN SMALL LETTER DOUBLE THORN
        A7D5          ; Lower # L&       LATIN SMALL LETTER DOUBLE WYNN
        A7D7          ; Lower # L&       LATIN SMALL LETTER MIDDLE SCOTS S
        A7D9          ; Lower # L&       LATIN SMALL LETTER SIGMOID S
       +A7F2..A7F4    ; Lower # Lm   [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
        A7F6          ; Lower # L&       LATIN SMALL LETTER REVERSED HALF H
        A7F8..A7F9    ; Lower # Lm   [2] MODIFIER LETTER CAPITAL H WITH STROKE..MODIFIER LETTER SMALL LIGATURE OE
        A7FA          ; Lower # L&       LATIN LETTER SMALL CAPITAL TURNED M
        AB30..AB5A    ; Lower # L&  [43] LATIN SMALL LETTER BARRED ALPHA..LATIN SMALL LETTER Y WITH SHORT RIGHT LEG
        AB5C..AB5F    ; Lower # Lm   [4] MODIFIER LETTER SMALL HENG..MODIFIER LETTER SMALL U WITH LEFT HOOK
        AB60..AB68    ; Lower # L&   [9] LATIN SMALL LETTER SAKHA YAT..LATIN SMALL LETTER TURNED R WITH MIDDLE TILDE
       +AB69          ; Lower # Lm       MODIFIER LETTER SMALL TURNED W
        AB70..ABBF    ; Lower # L&  [80] CHEROKEE SMALL LETTER A..CHEROKEE SMALL LETTER YA
        FB00..FB06    ; Lower # L&   [7] LATIN SMALL LIGATURE FF..LATIN SMALL LIGATURE ST
        FB13..FB17    ; Lower # L&   [5] ARMENIAN SMALL LIGATURE MEN NOW..ARMENIAN SMALL LIGATURE MEN XEH
       @@ -1281,9 +1299,11 @@ FF41..FF5A    ; Lower # L&  [26] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN
        1D7CB         ; Lower # L&       MATHEMATICAL BOLD SMALL DIGAMMA
        1DF00..1DF09  ; Lower # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0B..1DF1E  ; Lower # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; Lower # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
       +1E030..1E06D  ; Lower # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
        1E922..1E943  ; Lower # L&  [34] ADLAM SMALL LETTER ALIF..ADLAM SMALL LETTER SHA
        
       -# Total code points: 2424
       +# Total code points: 2497
        
        # ================================================
        
       @@ -2102,7 +2122,6 @@ FF21..FF3A    ; Upper # L&  [26] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LAT
        1075..1081    ; OLetter # Lo  [13] MYANMAR LETTER SHAN KA..MYANMAR LETTER SHAN HA
        108E          ; OLetter # Lo       MYANMAR LETTER RUMAI PALAUNG FA
        10D0..10FA    ; OLetter # L&  [43] GEORGIAN LETTER AN..GEORGIAN LETTER AIN
       -10FC          ; OLetter # Lm       MODIFIER LETTER GEORGIAN NAR
        10FD..10FF    ; OLetter # L&   [3] GEORGIAN LETTER AEN..GEORGIAN LETTER LABIAL SIGN
        1100..1248    ; OLetter # Lo [329] HANGUL CHOSEONG KIYEOK..ETHIOPIC SYLLABLE QWA
        124A..124D    ; OLetter # Lo   [4] ETHIOPIC SYLLABLE QWI..ETHIOPIC SYLLABLE QWE
       @@ -2215,7 +2234,6 @@ A6E6..A6EF    ; OLetter # Nl  [10] BAMUM LETTER MO..BAMUM LETTER KOGHOM
        A717..A71F    ; OLetter # Lm   [9] MODIFIER LETTER DOT VERTICAL BAR..MODIFIER LETTER LOW INVERTED EXCLAMATION MARK
        A788          ; OLetter # Lm       MODIFIER LETTER LOW CIRCUMFLEX ACCENT
        A78F          ; OLetter # Lo       LATIN LETTER SINOLOGICAL DOT
       -A7F2..A7F4    ; OLetter # Lm   [3] MODIFIER LETTER CAPITAL C..MODIFIER LETTER CAPITAL Q
        A7F7          ; OLetter # Lo       LATIN EPIGRAPHIC LETTER SIDEWAYS I
        A7FB..A801    ; OLetter # Lo   [7] LATIN EPIGRAPHIC LETTER REVERSED F..SYLOTI NAGRI LETTER I
        A803..A805    ; OLetter # Lo   [3] SYLOTI NAGRI LETTER U..SYLOTI NAGRI LETTER O
       @@ -2258,7 +2276,6 @@ AB09..AB0E    ; OLetter # Lo   [6] ETHIOPIC SYLLABLE DDHU..ETHIOPIC SYLLABLE DDH
        AB11..AB16    ; OLetter # Lo   [6] ETHIOPIC SYLLABLE DZU..ETHIOPIC SYLLABLE DZO
        AB20..AB26    ; OLetter # Lo   [7] ETHIOPIC SYLLABLE CCHHA..ETHIOPIC SYLLABLE CCHHO
        AB28..AB2E    ; OLetter # Lo   [7] ETHIOPIC SYLLABLE BBA..ETHIOPIC SYLLABLE BBO
       -AB69          ; OLetter # Lm       MODIFIER LETTER SMALL TURNED W
        ABC0..ABE2    ; OLetter # Lo  [35] MEETEI MAYEK LETTER KOK..MEETEI MAYEK LETTER I LONSUM
        AC00..D7A3    ; OLetter # Lo [11172] HANGUL SYLLABLE GA..HANGUL SYLLABLE HIH
        D7B0..D7C6    ; OLetter # Lo  [23] HANGUL JUNGSEONG O-YEO..HANGUL JUNGSEONG ARAEA-E
       @@ -2366,6 +2383,7 @@ FFDA..FFDC    ; OLetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        111DC         ; OLetter # Lo       SHARADA HEADSTROKE
        11200..11211  ; OLetter # Lo  [18] KHOJKI LETTER A..KHOJKI LETTER JJA
        11213..1122B  ; OLetter # Lo  [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
       +1123F..11240  ; OLetter # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
        11280..11286  ; OLetter # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; OLetter # Lo       MULTANI LETTER GHA
        1128A..1128D  ; OLetter # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -2427,12 +2445,16 @@ FFDA..FFDC    ; OLetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        11D6A..11D89  ; OLetter # Lo  [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
        11D98         ; OLetter # Lo       GUNJALA GONDI OM
        11EE0..11EF2  ; OLetter # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
       +11F02         ; OLetter # Lo       KAWI SIGN REPHA
       +11F04..11F10  ; OLetter # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; OLetter # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
        11FB0         ; OLetter # Lo       LISU LETTER YHA
        12000..12399  ; OLetter # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
        12400..1246E  ; OLetter # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
        12480..12543  ; OLetter # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; OLetter # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
       -13000..1342E  ; OLetter # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; OLetter # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13441..13446  ; OLetter # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
        14400..14646  ; OLetter # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; OLetter # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; OLetter # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -2454,7 +2476,9 @@ FFDA..FFDC    ; OLetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        1AFF5..1AFFB  ; OLetter # Lm   [7] KATAKANA LETTER MINNAN TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-5
        1AFFD..1AFFE  ; OLetter # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000..1B122  ; OLetter # Lo [291] KATAKANA LETTER ARCHAIC E..KATAKANA LETTER ARCHAIC WU
       +1B132         ; OLetter # Lo       HIRAGANA LETTER SMALL KO
        1B150..1B152  ; OLetter # Lo   [3] HIRAGANA LETTER SMALL WI..HIRAGANA LETTER SMALL WO
       +1B155         ; OLetter # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; OLetter # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        1B170..1B2FB  ; OLetter # Lo [396] NUSHU CHARACTER-1B170..NUSHU CHARACTER-1B2FB
        1BC00..1BC6A  ; OLetter # Lo [107] DUPLOYAN LETTER H..DUPLOYAN LETTER VOCALIC M
       @@ -2467,6 +2491,8 @@ FFDA..FFDC    ; OLetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        1E14E         ; OLetter # Lo       NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
        1E290..1E2AD  ; OLetter # Lo  [30] TOTO LETTER PA..TOTO LETTER A
        1E2C0..1E2EB  ; OLetter # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
       +1E4D0..1E4EA  ; OLetter # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; OLetter # Lm       NAG MUNDARI SIGN OJOD
        1E7E0..1E7E6  ; OLetter # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; OLetter # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; OLetter # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -2507,14 +2533,15 @@ FFDA..FFDC    ; OLetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        1EEA5..1EEA9  ; OLetter # Lo   [5] ARABIC MATHEMATICAL DOUBLE-STRUCK WAW..ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
        1EEAB..1EEBB  ; OLetter # Lo  [17] ARABIC MATHEMATICAL DOUBLE-STRUCK LAM..ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
        20000..2A6DF  ; OLetter # Lo [42720] CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF
       -2A700..2B738  ; OLetter # Lo [4153] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B738
       +2A700..2B739  ; OLetter # Lo [4154] CJK UNIFIED IDEOGRAPH-2A700..CJK UNIFIED IDEOGRAPH-2B739
        2B740..2B81D  ; OLetter # Lo [222] CJK UNIFIED IDEOGRAPH-2B740..CJK UNIFIED IDEOGRAPH-2B81D
        2B820..2CEA1  ; OLetter # Lo [5762] CJK UNIFIED IDEOGRAPH-2B820..CJK UNIFIED IDEOGRAPH-2CEA1
        2CEB0..2EBE0  ; OLetter # Lo [7473] CJK UNIFIED IDEOGRAPH-2CEB0..CJK UNIFIED IDEOGRAPH-2EBE0
        2F800..2FA1D  ; OLetter # Lo [542] CJK COMPATIBILITY IDEOGRAPH-2F800..CJK COMPATIBILITY IDEOGRAPH-2FA1D
        30000..3134A  ; OLetter # Lo [4939] CJK UNIFIED IDEOGRAPH-30000..CJK UNIFIED IDEOGRAPH-3134A
       +31350..323AF  ; OLetter # Lo [4192] CJK UNIFIED IDEOGRAPH-31350..CJK UNIFIED IDEOGRAPH-323AF
        
       -# Total code points: 127761
       +# Total code points: 132036
        
        # ================================================
        
       @@ -2573,16 +2600,18 @@ FF10..FF19    ; Numeric # Nd  [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
        11C50..11C59  ; Numeric # Nd  [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
        11D50..11D59  ; Numeric # Nd  [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
        11DA0..11DA9  ; Numeric # Nd  [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
       +11F50..11F59  ; Numeric # Nd  [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
        16A60..16A69  ; Numeric # Nd  [10] MRO DIGIT ZERO..MRO DIGIT NINE
        16AC0..16AC9  ; Numeric # Nd  [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
        16B50..16B59  ; Numeric # Nd  [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
        1D7CE..1D7FF  ; Numeric # Nd  [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
        1E140..1E149  ; Numeric # Nd  [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
        1E2F0..1E2F9  ; Numeric # Nd  [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
       +1E4F0..1E4F9  ; Numeric # Nd  [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
        1E950..1E959  ; Numeric # Nd  [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
        1FBF0..1FBF9  ; Numeric # Nd  [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
        
       -# Total code points: 662
       +# Total code points: 682
        
        # ================================================
        
       @@ -2664,6 +2693,7 @@ FF61          ; STerm # Po       HALFWIDTH IDEOGRAPHIC FULL STOP
        11A9B..11A9C  ; STerm # Po   [2] SOYOMBO MARK SHAD..SOYOMBO MARK DOUBLE SHAD
        11C41..11C42  ; STerm # Po   [2] BHAIKSUKI DANDA..BHAIKSUKI DOUBLE DANDA
        11EF7..11EF8  ; STerm # Po   [2] MAKASAR PASSIMBANG..MAKASAR END OF SECTION
       +11F43..11F44  ; STerm # Po   [2] KAWI DANDA..KAWI DOUBLE DANDA
        16A6E..16A6F  ; STerm # Po   [2] MRO DANDA..MRO DOUBLE DANDA
        16AF5         ; STerm # Po       BASSA VAH FULL STOP
        16B37..16B38  ; STerm # Po   [2] PAHAWH HMONG SIGN VOS THOM..PAHAWH HMONG SIGN VOS TSHAB CEEB
       @@ -2672,7 +2702,7 @@ FF61          ; STerm # Po       HALFWIDTH IDEOGRAPHIC FULL STOP
        1BC9F         ; STerm # Po       DUPLOYAN PUNCTUATION CHINOOK FULL STOP
        1DA88         ; STerm # Po       SIGNWRITING FULL STOP
        
       -# Total code points: 149
       +# Total code points: 151
        
        # ================================================
        
 (DIR) diff --git a/data/SentenceBreakTest.txt b/data/SentenceBreakTest.txt
       @@ -1,11 +1,11 @@
       -# SentenceBreakTest-14.0.0.txt
       -# Date: 2021-03-08, 06:22:40 GMT
       -# © 2021 Unicode®, Inc.
       +# SentenceBreakTest-15.0.0.txt
       +# Date: 2022-02-26, 00:39:00 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        #
        # Default Sentence_Break Test
        #
 (DIR) diff --git a/data/SpecialCasing.txt b/data/SpecialCasing.txt
       @@ -1,11 +1,11 @@
       -# SpecialCasing-14.0.0.txt
       -# Date: 2021-03-08, 19:35:55 GMT
       -# © 2021 Unicode®, Inc.
       +# SpecialCasing-15.0.0.txt
       +# Date: 2022-02-02, 23:35:52 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        #
        # Special Casing
        #
 (DIR) diff --git a/data/UnicodeData.txt b/data/UnicodeData.txt
       @@ -2975,6 +2975,7 @@
        0CEF;KANNADA DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
        0CF1;KANNADA SIGN JIHVAMULIYA;Lo;0;L;;;;;N;;;;;
        0CF2;KANNADA SIGN UPADHMANIYA;Lo;0;L;;;;;N;;;;;
       +0CF3;KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT;Mc;0;L;;;;;N;;;;;
        0D00;MALAYALAM SIGN COMBINING ANUSVARA ABOVE;Mn;0;NSM;;;;;N;;;;;
        0D01;MALAYALAM SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
        0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;
       @@ -3339,6 +3340,7 @@
        0ECB;LAO TONE MAI CATAWA;Mn;122;NSM;;;;;N;;;;;
        0ECC;LAO CANCELLATION MARK;Mn;0;NSM;;;;;N;;;;;
        0ECD;LAO NIGGAHITA;Mn;0;NSM;;;;;N;;;;;
       +0ECE;LAO YAMAKKAN;Mn;0;NSM;;;;;N;;;;;
        0ED0;LAO DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
        0ED1;LAO DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
        0ED2;LAO DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
       @@ -19393,6 +19395,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        10EAD;YEZIDI HYPHENATION MARK;Pd;0;R;;;;;N;;;;;
        10EB0;YEZIDI LETTER LAM WITH DOT ABOVE;Lo;0;R;;;;;N;;;;;
        10EB1;YEZIDI LETTER YOT WITH CIRCUMFLEX ABOVE;Lo;0;R;;;;;N;;;;;
       +10EFD;ARABIC SMALL LOW WORD SAKTA;Mn;220;NSM;;;;;N;;;;;
       +10EFE;ARABIC SMALL LOW WORD QASR;Mn;220;NSM;;;;;N;;;;;
       +10EFF;ARABIC SMALL LOW WORD MADDA;Mn;220;NSM;;;;;N;;;;;
        10F00;OLD SOGDIAN LETTER ALEPH;Lo;0;R;;;;;N;;;;;
        10F01;OLD SOGDIAN LETTER FINAL ALEPH;Lo;0;R;;;;;N;;;;;
        10F02;OLD SOGDIAN LETTER BETH;Lo;0;R;;;;;N;;;;;
       @@ -20058,6 +20063,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1123C;KHOJKI DOUBLE SECTION MARK;Po;0;L;;;;;N;;;;;
        1123D;KHOJKI ABBREVIATION SIGN;Po;0;L;;;;;N;;;;;
        1123E;KHOJKI SIGN SUKUN;Mn;0;NSM;;;;;N;;;;;
       +1123F;KHOJKI LETTER QA;Lo;0;L;;;;;N;;;;;
       +11240;KHOJKI LETTER SHORT I;Lo;0;L;;;;;N;;;;;
       +11241;KHOJKI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
        11280;MULTANI LETTER A;Lo;0;L;;;;;N;;;;;
        11281;MULTANI LETTER I;Lo;0;L;;;;;N;;;;;
        11282;MULTANI LETTER U;Lo;0;L;;;;;N;;;;;
       @@ -21256,6 +21264,16 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        11AF6;PAU CIN HAU LOW-FALLING TONE LONG FINAL;Lo;0;L;;;;;N;;;;;
        11AF7;PAU CIN HAU LOW-FALLING TONE FINAL;Lo;0;L;;;;;N;;;;;
        11AF8;PAU CIN HAU GLOTTAL STOP FINAL;Lo;0;L;;;;;N;;;;;
       +11B00;DEVANAGARI HEAD MARK;Po;0;L;;;;;N;;;;;
       +11B01;DEVANAGARI HEAD MARK WITH HEADSTROKE;Po;0;L;;;;;N;;;;;
       +11B02;DEVANAGARI SIGN BHALE;Po;0;L;;;;;N;;;;;
       +11B03;DEVANAGARI SIGN BHALE WITH HOOK;Po;0;L;;;;;N;;;;;
       +11B04;DEVANAGARI SIGN EXTENDED BHALE;Po;0;L;;;;;N;;;;;
       +11B05;DEVANAGARI SIGN EXTENDED BHALE WITH HOOK;Po;0;L;;;;;N;;;;;
       +11B06;DEVANAGARI SIGN WESTERN FIVE-LIKE BHALE;Po;0;L;;;;;N;;;;;
       +11B07;DEVANAGARI SIGN WESTERN NINE-LIKE BHALE;Po;0;L;;;;;N;;;;;
       +11B08;DEVANAGARI SIGN REVERSED NINE-LIKE BHALE;Po;0;L;;;;;N;;;;;
       +11B09;DEVANAGARI SIGN MINDU;Po;0;L;;;;;N;;;;;
        11C00;BHAIKSUKI LETTER A;Lo;0;L;;;;;N;;;;;
        11C01;BHAIKSUKI LETTER AA;Lo;0;L;;;;;N;;;;;
        11C02;BHAIKSUKI LETTER I;Lo;0;L;;;;;N;;;;;
       @@ -21584,6 +21602,92 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        11EF6;MAKASAR VOWEL SIGN O;Mc;0;L;;;;;N;;;;;
        11EF7;MAKASAR PASSIMBANG;Po;0;L;;;;;N;;;;;
        11EF8;MAKASAR END OF SECTION;Po;0;L;;;;;N;;;;;
       +11F00;KAWI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;
       +11F01;KAWI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;
       +11F02;KAWI SIGN REPHA;Lo;0;L;;;;;N;;;;;
       +11F03;KAWI SIGN VISARGA;Mc;0;L;;;;;N;;;;;
       +11F04;KAWI LETTER A;Lo;0;L;;;;;N;;;;;
       +11F05;KAWI LETTER AA;Lo;0;L;;;;;N;;;;;
       +11F06;KAWI LETTER I;Lo;0;L;;;;;N;;;;;
       +11F07;KAWI LETTER II;Lo;0;L;;;;;N;;;;;
       +11F08;KAWI LETTER U;Lo;0;L;;;;;N;;;;;
       +11F09;KAWI LETTER UU;Lo;0;L;;;;;N;;;;;
       +11F0A;KAWI LETTER VOCALIC R;Lo;0;L;;;;;N;;;;;
       +11F0B;KAWI LETTER VOCALIC RR;Lo;0;L;;;;;N;;;;;
       +11F0C;KAWI LETTER VOCALIC L;Lo;0;L;;;;;N;;;;;
       +11F0D;KAWI LETTER VOCALIC LL;Lo;0;L;;;;;N;;;;;
       +11F0E;KAWI LETTER E;Lo;0;L;;;;;N;;;;;
       +11F0F;KAWI LETTER AI;Lo;0;L;;;;;N;;;;;
       +11F10;KAWI LETTER O;Lo;0;L;;;;;N;;;;;
       +11F12;KAWI LETTER KA;Lo;0;L;;;;;N;;;;;
       +11F13;KAWI LETTER KHA;Lo;0;L;;;;;N;;;;;
       +11F14;KAWI LETTER GA;Lo;0;L;;;;;N;;;;;
       +11F15;KAWI LETTER GHA;Lo;0;L;;;;;N;;;;;
       +11F16;KAWI LETTER NGA;Lo;0;L;;;;;N;;;;;
       +11F17;KAWI LETTER CA;Lo;0;L;;;;;N;;;;;
       +11F18;KAWI LETTER CHA;Lo;0;L;;;;;N;;;;;
       +11F19;KAWI LETTER JA;Lo;0;L;;;;;N;;;;;
       +11F1A;KAWI LETTER JHA;Lo;0;L;;;;;N;;;;;
       +11F1B;KAWI LETTER NYA;Lo;0;L;;;;;N;;;;;
       +11F1C;KAWI LETTER TTA;Lo;0;L;;;;;N;;;;;
       +11F1D;KAWI LETTER TTHA;Lo;0;L;;;;;N;;;;;
       +11F1E;KAWI LETTER DDA;Lo;0;L;;;;;N;;;;;
       +11F1F;KAWI LETTER DDHA;Lo;0;L;;;;;N;;;;;
       +11F20;KAWI LETTER NNA;Lo;0;L;;;;;N;;;;;
       +11F21;KAWI LETTER TA;Lo;0;L;;;;;N;;;;;
       +11F22;KAWI LETTER THA;Lo;0;L;;;;;N;;;;;
       +11F23;KAWI LETTER DA;Lo;0;L;;;;;N;;;;;
       +11F24;KAWI LETTER DHA;Lo;0;L;;;;;N;;;;;
       +11F25;KAWI LETTER NA;Lo;0;L;;;;;N;;;;;
       +11F26;KAWI LETTER PA;Lo;0;L;;;;;N;;;;;
       +11F27;KAWI LETTER PHA;Lo;0;L;;;;;N;;;;;
       +11F28;KAWI LETTER BA;Lo;0;L;;;;;N;;;;;
       +11F29;KAWI LETTER BHA;Lo;0;L;;;;;N;;;;;
       +11F2A;KAWI LETTER MA;Lo;0;L;;;;;N;;;;;
       +11F2B;KAWI LETTER YA;Lo;0;L;;;;;N;;;;;
       +11F2C;KAWI LETTER RA;Lo;0;L;;;;;N;;;;;
       +11F2D;KAWI LETTER LA;Lo;0;L;;;;;N;;;;;
       +11F2E;KAWI LETTER WA;Lo;0;L;;;;;N;;;;;
       +11F2F;KAWI LETTER SHA;Lo;0;L;;;;;N;;;;;
       +11F30;KAWI LETTER SSA;Lo;0;L;;;;;N;;;;;
       +11F31;KAWI LETTER SA;Lo;0;L;;;;;N;;;;;
       +11F32;KAWI LETTER HA;Lo;0;L;;;;;N;;;;;
       +11F33;KAWI LETTER JNYA;Lo;0;L;;;;;N;;;;;
       +11F34;KAWI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;
       +11F35;KAWI VOWEL SIGN ALTERNATE AA;Mc;0;L;;;;;N;;;;;
       +11F36;KAWI VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;
       +11F37;KAWI VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;
       +11F38;KAWI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;
       +11F39;KAWI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;
       +11F3A;KAWI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;
       +11F3E;KAWI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;
       +11F3F;KAWI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;
       +11F40;KAWI VOWEL SIGN EU;Mn;0;NSM;;;;;N;;;;;
       +11F41;KAWI SIGN KILLER;Mc;9;L;;;;;N;;;;;
       +11F42;KAWI CONJOINER;Mn;9;NSM;;;;;N;;;;;
       +11F43;KAWI DANDA;Po;0;L;;;;;N;;;;;
       +11F44;KAWI DOUBLE DANDA;Po;0;L;;;;;N;;;;;
       +11F45;KAWI PUNCTUATION SECTION MARKER;Po;0;L;;;;;N;;;;;
       +11F46;KAWI PUNCTUATION ALTERNATE SECTION MARKER;Po;0;L;;;;;N;;;;;
       +11F47;KAWI PUNCTUATION FLOWER;Po;0;L;;;;;N;;;;;
       +11F48;KAWI PUNCTUATION SPACE FILLER;Po;0;L;;;;;N;;;;;
       +11F49;KAWI PUNCTUATION DOT;Po;0;L;;;;;N;;;;;
       +11F4A;KAWI PUNCTUATION DOUBLE DOT;Po;0;L;;;;;N;;;;;
       +11F4B;KAWI PUNCTUATION TRIPLE DOT;Po;0;L;;;;;N;;;;;
       +11F4C;KAWI PUNCTUATION CIRCLE;Po;0;L;;;;;N;;;;;
       +11F4D;KAWI PUNCTUATION FILLED CIRCLE;Po;0;L;;;;;N;;;;;
       +11F4E;KAWI PUNCTUATION SPIRAL;Po;0;L;;;;;N;;;;;
       +11F4F;KAWI PUNCTUATION CLOSING SPIRAL;Po;0;L;;;;;N;;;;;
       +11F50;KAWI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
       +11F51;KAWI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
       +11F52;KAWI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
       +11F53;KAWI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
       +11F54;KAWI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
       +11F55;KAWI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
       +11F56;KAWI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
       +11F57;KAWI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
       +11F58;KAWI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
       +11F59;KAWI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
        11FB0;LISU LETTER YHA;Lo;0;L;;;;;N;;;;;
        11FC0;TAMIL FRACTION ONE THREE-HUNDRED-AND-TWENTIETH;No;0;L;;;;1/320;N;;;;;
        11FC1;TAMIL FRACTION ONE ONE-HUNDRED-AND-SIXTIETH;No;0;L;;;;1/160;N;;;;;
       @@ -24040,6 +24144,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1342C;EGYPTIAN HIEROGLYPH AA030;Lo;0;L;;;;;N;;;;;
        1342D;EGYPTIAN HIEROGLYPH AA031;Lo;0;L;;;;;N;;;;;
        1342E;EGYPTIAN HIEROGLYPH AA032;Lo;0;L;;;;;N;;;;;
       +1342F;EGYPTIAN HIEROGLYPH V011D;Lo;0;L;;;;;N;;;;;
        13430;EGYPTIAN HIEROGLYPH VERTICAL JOINER;Cf;0;L;;;;;N;;;;;
        13431;EGYPTIAN HIEROGLYPH HORIZONTAL JOINER;Cf;0;L;;;;;N;;;;;
        13432;EGYPTIAN HIEROGLYPH INSERT AT TOP START;Cf;0;L;;;;;N;;;;;
       @@ -24049,6 +24154,35 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        13436;EGYPTIAN HIEROGLYPH OVERLAY MIDDLE;Cf;0;L;;;;;N;;;;;
        13437;EGYPTIAN HIEROGLYPH BEGIN SEGMENT;Cf;0;L;;;;;N;;;;;
        13438;EGYPTIAN HIEROGLYPH END SEGMENT;Cf;0;L;;;;;N;;;;;
       +13439;EGYPTIAN HIEROGLYPH INSERT AT MIDDLE;Cf;0;L;;;;;N;;;;;
       +1343A;EGYPTIAN HIEROGLYPH INSERT AT TOP;Cf;0;L;;;;;N;;;;;
       +1343B;EGYPTIAN HIEROGLYPH INSERT AT BOTTOM;Cf;0;L;;;;;N;;;;;
       +1343C;EGYPTIAN HIEROGLYPH BEGIN ENCLOSURE;Cf;0;L;;;;;N;;;;;
       +1343D;EGYPTIAN HIEROGLYPH END ENCLOSURE;Cf;0;L;;;;;N;;;;;
       +1343E;EGYPTIAN HIEROGLYPH BEGIN WALLED ENCLOSURE;Cf;0;L;;;;;N;;;;;
       +1343F;EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE;Cf;0;L;;;;;N;;;;;
       +13440;EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY;Mn;0;NSM;;;;;N;;;;;
       +13441;EGYPTIAN HIEROGLYPH FULL BLANK;Lo;0;L;;;;;N;;;;;
       +13442;EGYPTIAN HIEROGLYPH HALF BLANK;Lo;0;L;;;;;N;;;;;
       +13443;EGYPTIAN HIEROGLYPH LOST SIGN;Lo;0;L;;;;;N;;;;;
       +13444;EGYPTIAN HIEROGLYPH HALF LOST SIGN;Lo;0;L;;;;;N;;;;;
       +13445;EGYPTIAN HIEROGLYPH TALL LOST SIGN;Lo;0;L;;;;;N;;;;;
       +13446;EGYPTIAN HIEROGLYPH WIDE LOST SIGN;Lo;0;L;;;;;N;;;;;
       +13447;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START;Mn;0;NSM;;;;;N;;;;;
       +13448;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM START;Mn;0;NSM;;;;;N;;;;;
       +13449;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT START;Mn;0;NSM;;;;;N;;;;;
       +1344A;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP END;Mn;0;NSM;;;;;N;;;;;
       +1344B;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP;Mn;0;NSM;;;;;N;;;;;
       +1344C;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM START AND TOP END;Mn;0;NSM;;;;;N;;;;;
       +1344D;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT START AND TOP;Mn;0;NSM;;;;;N;;;;;
       +1344E;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM END;Mn;0;NSM;;;;;N;;;;;
       +1344F;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START AND BOTTOM END;Mn;0;NSM;;;;;N;;;;;
       +13450;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM;Mn;0;NSM;;;;;N;;;;;
       +13451;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT START AND BOTTOM;Mn;0;NSM;;;;;N;;;;;
       +13452;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT END;Mn;0;NSM;;;;;N;;;;;
       +13453;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP AND END;Mn;0;NSM;;;;;N;;;;;
       +13454;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT BOTTOM AND END;Mn;0;NSM;;;;;N;;;;;
       +13455;EGYPTIAN HIEROGLYPH MODIFIER DAMAGED;Mn;0;NSM;;;;;N;;;;;
        14400;ANATOLIAN HIEROGLYPH A001;Lo;0;L;;;;;N;;;;;
        14401;ANATOLIAN HIEROGLYPH A002;Lo;0;L;;;;;N;;;;;
        14402;ANATOLIAN HIEROGLYPH A003;Lo;0;L;;;;;N;;;;;
       @@ -27289,9 +27423,11 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1B120;KATAKANA LETTER ARCHAIC YI;Lo;0;L;;;;;N;;;;;
        1B121;KATAKANA LETTER ARCHAIC YE;Lo;0;L;;;;;N;;;;;
        1B122;KATAKANA LETTER ARCHAIC WU;Lo;0;L;;;;;N;;;;;
       +1B132;HIRAGANA LETTER SMALL KO;Lo;0;L;;;;;N;;;;;
        1B150;HIRAGANA LETTER SMALL WI;Lo;0;L;;;;;N;;;;;
        1B151;HIRAGANA LETTER SMALL WE;Lo;0;L;;;;;N;;;;;
        1B152;HIRAGANA LETTER SMALL WO;Lo;0;L;;;;;N;;;;;
       +1B155;KATAKANA LETTER SMALL KO;Lo;0;L;;;;;N;;;;;
        1B164;KATAKANA LETTER SMALL WI;Lo;0;L;;;;;N;;;;;
        1B165;KATAKANA LETTER SMALL WE;Lo;0;L;;;;;N;;;;;
        1B166;KATAKANA LETTER SMALL WO;Lo;0;L;;;;;N;;;;;
       @@ -28573,6 +28709,26 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1D243;COMBINING GREEK MUSICAL TETRASEME;Mn;230;NSM;;;;;N;;;;;
        1D244;COMBINING GREEK MUSICAL PENTASEME;Mn;230;NSM;;;;;N;;;;;
        1D245;GREEK MUSICAL LEIMMA;So;0;ON;;;;;N;;;;;
       +1D2C0;KAKTOVIK NUMERAL ZERO;No;0;L;;;;0;N;;;;;
       +1D2C1;KAKTOVIK NUMERAL ONE;No;0;L;;;;1;N;;;;;
       +1D2C2;KAKTOVIK NUMERAL TWO;No;0;L;;;;2;N;;;;;
       +1D2C3;KAKTOVIK NUMERAL THREE;No;0;L;;;;3;N;;;;;
       +1D2C4;KAKTOVIK NUMERAL FOUR;No;0;L;;;;4;N;;;;;
       +1D2C5;KAKTOVIK NUMERAL FIVE;No;0;L;;;;5;N;;;;;
       +1D2C6;KAKTOVIK NUMERAL SIX;No;0;L;;;;6;N;;;;;
       +1D2C7;KAKTOVIK NUMERAL SEVEN;No;0;L;;;;7;N;;;;;
       +1D2C8;KAKTOVIK NUMERAL EIGHT;No;0;L;;;;8;N;;;;;
       +1D2C9;KAKTOVIK NUMERAL NINE;No;0;L;;;;9;N;;;;;
       +1D2CA;KAKTOVIK NUMERAL TEN;No;0;L;;;;10;N;;;;;
       +1D2CB;KAKTOVIK NUMERAL ELEVEN;No;0;L;;;;11;N;;;;;
       +1D2CC;KAKTOVIK NUMERAL TWELVE;No;0;L;;;;12;N;;;;;
       +1D2CD;KAKTOVIK NUMERAL THIRTEEN;No;0;L;;;;13;N;;;;;
       +1D2CE;KAKTOVIK NUMERAL FOURTEEN;No;0;L;;;;14;N;;;;;
       +1D2CF;KAKTOVIK NUMERAL FIFTEEN;No;0;L;;;;15;N;;;;;
       +1D2D0;KAKTOVIK NUMERAL SIXTEEN;No;0;L;;;;16;N;;;;;
       +1D2D1;KAKTOVIK NUMERAL SEVENTEEN;No;0;L;;;;17;N;;;;;
       +1D2D2;KAKTOVIK NUMERAL EIGHTEEN;No;0;L;;;;18;N;;;;;
       +1D2D3;KAKTOVIK NUMERAL NINETEEN;No;0;L;;;;19;N;;;;;
        1D2E0;MAYAN NUMERAL ZERO;No;0;L;;;;0;N;;;;;
        1D2E1;MAYAN NUMERAL ONE;No;0;L;;;;1;N;;;;;
        1D2E2;MAYAN NUMERAL TWO;No;0;L;;;;2;N;;;;;
       @@ -30404,6 +30560,12 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1DF1C;LATIN SMALL LETTER TESH DIGRAPH WITH RETROFLEX HOOK;Ll;0;L;;;;;N;;;;;
        1DF1D;LATIN SMALL LETTER C WITH RETROFLEX HOOK;Ll;0;L;;;;;N;;;;;
        1DF1E;LATIN SMALL LETTER S WITH CURL;Ll;0;L;;;;;N;;;;;
       +1DF25;LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
       +1DF26;LATIN SMALL LETTER L WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
       +1DF27;LATIN SMALL LETTER N WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
       +1DF28;LATIN SMALL LETTER R WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
       +1DF29;LATIN SMALL LETTER S WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
       +1DF2A;LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK;Ll;0;L;;;;;N;;;;;
        1E000;COMBINING GLAGOLITIC LETTER AZU;Mn;230;NSM;;;;;N;;;;;
        1E001;COMBINING GLAGOLITIC LETTER BUKY;Mn;230;NSM;;;;;N;;;;;
        1E002;COMBINING GLAGOLITIC LETTER VEDE;Mn;230;NSM;;;;;N;;;;;
       @@ -30442,6 +30604,69 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1E028;COMBINING GLAGOLITIC LETTER BIG YUS;Mn;230;NSM;;;;;N;;;;;
        1E029;COMBINING GLAGOLITIC LETTER IOTATED BIG YUS;Mn;230;NSM;;;;;N;;;;;
        1E02A;COMBINING GLAGOLITIC LETTER FITA;Mn;230;NSM;;;;;N;;;;;
       +1E030;MODIFIER LETTER CYRILLIC SMALL A;Lm;0;L;<super> 0430;;;;N;;;;;
       +1E031;MODIFIER LETTER CYRILLIC SMALL BE;Lm;0;L;<super> 0431;;;;N;;;;;
       +1E032;MODIFIER LETTER CYRILLIC SMALL VE;Lm;0;L;<super> 0432;;;;N;;;;;
       +1E033;MODIFIER LETTER CYRILLIC SMALL GHE;Lm;0;L;<super> 0433;;;;N;;;;;
       +1E034;MODIFIER LETTER CYRILLIC SMALL DE;Lm;0;L;<super> 0434;;;;N;;;;;
       +1E035;MODIFIER LETTER CYRILLIC SMALL IE;Lm;0;L;<super> 0435;;;;N;;;;;
       +1E036;MODIFIER LETTER CYRILLIC SMALL ZHE;Lm;0;L;<super> 0436;;;;N;;;;;
       +1E037;MODIFIER LETTER CYRILLIC SMALL ZE;Lm;0;L;<super> 0437;;;;N;;;;;
       +1E038;MODIFIER LETTER CYRILLIC SMALL I;Lm;0;L;<super> 0438;;;;N;;;;;
       +1E039;MODIFIER LETTER CYRILLIC SMALL KA;Lm;0;L;<super> 043A;;;;N;;;;;
       +1E03A;MODIFIER LETTER CYRILLIC SMALL EL;Lm;0;L;<super> 043B;;;;N;;;;;
       +1E03B;MODIFIER LETTER CYRILLIC SMALL EM;Lm;0;L;<super> 043C;;;;N;;;;;
       +1E03C;MODIFIER LETTER CYRILLIC SMALL O;Lm;0;L;<super> 043E;;;;N;;;;;
       +1E03D;MODIFIER LETTER CYRILLIC SMALL PE;Lm;0;L;<super> 043F;;;;N;;;;;
       +1E03E;MODIFIER LETTER CYRILLIC SMALL ER;Lm;0;L;<super> 0440;;;;N;;;;;
       +1E03F;MODIFIER LETTER CYRILLIC SMALL ES;Lm;0;L;<super> 0441;;;;N;;;;;
       +1E040;MODIFIER LETTER CYRILLIC SMALL TE;Lm;0;L;<super> 0442;;;;N;;;;;
       +1E041;MODIFIER LETTER CYRILLIC SMALL U;Lm;0;L;<super> 0443;;;;N;;;;;
       +1E042;MODIFIER LETTER CYRILLIC SMALL EF;Lm;0;L;<super> 0444;;;;N;;;;;
       +1E043;MODIFIER LETTER CYRILLIC SMALL HA;Lm;0;L;<super> 0445;;;;N;;;;;
       +1E044;MODIFIER LETTER CYRILLIC SMALL TSE;Lm;0;L;<super> 0446;;;;N;;;;;
       +1E045;MODIFIER LETTER CYRILLIC SMALL CHE;Lm;0;L;<super> 0447;;;;N;;;;;
       +1E046;MODIFIER LETTER CYRILLIC SMALL SHA;Lm;0;L;<super> 0448;;;;N;;;;;
       +1E047;MODIFIER LETTER CYRILLIC SMALL YERU;Lm;0;L;<super> 044B;;;;N;;;;;
       +1E048;MODIFIER LETTER CYRILLIC SMALL E;Lm;0;L;<super> 044D;;;;N;;;;;
       +1E049;MODIFIER LETTER CYRILLIC SMALL YU;Lm;0;L;<super> 044E;;;;N;;;;;
       +1E04A;MODIFIER LETTER CYRILLIC SMALL DZZE;Lm;0;L;<super> A689;;;;N;;;;;
       +1E04B;MODIFIER LETTER CYRILLIC SMALL SCHWA;Lm;0;L;<super> 04D9;;;;N;;;;;
       +1E04C;MODIFIER LETTER CYRILLIC SMALL BYELORUSSIAN-UKRAINIAN I;Lm;0;L;<super> 0456;;;;N;;;;;
       +1E04D;MODIFIER LETTER CYRILLIC SMALL JE;Lm;0;L;<super> 0458;;;;N;;;;;
       +1E04E;MODIFIER LETTER CYRILLIC SMALL BARRED O;Lm;0;L;<super> 04E9;;;;N;;;;;
       +1E04F;MODIFIER LETTER CYRILLIC SMALL STRAIGHT U;Lm;0;L;<super> 04AF;;;;N;;;;;
       +1E050;MODIFIER LETTER CYRILLIC SMALL PALOCHKA;Lm;0;L;<super> 04CF;;;;N;;;;;
       +1E051;CYRILLIC SUBSCRIPT SMALL LETTER A;Lm;0;L;<sub> 0430;;;;N;;;;;
       +1E052;CYRILLIC SUBSCRIPT SMALL LETTER BE;Lm;0;L;<sub> 0431;;;;N;;;;;
       +1E053;CYRILLIC SUBSCRIPT SMALL LETTER VE;Lm;0;L;<sub> 0432;;;;N;;;;;
       +1E054;CYRILLIC SUBSCRIPT SMALL LETTER GHE;Lm;0;L;<sub> 0433;;;;N;;;;;
       +1E055;CYRILLIC SUBSCRIPT SMALL LETTER DE;Lm;0;L;<sub> 0434;;;;N;;;;;
       +1E056;CYRILLIC SUBSCRIPT SMALL LETTER IE;Lm;0;L;<sub> 0435;;;;N;;;;;
       +1E057;CYRILLIC SUBSCRIPT SMALL LETTER ZHE;Lm;0;L;<sub> 0436;;;;N;;;;;
       +1E058;CYRILLIC SUBSCRIPT SMALL LETTER ZE;Lm;0;L;<sub> 0437;;;;N;;;;;
       +1E059;CYRILLIC SUBSCRIPT SMALL LETTER I;Lm;0;L;<sub> 0438;;;;N;;;;;
       +1E05A;CYRILLIC SUBSCRIPT SMALL LETTER KA;Lm;0;L;<sub> 043A;;;;N;;;;;
       +1E05B;CYRILLIC SUBSCRIPT SMALL LETTER EL;Lm;0;L;<sub> 043B;;;;N;;;;;
       +1E05C;CYRILLIC SUBSCRIPT SMALL LETTER O;Lm;0;L;<sub> 043E;;;;N;;;;;
       +1E05D;CYRILLIC SUBSCRIPT SMALL LETTER PE;Lm;0;L;<sub> 043F;;;;N;;;;;
       +1E05E;CYRILLIC SUBSCRIPT SMALL LETTER ES;Lm;0;L;<sub> 0441;;;;N;;;;;
       +1E05F;CYRILLIC SUBSCRIPT SMALL LETTER U;Lm;0;L;<sub> 0443;;;;N;;;;;
       +1E060;CYRILLIC SUBSCRIPT SMALL LETTER EF;Lm;0;L;<sub> 0444;;;;N;;;;;
       +1E061;CYRILLIC SUBSCRIPT SMALL LETTER HA;Lm;0;L;<sub> 0445;;;;N;;;;;
       +1E062;CYRILLIC SUBSCRIPT SMALL LETTER TSE;Lm;0;L;<sub> 0446;;;;N;;;;;
       +1E063;CYRILLIC SUBSCRIPT SMALL LETTER CHE;Lm;0;L;<sub> 0447;;;;N;;;;;
       +1E064;CYRILLIC SUBSCRIPT SMALL LETTER SHA;Lm;0;L;<sub> 0448;;;;N;;;;;
       +1E065;CYRILLIC SUBSCRIPT SMALL LETTER HARD SIGN;Lm;0;L;<sub> 044A;;;;N;;;;;
       +1E066;CYRILLIC SUBSCRIPT SMALL LETTER YERU;Lm;0;L;<sub> 044B;;;;N;;;;;
       +1E067;CYRILLIC SUBSCRIPT SMALL LETTER GHE WITH UPTURN;Lm;0;L;<sub> 0491;;;;N;;;;;
       +1E068;CYRILLIC SUBSCRIPT SMALL LETTER BYELORUSSIAN-UKRAINIAN I;Lm;0;L;<sub> 0456;;;;N;;;;;
       +1E069;CYRILLIC SUBSCRIPT SMALL LETTER DZE;Lm;0;L;<sub> 0455;;;;N;;;;;
       +1E06A;CYRILLIC SUBSCRIPT SMALL LETTER DZHE;Lm;0;L;<sub> 045F;;;;N;;;;;
       +1E06B;MODIFIER LETTER CYRILLIC SMALL ES WITH DESCENDER;Lm;0;L;<super> 04AB;;;;N;;;;;
       +1E06C;MODIFIER LETTER CYRILLIC SMALL YERU WITH BACK YER;Lm;0;L;<super> A651;;;;N;;;;;
       +1E06D;MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE;Lm;0;L;<super> 04B1;;;;N;;;;;
       +1E08F;COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I;Mn;230;NSM;;;;;N;;;;;
        1E100;NYIAKENG PUACHUE HMONG LETTER MA;Lo;0;L;;;;;N;;;;;
        1E101;NYIAKENG PUACHUE HMONG LETTER TSA;Lo;0;L;;;;;N;;;;;
        1E102;NYIAKENG PUACHUE HMONG LETTER NTA;Lo;0;L;;;;;N;;;;;
       @@ -30603,6 +30828,48 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1E2F8;WANCHO DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
        1E2F9;WANCHO DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
        1E2FF;WANCHO NGUN SIGN;Sc;0;ET;;;;;N;;;;;
       +1E4D0;NAG MUNDARI LETTER O;Lo;0;L;;;;;N;;;;;
       +1E4D1;NAG MUNDARI LETTER OP;Lo;0;L;;;;;N;;;;;
       +1E4D2;NAG MUNDARI LETTER OL;Lo;0;L;;;;;N;;;;;
       +1E4D3;NAG MUNDARI LETTER OY;Lo;0;L;;;;;N;;;;;
       +1E4D4;NAG MUNDARI LETTER ONG;Lo;0;L;;;;;N;;;;;
       +1E4D5;NAG MUNDARI LETTER A;Lo;0;L;;;;;N;;;;;
       +1E4D6;NAG MUNDARI LETTER AJ;Lo;0;L;;;;;N;;;;;
       +1E4D7;NAG MUNDARI LETTER AB;Lo;0;L;;;;;N;;;;;
       +1E4D8;NAG MUNDARI LETTER ANY;Lo;0;L;;;;;N;;;;;
       +1E4D9;NAG MUNDARI LETTER AH;Lo;0;L;;;;;N;;;;;
       +1E4DA;NAG MUNDARI LETTER I;Lo;0;L;;;;;N;;;;;
       +1E4DB;NAG MUNDARI LETTER IS;Lo;0;L;;;;;N;;;;;
       +1E4DC;NAG MUNDARI LETTER IDD;Lo;0;L;;;;;N;;;;;
       +1E4DD;NAG MUNDARI LETTER IT;Lo;0;L;;;;;N;;;;;
       +1E4DE;NAG MUNDARI LETTER IH;Lo;0;L;;;;;N;;;;;
       +1E4DF;NAG MUNDARI LETTER U;Lo;0;L;;;;;N;;;;;
       +1E4E0;NAG MUNDARI LETTER UC;Lo;0;L;;;;;N;;;;;
       +1E4E1;NAG MUNDARI LETTER UD;Lo;0;L;;;;;N;;;;;
       +1E4E2;NAG MUNDARI LETTER UK;Lo;0;L;;;;;N;;;;;
       +1E4E3;NAG MUNDARI LETTER UR;Lo;0;L;;;;;N;;;;;
       +1E4E4;NAG MUNDARI LETTER E;Lo;0;L;;;;;N;;;;;
       +1E4E5;NAG MUNDARI LETTER ENN;Lo;0;L;;;;;N;;;;;
       +1E4E6;NAG MUNDARI LETTER EG;Lo;0;L;;;;;N;;;;;
       +1E4E7;NAG MUNDARI LETTER EM;Lo;0;L;;;;;N;;;;;
       +1E4E8;NAG MUNDARI LETTER EN;Lo;0;L;;;;;N;;;;;
       +1E4E9;NAG MUNDARI LETTER ETT;Lo;0;L;;;;;N;;;;;
       +1E4EA;NAG MUNDARI LETTER ELL;Lo;0;L;;;;;N;;;;;
       +1E4EB;NAG MUNDARI SIGN OJOD;Lm;0;L;;;;;N;;;;;
       +1E4EC;NAG MUNDARI SIGN MUHOR;Mn;232;NSM;;;;;N;;;;;
       +1E4ED;NAG MUNDARI SIGN TOYOR;Mn;232;NSM;;;;;N;;;;;
       +1E4EE;NAG MUNDARI SIGN IKIR;Mn;220;NSM;;;;;N;;;;;
       +1E4EF;NAG MUNDARI SIGN SUTUH;Mn;230;NSM;;;;;N;;;;;
       +1E4F0;NAG MUNDARI DIGIT ZERO;Nd;0;L;;0;0;0;N;;;;;
       +1E4F1;NAG MUNDARI DIGIT ONE;Nd;0;L;;1;1;1;N;;;;;
       +1E4F2;NAG MUNDARI DIGIT TWO;Nd;0;L;;2;2;2;N;;;;;
       +1E4F3;NAG MUNDARI DIGIT THREE;Nd;0;L;;3;3;3;N;;;;;
       +1E4F4;NAG MUNDARI DIGIT FOUR;Nd;0;L;;4;4;4;N;;;;;
       +1E4F5;NAG MUNDARI DIGIT FIVE;Nd;0;L;;5;5;5;N;;;;;
       +1E4F6;NAG MUNDARI DIGIT SIX;Nd;0;L;;6;6;6;N;;;;;
       +1E4F7;NAG MUNDARI DIGIT SEVEN;Nd;0;L;;7;7;7;N;;;;;
       +1E4F8;NAG MUNDARI DIGIT EIGHT;Nd;0;L;;8;8;8;N;;;;;
       +1E4F9;NAG MUNDARI DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
        1E7E0;ETHIOPIC SYLLABLE HHYA;Lo;0;L;;;;;N;;;;;
        1E7E1;ETHIOPIC SYLLABLE HHYU;Lo;0;L;;;;;N;;;;;
        1E7E2;ETHIOPIC SYLLABLE HHYI;Lo;0;L;;;;;N;;;;;
       @@ -32678,6 +32945,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1F6D5;HINDU TEMPLE;So;0;ON;;;;;N;;;;;
        1F6D6;HUT;So;0;ON;;;;;N;;;;;
        1F6D7;ELEVATOR;So;0;ON;;;;;N;;;;;
       +1F6DC;WIRELESS;So;0;ON;;;;;N;;;;;
        1F6DD;PLAYGROUND SLIDE;So;0;ON;;;;;N;;;;;
        1F6DE;WHEEL;So;0;ON;;;;;N;;;;;
        1F6DF;RING BUOY;So;0;ON;;;;;N;;;;;
       @@ -32823,6 +33091,14 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1F771;ALCHEMICAL SYMBOL FOR MONTH;So;0;ON;;;;;N;;;;;
        1F772;ALCHEMICAL SYMBOL FOR HALF DRAM;So;0;ON;;;;;N;;;;;
        1F773;ALCHEMICAL SYMBOL FOR HALF OUNCE;So;0;ON;;;;;N;;;;;
       +1F774;LOT OF FORTUNE;So;0;ON;;;;;N;;;;;
       +1F775;OCCULTATION;So;0;ON;;;;;N;;;;;
       +1F776;LUNAR ECLIPSE;So;0;ON;;;;;N;;;;;
       +1F77B;HAUMEA;So;0;ON;;;;;N;;;;;
       +1F77C;MAKEMAKE;So;0;ON;;;;;N;;;;;
       +1F77D;GONGGONG;So;0;ON;;;;;N;;;;;
       +1F77E;QUAOAR;So;0;ON;;;;;N;;;;;
       +1F77F;ORCUS;So;0;ON;;;;;N;;;;;
        1F780;BLACK LEFT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
        1F781;BLACK UP-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
        1F782;BLACK RIGHT-POINTING ISOSCELES RIGHT TRIANGLE;So;0;ON;;;;;N;;;;;
       @@ -32912,6 +33188,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1F7D6;NEGATIVE CIRCLED TRIANGLE;So;0;ON;;;;;N;;;;;
        1F7D7;CIRCLED SQUARE;So;0;ON;;;;;N;;;;;
        1F7D8;NEGATIVE CIRCLED SQUARE;So;0;ON;;;;;N;;;;;
       +1F7D9;NINE POINTED WHITE STAR;So;0;ON;;;;;N;;;;;
        1F7E0;LARGE ORANGE CIRCLE;So;0;ON;;;;;N;;;;;
        1F7E1;LARGE YELLOW CIRCLE;So;0;ON;;;;;N;;;;;
        1F7E2;LARGE GREEN CIRCLE;So;0;ON;;;;;N;;;;;
       @@ -33434,6 +33711,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1FA72;BRIEFS;So;0;ON;;;;;N;;;;;
        1FA73;SHORTS;So;0;ON;;;;;N;;;;;
        1FA74;THONG SANDAL;So;0;ON;;;;;N;;;;;
       +1FA75;LIGHT BLUE HEART;So;0;ON;;;;;N;;;;;
       +1FA76;GREY HEART;So;0;ON;;;;;N;;;;;
       +1FA77;PINK HEART;So;0;ON;;;;;N;;;;;
        1FA78;DROP OF BLOOD;So;0;ON;;;;;N;;;;;
        1FA79;ADHESIVE BANDAGE;So;0;ON;;;;;N;;;;;
        1FA7A;STETHOSCOPE;So;0;ON;;;;;N;;;;;
       @@ -33446,6 +33726,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1FA84;MAGIC WAND;So;0;ON;;;;;N;;;;;
        1FA85;PINATA;So;0;ON;;;;;N;;;;;
        1FA86;NESTING DOLLS;So;0;ON;;;;;N;;;;;
       +1FA87;MARACAS;So;0;ON;;;;;N;;;;;
       +1FA88;FLUTE;So;0;ON;;;;;N;;;;;
        1FA90;RINGED PLANET;So;0;ON;;;;;N;;;;;
        1FA91;CHAIR;So;0;ON;;;;;N;;;;;
        1FA92;RAZOR;So;0;ON;;;;;N;;;;;
       @@ -33475,6 +33757,9 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1FAAA;IDENTIFICATION CARD;So;0;ON;;;;;N;;;;;
        1FAAB;LOW BATTERY;So;0;ON;;;;;N;;;;;
        1FAAC;HAMSA;So;0;ON;;;;;N;;;;;
       +1FAAD;FOLDING HAND FAN;So;0;ON;;;;;N;;;;;
       +1FAAE;HAIR PICK;So;0;ON;;;;;N;;;;;
       +1FAAF;KHANDA;So;0;ON;;;;;N;;;;;
        1FAB0;FLY;So;0;ON;;;;;N;;;;;
        1FAB1;WORM;So;0;ON;;;;;N;;;;;
        1FAB2;BEETLE;So;0;ON;;;;;N;;;;;
       @@ -33486,12 +33771,18 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1FAB8;CORAL;So;0;ON;;;;;N;;;;;
        1FAB9;EMPTY NEST;So;0;ON;;;;;N;;;;;
        1FABA;NEST WITH EGGS;So;0;ON;;;;;N;;;;;
       +1FABB;HYACINTH;So;0;ON;;;;;N;;;;;
       +1FABC;JELLYFISH;So;0;ON;;;;;N;;;;;
       +1FABD;WING;So;0;ON;;;;;N;;;;;
       +1FABF;GOOSE;So;0;ON;;;;;N;;;;;
        1FAC0;ANATOMICAL HEART;So;0;ON;;;;;N;;;;;
        1FAC1;LUNGS;So;0;ON;;;;;N;;;;;
        1FAC2;PEOPLE HUGGING;So;0;ON;;;;;N;;;;;
        1FAC3;PREGNANT MAN;So;0;ON;;;;;N;;;;;
        1FAC4;PREGNANT PERSON;So;0;ON;;;;;N;;;;;
        1FAC5;PERSON WITH CROWN;So;0;ON;;;;;N;;;;;
       +1FACE;MOOSE;So;0;ON;;;;;N;;;;;
       +1FACF;DONKEY;So;0;ON;;;;;N;;;;;
        1FAD0;BLUEBERRIES;So;0;ON;;;;;N;;;;;
        1FAD1;BELL PEPPER;So;0;ON;;;;;N;;;;;
        1FAD2;OLIVE;So;0;ON;;;;;N;;;;;
       @@ -33502,6 +33793,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1FAD7;POURING LIQUID;So;0;ON;;;;;N;;;;;
        1FAD8;BEANS;So;0;ON;;;;;N;;;;;
        1FAD9;JAR;So;0;ON;;;;;N;;;;;
       +1FADA;GINGER ROOT;So;0;ON;;;;;N;;;;;
       +1FADB;PEA POD;So;0;ON;;;;;N;;;;;
        1FAE0;MELTING FACE;So;0;ON;;;;;N;;;;;
        1FAE1;SALUTING FACE;So;0;ON;;;;;N;;;;;
        1FAE2;FACE WITH OPEN EYES AND HAND OVER MOUTH;So;0;ON;;;;;N;;;;;
       @@ -33510,6 +33803,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1FAE5;DOTTED LINE FACE;So;0;ON;;;;;N;;;;;
        1FAE6;BITING LIP;So;0;ON;;;;;N;;;;;
        1FAE7;BUBBLES;So;0;ON;;;;;N;;;;;
       +1FAE8;SHAKING FACE;So;0;ON;;;;;N;;;;;
        1FAF0;HAND WITH INDEX FINGER AND THUMB CROSSED;So;0;ON;;;;;N;;;;;
        1FAF1;RIGHTWARDS HAND;So;0;ON;;;;;N;;;;;
        1FAF2;LEFTWARDS HAND;So;0;ON;;;;;N;;;;;
       @@ -33517,6 +33811,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        1FAF4;PALM UP HAND;So;0;ON;;;;;N;;;;;
        1FAF5;INDEX POINTING AT THE VIEWER;So;0;ON;;;;;N;;;;;
        1FAF6;HEART HANDS;So;0;ON;;;;;N;;;;;
       +1FAF7;LEFTWARDS PUSHING HAND;So;0;ON;;;;;N;;;;;
       +1FAF8;RIGHTWARDS PUSHING HAND;So;0;ON;;;;;N;;;;;
        1FB00;BLOCK SEXTANT-1;So;0;ON;;;;;N;;;;;
        1FB01;BLOCK SEXTANT-2;So;0;ON;;;;;N;;;;;
        1FB02;BLOCK SEXTANT-12;So;0;ON;;;;;N;;;;;
       @@ -33732,7 +34028,7 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
        2A6DF;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
        2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
       -2B738;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
       +2B739;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
        2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
        2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
        2B820;<CJK Ideograph Extension E, First>;Lo;0;L;;;;;N;;;;;
       @@ -34283,6 +34579,8 @@ FFFD;REPLACEMENT CHARACTER;So;0;ON;;;;;N;;;;;
        2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;
        30000;<CJK Ideograph Extension G, First>;Lo;0;L;;;;;N;;;;;
        3134A;<CJK Ideograph Extension G, Last>;Lo;0;L;;;;;N;;;;;
       +31350;<CJK Ideograph Extension H, First>;Lo;0;L;;;;;N;;;;;
       +323AF;<CJK Ideograph Extension H, Last>;Lo;0;L;;;;;N;;;;;
        E0001;LANGUAGE TAG;Cf;0;BN;;;;;N;;;;;
        E0020;TAG SPACE;Cf;0;BN;;;;;N;;;;;
        E0021;TAG EXCLAMATION MARK;Cf;0;BN;;;;;N;;;;;
 (DIR) diff --git a/data/WordBreakProperty.txt b/data/WordBreakProperty.txt
       @@ -1,11 +1,11 @@
       -# WordBreakProperty-14.0.0.txt
       -# Date: 2021-07-10, 00:35:32 GMT
       -# © 2021 Unicode®, Inc.
       +# WordBreakProperty-15.0.0.txt
       +# Date: 2022-04-27, 02:41:26 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        
        # ================================================
        
       @@ -180,6 +180,7 @@ FB46..FB4F    ; Hebrew_Letter # Lo  [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW
        0CCC..0CCD    ; Extend # Mn   [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
        0CD5..0CD6    ; Extend # Mc   [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
        0CE2..0CE3    ; Extend # Mn   [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
       +0CF3          ; Extend # Mc       KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
        0D00..0D01    ; Extend # Mn   [2] MALAYALAM SIGN COMBINING ANUSVARA ABOVE..MALAYALAM SIGN CANDRABINDU
        0D02..0D03    ; Extend # Mc   [2] MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISARGA
        0D3B..0D3C    ; Extend # Mn   [2] MALAYALAM SIGN VERTICAL BAR VIRAMA..MALAYALAM SIGN CIRCULAR VIRAMA
       @@ -203,7 +204,7 @@ FB46..FB4F    ; Hebrew_Letter # Lo  [10] HEBREW LETTER TSADI WITH DAGESH..HEBREW
        0E47..0E4E    ; Extend # Mn   [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
        0EB1          ; Extend # Mn       LAO VOWEL SIGN MAI KAN
        0EB4..0EBC    ; Extend # Mn   [9] LAO VOWEL SIGN I..LAO SEMIVOWEL SIGN LO
       -0EC8..0ECD    ; Extend # Mn   [6] LAO TONE MAI EK..LAO NIGGAHITA
       +0EC8..0ECE    ; Extend # Mn   [7] LAO TONE MAI EK..LAO YAMAKKAN
        0F18..0F19    ; Extend # Mn   [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
        0F35          ; Extend # Mn       TIBETAN MARK NGAS BZUNG NYI ZLA
        0F37          ; Extend # Mn       TIBETAN MARK NGAS BZUNG SGOR RTAGS
       @@ -407,6 +408,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        10AE5..10AE6  ; Extend # Mn   [2] MANICHAEAN ABBREVIATION MARK ABOVE..MANICHAEAN ABBREVIATION MARK BELOW
        10D24..10D27  ; Extend # Mn   [4] HANIFI ROHINGYA SIGN HARBAHAY..HANIFI ROHINGYA SIGN TASSI
        10EAB..10EAC  ; Extend # Mn   [2] YEZIDI COMBINING HAMZA MARK..YEZIDI COMBINING MADDA MARK
       +10EFD..10EFF  ; Extend # Mn   [3] ARABIC SMALL LOW WORD SAKTA..ARABIC SMALL LOW WORD MADDA
        10F46..10F50  ; Extend # Mn  [11] SOGDIAN COMBINING DOT BELOW..SOGDIAN COMBINING STROKE BELOW
        10F82..10F85  ; Extend # Mn   [4] OLD UYGHUR COMBINING DOT ABOVE..OLD UYGHUR COMBINING TWO DOTS BELOW
        11000         ; Extend # Mc       BRAHMI SIGN CANDRABINDU
       @@ -443,6 +445,7 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        11235         ; Extend # Mc       KHOJKI SIGN VIRAMA
        11236..11237  ; Extend # Mn   [2] KHOJKI SIGN NUKTA..KHOJKI SIGN SHADDA
        1123E         ; Extend # Mn       KHOJKI SIGN SUKUN
       +11241         ; Extend # Mn       KHOJKI VOWEL SIGN VOCALIC R
        112DF         ; Extend # Mn       KHUDAWADI SIGN ANUSVARA
        112E0..112E2  ; Extend # Mc   [3] KHUDAWADI VOWEL SIGN AA..KHUDAWADI VOWEL SIGN II
        112E3..112EA  ; Extend # Mn   [8] KHUDAWADI VOWEL SIGN U..KHUDAWADI SIGN VIRAMA
       @@ -552,6 +555,16 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        11D97         ; Extend # Mn       GUNJALA GONDI VIRAMA
        11EF3..11EF4  ; Extend # Mn   [2] MAKASAR VOWEL SIGN I..MAKASAR VOWEL SIGN U
        11EF5..11EF6  ; Extend # Mc   [2] MAKASAR VOWEL SIGN E..MAKASAR VOWEL SIGN O
       +11F00..11F01  ; Extend # Mn   [2] KAWI SIGN CANDRABINDU..KAWI SIGN ANUSVARA
       +11F03         ; Extend # Mc       KAWI SIGN VISARGA
       +11F34..11F35  ; Extend # Mc   [2] KAWI VOWEL SIGN AA..KAWI VOWEL SIGN ALTERNATE AA
       +11F36..11F3A  ; Extend # Mn   [5] KAWI VOWEL SIGN I..KAWI VOWEL SIGN VOCALIC R
       +11F3E..11F3F  ; Extend # Mc   [2] KAWI VOWEL SIGN E..KAWI VOWEL SIGN AI
       +11F40         ; Extend # Mn       KAWI VOWEL SIGN EU
       +11F41         ; Extend # Mc       KAWI SIGN KILLER
       +11F42         ; Extend # Mn       KAWI CONJOINER
       +13440         ; Extend # Mn       EGYPTIAN HIEROGLYPH MIRROR HORIZONTALLY
       +13447..13455  ; Extend # Mn  [15] EGYPTIAN HIEROGLYPH MODIFIER DAMAGED AT TOP START..EGYPTIAN HIEROGLYPH MODIFIER DAMAGED
        16AF0..16AF4  ; Extend # Mn   [5] BASSA VAH COMBINING HIGH TONE..BASSA VAH COMBINING HIGH-LOW TONE
        16B30..16B36  ; Extend # Mn   [7] PAHAWH HMONG MARK CIM TUB..PAHAWH HMONG MARK CIM TAUM
        16F4F         ; Extend # Mn       MIAO SIGN CONSONANT MODIFIER BAR
       @@ -580,16 +593,18 @@ FF9E..FF9F    ; Extend # Lm   [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDT
        1E01B..1E021  ; Extend # Mn   [7] COMBINING GLAGOLITIC LETTER SHTA..COMBINING GLAGOLITIC LETTER YATI
        1E023..1E024  ; Extend # Mn   [2] COMBINING GLAGOLITIC LETTER YU..COMBINING GLAGOLITIC LETTER SMALL YUS
        1E026..1E02A  ; Extend # Mn   [5] COMBINING GLAGOLITIC LETTER YO..COMBINING GLAGOLITIC LETTER FITA
       +1E08F         ; Extend # Mn       COMBINING CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
        1E130..1E136  ; Extend # Mn   [7] NYIAKENG PUACHUE HMONG TONE-B..NYIAKENG PUACHUE HMONG TONE-D
        1E2AE         ; Extend # Mn       TOTO SIGN RISING TONE
        1E2EC..1E2EF  ; Extend # Mn   [4] WANCHO TONE TUP..WANCHO TONE KOINI
       +1E4EC..1E4EF  ; Extend # Mn   [4] NAG MUNDARI SIGN MUHOR..NAG MUNDARI SIGN SUTUH
        1E8D0..1E8D6  ; Extend # Mn   [7] MENDE KIKAKUI COMBINING NUMBER TEENS..MENDE KIKAKUI COMBINING NUMBER MILLIONS
        1E944..1E94A  ; Extend # Mn   [7] ADLAM ALIF LENGTHENER..ADLAM NUKTA
        1F3FB..1F3FF  ; Extend # Sk   [5] EMOJI MODIFIER FITZPATRICK TYPE-1-2..EMOJI MODIFIER FITZPATRICK TYPE-6
        E0020..E007F  ; Extend # Cf  [96] TAG SPACE..CANCEL TAG
        E0100..E01EF  ; Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
        
       -# Total code points: 2512
       +# Total code points: 2554
        
        # ================================================
        
       @@ -615,12 +630,12 @@ FEFF          ; Format # Cf       ZERO WIDTH NO-BREAK SPACE
        FFF9..FFFB    ; Format # Cf   [3] INTERLINEAR ANNOTATION ANCHOR..INTERLINEAR ANNOTATION TERMINATOR
        110BD         ; Format # Cf       KAITHI NUMBER SIGN
        110CD         ; Format # Cf       KAITHI NUMBER SIGN ABOVE
       -13430..13438  ; Format # Cf   [9] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END SEGMENT
       +13430..1343F  ; Format # Cf  [16] EGYPTIAN HIEROGLYPH VERTICAL JOINER..EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE
        1BCA0..1BCA3  ; Format # Cf   [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
        1D173..1D17A  ; Format # Cf   [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE
        E0001         ; Format # Cf       LANGUAGE TAG
        
       -# Total code points: 64
       +# Total code points: 71
        
        # ================================================
        
       @@ -641,9 +656,10 @@ FF71..FF9D    ; Katakana # Lo  [45] HALFWIDTH KATAKANA LETTER A..HALFWIDTH KATAK
        1AFFD..1AFFE  ; Katakana # Lm   [2] KATAKANA LETTER MINNAN NASALIZED TONE-7..KATAKANA LETTER MINNAN NASALIZED TONE-8
        1B000         ; Katakana # Lo       KATAKANA LETTER ARCHAIC E
        1B120..1B122  ; Katakana # Lo   [3] KATAKANA LETTER ARCHAIC YI..KATAKANA LETTER ARCHAIC WU
       +1B155         ; Katakana # Lo       KATAKANA LETTER SMALL KO
        1B164..1B167  ; Katakana # Lo   [4] KATAKANA LETTER SMALL WI..KATAKANA LETTER SMALL N
        
       -# Total code points: 330
       +# Total code points: 331
        
        # ================================================
        
       @@ -1127,6 +1143,7 @@ FFDA..FFDC    ; ALetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        111DC         ; ALetter # Lo       SHARADA HEADSTROKE
        11200..11211  ; ALetter # Lo  [18] KHOJKI LETTER A..KHOJKI LETTER JJA
        11213..1122B  ; ALetter # Lo  [25] KHOJKI LETTER NYA..KHOJKI LETTER LLA
       +1123F..11240  ; ALetter # Lo   [2] KHOJKI LETTER QA..KHOJKI LETTER SHORT I
        11280..11286  ; ALetter # Lo   [7] MULTANI LETTER A..MULTANI LETTER GA
        11288         ; ALetter # Lo       MULTANI LETTER GHA
        1128A..1128D  ; ALetter # Lo   [4] MULTANI LETTER CA..MULTANI LETTER JJA
       @@ -1187,12 +1204,16 @@ FFDA..FFDC    ; ALetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        11D6A..11D89  ; ALetter # Lo  [32] GUNJALA GONDI LETTER OO..GUNJALA GONDI LETTER SA
        11D98         ; ALetter # Lo       GUNJALA GONDI OM
        11EE0..11EF2  ; ALetter # Lo  [19] MAKASAR LETTER KA..MAKASAR ANGKA
       +11F02         ; ALetter # Lo       KAWI SIGN REPHA
       +11F04..11F10  ; ALetter # Lo  [13] KAWI LETTER A..KAWI LETTER O
       +11F12..11F33  ; ALetter # Lo  [34] KAWI LETTER KA..KAWI LETTER JNYA
        11FB0         ; ALetter # Lo       LISU LETTER YHA
        12000..12399  ; ALetter # Lo [922] CUNEIFORM SIGN A..CUNEIFORM SIGN U U
        12400..1246E  ; ALetter # Nl [111] CUNEIFORM NUMERIC SIGN TWO ASH..CUNEIFORM NUMERIC SIGN NINE U VARIANT FORM
        12480..12543  ; ALetter # Lo [196] CUNEIFORM SIGN AB TIMES NUN TENU..CUNEIFORM SIGN ZU5 TIMES THREE DISH TENU
        12F90..12FF0  ; ALetter # Lo  [97] CYPRO-MINOAN SIGN CM001..CYPRO-MINOAN SIGN CM114
       -13000..1342E  ; ALetter # Lo [1071] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH AA032
       +13000..1342F  ; ALetter # Lo [1072] EGYPTIAN HIEROGLYPH A001..EGYPTIAN HIEROGLYPH V011D
       +13441..13446  ; ALetter # Lo   [6] EGYPTIAN HIEROGLYPH FULL BLANK..EGYPTIAN HIEROGLYPH WIDE LOST SIGN
        14400..14646  ; ALetter # Lo [583] ANATOLIAN HIEROGLYPH A001..ANATOLIAN HIEROGLYPH A530
        16800..16A38  ; ALetter # Lo [569] BAMUM LETTER PHASE-A NGKUE MFON..BAMUM LETTER PHASE-F VUEQ
        16A40..16A5E  ; ALetter # Lo  [31] MRO LETTER TA..MRO LETTER TEK
       @@ -1245,11 +1266,15 @@ FFDA..FFDC    ; ALetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        1DF00..1DF09  ; ALetter # L&  [10] LATIN SMALL LETTER FENG DIGRAPH WITH TRILL..LATIN SMALL LETTER T WITH HOOK AND RETROFLEX HOOK
        1DF0A         ; ALetter # Lo       LATIN LETTER RETROFLEX CLICK WITH RETROFLEX HOOK
        1DF0B..1DF1E  ; ALetter # L&  [20] LATIN SMALL LETTER ESH WITH DOUBLE BAR..LATIN SMALL LETTER S WITH CURL
       +1DF25..1DF2A  ; ALetter # L&   [6] LATIN SMALL LETTER D WITH MID-HEIGHT LEFT HOOK..LATIN SMALL LETTER T WITH MID-HEIGHT LEFT HOOK
       +1E030..1E06D  ; ALetter # Lm  [62] MODIFIER LETTER CYRILLIC SMALL A..MODIFIER LETTER CYRILLIC SMALL STRAIGHT U WITH STROKE
        1E100..1E12C  ; ALetter # Lo  [45] NYIAKENG PUACHUE HMONG LETTER MA..NYIAKENG PUACHUE HMONG LETTER W
        1E137..1E13D  ; ALetter # Lm   [7] NYIAKENG PUACHUE HMONG SIGN FOR PERSON..NYIAKENG PUACHUE HMONG SYLLABLE LENGTHENER
        1E14E         ; ALetter # Lo       NYIAKENG PUACHUE HMONG LOGOGRAM NYAJ
        1E290..1E2AD  ; ALetter # Lo  [30] TOTO LETTER PA..TOTO LETTER A
        1E2C0..1E2EB  ; ALetter # Lo  [44] WANCHO LETTER AA..WANCHO LETTER YIH
       +1E4D0..1E4EA  ; ALetter # Lo  [27] NAG MUNDARI LETTER O..NAG MUNDARI LETTER ELL
       +1E4EB         ; ALetter # Lm       NAG MUNDARI SIGN OJOD
        1E7E0..1E7E6  ; ALetter # Lo   [7] ETHIOPIC SYLLABLE HHYA..ETHIOPIC SYLLABLE HHYO
        1E7E8..1E7EB  ; ALetter # Lo   [4] ETHIOPIC SYLLABLE GURAGE HHWA..ETHIOPIC SYLLABLE HHWE
        1E7ED..1E7EE  ; ALetter # Lo   [2] ETHIOPIC SYLLABLE GURAGE MWI..ETHIOPIC SYLLABLE GURAGE MWEE
       @@ -1294,7 +1319,7 @@ FFDA..FFDC    ; ALetter # Lo   [3] HALFWIDTH HANGUL LETTER EU..HALFWIDTH HANGUL 
        1F150..1F169  ; ALetter # So  [26] NEGATIVE CIRCLED LATIN CAPITAL LETTER A..NEGATIVE CIRCLED LATIN CAPITAL LETTER Z
        1F170..1F189  ; ALetter # So  [26] NEGATIVE SQUARED LATIN CAPITAL LETTER A..NEGATIVE SQUARED LATIN CAPITAL LETTER Z
        
       -# Total code points: 29336
       +# Total code points: 29489
        
        # ================================================
        
       @@ -1398,16 +1423,18 @@ FF10..FF19    ; Numeric # Nd  [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
        11C50..11C59  ; Numeric # Nd  [10] BHAIKSUKI DIGIT ZERO..BHAIKSUKI DIGIT NINE
        11D50..11D59  ; Numeric # Nd  [10] MASARAM GONDI DIGIT ZERO..MASARAM GONDI DIGIT NINE
        11DA0..11DA9  ; Numeric # Nd  [10] GUNJALA GONDI DIGIT ZERO..GUNJALA GONDI DIGIT NINE
       +11F50..11F59  ; Numeric # Nd  [10] KAWI DIGIT ZERO..KAWI DIGIT NINE
        16A60..16A69  ; Numeric # Nd  [10] MRO DIGIT ZERO..MRO DIGIT NINE
        16AC0..16AC9  ; Numeric # Nd  [10] TANGSA DIGIT ZERO..TANGSA DIGIT NINE
        16B50..16B59  ; Numeric # Nd  [10] PAHAWH HMONG DIGIT ZERO..PAHAWH HMONG DIGIT NINE
        1D7CE..1D7FF  ; Numeric # Nd  [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
        1E140..1E149  ; Numeric # Nd  [10] NYIAKENG PUACHUE HMONG DIGIT ZERO..NYIAKENG PUACHUE HMONG DIGIT NINE
        1E2F0..1E2F9  ; Numeric # Nd  [10] WANCHO DIGIT ZERO..WANCHO DIGIT NINE
       +1E4F0..1E4F9  ; Numeric # Nd  [10] NAG MUNDARI DIGIT ZERO..NAG MUNDARI DIGIT NINE
        1E950..1E959  ; Numeric # Nd  [10] ADLAM DIGIT ZERO..ADLAM DIGIT NINE
        1FBF0..1FBF9  ; Numeric # Nd  [10] SEGMENTED DIGIT ZERO..SEGMENTED DIGIT NINE
        
       -# Total code points: 661
       +# Total code points: 681
        
        # ================================================
        
 (DIR) diff --git a/data/WordBreakTest.txt b/data/WordBreakTest.txt
       @@ -1,11 +1,11 @@
       -# WordBreakTest-14.0.0.txt
       -# Date: 2021-03-08, 06:22:40 GMT
       -# © 2021 Unicode®, Inc.
       +# WordBreakTest-15.0.0.txt
       +# Date: 2022-02-26, 00:39:00 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Unicode Character Database
       -#   For documentation, see http://www.unicode.org/reports/tr44/
       +#   For documentation, see https://www.unicode.org/reports/tr44/
        #
        # Default Word_Break Test
        #
 (DIR) diff --git a/data/emoji-data.txt b/data/emoji-data.txt
       @@ -1,13 +1,13 @@
       -# emoji-data-14.0.0.txt
       -# Date: 2021-08-26, 17:22:22 GMT
       -# © 2021 Unicode®, Inc.
       +# emoji-data.txt
       +# Date: 2022-08-02, 00:26:10 GMT
       +# © 2022 Unicode®, Inc.
        # Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
       -# For terms of use, see http://www.unicode.org/terms_of_use.html
       +# For terms of use, see https://www.unicode.org/terms_of_use.html
        #
        # Emoji Data for UTS #51
       -# Used with Emoji Version 14.0 and subsequent minor revisions (if any)
       +# Used with Emoji Version 15.0 and subsequent minor revisions (if any)
        #
       -# For documentation and usage, see http://www.unicode.org/reports/tr51
       +# For documentation and usage, see https://www.unicode.org/reports/tr51
        #
        # Format: 
        # <codepoint(s)> ; <property> # <comments> 
       @@ -19,8 +19,7 @@
        
        # ================================================
        
       -# All omitted code points have Emoji=No 
       -# @missing: 0000..10FFFF  ; Emoji ; No
       +# All omitted code points have Emoji=No
        
        0023          ; Emoji                # E0.0   [1] (#️)       hash sign
        002A          ; Emoji                # E0.0   [1] (*️)       asterisk
       @@ -341,6 +340,7 @@
        1F6D1..1F6D2  ; Emoji                # E3.0   [2] (🛑..🛒)    stop sign..shopping cart
        1F6D5         ; Emoji                # E12.0  [1] (🛕)       hindu temple
        1F6D6..1F6D7  ; Emoji                # E13.0  [2] (🛖..🛗)    hut..elevator
       +1F6DC         ; Emoji                # E15.0  [1] (🛜)       wireless
        1F6DD..1F6DF  ; Emoji                # E14.0  [3] (🛝..🛟)    playground slide..ring buoy
        1F6E0..1F6E5  ; Emoji                # E0.7   [6] (🛠️..🛥️)    hammer and wrench..motor boat
        1F6E9         ; Emoji                # E0.7   [1] (🛩️)       small airplane
       @@ -401,28 +401,36 @@
        1F9E7..1F9FF  ; Emoji                # E11.0 [25] (🧧..🧿)    red envelope..nazar amulet
        1FA70..1FA73  ; Emoji                # E12.0  [4] (🩰..🩳)    ballet shoes..shorts
        1FA74         ; Emoji                # E13.0  [1] (🩴)       thong sandal
       +1FA75..1FA77  ; Emoji                # E15.0  [3] (🩵..🩷)    light blue heart..pink heart
        1FA78..1FA7A  ; Emoji                # E12.0  [3] (🩸..🩺)    drop of blood..stethoscope
        1FA7B..1FA7C  ; Emoji                # E14.0  [2] (🩻..🩼)    x-ray..crutch
        1FA80..1FA82  ; Emoji                # E12.0  [3] (🪀..🪂)    yo-yo..parachute
        1FA83..1FA86  ; Emoji                # E13.0  [4] (🪃..🪆)    boomerang..nesting dolls
       +1FA87..1FA88  ; Emoji                # E15.0  [2] (🪇..🪈)    maracas..flute
        1FA90..1FA95  ; Emoji                # E12.0  [6] (🪐..🪕)    ringed planet..banjo
        1FA96..1FAA8  ; Emoji                # E13.0 [19] (🪖..🪨)    military helmet..rock
        1FAA9..1FAAC  ; Emoji                # E14.0  [4] (🪩..🪬)    mirror ball..hamsa
       +1FAAD..1FAAF  ; Emoji                # E15.0  [3] (🪭..🪯)    folding hand fan..khanda
        1FAB0..1FAB6  ; Emoji                # E13.0  [7] (🪰..🪶)    fly..feather
        1FAB7..1FABA  ; Emoji                # E14.0  [4] (🪷..🪺)    lotus..nest with eggs
       +1FABB..1FABD  ; Emoji                # E15.0  [3] (🪻..🪽)    hyacinth..wing
       +1FABF         ; Emoji                # E15.0  [1] (🪿)       goose
        1FAC0..1FAC2  ; Emoji                # E13.0  [3] (🫀..🫂)    anatomical heart..people hugging
        1FAC3..1FAC5  ; Emoji                # E14.0  [3] (🫃..🫅)    pregnant man..person with crown
       +1FACE..1FACF  ; Emoji                # E15.0  [2] (🫎..🫏)    moose..donkey
        1FAD0..1FAD6  ; Emoji                # E13.0  [7] (🫐..🫖)    blueberries..teapot
        1FAD7..1FAD9  ; Emoji                # E14.0  [3] (🫗..🫙)    pouring liquid..jar
       +1FADA..1FADB  ; Emoji                # E15.0  [2] (🫚..🫛)    ginger root..pea pod
        1FAE0..1FAE7  ; Emoji                # E14.0  [8] (🫠..🫧)    melting face..bubbles
       +1FAE8         ; Emoji                # E15.0  [1] (🫨)       shaking face
        1FAF0..1FAF6  ; Emoji                # E14.0  [7] (🫰..🫶)    hand with index finger and thumb crossed..heart hands
       +1FAF7..1FAF8  ; Emoji                # E15.0  [2] (🫷..🫸)    leftwards pushing hand..rightwards pushing hand
        
       -# Total elements: 1404
       +# Total elements: 1424
        
        # ================================================
        
       -# All omitted code points have Emoji_Presentation=No 
       -# @missing: 0000..10FFFF  ; Emoji_Presentation ; No
       +# All omitted code points have Emoji_Presentation=No
        
        231A..231B    ; Emoji_Presentation   # E0.6   [2] (⌚..⌛)    watch..hourglass done
        23E9..23EC    ; Emoji_Presentation   # E0.6   [4] (⏩..⏬)    fast-forward button..fast down button
       @@ -625,6 +633,7 @@
        1F6D1..1F6D2  ; Emoji_Presentation   # E3.0   [2] (🛑..🛒)    stop sign..shopping cart
        1F6D5         ; Emoji_Presentation   # E12.0  [1] (🛕)       hindu temple
        1F6D6..1F6D7  ; Emoji_Presentation   # E13.0  [2] (🛖..🛗)    hut..elevator
       +1F6DC         ; Emoji_Presentation   # E15.0  [1] (🛜)       wireless
        1F6DD..1F6DF  ; Emoji_Presentation   # E14.0  [3] (🛝..🛟)    playground slide..ring buoy
        1F6EB..1F6EC  ; Emoji_Presentation   # E1.0   [2] (🛫..🛬)    airplane departure..airplane arrival
        1F6F4..1F6F6  ; Emoji_Presentation   # E3.0   [3] (🛴..🛶)    kick scooter..canoe
       @@ -681,28 +690,36 @@
        1F9E7..1F9FF  ; Emoji_Presentation   # E11.0 [25] (🧧..🧿)    red envelope..nazar amulet
        1FA70..1FA73  ; Emoji_Presentation   # E12.0  [4] (🩰..🩳)    ballet shoes..shorts
        1FA74         ; Emoji_Presentation   # E13.0  [1] (🩴)       thong sandal
       +1FA75..1FA77  ; Emoji_Presentation   # E15.0  [3] (🩵..🩷)    light blue heart..pink heart
        1FA78..1FA7A  ; Emoji_Presentation   # E12.0  [3] (🩸..🩺)    drop of blood..stethoscope
        1FA7B..1FA7C  ; Emoji_Presentation   # E14.0  [2] (🩻..🩼)    x-ray..crutch
        1FA80..1FA82  ; Emoji_Presentation   # E12.0  [3] (🪀..🪂)    yo-yo..parachute
        1FA83..1FA86  ; Emoji_Presentation   # E13.0  [4] (🪃..🪆)    boomerang..nesting dolls
       +1FA87..1FA88  ; Emoji_Presentation   # E15.0  [2] (🪇..🪈)    maracas..flute
        1FA90..1FA95  ; Emoji_Presentation   # E12.0  [6] (🪐..🪕)    ringed planet..banjo
        1FA96..1FAA8  ; Emoji_Presentation   # E13.0 [19] (🪖..🪨)    military helmet..rock
        1FAA9..1FAAC  ; Emoji_Presentation   # E14.0  [4] (🪩..🪬)    mirror ball..hamsa
       +1FAAD..1FAAF  ; Emoji_Presentation   # E15.0  [3] (🪭..🪯)    folding hand fan..khanda
        1FAB0..1FAB6  ; Emoji_Presentation   # E13.0  [7] (🪰..🪶)    fly..feather
        1FAB7..1FABA  ; Emoji_Presentation   # E14.0  [4] (🪷..🪺)    lotus..nest with eggs
       +1FABB..1FABD  ; Emoji_Presentation   # E15.0  [3] (🪻..🪽)    hyacinth..wing
       +1FABF         ; Emoji_Presentation   # E15.0  [1] (🪿)       goose
        1FAC0..1FAC2  ; Emoji_Presentation   # E13.0  [3] (🫀..🫂)    anatomical heart..people hugging
        1FAC3..1FAC5  ; Emoji_Presentation   # E14.0  [3] (🫃..🫅)    pregnant man..person with crown
       +1FACE..1FACF  ; Emoji_Presentation   # E15.0  [2] (🫎..🫏)    moose..donkey
        1FAD0..1FAD6  ; Emoji_Presentation   # E13.0  [7] (🫐..🫖)    blueberries..teapot
        1FAD7..1FAD9  ; Emoji_Presentation   # E14.0  [3] (🫗..🫙)    pouring liquid..jar
       +1FADA..1FADB  ; Emoji_Presentation   # E15.0  [2] (🫚..🫛)    ginger root..pea pod
        1FAE0..1FAE7  ; Emoji_Presentation   # E14.0  [8] (🫠..🫧)    melting face..bubbles
       +1FAE8         ; Emoji_Presentation   # E15.0  [1] (🫨)       shaking face
        1FAF0..1FAF6  ; Emoji_Presentation   # E14.0  [7] (🫰..🫶)    hand with index finger and thumb crossed..heart hands
       +1FAF7..1FAF8  ; Emoji_Presentation   # E15.0  [2] (🫷..🫸)    leftwards pushing hand..rightwards pushing hand
        
       -# Total elements: 1185
       +# Total elements: 1205
        
        # ================================================
        
       -# All omitted code points have Emoji_Modifier=No 
       -# @missing: 0000..10FFFF  ; Emoji_Modifier ; No
       +# All omitted code points have Emoji_Modifier=No
        
        1F3FB..1F3FF  ; Emoji_Modifier       # E1.0   [5] (🏻..🏿)    light skin tone..dark skin tone
        
       @@ -710,8 +727,7 @@
        
        # ================================================
        
       -# All omitted code points have Emoji_Modifier_Base=No 
       -# @missing: 0000..10FFFF  ; Emoji_Modifier_Base ; No
       +# All omitted code points have Emoji_Modifier_Base=No
        
        261D          ; Emoji_Modifier_Base  # E0.6   [1] (☝️)       index pointing up
        26F9          ; Emoji_Modifier_Base  # E0.7   [1] (⛹️)       person bouncing ball
       @@ -762,13 +778,13 @@
        1F9D1..1F9DD  ; Emoji_Modifier_Base  # E5.0  [13] (🧑..🧝)    person..elf
        1FAC3..1FAC5  ; Emoji_Modifier_Base  # E14.0  [3] (🫃..🫅)    pregnant man..person with crown
        1FAF0..1FAF6  ; Emoji_Modifier_Base  # E14.0  [7] (🫰..🫶)    hand with index finger and thumb crossed..heart hands
       +1FAF7..1FAF8  ; Emoji_Modifier_Base  # E15.0  [2] (🫷..🫸)    leftwards pushing hand..rightwards pushing hand
        
       -# Total elements: 132
       +# Total elements: 134
        
        # ================================================
        
       -# All omitted code points have Emoji_Component=No 
       -# @missing: 0000..10FFFF  ; Emoji_Component ; No
       +# All omitted code points have Emoji_Component=No
        
        0023          ; Emoji_Component      # E0.0   [1] (#️)       hash sign
        002A          ; Emoji_Component      # E0.0   [1] (*️)       asterisk
       @@ -785,8 +801,7 @@ E0020..E007F  ; Emoji_Component      # E0.0  [96] (󠀠..󠁿)      tag space..c
        
        # ================================================
        
       -# All omitted code points have Extended_Pictographic=No 
       -# @missing: 0000..10FFFF  ; Extended_Pictographic ; No
       +# All omitted code points have Extended_Pictographic=No
        
        00A9          ; Extended_Pictographic# E0.6   [1] (©️)       copyright
        00AE          ; Extended_Pictographic# E0.6   [1] (®️)       registered
       @@ -1190,7 +1205,8 @@ E0020..E007F  ; Emoji_Component      # E0.0  [96] (󠀠..󠁿)      tag space..c
        1F6D3..1F6D4  ; Extended_Pictographic# E0.0   [2] (🛓..🛔)    STUPA..PAGODA
        1F6D5         ; Extended_Pictographic# E12.0  [1] (🛕)       hindu temple
        1F6D6..1F6D7  ; Extended_Pictographic# E13.0  [2] (🛖..🛗)    hut..elevator
       -1F6D8..1F6DC  ; Extended_Pictographic# E0.0   [5] (🛘..🛜)    <reserved-1F6D8>..<reserved-1F6DC>
       +1F6D8..1F6DB  ; Extended_Pictographic# E0.0   [4] (🛘..🛛)    <reserved-1F6D8>..<reserved-1F6DB>
       +1F6DC         ; Extended_Pictographic# E15.0  [1] (🛜)       wireless
        1F6DD..1F6DF  ; Extended_Pictographic# E14.0  [3] (🛝..🛟)    playground slide..ring buoy
        1F6E0..1F6E5  ; Extended_Pictographic# E0.7   [6] (🛠️..🛥️)    hammer and wrench..motor boat
        1F6E6..1F6E8  ; Extended_Pictographic# E0.0   [3] (🛦..🛨)    UP-POINTING MILITARY AIRPLANE..UP-POINTING SMALL AIRPLANE
       @@ -1207,7 +1223,7 @@ E0020..E007F  ; Emoji_Component      # E0.0  [96] (󠀠..󠁿)      tag space..c
        1F6FA         ; Extended_Pictographic# E12.0  [1] (🛺)       auto rickshaw
        1F6FB..1F6FC  ; Extended_Pictographic# E13.0  [2] (🛻..🛼)    pickup truck..roller skate
        1F6FD..1F6FF  ; Extended_Pictographic# E0.0   [3] (🛽..🛿)    <reserved-1F6FD>..<reserved-1F6FF>
       -1F774..1F77F  ; Extended_Pictographic# E0.0  [12] (🝴..🝿)    <reserved-1F774>..<reserved-1F77F>
       +1F774..1F77F  ; Extended_Pictographic# E0.0  [12] (🝴..🝿)    LOT OF FORTUNE..ORCUS
        1F7D5..1F7DF  ; Extended_Pictographic# E0.0  [11] (🟕..🟟)    CIRCLED TRIANGLE..<reserved-1F7DF>
        1F7E0..1F7EB  ; Extended_Pictographic# E12.0 [12] (🟠..🟫)    orange circle..brown square
        1F7EC..1F7EF  ; Extended_Pictographic# E0.0   [4] (🟬..🟯)    <reserved-1F7EC>..<reserved-1F7EF>
       @@ -1266,30 +1282,37 @@ E0020..E007F  ; Emoji_Component      # E0.0  [96] (󠀠..󠁿)      tag space..c
        1FA00..1FA6F  ; Extended_Pictographic# E0.0 [112] (🨀..🩯)    NEUTRAL CHESS KING..<reserved-1FA6F>
        1FA70..1FA73  ; Extended_Pictographic# E12.0  [4] (🩰..🩳)    ballet shoes..shorts
        1FA74         ; Extended_Pictographic# E13.0  [1] (🩴)       thong sandal
       -1FA75..1FA77  ; Extended_Pictographic# E0.0   [3] (🩵..🩷)    <reserved-1FA75>..<reserved-1FA77>
       +1FA75..1FA77  ; Extended_Pictographic# E15.0  [3] (🩵..🩷)    light blue heart..pink heart
        1FA78..1FA7A  ; Extended_Pictographic# E12.0  [3] (🩸..🩺)    drop of blood..stethoscope
        1FA7B..1FA7C  ; Extended_Pictographic# E14.0  [2] (🩻..🩼)    x-ray..crutch
        1FA7D..1FA7F  ; Extended_Pictographic# E0.0   [3] (🩽..🩿)    <reserved-1FA7D>..<reserved-1FA7F>
        1FA80..1FA82  ; Extended_Pictographic# E12.0  [3] (🪀..🪂)    yo-yo..parachute
        1FA83..1FA86  ; Extended_Pictographic# E13.0  [4] (🪃..🪆)    boomerang..nesting dolls
       -1FA87..1FA8F  ; Extended_Pictographic# E0.0   [9] (🪇..🪏)    <reserved-1FA87>..<reserved-1FA8F>
       +1FA87..1FA88  ; Extended_Pictographic# E15.0  [2] (🪇..🪈)    maracas..flute
       +1FA89..1FA8F  ; Extended_Pictographic# E0.0   [7] (🪉..🪏)    <reserved-1FA89>..<reserved-1FA8F>
        1FA90..1FA95  ; Extended_Pictographic# E12.0  [6] (🪐..🪕)    ringed planet..banjo
        1FA96..1FAA8  ; Extended_Pictographic# E13.0 [19] (🪖..🪨)    military helmet..rock
        1FAA9..1FAAC  ; Extended_Pictographic# E14.0  [4] (🪩..🪬)    mirror ball..hamsa
       -1FAAD..1FAAF  ; Extended_Pictographic# E0.0   [3] (🪭..🪯)    <reserved-1FAAD>..<reserved-1FAAF>
       +1FAAD..1FAAF  ; Extended_Pictographic# E15.0  [3] (🪭..🪯)    folding hand fan..khanda
        1FAB0..1FAB6  ; Extended_Pictographic# E13.0  [7] (🪰..🪶)    fly..feather
        1FAB7..1FABA  ; Extended_Pictographic# E14.0  [4] (🪷..🪺)    lotus..nest with eggs
       -1FABB..1FABF  ; Extended_Pictographic# E0.0   [5] (🪻..🪿)    <reserved-1FABB>..<reserved-1FABF>
       +1FABB..1FABD  ; Extended_Pictographic# E15.0  [3] (🪻..🪽)    hyacinth..wing
       +1FABE         ; Extended_Pictographic# E0.0   [1] (🪾)       <reserved-1FABE>
       +1FABF         ; Extended_Pictographic# E15.0  [1] (🪿)       goose
        1FAC0..1FAC2  ; Extended_Pictographic# E13.0  [3] (🫀..🫂)    anatomical heart..people hugging
        1FAC3..1FAC5  ; Extended_Pictographic# E14.0  [3] (🫃..🫅)    pregnant man..person with crown
       -1FAC6..1FACF  ; Extended_Pictographic# E0.0  [10] (🫆..🫏)    <reserved-1FAC6>..<reserved-1FACF>
       +1FAC6..1FACD  ; Extended_Pictographic# E0.0   [8] (🫆..🫍)    <reserved-1FAC6>..<reserved-1FACD>
       +1FACE..1FACF  ; Extended_Pictographic# E15.0  [2] (🫎..🫏)    moose..donkey
        1FAD0..1FAD6  ; Extended_Pictographic# E13.0  [7] (🫐..🫖)    blueberries..teapot
        1FAD7..1FAD9  ; Extended_Pictographic# E14.0  [3] (🫗..🫙)    pouring liquid..jar
       -1FADA..1FADF  ; Extended_Pictographic# E0.0   [6] (🫚..🫟)    <reserved-1FADA>..<reserved-1FADF>
       +1FADA..1FADB  ; Extended_Pictographic# E15.0  [2] (🫚..🫛)    ginger root..pea pod
       +1FADC..1FADF  ; Extended_Pictographic# E0.0   [4] (🫜..🫟)    <reserved-1FADC>..<reserved-1FADF>
        1FAE0..1FAE7  ; Extended_Pictographic# E14.0  [8] (🫠..🫧)    melting face..bubbles
       -1FAE8..1FAEF  ; Extended_Pictographic# E0.0   [8] (🫨..🫯)    <reserved-1FAE8>..<reserved-1FAEF>
       +1FAE8         ; Extended_Pictographic# E15.0  [1] (🫨)       shaking face
       +1FAE9..1FAEF  ; Extended_Pictographic# E0.0   [7] (🫩..🫯)    <reserved-1FAE9>..<reserved-1FAEF>
        1FAF0..1FAF6  ; Extended_Pictographic# E14.0  [7] (🫰..🫶)    hand with index finger and thumb crossed..heart hands
       -1FAF7..1FAFF  ; Extended_Pictographic# E0.0   [9] (🫷..🫿)    <reserved-1FAF7>..<reserved-1FAFF>
       +1FAF7..1FAF8  ; Extended_Pictographic# E15.0  [2] (🫷..🫸)    leftwards pushing hand..rightwards pushing hand
       +1FAF9..1FAFF  ; Extended_Pictographic# E0.0   [7] (🫹..🫿)    <reserved-1FAF9>..<reserved-1FAFF>
        1FC00..1FFFD  ; Extended_Pictographic# E0.0[1022] (🰀..🿽)    <reserved-1FC00>..<reserved-1FFFD>
        
        # Total elements: 3537
 (DIR) diff --git a/gen/case.c b/gen/case.c
       @@ -119,11 +119,14 @@ parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
                }
        
                /* go through the string again, parsing the numbers */
       -        for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++, tmp1 = tmp2 + 1) {
       +        for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
                        tmp2 = strchr(tmp1, ' ');
                        if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1), &((*cp)[i]))) {
                                return 1;
                        }
       +                if (tmp2 != NULL) {
       +                        tmp1 = tmp2 + 1;
       +                }
                }
        
                return 0;
       @@ -166,7 +169,8 @@ specialcasing_callback(const char *file, char **field, size_t nfields,
        
                /*
                 * overwrite value in "single mapping" property table by the
       -         * special value 0x110000 + (offset in special case array)
       +         * special value 0x110000 + (offset in special case array),
       +         * even if the special case has length 1
                 */
                prop_upper[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
                prop_lower[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
       @@ -297,5 +301,18 @@ main(int argc, char *argv[])
                }
                printf("};\n\n");
        
       +        free(comp_lower.data);
       +        free(comp_lower.offset);
       +        free(comp_title.data);
       +        free(comp_title.offset);
       +        free(comp_upper.data);
       +        free(comp_upper.offset);
       +        free(mm_lower.major);
       +        free(mm_lower.minor);
       +        free(mm_title.major);
       +        free(mm_title.minor);
       +        free(mm_upper.major);
       +        free(mm_upper.minor);
       +
                return 0;
        }
 (DIR) diff --git a/gen/util.c b/gen/util.c
       @@ -34,7 +34,7 @@ struct break_test_payload
        static void *
        reallocate_array(void *p, size_t len, size_t size)
        {
       -        if (len > 0 && size > (size_t)(-1) / len) {
       +        if (len > 0 && size > SIZE_MAX / len) {
                        errno = ENOMEM;
                        return NULL;
                }
       @@ -76,7 +76,7 @@ hextocp(const char *str, size_t len, uint_least32_t *cp)
                               (uint_least32_t)(str[i] - relative + off);
                }
        
       -        if (*cp > 0x10ffff) {
       +        if (*cp > UINT32_C(0x10FFFF)) {
                        fprintf(stderr, "hextocp: '%.*s' is too large.\n",
                                (int)len, str);
                        return 1;
       @@ -251,14 +251,14 @@ properties_compress(const struct properties *prop,
                uint_least32_t cp, i;
        
                /* initialization */
       -        if (!(comp->offset = malloc((size_t)0x110000 * sizeof(*(comp->offset))))) {
       +        if (!(comp->offset = malloc((size_t)UINT32_C(0x110000) * sizeof(*(comp->offset))))) {
                        fprintf(stderr, "malloc: %s\n", strerror(errno));
                        exit(1);
                }
                comp->data = NULL;
                comp->datalen = 0;
        
       -        for (cp = 0; cp < 0x110000; cp++) {
       +        for (cp = 0; cp < UINT32_C(0x110000); cp++) {
                        for (i = 0; i < comp->datalen; i++) {
                                if (!memcmp(&(prop[cp]), &(comp->data[i]), sizeof(*prop))) {
                                        /* found a match! */
       @@ -692,7 +692,13 @@ break_test_list_print(const struct break_test *test, size_t testlen,
        void
        break_test_list_free(struct break_test *test, size_t testlen)
        {
       -        (void)testlen;
       +        size_t i;
       +
       +        for (i = 0; i < testlen; i++) {
       +                free(test[i].cp);
       +                free(test[i].len);
       +                free(test[i].descr);
       +        }
        
                free(test);
        }
 (DIR) diff --git a/grapheme.h b/grapheme.h
       @@ -6,12 +6,7 @@
        #include <stddef.h>
        #include <stdint.h>
        
       -typedef struct grapheme_internal_segmentation_state {
       -        uint_least8_t prop;
       -        bool prop_set;
       -        bool gb11_flag;
       -        bool gb12_13_flag;
       -} GRAPHEME_STATE;
       +#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
        
        enum grapheme_bidirectional_override {
                GRAPHEME_BIDIRECTIONAL_OVERRIDE_NONE,
       @@ -19,9 +14,25 @@ enum grapheme_bidirectional_override {
                GRAPHEME_BIDIRECTIONAL_OVERRIDE_RTL,
        };
        
       -#define GRAPHEME_INVALID_CODEPOINT UINT32_C(0xFFFD)
       +size_t grapheme_bidirectional_logical_to_visual(const uint_least32_t *, size_t,
       +                                                enum grapheme_bidirectional_override,
       +                                                uint_least32_t *, size_t);
       +size_t grapheme_bidirectional_logical_to_visual_utf8(const char *, size_t,
       +                                                     enum grapheme_bidirectional_override,
       +                                                     char *, size_t);
       +
       +size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
       +size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
        
       -bool grapheme_is_character_break(uint_least32_t, uint_least32_t, GRAPHEME_STATE *);
       +bool grapheme_is_character_break(uint_least32_t, uint_least32_t, uint_least16_t *);
       +
       +bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *);
       +bool grapheme_is_titlecase(const uint_least32_t *, size_t, size_t *);
       +bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *);
       +
       +bool grapheme_is_lowercase_utf8(const char *, size_t, size_t *);
       +bool grapheme_is_titlecase_utf8(const char *, size_t, size_t *);
       +bool grapheme_is_uppercase_utf8(const char *, size_t, size_t *);
        
        size_t grapheme_next_character_break(const uint_least32_t *, size_t);
        size_t grapheme_next_line_break(const uint_least32_t *, size_t);
       @@ -33,30 +44,12 @@ size_t grapheme_next_line_break_utf8(const char *, size_t);
        size_t grapheme_next_sentence_break_utf8(const char *, size_t);
        size_t grapheme_next_word_break_utf8(const char *, size_t);
        
       -size_t grapheme_to_uppercase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
        size_t grapheme_to_lowercase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
        size_t grapheme_to_titlecase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
       +size_t grapheme_to_uppercase(const uint_least32_t *, size_t, uint_least32_t *, size_t);
        
       -size_t grapheme_to_uppercase_utf8(const char *, size_t, char *, size_t);
        size_t grapheme_to_lowercase_utf8(const char *, size_t, char *, size_t);
        size_t grapheme_to_titlecase_utf8(const char *, size_t, char *, size_t);
       -
       -bool grapheme_is_uppercase(const uint_least32_t *, size_t, size_t *);
       -bool grapheme_is_lowercase(const uint_least32_t *, size_t, size_t *);
       -bool grapheme_is_titlecase(const uint_least32_t *, size_t, size_t *);
       -
       -bool grapheme_is_uppercase_utf8(const char *, size_t, size_t *);
       -bool grapheme_is_lowercase_utf8(const char *, size_t, size_t *);
       -bool grapheme_is_titlecase_utf8(const char *, size_t, size_t *);
       -
       -size_t grapheme_bidirectional_logical_to_visual(const uint_least32_t *, size_t,
       -                                                enum grapheme_bidirectional_override,
       -                                                uint_least32_t *, size_t);
       -size_t grapheme_bidirectional_logical_to_visual_utf8(const char *, size_t,
       -                                                     enum grapheme_bidirectional_override,
       -                                                     char *, size_t);
       -
       -size_t grapheme_decode_utf8(const char *, size_t, uint_least32_t *);
       -size_t grapheme_encode_utf8(uint_least32_t, char *, size_t);
       +size_t grapheme_to_uppercase_utf8(const char *, size_t, char *, size_t);
        
        #endif /* GRAPHEME_H */
 (DIR) diff --git a/man/grapheme_decode_utf8.3 b/man/grapheme_decode_utf8.3
       @@ -1,101 +0,0 @@
       -.Dd 2021-12-22
       -.Dt GRAPHEME_DECODE_UTF8 3
       -.Os suckless.org
       -.Sh NAME
       -.Nm grapheme_decode_utf8
       -.Nd decode first codepoint in UTF-8-encoded string
       -.Sh SYNOPSIS
       -.In grapheme.h
       -.Ft size_t
       -.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
       -.Sh DESCRIPTION
       -The
       -.Fn grapheme_decode_utf8
       -function decodes the next codepoint in the UTF-8-encoded string
       -.Va str
       -of length
       -.Va len .
       -If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
       -string ends unexpectedly, empty string, etc.) the decoding is stopped
       -at the last processed byte and the decoded codepoint set to
       -.Dv GRAPHEME_INVALID_CODEPOINT .
       -.Pp
       -If
       -.Va cp
       -is not
       -.Dv NULL
       -the decoded codepoint is stored in the memory pointed to by
       -.Va cp .
       -.Pp
       -Given NUL has a unique 1 byte representation, it is safe to operate on
       -NUL-terminated strings by setting
       -.Va len
       -to
       -.Dv SIZE_MAX
       -(stdint.h is already included by grapheme.h) and terminating when
       -.Va cp
       -is 0 (see
       -.Sx EXAMPLES
       -for an example).
       -.Sh RETURN VALUES
       -The
       -.Fn grapheme_decode_utf8
       -function returns the number of processed bytes and 0 if
       -.Va str
       -is
       -.Dv NULL
       -or
       -.Va len
       -is 0.
       -If the string ends unexpectedly in a multibyte sequence, the desired
       -length (that is larger than
       -.Va len )
       -is returned.
       -.Sh EXAMPLES
       -.Bd -literal
       -/* cc (-static) -o example example.c -lgrapheme */
       -#include <grapheme.h>
       -#include <inttypes.h>
       -#include <stdio.h>
       -
       -void
       -print_cps(const char *str, size_t len)
       -{
       -        size_t ret, off;
       -        uint_least32_t cp;
       -
       -        for (off = 0; off < len; off += ret) {
       -                if ((ret = grapheme_decode_utf8(str + off,
       -                                                len - off, &cp)) > (len - off)) {
       -                        /*
       -                         * string ended unexpectedly in the middle of a
       -                         * multibyte sequence and we have the choice
       -                         * here to possibly expand str by ret - len + off
       -                         * bytes to get a full sequence, but we just
       -                         * bail out in this case.
       -                         */
       -                        break;
       -                }
       -                printf("%"PRIxLEAST32"\\n", cp);
       -        }
       -}
       -
       -void
       -print_cps_nul_terminated(const char *str)
       -{
       -        size_t ret, off;
       -        uint_least32_t cp;
       -
       -        for (off = 0; (ret = grapheme_decode_utf8(str + off,
       -                                                  SIZE_MAX, &cp)) > 0 &&
       -             cp != 0; off += ret) {
       -                printf("%"PRIxLEAST32"\\n", cp);
       -        }
       -}
       -.Ed
       -.Sh SEE ALSO
       -.Xr grapheme_encode_utf8 3 ,
       -.Xr grapheme_is_character_break 3 ,
       -.Xr libgrapheme 7
       -.Sh AUTHORS
       -.An Laslo Hunhold Aq Mt dev@frign.de
 (DIR) diff --git a/man/grapheme_decode_utf8.sh b/man/grapheme_decode_utf8.sh
       @@ -0,0 +1,102 @@
       +cat << EOF
       +.Dd ${MAN_DATE}
       +.Dt GRAPHEME_DECODE_UTF8 3
       +.Os suckless.org
       +.Sh NAME
       +.Nm grapheme_decode_utf8
       +.Nd decode first codepoint in UTF-8-encoded string
       +.Sh SYNOPSIS
       +.In grapheme.h
       +.Ft size_t
       +.Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
       +.Sh DESCRIPTION
       +The
       +.Fn grapheme_decode_utf8
       +function decodes the first codepoint in the UTF-8-encoded string
       +.Va str
       +of length
       +.Va len .
       +If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
       +string ends unexpectedly, empty string, etc.) the decoding is stopped
       +at the last processed byte and the decoded codepoint set to
       +.Dv GRAPHEME_INVALID_CODEPOINT .
       +.Pp
       +If
       +.Va cp
       +is not
       +.Dv NULL
       +the decoded codepoint is stored in the memory pointed to by
       +.Va cp .
       +.Pp
       +Given NUL has a unique 1 byte representation, it is safe to operate on
       +NUL-terminated strings by setting
       +.Va len
       +to
       +.Dv SIZE_MAX
       +(stdint.h is already included by grapheme.h) and terminating when
       +.Va cp
       +is 0 (see
       +.Sx EXAMPLES
       +for an example).
       +.Sh RETURN VALUES
       +The
       +.Fn grapheme_decode_utf8
       +function returns the number of processed bytes and 0 if
       +.Va str
       +is
       +.Dv NULL
       +or
       +.Va len
       +is 0.
       +If the string ends unexpectedly in a multibyte sequence, the desired
       +length (that is larger than
       +.Va len )
       +is returned.
       +.Sh EXAMPLES
       +.Bd -literal
       +/* cc (-static) -o example example.c -lgrapheme */
       +#include <grapheme.h>
       +#include <inttypes.h>
       +#include <stdio.h>
       +
       +void
       +print_cps(const char *str, size_t len)
       +{
       +        size_t ret, off;
       +        uint_least32_t cp;
       +
       +        for (off = 0; off < len; off += ret) {
       +                if ((ret = grapheme_decode_utf8(str + off,
       +                                                len - off, &cp)) > (len - off)) {
       +                        /*
       +                         * string ended unexpectedly in the middle of a
       +                         * multibyte sequence and we have the choice
       +                         * here to possibly expand str by ret - len + off
       +                         * bytes to get a full sequence, but we just
       +                         * bail out in this case.
       +                         */
       +                        break;
       +                }
       +                printf("%"PRIxLEAST32"\\\\n", cp);
       +        }
       +}
       +
       +void
       +print_cps_nul_terminated(const char *str)
       +{
       +        size_t ret, off;
       +        uint_least32_t cp;
       +
       +        for (off = 0; (ret = grapheme_decode_utf8(str + off,
       +                                                  SIZE_MAX, &cp)) > 0 &&
       +             cp != 0; off += ret) {
       +                printf("%"PRIxLEAST32"\\\\n", cp);
       +        }
       +}
       +.Ed
       +.Sh SEE ALSO
       +.Xr grapheme_encode_utf8 3 ,
       +.Xr libgrapheme 7
       +.Sh AUTHORS
       +.An Laslo Hunhold Aq Mt dev@frign.de
       +EOF
 (DIR) diff --git a/man/grapheme_encode_utf8.3 b/man/grapheme_encode_utf8.3
       @@ -1,98 +0,0 @@
       -.Dd 2021-12-22
       -.Dt GRAPHEME_ENCODE_UTF8 3
       -.Os suckless.org
       -.Sh NAME
       -.Nm grapheme_encode_utf8
       -.Nd encode codepoint into UTF-8 string
       -.Sh SYNOPSIS
       -.In grapheme.h
       -.Ft size_t
       -.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
       -.Sh DESCRIPTION
       -The
       -.Fn grapheme_encode_utf8
       -function encodes the codepoint
       -.Va cp
       -into a UTF-8-string.
       -If
       -.Va str
       -is not
       -.Dv NULL
       -and
       -.Va len
       -is large enough it writes the UTF-8-string to the memory pointed to by
       -.Va str .
       -.Sh RETURN VALUES
       -The
       -.Fn grapheme_encode_utf8
       -function returns the length (in bytes) of the UTF-8-string resulting
       -from encoding
       -.Va cp .
       -When the returned value is larger than
       -.Va len
       -it is indicated that the output string is too small and no data has been
       -written.
       -.Sh EXAMPLES
       -.Bd -literal
       -/* cc (-static) -o example example.c -lgrapheme */
       -#include <grapheme.h>
       -#include <stddef.h>
       -#include <stdlib.h>
       -
       -size_t
       -cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
       -{
       -        size_t i, off, ret;
       -
       -        for (i = 0, off = 0; i < cplen; i++, off += ret) {
       -                if ((ret = grapheme_encode_utf8(cp[i], str + off,
       -                                                len - off)) > (len - off)) {
       -                        /* buffer too small */
       -                        break;
       -                }
       -        }
       -
       -        return off;
       -}
       -
       -size_t
       -cps_bytelen(const uint_least32_t *cp, size_t cplen)
       -{
       -        size_t i, len;
       -
       -        for (i = 0, len = 0; i < cplen; i++) {
       -                len += grapheme_encode_utf8(cp[i], NULL, 0);
       -        }
       -
       -        return len;
       -}
       -
       -char *
       -cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
       -{
       -        char *str;
       -        size_t len, i, ret, off;
       -
       -        len = cps_bytelen(cp, cplen);
       -
       -        if (!(str = malloc(len))) {
       -                return NULL;
       -        }
       -
       -        for (i = 0, off = 0; i < cplen; i++, off += ret) {
       -                if ((ret = grapheme_encode_utf8(cp[i], str + off,
       -                                                len - off)) > (len - off)) {
       -                        /* buffer too small */
       -                        break;
       -                }
       -        }
       -        str[off] = '\\0';
       -
       -        return str;
       -}
       -.Ed
       -.Sh SEE ALSO
       -.Xr grapheme_decode_utf8 3 ,
       -.Xr libgrapheme 7
       -.Sh AUTHORS
       -.An Laslo Hunhold Aq Mt dev@frign.de
 (DIR) diff --git a/man/grapheme_encode_utf8.sh b/man/grapheme_encode_utf8.sh
       @@ -0,0 +1,103 @@
       +cat << EOF
       +.Dd ${MAN_DATE}
       +.Dt GRAPHEME_ENCODE_UTF8 3
       +.Os suckless.org
       +.Sh NAME
       +.Nm grapheme_encode_utf8
       +.Nd encode codepoint into UTF-8 string
       +.Sh SYNOPSIS
       +.In grapheme.h
       +.Ft size_t
       +.Fn grapheme_encode_utf8 "uint_least32_t cp" "char *str" "size_t len"
       +.Sh DESCRIPTION
       +The
       +.Fn grapheme_encode_utf8
       +function encodes the codepoint
       +.Va cp
       +into a UTF-8-string.
       +If
       +.Va str
       +is not
       +.Dv NULL
       +and
       +.Va len
       +is large enough it writes the UTF-8-string to the memory pointed to by
       +.Va str .
       +Otherwise no data is written.
       +.Sh RETURN VALUES
       +The
       +.Fn grapheme_encode_utf8
       +function returns the length (in bytes) of the UTF-8-string resulting
       +from encoding
       +.Va cp ,
       +even if
       +.Va len
       +is not large enough or
       +.Va str
       +is
       +.Dv NULL .
       +.Sh EXAMPLES
       +.Bd -literal
       +/* cc (-static) -o example example.c -lgrapheme */
       +#include <grapheme.h>
       +#include <stddef.h>
       +#include <stdlib.h>
       +
       +size_t
       +cps_to_utf8(const uint_least32_t *cp, size_t cplen, char *str, size_t len)
       +{
       +        size_t i, off, ret;
       +
       +        for (i = 0, off = 0; i < cplen; i++, off += ret) {
       +                if ((ret = grapheme_encode_utf8(cp[i], str + off,
       +                                                len - off)) > (len - off)) {
       +                        /* buffer too small */
       +                        break;
       +                }
       +        }
       +
       +        return off;
       +}
       +
       +size_t
       +cps_bytelen(const uint_least32_t *cp, size_t cplen)
       +{
       +        size_t i, len;
       +
       +        for (i = 0, len = 0; i < cplen; i++) {
       +                len += grapheme_encode_utf8(cp[i], NULL, 0);
       +        }
       +
       +        return len;
       +}
       +
       +char *
       +cps_to_utf8_alloc(const uint_least32_t *cp, size_t cplen)
       +{
       +        char *str;
       +        size_t len, i, ret, off;
       +
       +        len = cps_bytelen(cp, cplen);
       +
       +        if (!(str = malloc(len))) {
       +                return NULL;
       +        }
       +
       +        for (i = 0, off = 0; i < cplen; i++, off += ret) {
       +                if ((ret = grapheme_encode_utf8(cp[i], str + off,
       +                                                len - off)) > (len - off)) {
       +                        /* buffer too small */
       +                        break;
       +                }
       +        }
       +        str[off] = '\\\\0';
       +
       +        return str;
       +}
       +.Ed
       +.Sh SEE ALSO
       +.Xr grapheme_decode_utf8 3 ,
       +.Xr libgrapheme 7
       +.Sh AUTHORS
       +.An Laslo Hunhold Aq Mt dev@frign.de
       +EOF
 (DIR) diff --git a/man/grapheme_is_character_break.3 b/man/grapheme_is_character_break.3
       @@ -1,80 +0,0 @@
       -.Dd 2021-12-22
       -.Dt GRAPHEME_IS_CHARACTER_BREAK 3
       -.Os suckless.org
       -.Sh NAME
       -.Nm grapheme_is_character_break
       -.Nd test for a grapheme cluster break between two codepoints
       -.Sh SYNOPSIS
       -.In grapheme.h
       -.Ft size_t
       -.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "GRAPHEME_STATE *state"
       -.Sh DESCRIPTION
       -The
       -.Fn grapheme_is_character_break
       -function determines if there is a grapheme cluster break (see
       -.Xr libgrapheme 7 )
       -between the two codepoints
       -.Va cp1
       -and
       -.Va cp2 .
       -By specification this decision depends on a
       -.Va state
       -that can at most be completely reset after detecting a break and must
       -be reset every time one deviates from sequential processing.
       -.Pp
       -If
       -.Va state
       -is
       -.Dv NULL
       -.Fn grapheme_is_character_break
       -behaves as if it was called with a fully reset state.
       -.Sh RETURN VALUES
       -The
       -.Fn grapheme_is_character_break
       -function returns
       -.Va true
       -if there is a grapheme cluster break between the codepoints
       -.Va cp1
       -and
       -.Va cp2
       -and
       -.Va false
       -if there is not.
       -.Sh EXAMPLES
       -.Bd -literal
       -/* cc (-static) -o example example.c -lgrapheme */
       -#include <grapheme.h>
       -#include <stdint.h>
       -#include <stdio.h>
       -#include <stdlib.h>
       -
       -int
       -main(void)
       -{
       -        GRAPHEME_STATE state = { 0 };
       -        uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
       -        size_t i;
       -
       -        for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
       -                if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
       -                        printf("break in s1 at offset %zu\n", i);
       -                }
       -        }
       -        memset(&state, 0, sizeof(state)); /* reset state */
       -        for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
       -                if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
       -                        printf("break in s2 at offset %zu\n", i);
       -                }
       -        }
       -
       -        return 0;
       -}
       -.Ed
       -.Sh SEE ALSO
       -.Xr grapheme_next_character_break 3 ,
       -.Xr libgrapheme 7
       -.Sh STANDARDS
       -.Fn grapheme_is_character_break
       -is compliant with the Unicode 14.0.0 specification.
       -.Sh AUTHORS
       -.An Laslo Hunhold Aq Mt dev@frign.de
 (DIR) diff --git a/man/grapheme_is_character_break.sh b/man/grapheme_is_character_break.sh
       @@ -0,0 +1,83 @@
       +cat << EOF
       +.Dd ${MAN_DATE}
       +.Dt GRAPHEME_IS_CHARACTER_BREAK 3
       +.Os suckless.org
       +.Sh NAME
       +.Nm grapheme_is_character_break
       +.Nd test for a grapheme cluster break between two codepoints
       +.Sh SYNOPSIS
       +.In grapheme.h
       +.Ft size_t
       +.Fn grapheme_is_character_break "uint_least32_t cp1" "uint_least32_t cp2" "uint_least16_t *state"
       +.Sh DESCRIPTION
       +The
       +.Fn grapheme_is_character_break
       +function determines if there is a grapheme cluster break (see
       +.Xr libgrapheme 7 )
       +between the two codepoints
       +.Va cp1
       +and
       +.Va cp2 .
       +By specification this decision depends on a
       +.Va state
       +that can at most be completely reset after detecting a break and must
       +be reset every time one deviates from sequential processing.
       +.Pp
       +If
       +.Va state
       +is
       +.Dv NULL
       +.Fn grapheme_is_character_break
       +behaves as if it was called with a fully reset state.
       +.Sh RETURN VALUES
       +The
       +.Fn grapheme_is_character_break
       +function returns
       +.Va true
       +if there is a grapheme cluster break between the codepoints
       +.Va cp1
       +and
       +.Va cp2
       +and
       +.Va false
       +if there is not.
       +.Sh EXAMPLES
       +.Bd -literal
       +/* cc (-static) -o example example.c -lgrapheme */
       +#include <grapheme.h>
       +#include <stdint.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +
       +int
       +main(void)
       +{
       +        uint_least16_t state = 0;
       +        uint_least32_t s1[] = ..., s2[] = ...; /* two input arrays */
       +        size_t i;
       +
       +        for (i = 0; i + 1 < sizeof(s1) / sizeof(*s1); i++) {
       +                if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
       +                        printf("break in s1 at offset %zu\n", i);
       +                }
       +        }
       +        memset(&state, 0, sizeof(state)); /* reset state */
       +        for (i = 0; i + 1 < sizeof(s2) / sizeof(*s2); i++) {
       +                if (grapheme_is_character_break(s[i], s[i + 1], &state)) {
       +                        printf("break in s2 at offset %zu\n", i);
       +                }
       +        }
       +
       +        return 0;
       +}
       +.Ed
       +.Sh SEE ALSO
       +.Xr grapheme_next_character_break 3 ,
       +.Xr grapheme_next_character_break_utf8 3 ,
       +.Xr libgrapheme 7
       +.Sh STANDARDS
       +.Fn grapheme_is_character_break
       +is compliant with the Unicode ${UNICODE_VERSION} specification.
       +.Sh AUTHORS
       +.An Laslo Hunhold Aq Mt dev@frign.de
       +EOF
 (DIR) diff --git a/man/grapheme_is_lowercase.sh b/man/grapheme_is_lowercase.sh
       @@ -0,0 +1,3 @@
       +ENCODING="codepoint" \
       +CASE="lowercase" \
       +        $SH man/template/is_case.sh
 (DIR) diff --git a/man/grapheme_is_lowercase_utf8.sh b/man/grapheme_is_lowercase_utf8.sh
       @@ -0,0 +1,3 @@
       +ENCODING="utf8" \
       +CASE="lowercase" \
       +        $SH man/template/is_case.sh
 (DIR) diff --git a/man/grapheme_is_titlecase.sh b/man/grapheme_is_titlecase.sh
       @@ -0,0 +1,3 @@
       +ENCODING="codepoint" \
       +CASE="titlecase" \
       +        $SH man/template/is_case.sh
 (DIR) diff --git a/man/grapheme_is_titlecase_utf8.sh b/man/grapheme_is_titlecase_utf8.sh
       @@ -0,0 +1,3 @@
       +ENCODING="utf8" \
       +CASE="titlecase" \
       +        $SH man/template/is_case.sh
 (DIR) diff --git a/man/grapheme_is_uppercase.sh b/man/grapheme_is_uppercase.sh
       @@ -0,0 +1,3 @@
       +ENCODING="codepoint" \
       +CASE="uppercase" \
       +        $SH man/template/is_case.sh
 (DIR) diff --git a/man/grapheme_is_uppercase_utf8.sh b/man/grapheme_is_uppercase_utf8.sh
       @@ -0,0 +1,3 @@
       +ENCODING="utf8" \
       +CASE="lowercase" \
       +        $SH man/template/is_case.sh
 (DIR) diff --git a/man/grapheme_next_character_break.sh b/man/grapheme_next_character_break.sh
       @@ -0,0 +1,4 @@
       +ENCODING="codepoint" \
       +TYPE="character" \
       +REALTYPE="grapheme cluster" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_next_character_break_utf8.3 b/man/grapheme_next_character_break_utf8.3
       @@ -1,92 +0,0 @@
       -.Dd 2021-12-22
       -.Dt GRAPHEME_NEXT_CHARACTER_BREAK_UTF8 3
       -.Os suckless.org
       -.Sh NAME
       -.Nm grapheme_next_character_break_utf8
       -.Nd determine byte-offset to next grapheme cluster break
       -.Sh SYNOPSIS
       -.In grapheme.h
       -.Ft size_t
       -.Fn grapheme_next_character_break_utf8 "const char *str" "size_t len"
       -.Sh DESCRIPTION
       -The
       -.Fn grapheme_next_character_break_utf8
       -function computes the offset (in bytes) to the next grapheme
       -cluster break (see
       -.Xr libgrapheme 7 )
       -in the UTF-8-encoded string
       -.Va str
       -of length
       -.Va len .
       -If a grapheme cluster begins at
       -.Va str
       -this offset is equal to the length of said grapheme cluster.
       -.Pp
       -If
       -.Va len
       -is set to
       -.Dv SIZE_MAX
       -(stdint.h is already included by grapheme.h) the string
       -.Va str
       -is interpreted to be NUL-terminated and processing stops when a
       -NUL-byte is encountered.
       -.Pp
       -For non-UTF-8 input data
       -.Xr grapheme_is_character_break 3
       -can be used instead.
       -.Sh RETURN VALUES
       -The
       -.Fn grapheme_next_character_break_utf8
       -function returns the offset (in bytes) to the next grapheme cluster
       -break in
       -.Va str
       -or 0 if
       -.Va str
       -is
       -.Dv NULL .
       -.Sh EXAMPLES
       -.Bd -literal
       -/* cc (-static) -o example example.c -lgrapheme */
       -#include <grapheme.h>
       -#include <stdint.h>
       -#include <stdio.h>
       -
       -int
       -main(void)
       -{
       -        /* UTF-8 encoded input */
       -        char *s = "T\\xC3\\xABst \\xF0\\x9F\\x91\\xA8\\xE2\\x80\\x8D\\xF0"
       -                  "\\x9F\\x91\\xA9\\xE2\\x80\\x8D\\xF0\\x9F\\x91\\xA6 \\xF0"
       -                  "\\x9F\\x87\\xBA\\xF0\\x9F\\x87\\xB8 \\xE0\\xA4\\xA8\\xE0"
       -                  "\\xA5\\x80 \\xE0\\xAE\\xA8\\xE0\\xAE\\xBF!";
       -        size_t ret, len, off;
       -
       -        printf("Input: \\"%s\\"\\n", s);
       -
       -        /* print each grapheme cluster with byte-length */
       -        printf("Grapheme clusters in NUL-delimited input:\\n");
       -        for (off = 0; s[off] != '\\0'; off += ret) {
       -                ret = grapheme_next_character_break_utf8(s + off, SIZE_MAX);
       -                printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
       -        }
       -        printf("\\n");
       -
       -        /* do the same, but this time string is length-delimited */
       -        len = 17;
       -        printf("Grapheme clusters in input delimited to %zu bytes:\\n", len);
       -        for (off = 0; off < len; off += ret) {
       -                ret = grapheme_next_character_break_utf8(s + off, len - off);
       -                printf("%2zu bytes | %.*s\\n", ret, (int)ret, s + off, ret);
       -        }
       -
       -        return 0;
       -}
       -.Ed
       -.Sh SEE ALSO
       -.Xr grapheme_is_character_break 3 ,
       -.Xr libgrapheme 7
       -.Sh STANDARDS
       -.Fn grapheme_next_character_break_utf8
       -is compliant with the Unicode 14.0.0 specification.
       -.Sh AUTHORS
       -.An Laslo Hunhold Aq Mt dev@frign.de
 (DIR) diff --git a/man/grapheme_next_character_break_utf8.sh b/man/grapheme_next_character_break_utf8.sh
       @@ -0,0 +1,4 @@
       +ENCODING="utf8" \
       +TYPE="character" \
       +REALTYPE="grapheme cluster" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_next_line_break.sh b/man/grapheme_next_line_break.sh
       @@ -0,0 +1,4 @@
       +ENCODING="codepoint" \
       +TYPE="line" \
       +REALTYPE="possible line" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_next_line_break_utf8.sh b/man/grapheme_next_line_break_utf8.sh
       @@ -0,0 +1,4 @@
       +ENCODING="utf8" \
       +TYPE="line" \
       +REALTYPE="possible line" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_next_sentence_break.sh b/man/grapheme_next_sentence_break.sh
       @@ -0,0 +1,4 @@
       +ENCODING="codepoint" \
       +TYPE="sentence" \
       +REALTYPE="sentence" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_next_sentence_break_utf8.sh b/man/grapheme_next_sentence_break_utf8.sh
       @@ -0,0 +1,4 @@
       +ENCODING="utf8" \
       +TYPE="sentence" \
       +REALTYPE="sentence" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_next_word_break.sh b/man/grapheme_next_word_break.sh
       @@ -0,0 +1,4 @@
       +ENCODING="codepoint" \
       +TYPE="word" \
       +REALTYPE="word" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_next_word_break_utf8.sh b/man/grapheme_next_word_break_utf8.sh
       @@ -0,0 +1,4 @@
       +ENCODING="utf8" \
       +TYPE="word" \
       +REALTYPE="word" \
       +        $SH man/template/next_break.sh
 (DIR) diff --git a/man/grapheme_to_lowercase.sh b/man/grapheme_to_lowercase.sh
       @@ -0,0 +1,3 @@
       +ENCODING="codepoint" \
       +CASE="lowercase" \
       +        $SH man/template/to_case.sh
 (DIR) diff --git a/man/grapheme_to_lowercase_utf8.sh b/man/grapheme_to_lowercase_utf8.sh
       @@ -0,0 +1,3 @@
       +ENCODING="utf8" \
       +CASE="lowercase" \
       +        $SH man/template/to_case.sh
 (DIR) diff --git a/man/grapheme_to_titlecase.sh b/man/grapheme_to_titlecase.sh
       @@ -0,0 +1,3 @@
       +ENCODING="codepoint" \
       +CASE="titlecase" \
       +        $SH man/template/to_case.sh
 (DIR) diff --git a/man/grapheme_to_titlecase_utf8.sh b/man/grapheme_to_titlecase_utf8.sh
       @@ -0,0 +1,3 @@
       +ENCODING="utf8" \
       +CASE="titlecase" \
       +        $SH man/template/to_case.sh
 (DIR) diff --git a/man/grapheme_to_uppercase.sh b/man/grapheme_to_uppercase.sh
       @@ -0,0 +1,3 @@
       +ENCODING="codepoint" \
       +CASE="uppercase" \
       +        $SH man/template/to_case.sh
 (DIR) diff --git a/man/grapheme_to_uppercase_utf8.sh b/man/grapheme_to_uppercase_utf8.sh
       @@ -0,0 +1,3 @@
       +ENCODING="utf8" \
       +CASE="lowercase" \
       +        $SH man/template/to_case.sh
 (DIR) diff --git a/man/libgrapheme.7 b/man/libgrapheme.7
       @@ -1,140 +0,0 @@
       -.Dd 2021-12-22
       -.Dt LIBGRAPHEME 7
       -.Os suckless.org
       -.Sh NAME
       -.Nm libgrapheme
       -.Nd unicode string library
       -.Sh SYNOPSIS
       -.In grapheme.h
       -.Sh DESCRIPTION
       -The
       -.Nm
       -library provides functions to properly handle Unicode strings according
       -to the Unicode specification.
       -Unicode strings are made up of user-perceived characters (so-called
       -.Dq grapheme clusters ,
       -see
       -.Sx MOTIVATION )
       -that are made up of one or more Unicode codepoints, which in turn
       -are encoded in one or more bytes in an encoding like UTF-8.
       -.Pp
       -There is a widespread misconception that it was enough to simply
       -determine codepoints in a string and treat them as user-perceived
       -characters to be Unicode compliant.
       -While this may work in some cases, this assumption quickly breaks,
       -especially for non-Western languages and decomposed Unicode strings
       -where user-perceived characters are usually represented using multiple
       -codepoints.
       -.Pp
       -Despite this complicated multilevel structure of Unicode strings,
       -.Nm
       -provides methods to work with them at the byte-level (i.e. UTF-8
       -.Sq char
       -arrays) while also offering codepoint-level methods.
       -.Pp
       -Every documented function's manual page provides a self-contained
       -example illustrating the possible usage.
       -.Sh SEE ALSO
       -.Xr grapheme_decode_utf8 3 ,
       -.Xr grapheme_encode_utf8 3 ,
       -.Xr grapheme_is_character_break 3 ,
       -.Xr grapheme_next_character_break 3
       -.Sh STANDARDS
       -.Nm
       -is compliant with the Unicode 14.0.0 specification.
       -.Sh MOTIVATION
       -The idea behind every character encoding scheme like ASCII or Unicode
       -is to express abstract characters (which can be thought of as shapes
       -making up a written language). ASCII for instance, which comprises the
       -range 0 to 127, assigns the number 65 (0x41) to the abstract character
       -.Sq A .
       -This number is called a
       -.Dq codepoint ,
       -and all codepoints of an encoding make up its so-called
       -.Dq code space .
       -.Pp
       -Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
       -first 128 codepoints are identical to ASCII's. The additional code
       -points are needed as Unicode's goal is to express all writing systems
       -of the world.
       -To give an example, the abstract character
       -.Sq \[u00C4]
       -is not expressable in ASCII, given no ASCII codepoint has been assigned
       -to it.
       -It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
       -.Pp
       -One may assume that this process is straightfoward, but as more and
       -more codepoints were assigned to abstract characters, the Unicode
       -Consortium (that defines the Unicode standard) was facing a problem:
       -Many (mostly non-European) languages have such a large amount of
       -abstract characters that it would exhaust the available Unicode code
       -space if one tried to assign a codepoint to each abstract character.
       -The solution to that problem is best introduced with an example: Consider
       -the abstract character
       -.Sq \[u01DE] ,
       -which is
       -.Sq A
       -with an umlaut and a macron added to it.
       -In this sense, one can consider
       -.Sq \[u01DE]
       -as a two-fold modification (namely
       -.Dq add umlaut
       -and
       -.Dq add macron )
       -of the
       -.Dq base character
       -.Sq A .
       -.Pp
       -The Unicode Consortium adapted this idea by assigning codepoints to
       -modifications.
       -For example, the codepoint 0x308 represents adding an umlaut and 0x304
       -represents adding a macron, and thus, the codepoint sequence
       -.Dq 0x41 0x308 0x304 ,
       -namely the base character
       -.Sq A
       -followed by the umlaut and macron modifiers, represents the abstract
       -character
       -.Sq \[u01DE] .
       -As a side-note, the single codepoint 0x1DE was also assigned to
       -.Sq \[u01DE] ,
       -which is a good example for the fact that there can be multiple
       -representations of a single abstract character in Unicode.
       -.Pp
       -Expressing a single abstract character with multiple codepoints solved
       -the code space exhaustion-problem, and the concept has been greatly
       -expanded since its first introduction (emojis, joiners, etc.). A sequence
       -(which can also have the length 1) of codepoints that belong together
       -this way and represents an abstract character is called a
       -.Dq grapheme cluster .
       -.Pp
       -In many applications it is necessary to count the number of
       -user-perceived characters, i.e. grapheme clusters, in a string.
       -A good example for this is a terminal text editor, which needs to
       -properly align characters on a grid.
       -This is pretty simple with ASCII-strings, where you just count the number
       -of bytes (as each byte is a codepoint and each codepoint is a grapheme
       -cluster).
       -With Unicode-strings, it is a common mistake to simply adapt the
       -ASCII-approach and count the number of code points.
       -This is wrong, as, for example, the sequence
       -.Dq 0x41 0x308 0x304 ,
       -while made up of 3 codepoints, is a single grapheme cluster and
       -represents the user-perceived character
       -.Sq \[u01DE] .
       -.Pp
       -The proper way to segment a string into user-perceived characters
       -is to segment it into its grapheme clusters by applying the Unicode
       -grapheme cluster breaking algorithm (UAX #29).
       -It is based on a complex ruleset and lookup-tables and determines if a
       -grapheme cluster ends or is continued between two codepoints.
       -Libraries like ICU and libunistring, which also offer this functionality,
       -are often bloated, not correct, difficult to use or not reasonably
       -statically linkable.
       -.Pp
       -Analogously, the standard provides algorithms to separate strings by
       -words, sentences and lines, convert cases and compare strings.
       -The motivation behind
       -.Nm
       -is to make unicode handling suck less and abide by the UNIX philosophy.
       -.Sh AUTHORS
       -.An Laslo Hunhold Aq Mt dev@frign.de
 (DIR) diff --git a/man/libgrapheme.sh b/man/libgrapheme.sh
       @@ -0,0 +1,167 @@
       +cat << EOF
       +.Dd ${MAN_DATE}
       +.Dt LIBGRAPHEME 7
       +.Os suckless.org
       +.Sh NAME
       +.Nm libgrapheme
       +.Nd unicode string library
       +.Sh SYNOPSIS
       +.In grapheme.h
       +.Sh DESCRIPTION
       +The
       +.Nm
       +library provides functions to properly handle Unicode strings according
       +to the Unicode specification in regard to character, word, sentence and
       +line segmentation and case detection and conversion.
       +.Pp
       +Unicode strings are made up of user-perceived characters (so-called
       +.Dq grapheme clusters ,
       +see
       +.Sx MOTIVATION )
       +that are composed of one or more Unicode codepoints, which in turn
       +are encoded in one or more bytes in an encoding like UTF-8.
       +.Pp
       +There is a widespread misconception that it was enough to simply
       +determine codepoints in a string and treat them as user-perceived
       +characters to be Unicode compliant.
       +While this may work in some cases, this assumption quickly breaks,
       +especially for non-Western languages and decomposed Unicode strings
       +where user-perceived characters are usually represented using multiple
       +codepoints.
       +.Pp
       +Despite this complicated multilevel structure of Unicode strings,
       +.Nm
       +provides methods to work with them at the byte-level (i.e. UTF-8
       +.Sq char
       +arrays) while also offering codepoint-level methods.
       +Additionally, it is a
       +.Dq freestanding
       +library (see ISO/IEC 9899:1999 section 4.6) and thus does not depend on
       +a standard library. This makes it easy to use in bare metal environments.
       +.Pp
       +Every documented function's manual page provides a self-contained
       +example illustrating the possible usage.
       +.Sh SEE ALSO
       +.Xr grapheme_decode_utf8 3 ,
       +.Xr grapheme_encode_utf8 3 ,
       +.Xr grapheme_is_character_break 3 ,
       +.Xr grapheme_is_lowercase 3 ,
       +.Xr grapheme_is_lowercase_utf8 3 ,
       +.Xr grapheme_is_titlecase 3 ,
       +.Xr grapheme_is_titlecase_utf8 3 ,
       +.Xr grapheme_is_uppercase 3 ,
       +.Xr grapheme_is_uppercase_utf8 3 ,
       +.Xr grapheme_next_character_break 3 ,
       +.Xr grapheme_next_character_break_utf8 3 ,
       +.Xr grapheme_next_line_break 3 ,
       +.Xr grapheme_next_line_break_utf8 3 ,
       +.Xr grapheme_next_sentence_break 3 ,
       +.Xr grapheme_next_sentence_break_utf8 3 ,
       +.Xr grapheme_next_word_break 3 ,
       +.Xr grapheme_next_word_break_utf8 3 ,
       +.Xr grapheme_to_lowercase 3 ,
       +.Xr grapheme_to_lowercase_utf8 3 ,
       +.Xr grapheme_to_titlecase 3 ,
       +.Xr grapheme_to_titlecase_utf8 3
       +.Xr grapheme_to_uppercase 3 ,
       +.Xr grapheme_to_uppercase_utf8 3 ,
       +.Sh STANDARDS
       +.Nm
       +is compliant with the Unicode ${UNICODE_VERSION} specification.
       +.Sh MOTIVATION
       +The idea behind every character encoding scheme like ASCII or Unicode
       +is to express abstract characters (which can be thought of as shapes
       +making up a written language). ASCII for instance, which comprises the
       +range 0 to 127, assigns the number 65 (0x41) to the abstract character
       +.Sq A .
       +This number is called a
       +.Dq codepoint ,
       +and all codepoints of an encoding make up its so-called
       +.Dq code space .
       +.Pp
       +Unicode's code space is much larger, ranging from 0 to 0x10FFFF, but its
       +first 128 codepoints are identical to ASCII's. The additional code
       +points are needed as Unicode's goal is to express all writing systems
       +of the world.
       +To give an example, the abstract character
       +.Sq \[u00C4]
       +is not expressable in ASCII, given no ASCII codepoint has been assigned
       +to it.
       +It can be expressed in Unicode, though, with the codepoint 196 (0xC4).
       +.Pp
       +One may assume that this process is straightfoward, but as more and
       +more codepoints were assigned to abstract characters, the Unicode
       +Consortium (that defines the Unicode standard) was facing a problem:
       +Many (mostly non-European) languages have such a large amount of
       +abstract characters that it would exhaust the available Unicode code
       +space if one tried to assign a codepoint to each abstract character.
       +The solution to that problem is best introduced with an example: Consider
       +the abstract character
       +.Sq \[u01DE] ,
       +which is
       +.Sq A
       +with an umlaut and a macron added to it.
       +In this sense, one can consider
       +.Sq \[u01DE]
       +as a two-fold modification (namely
       +.Dq add umlaut
       +and
       +.Dq add macron )
       +of the
       +.Dq base character
       +.Sq A .
       +.Pp
       +The Unicode Consortium adapted this idea by assigning codepoints to
       +modifications.
       +For example, the codepoint 0x308 represents adding an umlaut and 0x304
       +represents adding a macron, and thus, the codepoint sequence
       +.Dq 0x41 0x308 0x304 ,
       +namely the base character
       +.Sq A
       +followed by the umlaut and macron modifiers, represents the abstract
       +character
       +.Sq \[u01DE] .
       +As a side-note, the single codepoint 0x1DE was also assigned to
       +.Sq \[u01DE] ,
       +which is a good example for the fact that there can be multiple
       +representations of a single abstract character in Unicode.
       +.Pp
       +Expressing a single abstract character with multiple codepoints solved
       +the code space exhaustion-problem, and the concept has been greatly
       +expanded since its first introduction (emojis, joiners, etc.). A sequence
       +(which can also have the length 1) of codepoints that belong together
       +this way and represents an abstract character is called a
       +.Dq grapheme cluster .
       +.Pp
       +In many applications it is necessary to count the number of
       +user-perceived characters, i.e. grapheme clusters, in a string.
       +A good example for this is a terminal text editor, which needs to
       +properly align characters on a grid.
       +This is pretty simple with ASCII-strings, where you just count the number
       +of bytes (as each byte is a codepoint and each codepoint is a grapheme
       +cluster).
       +With Unicode-strings, it is a common mistake to simply adapt the
       +ASCII-approach and count the number of code points.
       +This is wrong, as, for example, the sequence
       +.Dq 0x41 0x308 0x304 ,
       +while made up of 3 codepoints, is a single grapheme cluster and
       +represents the user-perceived character
       +.Sq \[u01DE] .
       +.Pp
       +The proper way to segment a string into user-perceived characters
       +is to segment it into its grapheme clusters by applying the Unicode
       +grapheme cluster breaking algorithm (UAX #29).
       +It is based on a complex ruleset and lookup-tables and determines if a
       +grapheme cluster ends or is continued between two codepoints.
       +Libraries like ICU and libunistring, which also offer this functionality,
       +are often bloated, not correct, difficult to use or not reasonably
       +statically linkable.
       +.Pp
       +Analogously, the standard provides algorithms to separate strings by
       +words, sentences and lines, convert cases and compare strings.
       +The motivation behind
       +.Nm
       +is to make unicode handling suck less and abide by the UNIX philosophy.
       +.Sh AUTHORS
       +.An Laslo Hunhold Aq Mt dev@frign.de
       +EOF
 (DIR) diff --git a/man/template/is_case.sh b/man/template/is_case.sh
       @@ -0,0 +1,67 @@
       +if [ "$ENCODING" = "utf8" ]; then
       +        UNIT="byte"
       +        ARRAYTYPE="UTF-8-encoded string"
       +        SUFFIX="_utf8"
       +        ANTISUFFIX=""
       +        DATATYPE="char"
       +else
       +        UNIT="codepoint"
       +        ARRAYTYPE="codepoint array"
       +        SUFFIX=""
       +        ANTISUFFIX="_utf8"
       +        DATATYPE="uint_least32_t"
       +fi
       +
       +cat << EOF
       +.Dd ${MAN_DATE}
       +.Dt GRAPHEME_IS_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
       +.Os suckless.org
       +.Sh NAME
       +.Nm grapheme_is_${CASE}${SUFFIX}
       +.Nd check if ${ARRAYTYPE} is ${CASE}
       +.Sh SYNOPSIS
       +.In grapheme.h
       +.Ft size_t
       +.Fn grapheme_is_${CASE}${SUFFIX} "const ${DATATYPE} *str" "size_t len" "size_t *caselen"
       +.Sh DESCRIPTION
       +The
       +.Fn grapheme_is_${CASE}${SUFFIX}
       +function checks if the ${ARRAYTYPE}
       +.Va str
       +is ${CASE} and writes the length of the matching ${CASE}-sequence to the integer pointed to by
       +.Va caselen ,
       +unless
       +.Va caselen
       +is set to
       +.Dv NULL .
       +.Pp
       +If
       +.Va len
       +is set to
       +.Dv SIZE_MAX
       +(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
       +.Va src
       +is interpreted to be NUL-terminated and processing stops when a
       +NUL-byte is encountered.
       +.Pp
       +For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
       +.Xr grapheme_is_${CASE}${ANTISUFFIX} 3
       +can be used instead.
       +.Sh RETURN VALUES
       +The
       +.Fn grapheme_is_${CASE}${SUFFIX}
       +function returns
       +.Dv true
       +if the ${ARRAYTYPE}
       +.Va str
       +is ${CASE}, otherwise
       +.Dv false .
       +.Sh SEE ALSO
       +.Xr grapheme_is_${CASE}${ANTISUFFIX} 3 ,
       +.Xr libgrapheme 7
       +.Sh STANDARDS
       +.Fn grapheme_is_${CASE}${SUFFIX}
       +is compliant with the Unicode ${UNICODE_VERSION} specification.
       +.Sh AUTHORS
       +.An Laslo Hunhold Aq Mt dev@frign.de
       +EOF
 (DIR) diff --git a/man/template/next_break.sh b/man/template/next_break.sh
       @@ -0,0 +1,112 @@
       +if [ "$ENCODING" = "utf8" ]; then
       +        UNIT="byte"
       +        SUFFIX="_utf8"
       +        ANTISUFFIX=""
       +else
       +        UNIT="codepoint"
       +        SUFFIX=""
       +        ANTISUFFIX="_utf8"
       +fi
       +
       +cat << EOF
       +.Dd ${MAN_DATE}
       +.Dt GRAPHEME_NEXT_$(printf "%s_break%s" "$TYPE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
       +.Os suckless.org
       +.Sh NAME
       +.Nm grapheme_next_${TYPE}_break${SUFFIX}
       +.Nd determine ${UNIT}-offset to next ${REALTYPE} break
       +.Sh SYNOPSIS
       +.In grapheme.h
       +.Ft size_t
       +.Fn grapheme_next_${TYPE}_break${SUFFIX} "const $(if [ "$ENCODING" = "utf8" ]; then printf "char"; else printf "uint_least32_t"; fi) *str" "size_t len"
       +.Sh DESCRIPTION
       +The
       +.Fn grapheme_next_${TYPE}_break${SUFFIX}
       +function computes the offset (in ${UNIT}s) to the next ${REALTYPE}
       +break (see
       +.Xr libgrapheme 7 )
       +in the $(if [ "$ENCODING" = "utf8" ]; then printf "UTF-8-encoded string"; else printf "codepoint array"; fi)
       +.Va str
       +of length
       +.Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a ${REALTYPE} begins at
       +.Va str
       +this offset is equal to the length of said ${REALTYPE}."; fi)
       +.Pp
       +If
       +.Va len
       +is set to
       +.Dv SIZE_MAX
       +(stdint.h is already included by grapheme.h) the string
       +.Va str
       +is interpreted to be NUL-terminated and processing stops when
       +a $(if [ "$ENCODING" = "utf8" ]; then printf "NUL-byte"; else printf "codepoint with the value 0"; fi) is encountered.
       +.Pp
       +For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input
       +data$(if [ "$TYPE" = "character" ] && [ "$ENCODING" = "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 and"; fi)
       +.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3
       +can be used instead.
       +.Sh RETURN VALUES
       +The
       +.Fn grapheme_next_${TYPE}_break${SUFFIX}
       +function returns the offset (in ${UNIT}s) to the next ${REALTYPE}
       +break in
       +.Va str
       +or 0 if
       +.Va str
       +is
       +.Dv NULL .
       +EOF
       +
       +if [ "$ENCODING" = "utf8" ]; then
       +cat << EOF
       +.Sh EXAMPLES
       +.Bd -literal
       +/* cc (-static) -o example example.c -lgrapheme */
       +#include <grapheme.h>
       +#include <stdint.h>
       +#include <stdio.h>
       +
       +int
       +main(void)
       +{
       +        /* UTF-8 encoded input */
       +        char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0"
       +                  "\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0"
       +                  "\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0"
       +                  "\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!";
       +        size_t ret, len, off;
       +
       +        printf("Input: \\\\"%s\\\\"\\\\n", s);
       +
       +        /* print each ${REALTYPE} with byte-length */
       +        printf("${REALTYPE}s in NUL-delimited input:\\\\n");
       +        for (off = 0; s[off] != '\\\\0'; off += ret) {
       +                ret = grapheme_next_${TYPE}_break_utf8(s + off, SIZE_MAX);
       +                printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
       +        }
       +        printf("\\\\n");
       +
       +        /* do the same, but this time string is length-delimited */
       +        len = 17;
       +        printf("${REALTYPE}s in input delimited to %zu bytes:\\\\n", len);
       +        for (off = 0; off < len; off += ret) {
       +                ret = grapheme_next_${TYPE}_break_utf8(s + off, len - off);
       +                printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
       +        }
       +
       +        return 0;
       +}
       +.Ed
       +EOF
       +fi
       +
       +cat << EOF
       +.Sh SEE ALSO$(if [ "$TYPE" = "character" ] && [ "$ENCODING" != "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 ,"; fi)
       +.Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3 ,
       +.Xr libgrapheme 7
       +.Sh STANDARDS
       +.Fn grapheme_next_${TYPE}_break${SUFFIX}
       +is compliant with the Unicode ${UNICODE_VERSION} specification.
       +.Sh AUTHORS
       +.An Laslo Hunhold Aq Mt dev@frign.de
       +EOF
 (DIR) diff --git a/man/template/to_case.sh b/man/template/to_case.sh
       @@ -0,0 +1,72 @@
       +if [ "$ENCODING" = "utf8" ]; then
       +        UNIT="byte"
       +        ARRAYTYPE="UTF-8-encoded string"
       +        SUFFIX="_utf8"
       +        ANTISUFFIX=""
       +        DATATYPE="char"
       +else
       +        UNIT="codepoint"
       +        ARRAYTYPE="codepoint array"
       +        SUFFIX=""
       +        ANTISUFFIX="_utf8"
       +        DATATYPE="uint_least32_t"
       +fi
       +
       +cat << EOF
       +.Dd ${MAN_DATE}
       +.Dt GRAPHEME_TO_$(printf "%s%s" "$CASE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
       +.Os suckless.org
       +.Sh NAME
       +.Nm grapheme_to_${CASE}${SUFFIX}
       +.Nd convert ${ARRAYTYPE} to ${CASE}
       +.Sh SYNOPSIS
       +.In grapheme.h
       +.Ft size_t
       +.Fn grapheme_to_${CASE}${SUFFIX} "const ${DATATYPE} *src" "size_t srclen" "${DATATYPE} *dest" "size_t destlen"
       +.Sh DESCRIPTION
       +The
       +.Fn grapheme_to_${CASE}${SUFFIX}
       +function converts the ${ARRAYTYPE}
       +.Va str
       +to ${CASE} and writes the result to
       +.Va dest
       +up to
       +.Va destlen ,
       +unless
       +.Va dest
       +is set to
       +.Dv NULL .
       +.Pp
       +If
       +.Va srclen
       +is set to
       +.Dv SIZE_MAX
       +(stdint.h is already included by grapheme.h) the ${ARRAYTYPE}
       +.Va src
       +is interpreted to be NUL-terminated and processing stops when a
       +NUL-byte is encountered.
       +.Pp
       +For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input data
       +.Xr grapheme_to_${CASE}${ANTISUFFIX} 3
       +can be used instead.
       +.Sh RETURN VALUES
       +The
       +.Fn grapheme_to_${CASE}${SUFFIX}
       +function returns the number of ${UNIT}s in the array resulting
       +from converting
       +.Va src
       +to ${CASE}, even if
       +.Va destlen
       +is not large enough or
       +.Va dest
       +is
       +.Dv NULL .
       +.Sh SEE ALSO
       +.Xr grapheme_to_${CASE}${ANTISUFFIX} 3 ,
       +.Xr libgrapheme 7
       +.Sh STANDARDS
       +.Fn grapheme_to_${CASE}${SUFFIX}
       +is compliant with the Unicode ${UNICODE_VERSION} specification.
       +.Sh AUTHORS
       +.An Laslo Hunhold Aq Mt dev@frign.de
       +EOF
 (DIR) diff --git a/src/case.c b/src/case.c
       @@ -1,4 +1,5 @@
        /* See LICENSE file for copyright and license details. */
       +#include <stddef.h>
        #include <stdint.h>
        
        #include "../grapheme.h"
       @@ -8,9 +9,9 @@
        static inline enum case_property
        get_case_property(uint_least32_t cp)
        {
       -        if (likely(cp <= 0x10FFFF)) {
       +        if (likely(cp <= UINT32_C(0x10FFFF))) {
                        return (enum case_property)
       -                       case_minor[case_major[cp >> 8] + (cp & 0xff)];
       +                       case_minor[case_major[cp >> 8] + (cp & 0xFF)];
                } else {
                        return CASE_PROP_OTHER;
                }
       @@ -20,35 +21,31 @@ static inline int_least32_t
        get_case_offset(uint_least32_t cp, const uint_least16_t *major,
                        const int_least32_t *minor)
        {
       -        if (likely(cp <= 0x10FFFF)) {
       +        if (likely(cp <= UINT32_C(0x10FFFF))) {
                        /*
                         * this value might be larger than or equal to 0x110000
                         * for the special-case-mapping. This needs to be handled
                         * separately
                         */
       -                return minor[major[cp >> 8] + (cp & 0xff)];
       +                return minor[major[cp >> 8] + (cp & 0xFF)];
                } else {
                        return 0;
                }
        }
        
        static inline size_t
       -to_case(const void *src, size_t srclen, void *dest, size_t destlen,
       -        size_t srcnumprocess, uint_least8_t final_sigma_level,
       -        size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
       -        size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t),
       -        const uint_least16_t *major, const int_least32_t *minor,
       -        const struct special_case *sc)
       +to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
       +        uint_least8_t final_sigma_level, const uint_least16_t *major,
       +        const int_least32_t *minor, const struct special_case *sc)
        {
       +        HERODOTUS_READER tmp;
                enum case_property prop;
       -        size_t srcoff, destoff, res, tmp, off, i;
       +        enum herodotus_status s;
       +        size_t off, i;
                uint_least32_t cp, tmp_cp;
                int_least32_t map;
        
       -        for (srcoff = 0, destoff = 0; srcoff < srcnumprocess; srcoff += res) {
       -                /* read in next source codepoint */
       -                res = get_codepoint((const char *)src, srclen, srcoff, &cp);
       -
       +        for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
                        if (sc == lower_special) {
                                /*
                                 * For the special Final_Sigma-rule (see SpecialCasing.txt),
       @@ -72,8 +69,10 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
                                         * if the succeeding character is cased, invalidating
                                         * the after-condition
                                         */
       -                                for (tmp = srcoff + res, prop = NUM_CASE_PROPS; tmp < srclen; ) {
       -                                        tmp += get_codepoint(src, srclen, tmp, &tmp_cp);
       +                                herodotus_reader_copy(r, &tmp);
       +                                for (prop = NUM_CASE_PROPS;
       +                                     (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
       +                                     HERODOTUS_STATUS_SUCCESS; ) {
                                                prop = get_case_property(tmp_cp);
        
                                                if (prop != CASE_PROP_CASE_IGNORABLE &&
       @@ -83,20 +82,19 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
                                        }
        
                                        /*
       -                                 * Now prop is something other than case-ignorable.
       +                                 * Now prop is something other than case-ignorable or
       +                                 * the source-string ended.
                                         * If it is something other than cased, we know
                                         * that the after-condition holds
                                         */
       -                                if (prop != CASE_PROP_CASED &&
       -                                    prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
       +                                if (s != HERODOTUS_STATUS_SUCCESS ||
       +                                    (prop != CASE_PROP_CASED &&
       +                                     prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
                                                /*
                                                 * write GREEK SMALL LETTER FINAL SIGMA to
                                                 * destination
                                                 */
       -                                        destoff += set_codepoint(UINT32_C(0x03C2),
       -                                                                 dest,
       -                                                                 destlen,
       -                                                                 destoff);
       +                                        herodotus_write_codepoint(w, UINT32_C(0x03C2));
                                                
                                                /* reset Final_Sigma-state and continue */
                                                final_sigma_level = 0;
       @@ -132,208 +130,176 @@ to_case(const void *src, size_t srclen, void *dest, size_t destlen,
                                off = (uint_least32_t)map - UINT32_C(0x110000);
        
                                for (i = 0; i < sc[off].cplen; i++) {
       -                                if (likely(destoff < destlen)) {
       -                                        /*
       -                                         * write special mapping to destination
       -                                         */
       -                                        destoff += set_codepoint(sc[off].cp[i],
       -                                                                 dest,
       -                                                                 destlen,
       -                                                                 destoff);
       -                                } else {
       -                                        /*
       -                                         * further increase destoff to indicate
       -                                         * how much buffer space we need
       -                                         */
       -                                        destoff += set_codepoint(sc[off].cp[i],
       -                                                                 NULL, 0, 0);
       -                                }
       +                                herodotus_write_codepoint(w, sc[off].cp[i]);
                                }
                        } else {
                                /* we have a simple mapping */
       -                        if (likely(destoff < destlen)) {
       -                                destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
       -                                                         dest, destlen, destoff);
       -                        } else {
       -                                destoff += set_codepoint((uint_least32_t)((int_least32_t)cp + map),
       -                                                         NULL, 0, 0);
       -                        }
       +                        herodotus_write_codepoint(w, (uint_least32_t)
       +                                                  ((int_least32_t)cp + map));
                        }
                }
        
       -        if (set_codepoint == set_codepoint_utf8 && destlen > 0) {
       -                /*
       -                 * NUL-terminate destination to always ensure NUL-termination,
       -                 * unless in check mode.
       -                 * Just like with snprintf() a return value >= destlen indicates
       -                 * truncation.
       -                 */
       -                ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
       -        }
       +        herodotus_writer_nul_terminate(w);
        
       -        return destoff;
       +        return herodotus_writer_number_written(w);
       +}
       +
       +static size_t
       +herodotus_next_word_break(const HERODOTUS_READER *r)
       +{
       +        HERODOTUS_READER tmp;
       +
       +        herodotus_reader_copy(r, &tmp);
       +
       +        if (r->type == HERODOTUS_TYPE_CODEPOINT) {
       +                return grapheme_next_word_break(tmp.src, tmp.srclen);
       +        } else { /* r->type == HERODOTUS_TYPE_UTF8 */
       +                return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
       +        }
        }
        
        static inline size_t
       -to_titlecase(const void *src, size_t srclen, void *dest, size_t destlen,
       -             size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
       -             size_t (*set_codepoint)(uint_least32_t, void *, size_t, size_t))
       +to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
        {
                enum case_property prop;
       -        size_t next_wb, srcoff, destoff, res;
       +        enum herodotus_status s;
                uint_least32_t cp;
       +        size_t nwb;
        
       -        for (srcoff = destoff = 0; ; ) {
       -                if (get_codepoint == get_codepoint_utf8) {
       -                        if ((next_wb = grapheme_next_word_break_utf8((const char *)src + srcoff,
       -                                                                     srclen - srcoff)) == 0) {
       -                                /* we consumed all of the string */
       -                                break;
       -                        }
       -                } else {
       -                        if ((next_wb = grapheme_next_word_break((const uint_least32_t *)src + srcoff,
       -                                                                srclen - srcoff)) == 0) {
       -                                /* we consumed all of the string */
       -                                break;
       -                        }
       -                }
       -
       -                for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff += res) {
       +        for (; (nwb = herodotus_next_word_break(r)) > 0;) {
       +                herodotus_reader_push_advance_limit(r, nwb);
       +                for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
                                /* check if we have a cased character */
       -                        res = get_codepoint(src, srclen, srcoff, &cp);
                                prop = get_case_property(cp);
                                if (prop == CASE_PROP_CASED ||
                                    prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
                                        break;
                                } else {
                                        /* write the data to the output verbatim, it if permits */
       -                                destoff += set_codepoint_utf8(cp, dest, destlen, destoff);
       -                        }
       -                }
       +                                herodotus_write_codepoint(w, cp);
        
       -                if (next_wb > 0) {
       -                        /* get character length */
       -                        res = get_codepoint(src, srclen, srcoff, &cp);
       -
       -                        /* we have a cased character at srcoff, map it to titlecase */
       -                        if (get_codepoint == get_codepoint_utf8) {
       -                                destoff += to_case((const char *)src + srcoff,
       -                                                   srclen - srcoff,
       -                                                   (char *)dest + destoff,
       -                                                   (destoff < destlen) ? (destlen - destoff) : 0,
       -                                                   res, 0,
       -                                                   get_codepoint_utf8,
       -                                                   set_codepoint_utf8, title_major,
       -                                                   title_minor, title_special);
       -                        } else {
       -                                destoff += to_case((const uint_least32_t *)src + srcoff,
       -                                                   srclen - srcoff,
       -                                                   (uint_least32_t *)dest + destoff,
       -                                                   (destoff < destlen) ? (destlen - destoff) : 0,
       -                                                   res, 0,
       -                                                   get_codepoint,
       -                                                   set_codepoint, title_major,
       -                                                   title_minor, title_special);
       +                                /* increment reader */
       +                                herodotus_read_codepoint(r, true, &cp);
                                }
       -
       -                        /* we consumed a character */
       -                        srcoff += res;
       -                        next_wb -= res;
                        }
        
       -                /* cast the rest of the codepoints in the word to lowercase */
       -                if (get_codepoint == get_codepoint_utf8) {
       -                        destoff += to_case((const char *)src + srcoff,
       -                                           srclen - srcoff,
       -                                           (char *)dest + destoff,
       -                                           (destoff < destlen) ? (destlen - destoff) : 0,
       -                                           next_wb, 1,
       -                                           get_codepoint_utf8,
       -                                           set_codepoint_utf8, lower_major,
       -                                           lower_minor, lower_special);
       +                if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
       +                        /* we are done */
       +                        herodotus_reader_pop_limit(r);
       +                        break;
       +                } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
       +                        /*
       +                         * we did not encounter any cased character
       +                         * up to the word break
       +                         */
       +                        herodotus_reader_pop_limit(r);
       +                        continue;
                        } else {
       -                        destoff += to_case((const uint_least32_t *)src + srcoff,
       -                                           srclen - srcoff,
       -                                           (uint_least32_t *)dest + destoff,
       -                                           (destoff < destlen) ? (destlen - destoff) : 0,
       -                                           next_wb, 1,
       -                                           get_codepoint,
       -                                           set_codepoint, lower_major,
       -                                           lower_minor, lower_special);
       +                        /*
       +                         * we encountered a cased character before the word
       +                         * break, convert it to titlecase
       +                         */
       +                        herodotus_reader_push_advance_limit(r,
       +                                herodotus_reader_next_codepoint_break(r));
       +                        to_case(r, w, 0, title_major, title_minor, title_special);
       +                        herodotus_reader_pop_limit(r);
                        }
       -                srcoff += next_wb;
       -        }
        
       -        if (set_codepoint == set_codepoint_utf8) {
       -                /*
       -                 * NUL-terminate destination to always ensure NUL-termination.
       -                 * Just like with snprintf() a return value >= destlen indicates
       -                 * truncation.
       -                 */
       -                ((char *)dest)[(destoff < destlen) ? destoff : (destlen - 1)] = '\0';
       +                /* cast the rest of the codepoints in the word to lowercase */
       +                to_case(r, w, 1, lower_major, lower_minor, lower_special);
       +
       +                /* remove the limit on the word before the next iteration */
       +                herodotus_reader_pop_limit(r);
                }
        
       -        return destoff;
       +        herodotus_writer_nul_terminate(w);
       +
       +        return herodotus_writer_number_written(w);
        }
        
        size_t
        grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
       -                       upper_major, upper_minor, upper_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
       +
       +        return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
        }
        
        size_t
        grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint, set_codepoint,
       -                       lower_major, lower_minor, lower_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
       +
       +        return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
        }
        
        size_t
        grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
        {
       -        return to_titlecase(src, srclen, dest, destlen, get_codepoint,
       -                            set_codepoint);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
       +
       +        return to_titlecase(&r, &w);
        }
        
        size_t
        grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
       -                       upper_major, upper_minor, upper_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
       +
       +        return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
        }
        
        size_t
        grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
        {
       -        return to_case(src, srclen, dest, destlen, srclen, 0, get_codepoint_utf8, set_codepoint_utf8,
       -                       lower_major, lower_minor, lower_special);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
        
       +        return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
        }
        
        size_t
        grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
        {
       -        return to_titlecase(src, srclen, dest, destlen, get_codepoint_utf8,
       -                            set_codepoint_utf8);
       +        HERODOTUS_READER r;
       +        HERODOTUS_WRITER w;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +        herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
       +
       +        return to_titlecase(&r, &w);
        }
        
        static inline bool
       -is_case(const void *src, size_t srclen,
       -        size_t srcnumprocess,
       -        size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
       -        const uint_least16_t *major, const int_least32_t *minor,
       -        const struct special_case *sc, size_t *output)
       +is_case(HERODOTUS_READER *r, const uint_least16_t *major,
       +        const int_least32_t *minor, const struct special_case *sc,
       +        size_t *output)
        {
       -        size_t srcoff, new_srcoff, tmp, res, off, i;
       -        uint_least32_t cp, tmp_cp;
       +        size_t off, i;
       +        bool ret = true;
       +        uint_least32_t cp;
                int_least32_t map;
        
       -        for (srcoff = 0; srcoff < srcnumprocess; srcoff = new_srcoff) {
       -                /* read in next source codepoint */
       -                new_srcoff = srcoff + get_codepoint(src, srclen, srcoff, &cp);
       -
       +        for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
                        /* get and handle case mapping */
                        if (unlikely((map = get_case_offset(cp, major, minor)) >=
                                     INT32_C(0x110000))) {
       @@ -341,169 +307,164 @@ is_case(const void *src, size_t srclen,
                                 * is the difference to 0x110000*/
                                off = (uint_least32_t)map - UINT32_C(0x110000);
        
       -                        for (i = 0, tmp = srcoff; i < sc[off].cplen; i++, tmp += res) {
       -                                res = get_codepoint(src, srclen, srcoff, &tmp_cp);
       -                                if (tmp_cp != sc[off].cp[i]) {
       -                                        /* we have a difference */
       -                                        if (output) {
       -                                                *output = tmp;
       +                        for (i = 0; i < sc[off].cplen; i++) {
       +                                if (herodotus_read_codepoint(r, false, &cp) ==
       +                                    HERODOTUS_STATUS_SUCCESS) {
       +                                        if (cp != sc[off].cp[i]) {
       +                                                ret = false;
       +                                                goto done;
       +                                        } else {
       +                                                /* move forward */
       +                                                herodotus_read_codepoint(r, true, &cp);
                                                }
       -                                        return false;
       +                                } else {
       +                                        /*
       +                                         * input ended and we didn't see
       +                                         * any difference so far, so this
       +                                         * string is in fact okay
       +                                         */
       +                                        ret = true;
       +                                        goto done;
                                        }
                                }
       -                        new_srcoff = tmp;
                        } else {
                                /* we have a simple mapping */
                                if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
                                        /* we have a difference */
       -                                if (output) {
       -                                        *output = srcoff;
       -                                }
       -                                return false;
       +                                ret = false;
       +                                goto done;
       +                        } else {
       +                                /* move forward */
       +                                herodotus_read_codepoint(r, true, &cp);
                                }
                        }
                }
       -
       +done:
                if (output) {
       -                *output = srcoff;
       +                *output = herodotus_reader_number_read(r);
                }
       -        return true;
       +        return ret;
        }
        
        static inline bool
       -is_titlecase(const void *src, size_t srclen,
       -             size_t (*get_codepoint)(const void *, size_t, size_t, uint_least32_t *),
       -             size_t *output)
       +is_titlecase(HERODOTUS_READER *r, size_t *output)
        {
                enum case_property prop;
       -        size_t next_wb, srcoff, res, tmp_output;
       +        enum herodotus_status s;
       +        bool ret = true;
                uint_least32_t cp;
       +        size_t nwb;
        
       -        for (srcoff = 0; ; ) {
       -                if (get_codepoint == get_codepoint_utf8) {
       -                        if ((next_wb = grapheme_next_word_break_utf8((const char *)src + srcoff,
       -                                                                     srclen - srcoff)) == 0) {
       -                                /* we consumed all of the string */
       -                                break;
       -                        }
       -                } else {
       -                        if ((next_wb = grapheme_next_word_break((const uint_least32_t *)src + srcoff,
       -                                                                srclen - srcoff)) == 0) {
       -                                /* we consumed all of the string */
       -                                break;
       -                        }
       -                }
       -
       -                for (; next_wb > 0 && srcoff < srclen; next_wb -= res, srcoff += res) {
       +        for (; (nwb = herodotus_next_word_break(r)) > 0;) {
       +                herodotus_reader_push_advance_limit(r, nwb);
       +                for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
                                /* check if we have a cased character */
       -                        res = get_codepoint(src, srclen, srcoff, &cp);
                                prop = get_case_property(cp);
                                if (prop == CASE_PROP_CASED ||
                                    prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
                                        break;
       +                        } else {
       +                                /* increment reader */
       +                                herodotus_read_codepoint(r, true, &cp);
                                }
                        }
        
       -                if (next_wb > 0) {
       -                        /* get character length */
       -                        res = get_codepoint(src, srclen, srcoff, &cp);
       -
       -                        /* we have a cased character at srcoff, check if it's titlecase */
       -                        if (get_codepoint == get_codepoint_utf8) {
       -                                if (!is_case((const char *)src + srcoff,
       -                                              srclen - srcoff, res,
       -                                              get_codepoint_utf8, title_major,
       -                                              title_minor, title_special, &tmp_output)) {
       -                                        if (output) {
       -                                                *output = srcoff + tmp_output;
       -                                        }
       -                                        return false;
       -                                }
       -                        } else {
       -                                if (!is_case((const uint_least32_t *)src + srcoff,
       -                                              srclen - srcoff, res,
       -                                              get_codepoint, title_major,
       -                                              title_minor, title_special, &tmp_output)) {
       -                                        if (output) {
       -                                                *output = srcoff + tmp_output;
       -                                        }
       -                                        return false;
       -                                }
       +                if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
       +                        /* we are done */
       +                        break;
       +                } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
       +                        /*
       +                         * we did not encounter any cased character
       +                         * up to the word break
       +                         */
       +                        herodotus_reader_pop_limit(r);
       +                        continue;
       +                } else {
       +                        /*
       +                         * we encountered a cased character before the word
       +                         * break, check if it's titlecase
       +                         */
       +                        herodotus_reader_push_advance_limit(r,
       +                                herodotus_reader_next_codepoint_break(r));
       +                        if (!is_case(r, title_major, title_minor, title_special, NULL)) {
       +                                ret = false;
       +                                goto done;
                                }
       -
       -                        /* we consumed a character */
       -                        srcoff += res;
       -                        next_wb -= res;
       +                        herodotus_reader_pop_limit(r);
                        }
        
                        /* check if the rest of the codepoints in the word are lowercase */
       -                if (get_codepoint == get_codepoint_utf8) {
       -                        if (!is_case((const char *)src + srcoff,
       -                                      srclen - srcoff, next_wb,
       -                                      get_codepoint_utf8, lower_major,
       -                                      lower_minor, lower_special, &tmp_output)) {
       -                                if (output) {
       -                                        *output = srcoff + tmp_output;
       -                                }
       -                                return false;
       -                        }
       -                } else {
       -                        if (!is_case((const uint_least32_t *)src + srcoff,
       -                                      srclen - srcoff, next_wb,
       -                                      get_codepoint, lower_major,
       -                                      lower_minor, lower_special, &tmp_output)) {
       -                                if (output) {
       -                                        *output = srcoff + tmp_output;
       -                                }
       -                                return false;
       -                        }
       +                if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
       +                        ret = false;
       +                        goto done;
                        }
       -                srcoff += next_wb;
       -        }
        
       +                /* remove the limit on the word before the next iteration */
       +                herodotus_reader_pop_limit(r);
       +        }
       +done:
                if (output) {
       -                *output = srcoff;
       +                *output = herodotus_reader_number_read(r);
                }
       -        return true;
       +        return ret;
        }
        
        bool
        grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
        {
       -        return is_case(src, srclen, srclen, get_codepoint,
       -                       upper_major, upper_minor, upper_special, caselen);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +
       +        return is_case(&r, upper_major, upper_minor, upper_special, caselen);
        }
        
        bool
        grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
        {
       -        return is_case(src, srclen, srclen, get_codepoint,
       -                       lower_major, lower_minor, lower_special, caselen);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +
       +        return is_case(&r, lower_major, lower_minor, lower_special, caselen);
        }
        
        bool
        grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
        {
       -        return is_titlecase(src, srclen, get_codepoint, caselen);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
       +
       +        return is_titlecase(&r, caselen);
        }
        
        bool
        grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
        {
       -        return is_case(src, srclen, srclen, get_codepoint_utf8,
       -                       upper_major, upper_minor, upper_special, caselen);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +
       +        return is_case(&r, upper_major, upper_minor, upper_special, caselen);
        }
        
        bool
        grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
        {
       -        return is_case(src, srclen, srclen, get_codepoint_utf8,
       -                       lower_major, lower_minor, lower_special, caselen);
       +        HERODOTUS_READER r;
        
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +
       +        return is_case(&r, lower_major, lower_minor, lower_special, caselen);
        }
        
        bool
        grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
        {
       -        return is_titlecase(src, srclen, get_codepoint_utf8, caselen);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
       +
       +        return is_titlecase(&r, caselen);
        }
 (DIR) diff --git a/src/character.c b/src/character.c
       @@ -1,162 +1,191 @@
        /* See LICENSE file for copyright and license details. */
       +#include <limits.h>
        #include <stdbool.h>
        #include <stddef.h>
       -#include <stdlib.h>
       -#include <string.h>
        
        #include "../gen/character.h"
        #include "../grapheme.h"
        #include "util.h"
        
       +struct character_break_state {
       +        uint_least8_t prop;
       +        bool prop_set;
       +        bool gb11_flag;
       +        bool gb12_13_flag;
       +};
       +
        static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_OTHER] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_CR] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_LF),            /* GB3  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_LF,            /* GB3  */
                [CHAR_BREAK_PROP_EXTEND] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_L] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_L)     | /* GB6  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V)     | /* GB6  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LV)    | /* GB6  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LVT)   | /* GB6  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L     | /* GB6  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V     | /* GB6  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV    | /* GB6  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT   | /* GB6  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_V] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V)     | /* GB7  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB7  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V     | /* GB7  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB7  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_T] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB8  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB8  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_LV] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V)     | /* GB7  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB7  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V     | /* GB7  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB7  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_HANGUL_LVT] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T)     | /* GB8  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T     | /* GB8  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_PREPEND] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK)  | /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK  | /* GB9a */
                        (UINT16_C(0xFFFF) &
       -                 ~(UINT16_C(1 << CHAR_BREAK_PROP_CR)      |
       -                   UINT16_C(1 << CHAR_BREAK_PROP_LF)      |
       -                   UINT16_C(1 << CHAR_BREAK_PROP_CONTROL)
       +                 ~(UINT16_C(1) << CHAR_BREAK_PROP_CR      |
       +                   UINT16_C(1) << CHAR_BREAK_PROP_LF      |
       +                   UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
                          )
                        ),                                           /* GB9b */
                [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_SPACINGMARK] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
                [CHAR_BREAK_PROP_ZWJ] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)       | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)          | /* GB9  */
       -                UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK),   /* GB9a */
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND       | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ          | /* GB9  */
       +                UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK,   /* GB9a */
        };
        static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)                   |
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ                   |
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
                [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
                [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND)                |
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ),
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND                |
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
                [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_ZWJ)                   |
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
       +                UINT16_C(1) << CHAR_BREAK_PROP_ZWJ                   |
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
        };
        static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
       +                UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
        };
        static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
       +                UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
        };
        static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
                [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
       -                UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
       +                UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
        };
        
        static inline enum char_break_property
        get_break_prop(uint_least32_t cp)
        {
       -        if (likely(cp <= 0x10FFFF)) {
       +        if (likely(cp <= UINT32_C(0x10FFFF))) {
                        return (enum char_break_property)
       -                       char_break_minor[char_break_major[cp >> 8] + (cp & 0xff)];
       +                       char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
                } else {
                        return CHAR_BREAK_PROP_OTHER;
                }
        }
        
       +static inline void
       +state_serialize(const struct character_break_state *in, uint_least16_t *out)
       +{
       +        *out = (uint_least16_t)(in->prop & UINT8_C(0xFF))                   | /* first 8 bits */
       +               (uint_least16_t)(((uint_least16_t)(in->prop_set))     <<  8) | /* 9th bit */
       +               (uint_least16_t)(((uint_least16_t)(in->gb11_flag))    <<  9) | /* 10th bit */
       +               (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10);  /* 11th bit */
       +}
       +
       +static inline void
       +state_deserialize(uint_least16_t in, struct character_break_state *out)
       +{
       +        out->prop         = in & UINT8_C(0xFF);
       +        out->prop_set     = in & (UINT16_C(1) <<  8);
       +        out->gb11_flag    = in & (UINT16_C(1) <<  9);
       +        out->gb12_13_flag = in & (UINT16_C(1) << 10);
       +}
       +
        bool
       -grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
       +grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
        {
       +        struct character_break_state state;
                enum char_break_property cp0_prop, cp1_prop;
                bool notbreak = false;
        
       -        if (likely(state)) {
       -                if (likely(state->prop_set)) {
       -                        cp0_prop = state->prop;
       +        if (likely(s)) {
       +                state_deserialize(*s, &state);
       +
       +                if (likely(state.prop_set)) {
       +                        cp0_prop = state.prop;
                        } else {
                                cp0_prop = get_break_prop(cp0);
                        }
                        cp1_prop = get_break_prop(cp1);
        
                        /* preserve prop of right codepoint for next iteration */
       -                state->prop = (uint_least8_t)cp1_prop;
       -                state->prop_set = true;
       +                state.prop = (uint_least8_t)cp1_prop;
       +                state.prop_set = true;
        
                        /* update flags */
       -                state->gb11_flag =
       +                state.gb11_flag =
                                flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
       -                                         state->gb11_flag] &
       -                        UINT16_C(1 << cp1_prop);
       -                state->gb12_13_flag =
       +                                         state.gb11_flag] &
       +                        UINT16_C(1) << cp1_prop;
       +                state.gb12_13_flag =
                                flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
       -                                            state->gb12_13_flag] &
       -                        UINT16_C(1 << cp1_prop);
       +                                            state.gb12_13_flag] &
       +                        UINT16_C(1) << cp1_prop;
        
                        /*
                         * Apply grapheme cluster breaking algorithm (UAX #29), see
                         * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
                         */
       -                notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
       -                           (dont_break_gb11[cp0_prop + state->gb11_flag *
       +                notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
       +                           (dont_break_gb11[cp0_prop + state.gb11_flag *
                                                    NUM_CHAR_BREAK_PROPS] &
       -                            UINT16_C(1 << cp1_prop)) ||
       -                           (dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
       +                            (UINT16_C(1) << cp1_prop)) ||
       +                           (dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
                                                       NUM_CHAR_BREAK_PROPS] &
       -                            UINT16_C(1 << cp1_prop));
       +                            (UINT16_C(1) << cp1_prop));
        
                        /* update or reset flags (when we have a break) */
                        if (likely(!notbreak)) {
       -                        state->gb11_flag = state->gb12_13_flag = false;
       +                        state.gb11_flag = state.gb12_13_flag = false;
                        }
       +
       +                state_serialize(&state, s);
                } else {
                        cp0_prop = get_break_prop(cp0);
                        cp1_prop = get_break_prop(cp1);
       @@ -168,69 +197,47 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
                         * Given we have no state, this behaves as if the state-booleans
                         * were all set to false
                         */
       -                notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
       -                           (dont_break_gb11[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
       -                           (dont_break_gb12_13[cp0_prop] & UINT16_C(1 << cp1_prop));
       +                notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
       +                           (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
       +                           (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
                }
        
                return !notbreak;
        }
        
       -size_t
       -grapheme_next_character_break(const uint_least32_t *str, size_t len)
       +static size_t
       +next_character_break(HERODOTUS_READER *r)
        {
       -        GRAPHEME_STATE state = { 0 };
       -        size_t off;
       -
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       +        uint_least16_t state = 0;
       +        uint_least32_t cp0 = 0, cp1 = 0;
        
       -        for (off = 1; off < len; off++) {
       -                if (grapheme_is_character_break(str[off - 1], str[off], &state)) {
       +        for (herodotus_read_codepoint(r, true, &cp0);
       +             herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
       +             herodotus_read_codepoint(r, true, &cp0)) {
       +                if (grapheme_is_character_break(cp0, cp1, &state)) {
                                break;
                        }
                }
        
       -        return off;
       +        return herodotus_reader_number_read(r);
        }
        
        size_t
       -grapheme_next_character_break_utf8(const char *str, size_t len)
       +grapheme_next_character_break(const uint_least32_t *str, size_t len)
        {
       -        GRAPHEME_STATE state = { 0 };
       -        uint_least32_t cp0 = 0, cp1 = 0;
       -        size_t off, ret;
       -
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       +        HERODOTUS_READER r;
        
       -        for (off = 0; (len == SIZE_MAX) || off < len; off += ret) {
       -                cp0 = cp1;
       -                ret = grapheme_decode_utf8(str + off, (len == SIZE_MAX) ?
       -                                           SIZE_MAX : len - off, &cp1);
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
        
       -                if (len != SIZE_MAX && ret > (len - off)) {
       -                        /* string ended abruptly, simply accept cropping */
       -                        ret = len - off;
       -                }
       +        return next_character_break(&r);
       +}
        
       -                if (len == SIZE_MAX && cp1 == 0) {
       -                        /* we hit a NUL-byte and are done */
       -                        break;
       -                }
       +size_t
       +grapheme_next_character_break_utf8(const char *str, size_t len)
       +{
       +        HERODOTUS_READER r;
        
       -                if (off == 0) {
       -                        /*
       -                         * we skip the first round, as we need both
       -                         * cp0 and cp1 to be initialized
       -                         */
       -                        continue;
       -                } else if (grapheme_is_character_break(cp0, cp1, &state)) {
       -                        break;
       -                }
       -        }
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
        
       -        return off;
       +        return next_character_break(&r);
        }
 (DIR) diff --git a/src/line.c b/src/line.c
       @@ -1,8 +1,6 @@
        /* See LICENSE file for copyright and license details. */
        #include <stdbool.h>
        #include <stddef.h>
       -#include <stdlib.h>
       -#include <string.h>
        
        #include "../gen/line.h"
        #include "../grapheme.h"
       @@ -11,7 +9,7 @@
        static inline enum line_break_property
        get_break_prop(uint_least32_t cp)
        {
       -        if (likely(cp <= 0x10FFFF)) {
       +        if (likely(cp <= UINT32_C(0x10FFFF))) {
                        return (enum line_break_property)
                               line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)];
                } else {
       @@ -20,22 +18,15 @@ get_break_prop(uint_least32_t cp)
        }
        
        static size_t
       -next_line_break(const void *str, size_t len, size_t (*get_codepoint)
       -                (const void *, size_t, size_t, uint_least32_t *))
       +next_line_break(HERODOTUS_READER *r)
        {
       +        HERODOTUS_READER tmp;
                enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
                                         last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
       -        enum line_break_property res;
                uint_least32_t cp;
                uint_least8_t lb25_level = 0;
       -        size_t off, new_off;
                bool lb21a_flag = false, ri_even = true;
        
       -        /* check degenerate cases */
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       -
                /*
                 * Apply line breaking algorithm (UAX #14), see
                 * https://unicode.org/reports/tr14/#Algorithm and tailoring
       @@ -49,24 +40,14 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint)
                 * Initialize the different properties such that we have
                 * a good state after the state-update in the loop
                 */
       -        cp0_prop = NUM_LINE_BREAK_PROPS;
       -        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
       -                return 1;
       -        }
       -        cp1_prop = get_break_prop(cp);
                last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
                last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
        
       -        for (; off < len; off = new_off) {
       -                /* update state */
       -                cp0_prop = cp1_prop;
       -                if ((new_off = off + get_codepoint(str, len, off, &cp)) <= len) {
       -                        get_codepoint(str, len, off, &cp);
       -                        cp1_prop = get_break_prop(cp);
       -                } else {
       -                        /* LB3 */
       -                        break;
       -                }
       +        for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp);
       +             herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;
       +             herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
       +                /* get property of the right codepoint */
       +                cp1_prop = get_break_prop(cp);
        
                        /* update retention-states */
        
       @@ -378,14 +359,14 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint)
                                 * two adjacent codepoints as we have it with
                                 * characters.
                                 */
       -                        if (new_off < len &&
       +                        herodotus_reader_copy(r, &tmp);
       +                        herodotus_read_codepoint(&tmp, true, &cp);
       +                        if (herodotus_read_codepoint(&tmp, true, &cp) ==
       +                            HERODOTUS_STATUS_SUCCESS &&
                                    (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
                                     cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF    ||
                                     cp1_prop == LINE_BREAK_PROP_HY)) {
       -                                get_codepoint(str, len, new_off, &cp);
       -                                res = get_break_prop(cp);
       -
       -                                if (res == LINE_BREAK_PROP_NU) {
       +                                if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
                                                continue;
                                        }
                                }
       @@ -505,17 +486,25 @@ next_line_break(const void *str, size_t len, size_t (*get_codepoint)
                        break;
                }
        
       -        return off;
       +        return herodotus_reader_number_read(r);
        }
        
        size_t
        grapheme_next_line_break(const uint_least32_t *str, size_t len)
        {
       -        return next_line_break(str, len, get_codepoint);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
       +
       +        return next_line_break(&r);
        }
        
        size_t
        grapheme_next_line_break_utf8(const char *str, size_t len)
        {
       -        return next_line_break(str, len, get_codepoint_utf8);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
       +
       +        return next_line_break(&r);
        }
 (DIR) diff --git a/src/sentence.c b/src/sentence.c
       @@ -1,18 +1,22 @@
        /* See LICENSE file for copyright and license details. */
        #include <stdbool.h>
        #include <stddef.h>
       -#include <stdlib.h>
       -#include <string.h>
        
        #include "../gen/sentence.h"
        #include "../grapheme.h"
        #include "util.h"
        
       -static inline enum sentence_break_property
       -get_break_prop(uint_least32_t cp)
       +struct sentence_break_state
        {
       -        if (likely(cp <= 0x10FFFF)) {
       -                return (enum sentence_break_property)
       +        uint_least8_t aterm_close_sp_level;
       +        uint_least8_t saterm_close_sp_parasep_level;
       +};
       +
       +static inline uint_least8_t
       +get_sentence_break_prop(uint_least32_t cp)
       +{
       +        if (likely(cp <= UINT32_C(0x10FFFF))) {
       +                return (uint_least8_t)
                               sentence_break_minor[sentence_break_major[cp >> 8] +
                               (cp & 0xff)];
                } else {
       @@ -20,239 +24,157 @@ get_break_prop(uint_least32_t cp)
                }
        }
        
       -static size_t
       -next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
       -                    (const void *, size_t, size_t, uint_least32_t *))
       +static bool
       +is_skippable_sentence_prop(uint_least8_t prop)
        {
       -        struct {
       -                enum sentence_break_property a, b, c, d;
       -        } raw, skip;
       -        enum sentence_break_property res;
       -        uint_least32_t cp;
       -        uint_least8_t aterm_close_sp_level = 0,
       -                      saterm_close_sp_parasep_level = 0;
       -        size_t off, tmp, new_off;
       +        return prop == SENTENCE_BREAK_PROP_EXTEND ||
       +               prop == SENTENCE_BREAK_PROP_FORMAT;
       +}
        
       -        /* check degenerate cases */
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       +static void
       +sentence_skip_shift_callback(uint_least8_t prop, void *s)
       +{
       +        struct sentence_break_state *state = (struct sentence_break_state *)s;
        
                /*
       -         * Apply sentence breaking algorithm (UAX #29), see
       -         * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
       +         * Here comes a bit of magic. The rules
       +         * SB8, SB8a, SB9 and SB10 have very complicated
       +         * left-hand-side-rules of the form
                 *
       -         * There are 4 slots (a, b, c, d) of "break" properties and
       -         * we check if there is a break in the middle between b and c.
       +         *  ATerm Close* Sp*
       +         *  SATerm Close*
       +         *  SATerm Close* Sp*
       +         *  SATerm Close* Sp* ParaSep?
                 *
       -         * The position of this middle spot is determined by off,
       -         * which gives the offset of the first element on the right
       -         * hand side of said spot, or, in other words, gives the number
       -         * of elements on the left hand side.
       +         * but instead of backtracking, we keep the
       +         * state as some kind of "power level" in
       +         * two state-variables
                 *
       -         * It is further complicated by the fact that the algorithm
       -         * expects you to skip certain characters for the second
       -         * half of the rules (after SB5). Thus, we do not only have
       -         * the "raw" properties as described above, but also the "skip"
       -         * properties, where the skip.a and skip.b, for instance,
       -         * give the two preceding character properties behind the
       -         * currently investigated breakpoint.
       +         *  aterm_close_sp_level
       +         *  saterm_close_sp_parasep_level
       +         *
       +         * that go from 0 to 3/4:
       +         *
       +         *  0: we are not in the sequence
       +         *  1: we have one ATerm/SATerm to the left of
       +         *     the middle spot
       +         *  2: we have one ATerm/SATerm and one or more
       +         *     Close to the left of the middle spot
       +         *  3: we have one ATerm/SATerm, zero or more
       +         *     Close and one or more Sp to the left of
       +         *     the middle spot.
       +         *  4: we have one SATerm, zero or more Close,
       +         *     zero or more Sp and one ParaSep to the
       +         *     left of the middle spot.
                 *
                 */
       -
       -        /*
       -         * Initialize the different properties such that we have
       -         * a good state after the state-update in the loop
       -         */
       -        raw.b = NUM_SENTENCE_BREAK_PROPS;
       -        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
       -                return 1;
       +        if ((state->aterm_close_sp_level == 0 ||
       +             state->aterm_close_sp_level == 1) &&
       +            prop == SENTENCE_BREAK_PROP_ATERM) {
       +                /* sequence has begun */
       +                state->aterm_close_sp_level = 1;
       +        } else if ((state->aterm_close_sp_level == 1 ||
       +                    state->aterm_close_sp_level == 2) &&
       +                   prop == SENTENCE_BREAK_PROP_CLOSE) {
       +                /* close-sequence begins or continued */
       +                state->aterm_close_sp_level = 2;
       +        } else if ((state->aterm_close_sp_level == 1 ||
       +                    state->aterm_close_sp_level == 2 ||
       +                    state->aterm_close_sp_level == 3) &&
       +                   prop == SENTENCE_BREAK_PROP_SP) {
       +                /* sp-sequence begins or continued */
       +                state->aterm_close_sp_level = 3;
       +        } else {
       +                /* sequence broke */
       +                state->aterm_close_sp_level = 0;
                }
       -        raw.c = get_break_prop(cp);
       -        (void)get_codepoint(str, len, off, &cp);
       -        raw.d = get_break_prop(cp);
       -        skip.a = skip.b = NUM_SENTENCE_BREAK_PROPS;
       -
       -        for (; off < len; off = new_off) {
       -                /*
       -                 * Update left side (a and b) of the skip state by
       -                 * "shifting in" the raw.c property as long as it is
       -                 * not one of the "ignored" character properties.
       -                 * While at it, update the RI-counter.
       -                 *
       -                 */
       -                if (raw.c != SENTENCE_BREAK_PROP_EXTEND &&
       -                    raw.c != SENTENCE_BREAK_PROP_FORMAT) {
       -                            skip.a = skip.b;
       -                        skip.b = raw.c;
       -
       -                        /*
       -                         * Here comes a bit of magic. The rules
       -                         * SB8, SB8a, SB9 and SB10 have very complicated
       -                         * left-hand-side-rules of the form
       -                         *
       -                         *  ATerm Close* Sp*
       -                         *  SATerm Close*
       -                         *  SATerm Close* Sp*
       -                         *  SATerm Close* Sp* ParaSep?
       -                         * 
       -                         * but instead of backtracking, we keep the
       -                         * state as some kind of "power level" in
       -                         * two variables
       -                         *
       -                         *  aterm_close_sp_level
       -                         *  saterm_close_sp_parasep_level
       -                         * 
       -                         * that go from 0 to 3/4:
       -                         *
       -                         *  0: we are not in the sequence
       -                         *  1: we have one ATerm/SATerm to the left of
       -                         *     the middle spot
       -                         *  2: we have one ATerm/SATerm and one or more
       -                         *     Close to the left of the middle spot
       -                         *  3: we have one ATerm/SATerm, zero or more
       -                         *     Close and one or more Sp to the left of
       -                         *     the middle spot.
       -                         *  4: we have one SATerm, zero or more Close,
       -                         *     zero or more Sp and one ParaSep to the
       -                         *     left of the middle spot.
       -                         *
       -                         */
       -                        if ((aterm_close_sp_level == 0 ||
       -                             aterm_close_sp_level == 1) &&
       -                            skip.b == SENTENCE_BREAK_PROP_ATERM) {
       -                                    /* sequence has begun */
       -                                aterm_close_sp_level = 1;
       -                        } else if ((aterm_close_sp_level == 1 ||
       -                                    aterm_close_sp_level == 2) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_CLOSE) {
       -                                /* close-sequence begins or continued */
       -                                aterm_close_sp_level = 2;
       -                        } else if ((aterm_close_sp_level == 1 ||
       -                                    aterm_close_sp_level == 2 ||
       -                                    aterm_close_sp_level == 3) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_SP) {
       -                                /* sp-sequence begins or continued */
       -                                aterm_close_sp_level = 3;
       -                        } else {
       -                                /* sequence broke */
       -                                aterm_close_sp_level = 0;
       -                        }
        
       -                        if ((saterm_close_sp_parasep_level == 0 ||
       -                             saterm_close_sp_parasep_level == 1) &&
       -                            (skip.b == SENTENCE_BREAK_PROP_STERM ||
       -                             skip.b == SENTENCE_BREAK_PROP_ATERM)) {
       -                                    /* sequence has begun */
       -                                saterm_close_sp_parasep_level = 1;
       -                        } else if ((saterm_close_sp_parasep_level == 1 ||
       -                                    saterm_close_sp_parasep_level == 2) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_CLOSE) {
       -                                /* close-sequence begins or continued */
       -                                saterm_close_sp_parasep_level = 2;
       -                        } else if ((saterm_close_sp_parasep_level == 1 ||
       -                                    saterm_close_sp_parasep_level == 2 ||
       -                                    saterm_close_sp_parasep_level == 3) &&
       -                                   skip.b == SENTENCE_BREAK_PROP_SP) {
       -                                /* sp-sequence begins or continued */
       -                                saterm_close_sp_parasep_level = 3;
       -                        } else if ((saterm_close_sp_parasep_level == 1 ||
       -                                    saterm_close_sp_parasep_level == 2 ||
       -                                    saterm_close_sp_parasep_level == 3) &&
       -                                   (skip.b == SENTENCE_BREAK_PROP_SEP ||
       -                                    skip.b == SENTENCE_BREAK_PROP_CR  ||
       -                                    skip.b == SENTENCE_BREAK_PROP_LF)) {
       -                                /* ParaSep at the end of the sequence */
       -                                saterm_close_sp_parasep_level = 4;
       -                        } else {
       -                                /* sequence broke */
       -                                saterm_close_sp_parasep_level = 0;
       -                        }
       -                }
       -
       -                /*
       -                 * Update right side (b and c) of the skip state by
       -                 * starting at the breakpoint and detecting the two
       -                 * following non-ignored character classes
       -                 *
       -                 */
       -                skip.c = NUM_SENTENCE_BREAK_PROPS;
       -                for (tmp = off; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       -
       -                        if (res != SENTENCE_BREAK_PROP_EXTEND &&
       -                            res != SENTENCE_BREAK_PROP_FORMAT) {
       -                                skip.c = res;
       -                                break;
       -                        }
       -                }
       -                skip.d = NUM_SENTENCE_BREAK_PROPS;
       -                for (; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       +        if ((state->saterm_close_sp_parasep_level == 0 ||
       +             state->saterm_close_sp_parasep_level == 1) &&
       +            (prop == SENTENCE_BREAK_PROP_STERM ||
       +             prop == SENTENCE_BREAK_PROP_ATERM)) {
       +                /* sequence has begun */
       +                state->saterm_close_sp_parasep_level = 1;
       +        } else if ((state->saterm_close_sp_parasep_level == 1 ||
       +                    state->saterm_close_sp_parasep_level == 2) &&
       +                   prop == SENTENCE_BREAK_PROP_CLOSE) {
       +                /* close-sequence begins or continued */
       +                state->saterm_close_sp_parasep_level = 2;
       +        } else if ((state->saterm_close_sp_parasep_level == 1 ||
       +                    state->saterm_close_sp_parasep_level == 2 ||
       +                    state->saterm_close_sp_parasep_level == 3) &&
       +                   prop == SENTENCE_BREAK_PROP_SP) {
       +                /* sp-sequence begins or continued */
       +                state->saterm_close_sp_parasep_level = 3;
       +        } else if ((state->saterm_close_sp_parasep_level == 1 ||
       +                    state->saterm_close_sp_parasep_level == 2 ||
       +                    state->saterm_close_sp_parasep_level == 3) &&
       +                   (prop == SENTENCE_BREAK_PROP_SEP ||
       +                    prop == SENTENCE_BREAK_PROP_CR  ||
       +                    prop == SENTENCE_BREAK_PROP_LF)) {
       +                /* ParaSep at the end of the sequence */
       +                state->saterm_close_sp_parasep_level = 4;
       +        } else {
       +                /* sequence broke */
       +                state->saterm_close_sp_parasep_level = 0;
       +        }
       +}
        
       -                        if (res != SENTENCE_BREAK_PROP_EXTEND &&
       -                            res != SENTENCE_BREAK_PROP_FORMAT) {
       -                                skip.d = res;
       -                                break;
       -                        }
       -                }
       +static size_t
       +next_sentence_break(HERODOTUS_READER *r)
       +{
       +        HERODOTUS_READER tmp;
       +        enum sentence_break_property prop;
       +        struct proper p;
       +        struct sentence_break_state state = { 0 };
       +        uint_least32_t cp;
        
       -                /*
       -                 * Update the raw state by simply shifting everything
       -                 * in and, if we still have data left, determining
       -                 * the character class of the next codepoint.
       -                 *
       -                 */
       -                raw.a = raw.b;
       -                raw.b = raw.c;
       -                raw.c = raw.d;
       -                if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
       -                        get_codepoint(str, len, new_off, &cp);
       -                        raw.d = get_break_prop(cp);
       -                } else {
       -                        raw.d = NUM_SENTENCE_BREAK_PROPS;
       -                }
       +        /*
       +         * Apply sentence breaking algorithm (UAX #29), see
       +         * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
       +         */
       +        proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
       +                    get_sentence_break_prop, is_skippable_sentence_prop,
       +                    sentence_skip_shift_callback, &p);
        
       +        while (!proper_advance(&p)) {
                        /* SB3 */
       -                if (raw.b == SENTENCE_BREAK_PROP_CR &&
       -                    raw.c == SENTENCE_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
       +                    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
                                continue;
                        }
        
                        /* SB4 */
       -                if (raw.b == SENTENCE_BREAK_PROP_SEP ||
       -                    raw.b == SENTENCE_BREAK_PROP_CR  ||
       -                    raw.b == SENTENCE_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
       +                    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR  ||
       +                    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
                                break;
                        }
        
                        /* SB5 */
       -                if (raw.c == SENTENCE_BREAK_PROP_EXTEND ||
       -                    raw.c == SENTENCE_BREAK_PROP_FORMAT) {
       +                if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
       +                    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
                                continue;
                        }
        
                        /* SB6 */
       -                if (skip.b == SENTENCE_BREAK_PROP_ATERM &&
       -                    skip.c == SENTENCE_BREAK_PROP_NUMERIC) {
       +                if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
       +                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* SB7 */
       -                if (off > 1 &&
       -                    (skip.a == SENTENCE_BREAK_PROP_UPPER ||
       -                     skip.a == SENTENCE_BREAK_PROP_LOWER) &&
       -                    skip.b == SENTENCE_BREAK_PROP_ATERM &&
       -                    skip.c == SENTENCE_BREAK_PROP_UPPER) {
       +                if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
       +                     p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
       +                    p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
       +                    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
                                continue;
                        }
        
                        /* SB8 */
       -                if (aterm_close_sp_level == 1 ||
       -                    aterm_close_sp_level == 2 ||
       -                    aterm_close_sp_level == 3) {
       +                if (state.aterm_close_sp_level == 1 ||
       +                    state.aterm_close_sp_level == 2 ||
       +                    state.aterm_close_sp_level == 3) {
                                /*
                                 * This is the most complicated rule, requiring
                                 * the right-hand-side to satisfy the regular expression
       @@ -260,67 +182,75 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
                                 *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
                                 *
                                 * which we simply check "manually" given LUT-lookups
       -                         * are very cheap.
       +                         * are very cheap by starting at the mid_reader.
                                 *
                                 */
       -                        for (tmp = off, res = NUM_SENTENCE_BREAK_PROPS; tmp < len; ) {
       -                                tmp += get_codepoint(str, len, tmp, &cp);
       -                                res = get_break_prop(cp);
       +                        herodotus_reader_copy(&(p.mid_reader), &tmp);
       +
       +                        prop = NUM_SENTENCE_BREAK_PROPS;
       +                        while (herodotus_read_codepoint(&tmp, true, &cp) ==
       +                               HERODOTUS_STATUS_SUCCESS) {
       +                                prop = get_sentence_break_prop(cp);
        
       -                                if (res == SENTENCE_BREAK_PROP_OLETTER ||
       -                                    res == SENTENCE_BREAK_PROP_UPPER   ||
       -                                    res == SENTENCE_BREAK_PROP_LOWER   ||
       -                                    res == SENTENCE_BREAK_PROP_SEP     ||
       -                                    res == SENTENCE_BREAK_PROP_CR      ||
       -                                    res == SENTENCE_BREAK_PROP_LF      ||
       -                                    res == SENTENCE_BREAK_PROP_STERM   ||
       -                                    res == SENTENCE_BREAK_PROP_ATERM) {
       +                                /*
       +                                 * the skippable properties are ignored
       +                                 * automatically here given they do not
       +                                 * match the following condition
       +                                 */
       +                                if (prop == SENTENCE_BREAK_PROP_OLETTER ||
       +                                    prop == SENTENCE_BREAK_PROP_UPPER   ||
       +                                    prop == SENTENCE_BREAK_PROP_LOWER   ||
       +                                    prop == SENTENCE_BREAK_PROP_SEP     ||
       +                                    prop == SENTENCE_BREAK_PROP_CR      ||
       +                                    prop == SENTENCE_BREAK_PROP_LF      ||
       +                                    prop == SENTENCE_BREAK_PROP_STERM   ||
       +                                    prop == SENTENCE_BREAK_PROP_ATERM) {
                                                break;
                                        }
                                }
        
       -                        if (res == SENTENCE_BREAK_PROP_LOWER) {
       +                        if (prop == SENTENCE_BREAK_PROP_LOWER) {
                                        continue;
                                }
                        }
        
                        /* SB8a */
       -                if ((saterm_close_sp_parasep_level == 1 ||
       -                     saterm_close_sp_parasep_level == 2 ||
       -                     saterm_close_sp_parasep_level == 3) &&
       -                    (skip.c == SENTENCE_BREAK_PROP_SCONTINUE ||
       -                     skip.c == SENTENCE_BREAK_PROP_STERM     ||
       -                     skip.c == SENTENCE_BREAK_PROP_ATERM)) {
       +                if ((state.saterm_close_sp_parasep_level == 1 ||
       +                     state.saterm_close_sp_parasep_level == 2 ||
       +                     state.saterm_close_sp_parasep_level == 3) &&
       +                    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM     ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
                                continue;
                        }
        
                        /* SB9 */
       -                if ((saterm_close_sp_parasep_level == 1 ||
       -                     saterm_close_sp_parasep_level == 2) &&
       -                    (skip.c == SENTENCE_BREAK_PROP_CLOSE ||
       -                     skip.c == SENTENCE_BREAK_PROP_SP    ||
       -                     skip.c == SENTENCE_BREAK_PROP_SEP   ||
       -                     skip.c == SENTENCE_BREAK_PROP_CR    ||
       -                     skip.c == SENTENCE_BREAK_PROP_LF)) {
       +                if ((state.saterm_close_sp_parasep_level == 1 ||
       +                     state.saterm_close_sp_parasep_level == 2) &&
       +                    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP    ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP   ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR    ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
                                continue;
                        }
        
                        /* SB10 */
       -                if ((saterm_close_sp_parasep_level == 1 ||
       -                     saterm_close_sp_parasep_level == 2 ||
       -                     saterm_close_sp_parasep_level == 3) &&
       -                    (skip.c == SENTENCE_BREAK_PROP_SP  ||
       -                     skip.c == SENTENCE_BREAK_PROP_SEP ||
       -                     skip.c == SENTENCE_BREAK_PROP_CR  ||
       -                     skip.c == SENTENCE_BREAK_PROP_LF)) {
       +                if ((state.saterm_close_sp_parasep_level == 1 ||
       +                     state.saterm_close_sp_parasep_level == 2 ||
       +                     state.saterm_close_sp_parasep_level == 3) &&
       +                    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP  ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR  ||
       +                     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
                                continue;
                        }
        
                        /* SB11 */
       -                if (saterm_close_sp_parasep_level == 1 ||
       -                    saterm_close_sp_parasep_level == 2 ||
       -                    saterm_close_sp_parasep_level == 3 ||
       -                    saterm_close_sp_parasep_level == 4) {
       +                if (state.saterm_close_sp_parasep_level == 1 ||
       +                    state.saterm_close_sp_parasep_level == 2 ||
       +                    state.saterm_close_sp_parasep_level == 3 ||
       +                    state.saterm_close_sp_parasep_level == 4) {
                                break;
                        }
        
       @@ -328,17 +258,25 @@ next_sentence_break(const void *str, size_t len, size_t (*get_codepoint)
                        continue;
                }
        
       -        return off;
       +        return herodotus_reader_number_read(&(p.mid_reader));
        }
        
        size_t
        grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
        {
       -        return next_sentence_break(str, len, get_codepoint);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
       +
       +        return next_sentence_break(&r);
        }
        
        size_t
        grapheme_next_sentence_break_utf8(const char *str, size_t len)
        {
       -        return next_sentence_break(str, len, get_codepoint_utf8);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
       +
       +        return next_sentence_break(&r);
        }
 (DIR) diff --git a/src/utf8.c b/src/utf8.c
       @@ -1,5 +1,6 @@
        /* See LICENSE file for copyright and license details. */
       -#include <stdio.h>
       +#include <stddef.h>
       +#include <stdint.h>
        
        #include "../grapheme.h"
        #include "util.h"
 (DIR) diff --git a/src/util.c b/src/util.c
       @@ -1,70 +1,417 @@
        /* See LICENSE file for copyright and license details. */
       +#include <limits.h>
        #include <stdbool.h>
       +#include <stddef.h>
        #include <stdint.h>
       -#include <stdlib.h>
        
        #include "../gen/types.h"
        #include "../grapheme.h"
        #include "util.h"
        
       -inline size_t
       -get_codepoint(const void *str, size_t len, size_t offset, uint_least32_t *cp)
       +void
       +herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
       +                      const void *src, size_t srclen)
        {
       -        if (offset < len) {
       -                *cp = ((const uint_least32_t *)str)[offset];
       -                return 1;
       -        } else {
       -                *cp = GRAPHEME_INVALID_CODEPOINT;
       -                return 0;
       +        size_t i;
       +
       +        r->type = type;
       +        r->src = src;
       +        r->srclen = srclen;
       +        r->off = 0;
       +        r->terminated_by_null = false;
       +
       +        for (i = 0; i < LEN(r->soft_limit); i++) {
       +                r->soft_limit[i] = SIZE_MAX;
                }
        }
        
       -inline size_t
       -get_codepoint_utf8(const void *str, size_t len, size_t offset, uint_least32_t *cp)
       +void
       +herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
        {
       -        size_t ret;
       +        size_t i;
        
       -        if (offset < len) {
       -                ret = grapheme_decode_utf8((const char *)str + offset,
       -                                           len - offset, cp);
       +        /*
       +         * we copy such that we have a "fresh" start and build on the
       +         * fact that src->soft_limit[i] for any i and src->srclen are
       +         * always larger or equal to src->off
       +         */
       +        dest->type = src->type;
       +        if (src->type == HERODOTUS_TYPE_CODEPOINT) {
       +                dest->src = (src->src == NULL) ? NULL :
       +                            ((const uint_least32_t *)(src->src)) + src->off;
       +        } else { /* src->type == HERODOTUS_TYPE_UTF8 */
       +                dest->src = (src->src == NULL) ? NULL :
       +                            ((const char *)(src->src)) + src->off;
       +        }
       +        if (src->srclen == SIZE_MAX) {
       +                dest->srclen = SIZE_MAX;
       +        } else {
       +                dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
       +        }
       +        dest->off = 0;
       +        dest->terminated_by_null = src->terminated_by_null;
        
       -                if (unlikely(len == SIZE_MAX && cp == 0)) {
       -                        return 0;
       +        for (i = 0; i < LEN(src->soft_limit); i++) {
       +                if (src->soft_limit[i] == SIZE_MAX) {
       +                        dest->soft_limit[i] = SIZE_MAX;
                        } else {
       -                        return ret;
       +                        /*
       +                         * if we have a degenerate case where the offset is
       +                         * higher than the soft-limit, we simply clamp the
       +                         * soft-limit to zero given we can't decide here
       +                         * to release the limit and, instead, we just
       +                         * prevent any more reads
       +                         */
       +                        dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
       +                                src->soft_limit[i] - src->off : 0;
                        }
       -        } else {
       +        }
       +}
       +
       +void
       +herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
       +{
       +        size_t i;
       +
       +        for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
       +                r->soft_limit[i] = r->soft_limit[i - 1];
       +        }
       +        r->soft_limit[0] = r->off + count;
       +}
       +
       +void
       +herodotus_reader_pop_limit(HERODOTUS_READER *r)
       +{
       +        size_t i;
       +
       +        for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
       +                r->soft_limit[i] = r->soft_limit[i + 1];
       +        }
       +        r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
       +}
       +
       +size_t
       +herodotus_reader_next_word_break(const HERODOTUS_READER *r)
       +{
       +        if (r->type == HERODOTUS_TYPE_CODEPOINT) {
       +                return grapheme_next_word_break(
       +                        (const uint_least32_t *)(r->src) + r->off,
       +                        MIN(r->srclen, r->soft_limit[0]) - r->off);
       +        } else { /* r->type == HERODOTUS_TYPE_UTF8 */
       +                return grapheme_next_word_break_utf8(
       +                        (const char *)(r->src) + r->off,
       +                        MIN(r->srclen, r->soft_limit[0]) - r->off);
       +        }
       +}
       +
       +size_t
       +herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
       +{
       +        if (r->type == HERODOTUS_TYPE_CODEPOINT) {
       +                return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
       +        } else { /* r->type == HERODOTUS_TYPE_UTF8 */
       +                return grapheme_decode_utf8(
       +                        (const char *)(r->src) + r->off,
       +                        MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
       +        }
       +}
       +
       +size_t
       +herodotus_reader_number_read(const HERODOTUS_READER *r)
       +{
       +        return r->off;
       +}
       +
       +enum herodotus_status
       +herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
       +{
       +        size_t ret;
       +
       +        if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
                        *cp = GRAPHEME_INVALID_CODEPOINT;
       -                return 0;
       +                return HERODOTUS_STATUS_END_OF_BUFFER;
                }
       +
       +        if (r->off >= r->soft_limit[0]) {
       +                *cp = GRAPHEME_INVALID_CODEPOINT;
       +                return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
       +        }
       +
       +        if (r->type == HERODOTUS_TYPE_CODEPOINT) {
       +                *cp = ((const uint_least32_t *)(r->src))[r->off];
       +                ret = 1;
       +        } else { /* r->type == HERODOTUS_TYPE_UTF8 */
       +                ret = grapheme_decode_utf8((const char *)r->src + r->off,
       +                                           MIN(r->srclen, r->soft_limit[0]) -
       +                                           r->off, cp);
       +        }
       +
       +        if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
       +                /*
       +                 * We encountered a null-codepoint. Don't increment
       +                 * offset and return as if the buffer had ended here all
       +                 * along
       +                 */
       +                r->terminated_by_null = true;
       +                return HERODOTUS_STATUS_END_OF_BUFFER;
       +        }
       +
       +        if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
       +                /*
       +                 * we want more than we have; instead of returning
       +                 * garbage we terminate here.
       +                 */
       +                return HERODOTUS_STATUS_END_OF_BUFFER;
       +        }
       +
       +        /*
       +         * Increase offset which we now know won't surpass the limits,
       +         * unless we got told otherwise
       +         */
       +        if (advance) {
       +                r->off += ret;
       +        }
       +
       +        return HERODOTUS_STATUS_SUCCESS;
        }
        
       -inline size_t
       -set_codepoint(uint_least32_t cp, void *str, size_t len, size_t offset)
       +void
       +herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
       +                      void *dest, size_t destlen)
        {
       -        if (str == NULL || len == 0) {
       -                return 1;
       +        w->type = type;
       +        w->dest = dest;
       +        w->destlen = destlen;
       +        w->off = 0;
       +        w->first_unwritable_offset = SIZE_MAX;
       +}
       +
       +void
       +herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
       +{
       +        if (w->dest == NULL) {
       +                return;
                }
        
       -        if (offset < len) {
       -                ((uint_least32_t *)str)[offset] = cp;
       -                return 1;
       -        } else {
       -                return 0;
       +        if (w->off < w->destlen) {
       +                /* We still have space in the buffer. Simply use it */
       +                if (w->type == HERODOTUS_TYPE_CODEPOINT) {
       +                        ((uint_least32_t *)(w->dest))[w->off] = 0;
       +                } else { /* w->type == HERODOTUS_TYPE_UTF8 */
       +                        ((char *)(w->dest))[w->off] = '\0';
       +                }
       +        } else if (w->first_unwritable_offset < w->destlen) {
       +                /*
       +                 * There is no more space in the buffer. However,
       +                 * we have noted down the first offset we couldn't
       +                 * use to write into the buffer and it's smaller than
       +                 * destlen. Thus we bailed writing into the
       +                 * destination when a multibyte-codepoint couldn't be
       +                 * written. So the last "real" byte might be at
       +                 * destlen-4, destlen-3, destlen-2 or destlen-1
       +                 * (the last case meaning truncation).
       +                 */
       +                if (w->type == HERODOTUS_TYPE_CODEPOINT) {
       +                        ((uint_least32_t *)(w->dest))
       +                                [w->first_unwritable_offset] = 0;
       +                } else { /* w->type == HERODOTUS_TYPE_UTF8 */
       +                        ((char *)(w->dest))[w->first_unwritable_offset] = '\0';
       +                }
       +        } else if (w->destlen > 0) {
       +                /*
       +                 * In this case, there is no more space in the buffer and
       +                 * the last unwritable offset is larger than
       +                 * or equal to the destination buffer length. This means
       +                 * that we are forced to simply write into the last
       +                 * byte.
       +                 */
       +                if (w->type == HERODOTUS_TYPE_CODEPOINT) {
       +                        ((uint_least32_t *)(w->dest))
       +                                [w->destlen - 1] = 0;
       +                } else { /* w->type == HERODOTUS_TYPE_UTF8 */
       +                        ((char *)(w->dest))[w->destlen - 1] = '\0';
       +                }
                }
       +
       +        /* w->off is not incremented in any case */
       +}
       +
       +size_t
       +herodotus_writer_number_written(const HERODOTUS_WRITER *w)
       +{
       +        return w->off;
        }
        
       -inline size_t
       -set_codepoint_utf8(uint_least32_t cp, void *str, size_t len, size_t offset)
       +void
       +herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
        {
       -        if (str == NULL || len == 0) {
       -                return grapheme_encode_utf8(cp, NULL, 0);
       +        size_t ret;
       +
       +        /*
       +         * This function will always faithfully say how many codepoints
       +         * were written, even if the buffer ends. This is used to enable
       +         * truncation detection.
       +         */
       +        if (w->type == HERODOTUS_TYPE_CODEPOINT) {
       +                if (w->dest != NULL && w->off < w->destlen) {
       +                        ((uint_least32_t *)(w->dest))[w->off] = cp;
       +                }
       +
       +                w->off += 1;
       +        } else { /* w->type == HERODOTUS_TYPE_UTF8 */
       +                /*
       +                 * First determine how many bytes we need to encode the
       +                 * codepoint
       +                 */
       +                ret = grapheme_encode_utf8(cp, NULL, 0);
       +
       +                if (w->dest != NULL && w->off + ret < w->destlen) {
       +                        /* we still have enough room in the buffer */
       +                        grapheme_encode_utf8(cp, (char *)(w->dest) +
       +                                             w->off, w->destlen - w->off);
       +                } else if (w->first_unwritable_offset == SIZE_MAX) {
       +                        /*
       +                         * the first unwritable offset has not been
       +                         * noted down, so this is the first time we can't
       +                         * write (completely) to an offset
       +                         */
       +                        w->first_unwritable_offset = w->off;
       +                }
       +
       +                w->off += ret;
                }
       +}
       +
       +void
       +proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
       +            uint_least8_t (*get_break_prop)(uint_least32_t),
       +            bool (*is_skippable_prop)(uint_least8_t),
       +            void (*skip_shift_callback)(uint_least8_t, void *),
       +            struct proper *p)
       +{
       +        uint_least8_t prop;
       +        uint_least32_t cp;
       +        size_t i;
       +
       +        /* set internal variables */
       +        p->state = state;
       +        p->no_prop = no_prop;
       +        p->get_break_prop = get_break_prop;
       +        p->is_skippable_prop = is_skippable_prop;
       +        p->skip_shift_callback = skip_shift_callback;
       +
       +        /*
       +         * Initialize mid-reader, which is basically just there
       +         * to reflect the current position of the viewing-line
       +         */
       +        herodotus_reader_copy(r, &(p->mid_reader));
        
       -        if (offset < len) {
       -                return grapheme_encode_utf8(cp, (char *)str + offset,
       -                                            len - offset);
       +        /*
       +         * In the initialization, we simply (try to) fill in next_prop.
       +         * If we cannot read in more (due to the buffer ending), we
       +         * fill in the prop as invalid
       +         */
       +
       +        /*
       +         * initialize the previous properties to have no property
       +         * (given we are at the start of the buffer)
       +         */
       +        p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
       +        p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
       +
       +        /*
       +         * initialize the next properties
       +         */
       +
       +        /* initialize the raw reader */
       +        herodotus_reader_copy(r, &(p->raw_reader));
       +
       +        /* fill in the two next raw properties (after no-initialization) */
       +        p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
       +        for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
       +             HERODOTUS_STATUS_SUCCESS; ) {
       +                p->raw.next_prop[i++] = p->get_break_prop(cp);
       +        }
       +
       +        /* initialize the skip reader */
       +        herodotus_reader_copy(r, &(p->skip_reader));
       +
       +        /* fill in the two next skip properties (after no-initialization) */
       +        p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
       +        for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
       +             HERODOTUS_STATUS_SUCCESS; ) {
       +                prop = p->get_break_prop(cp);
       +                if (!p->is_skippable_prop(prop)) {
       +                        p->skip.next_prop[i++] = prop;
       +                }
       +        }
       +}
       +
       +int
       +proper_advance(struct proper *p)
       +{
       +        uint_least8_t prop;
       +        uint_least32_t cp;
       +
       +        /* read in next "raw" property */
       +        if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
       +            HERODOTUS_STATUS_SUCCESS) {
       +                prop = p->get_break_prop(cp);
                } else {
       -                return grapheme_encode_utf8(cp, NULL, 0);
       +                prop = p->no_prop;
       +        }
       +
       +        /*
       +         * do a shift-in, unless we find that the property that is to
       +         * be moved past the "raw-viewing-line" (this property is stored
       +         * in p->raw.next_prop[0]) is a no_prop, indicating that
       +         * we are at the end of the buffer.
       +         */
       +        if (p->raw.next_prop[0] == p->no_prop) {
       +                return 1;
       +        }
       +
       +        /* shift in the properties */
       +        p->raw.prev_prop[1] = p->raw.prev_prop[0];
       +        p->raw.prev_prop[0] = p->raw.next_prop[0];
       +        p->raw.next_prop[0] = p->raw.next_prop[1];
       +        p->raw.next_prop[1] = prop;
       +
       +        /* advance the middle reader viewing-line */
       +        (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
       +
       +        /* check skippability-property */
       +        if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
       +                /*
       +                 * the property that has moved past the "raw-viewing-line"
       +                 * (this property is now (after the raw-shift) stored in
       +                 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
       +                 * guaranteeing that we won't shift a no-prop past the
       +                 * "viewing-line" in the skip-properties) is not a skippable
       +                 * property, thus we need to shift the skip property as well.
       +                 */
       +                p->skip.prev_prop[1] = p->skip.prev_prop[0];
       +                p->skip.prev_prop[0] = p->skip.next_prop[0];
       +                p->skip.next_prop[0] = p->skip.next_prop[1];
       +
       +                /*
       +                 * call the skip-shift-callback on the property that
       +                 * passed the skip-viewing-line (this property is now
       +                 * stored in p->skip.prev_prop[0]).
       +                 */
       +                p->skip_shift_callback(p->skip.prev_prop[0], p->state);
       +
       +                /* determine the next shift property */
       +                p->skip.next_prop[1] = p->no_prop;
       +                while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
       +                       HERODOTUS_STATUS_SUCCESS) {
       +                        prop = p->get_break_prop(cp);
       +                        if (!p->is_skippable_prop(prop)) {
       +                                p->skip.next_prop[1] = prop;
       +                                break;
       +                        }
       +                }
                }
       +
       +        return 0;
        }
 (DIR) diff --git a/src/util.h b/src/util.h
       @@ -2,12 +2,16 @@
        #ifndef UTIL_H
        #define UTIL_H
        
       +#include <stdbool.h>
        #include <stddef.h>
        #include <stdint.h>
        
        #include "../gen/types.h"
        #include "../grapheme.h"
        
       +#undef MIN
       +#define MIN(x,y)  ((x) < (y) ? (x) : (y))
       +#undef LEN
        #define LEN(x) (sizeof(x) / sizeof(*(x)))
        
        #undef likely
       @@ -25,10 +29,88 @@
                #define unlikely(expr) (expr)
        #endif
        
       -size_t get_codepoint(const void *, size_t, size_t, uint_least32_t *);
       -size_t get_codepoint_utf8(const void *, size_t, size_t, uint_least32_t *);
       +/*
       + * Herodotus, the ancient greek historian and geographer,
       + * was criticized for including legends and other fantastic
       + * accounts into his works, among others by his contemporary
       + * Thucydides.
       + *
       + * The Herodotus readers and writers are tailored towards the needs
       + * of the library interface, doing all the dirty work behind the
       + * scenes. While the reader is relatively faithful in his accounts,
       + * the Herodotus writer will never fail and always claim to write the
       + * data. Internally, it only writes as much as it can, and will simply
       + * keep account of the rest. This way, we can properly signal truncation.
       + *
       + * In this sense, explaining the naming, the writer is always a bit
       + * inaccurate in his accounts.
       + *
       + */
       +enum herodotus_status {
       +        HERODOTUS_STATUS_SUCCESS,
       +        HERODOTUS_STATUS_END_OF_BUFFER,
       +        HERODOTUS_STATUS_SOFT_LIMIT_REACHED,
       +};
        
       -size_t set_codepoint(uint_least32_t, void *, size_t, size_t);
       -size_t set_codepoint_utf8(uint_least32_t, void *, size_t, size_t);
       +enum herodotus_type {
       +        HERODOTUS_TYPE_CODEPOINT,
       +        HERODOTUS_TYPE_UTF8,
       +};
       +
       +typedef struct herodotus_reader {
       +        enum herodotus_type type;
       +        const void *src;
       +        size_t srclen;
       +        size_t off;
       +        bool terminated_by_null;
       +        size_t soft_limit[10];
       +} HERODOTUS_READER;
       +
       +typedef struct herodotus_writer {
       +        enum herodotus_type type;
       +        void *dest;
       +        size_t destlen;
       +        size_t off;
       +        size_t first_unwritable_offset;
       +} HERODOTUS_WRITER;
       +
       +struct proper {
       +        /*
       +         * prev_prop[1] prev_prop[0] | next_prop[0] next_prop[1]
       +         */
       +        struct {
       +                uint_least8_t prev_prop[2];
       +                uint_least8_t next_prop[2];
       +        } raw, skip;
       +        HERODOTUS_READER mid_reader, raw_reader, skip_reader;
       +        void *state;
       +        uint_least8_t no_prop;
       +        uint_least8_t (*get_break_prop)(uint_least32_t);
       +        bool (*is_skippable_prop)(uint_least8_t);
       +        void (*skip_shift_callback)(uint_least8_t, void *);
       +};
       +
       +void herodotus_reader_init(HERODOTUS_READER *, enum herodotus_type,
       +                           const void *, size_t);
       +void herodotus_reader_copy(const HERODOTUS_READER *, HERODOTUS_READER *);
       +void herodotus_reader_push_advance_limit(HERODOTUS_READER *, size_t);
       +void herodotus_reader_pop_limit(HERODOTUS_READER *);
       +size_t herodotus_reader_number_read(const HERODOTUS_READER *);
       +size_t herodotus_reader_next_word_break(const HERODOTUS_READER *);
       +size_t herodotus_reader_next_codepoint_break(const HERODOTUS_READER *);
       +enum herodotus_status herodotus_read_codepoint(HERODOTUS_READER *, bool, uint_least32_t *);
       +
       +void herodotus_writer_init(HERODOTUS_WRITER *, enum herodotus_type, void *,
       +                           size_t);
       +void herodotus_writer_nul_terminate(HERODOTUS_WRITER *);
       +size_t herodotus_writer_number_written(const HERODOTUS_WRITER *);
       +void herodotus_write_codepoint(HERODOTUS_WRITER *, uint_least32_t);
       +
       +void proper_init(const HERODOTUS_READER *, void *, uint_least8_t,
       +                 uint_least8_t (*get_break_prop)(uint_least32_t),
       +                 bool (*is_skippable_prop)(uint_least8_t),
       +                 void (*skip_shift_callback)(uint_least8_t, void *),
       +                 struct proper *);
       +int proper_advance(struct proper *);
        
        #endif /* UTIL_H */
 (DIR) diff --git a/src/word.c b/src/word.c
       @@ -1,331 +1,242 @@
        /* See LICENSE file for copyright and license details. */
        #include <stdbool.h>
        #include <stddef.h>
       -#include <stdlib.h>
       -#include <string.h>
        
        #include "../gen/word.h"
        #include "../grapheme.h"
        #include "util.h"
        
       -static inline enum word_break_property
       -get_break_prop(uint_least32_t cp)
       +struct word_break_state
       +{
       +        bool ri_even;
       +};
       +
       +static inline uint_least8_t
       +get_word_break_prop(uint_least32_t cp)
        {
                if (likely(cp <= 0x10FFFF)) {
       -                return (enum word_break_property)
       +                return (uint_least8_t)
                               word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
                } else {
                        return WORD_BREAK_PROP_OTHER;
                }
        }
        
       -static size_t
       -next_word_break(const void *str, size_t len, size_t (*get_codepoint)
       -                (const void *, size_t, size_t, uint_least32_t *))
       +static bool
       +is_skippable_word_prop(uint_least8_t prop)
        {
       -        struct {
       -                enum word_break_property a, b, c, d;
       -        } raw, skip;
       -        enum word_break_property res;
       -        uint_least32_t cp;
       -        size_t off, tmp, new_off;
       -        bool ri_even = true;
       -
       -        /* check degenerate cases */
       -        if (str == NULL || len == 0) {
       -                return 0;
       -        }
       -
       -        /*
       -         * Apply word breaking algorithm (UAX #29), see
       -         * https://unicode.org/reports/tr29/#Word_Boundary_Rules
       -         *
       -         * There are 4 slots (a, b, c, d) of "break" properties and
       -         * we check if there is a break in the middle between b and c.
       -         *
       -         * The position of this middle spot is determined by off,
       -         * which gives the offset of the first element on the right
       -         * hand side of said spot, or, in other words, gives the number
       -         * of elements on the left hand side.
       -         *
       -         * It is further complicated by the fact that the algorithm
       -         * expects you to skip certain characters for the second
       -         * half of the rules (after WB4). Thus, we do not only have
       -         * the "raw" properties as described above, but also the "skip"
       -         * properties, where the skip.a and skip.b, for instance,
       -         * give the two preceding character properties behind the
       -         * currently investigated breakpoint.
       -         *
       -         */
       +        return prop == WORD_BREAK_PROP_EXTEND ||
       +               prop == WORD_BREAK_PROP_FORMAT ||
       +               prop == WORD_BREAK_PROP_ZWJ;
       +}
        
       -        /*
       -         * Initialize the different properties such that we have
       -         * a good state after the state-update in the loop
       -         */
       -        raw.b = NUM_WORD_BREAK_PROPS;
       -        if ((off = get_codepoint(str, len, 0, &cp)) >= len) {
       -                return 1;
       -        }
       -        raw.c = get_break_prop(cp);
       -        (void)get_codepoint(str, len, off, &cp);
       -        raw.d = get_break_prop(cp);
       -        skip.a = skip.b = NUM_WORD_BREAK_PROPS;
       +static void
       +word_skip_shift_callback(uint_least8_t prop, void *s)
       +{
       +        struct word_break_state *state = (struct word_break_state *)s;
        
       -        for (; off < len; off = new_off) {
       +        if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
                        /*
       -                 * Update left side (a and b) of the skip state by
       -                 * "shifting in" the raw.c property as long as it is
       -                 * not one of the "ignored" character properties.
       -                 * While at it, update the RI-counter.
       +                 * The property we just shifted in is
       +                 * a regional indicator, increasing the
       +                 * number of consecutive RIs on the left
       +                 * side of the breakpoint by one, changing
       +                 * the oddness.
                         *
                         */
       -                if (raw.c != WORD_BREAK_PROP_EXTEND &&
       -                    raw.c != WORD_BREAK_PROP_FORMAT &&
       -                    raw.c != WORD_BREAK_PROP_ZWJ) {
       -                            skip.a = skip.b;
       -                        skip.b = raw.c;
       -
       -                        if (skip.b == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
       -                                /*
       -                                 * The property we just shifted in is
       -                                 * a regional indicator, increasing the
       -                                 * number of consecutive RIs on the left
       -                                 * side of the breakpoint by one, changing
       -                                 * the oddness.
       -                                 *
       -                                 */
       -                                ri_even = !ri_even;
       -                        } else {
       -                                /*
       -                                 * We saw no regional indicator, so the
       -                                 * number of consecutive RIs on the left
       -                                 * side of the breakpoint is zero, which
       -                                 * is an even number.
       -                                 *
       -                                 */
       -                                ri_even = true;
       -                        }
       -                }
       -
       +                state->ri_even = !(state->ri_even);
       +        } else {
                        /*
       -                 * Update right side (b and c) of the skip state by
       -                 * starting at the breakpoint and detecting the two
       -                 * following non-ignored character classes
       +                 * We saw no regional indicator, so the
       +                 * number of consecutive RIs on the left
       +                 * side of the breakpoint is zero, which
       +                 * is an even number.
                         *
                         */
       -                skip.c = NUM_WORD_BREAK_PROPS;
       -                for (tmp = off; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       -
       -                        if (res != WORD_BREAK_PROP_EXTEND &&
       -                            res != WORD_BREAK_PROP_FORMAT &&
       -                            res != WORD_BREAK_PROP_ZWJ) {
       -                                skip.c = res;
       -                                break;
       -                        }
       -                }
       -                skip.d = NUM_WORD_BREAK_PROPS;
       -                for (; tmp < len; ) {
       -                        tmp += get_codepoint(str, len, tmp, &cp);
       -                        res = get_break_prop(cp);
       -
       -                        if (res != WORD_BREAK_PROP_EXTEND &&
       -                            res != WORD_BREAK_PROP_FORMAT &&
       -                            res != WORD_BREAK_PROP_ZWJ) {
       -                                skip.d = res;
       -                                break;
       -                        }
       -                }
       +                state->ri_even = true;
       +        }
       +}
        
       -                /*
       -                 * Update the raw state by simply shifting everything
       -                 * in and, if we still have data left, determining
       -                 * the character class of the next codepoint.
       -                 *
       -                 */
       -                raw.a = raw.b;
       -                raw.b = raw.c;
       -                raw.c = raw.d;
       -                if ((new_off = off + get_codepoint(str, len, off, &cp)) < len) {
       -                        get_codepoint(str, len, new_off, &cp);
       -                        raw.d = get_break_prop(cp);
       -                } else {
       -                        raw.d = NUM_WORD_BREAK_PROPS;
       -                }
       +static size_t
       +next_word_break(HERODOTUS_READER *r)
       +{
       +        struct proper p;
       +        struct word_break_state state = { .ri_even = true };
        
       +        /*
       +         * Apply word breaking algorithm (UAX #29), see
       +         * https://unicode.org/reports/tr29/#Word_Boundary_Rules
       +         */
       +        proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
       +                    is_skippable_word_prop, word_skip_shift_callback, &p);
       +
       +        while (!proper_advance(&p)) {
                        /* WB3 */
       -                if (raw.b == WORD_BREAK_PROP_CR &&
       -                    raw.c == WORD_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
                                continue;
                        }
        
                        /* WB3a */
       -                if (raw.b == WORD_BREAK_PROP_NEWLINE ||
       -                    raw.b == WORD_BREAK_PROP_CR      ||
       -                    raw.b == WORD_BREAK_PROP_LF) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
       +                    p.raw.prev_prop[0] == WORD_BREAK_PROP_CR      ||
       +                    p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
                                break;
                        }
        
                        /* WB3b */
       -                if (raw.c == WORD_BREAK_PROP_NEWLINE ||
       -                    raw.c == WORD_BREAK_PROP_CR      ||
       -                    raw.c == WORD_BREAK_PROP_LF) {
       +                if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_CR      ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
                                break;
                        }
        
                        /* WB3c */
       -                if (raw.b == WORD_BREAK_PROP_ZWJ &&
       -                    (raw.c == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
       -                     raw.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
       +                    (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
       +                     p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
                                continue;
                        }
        
                        /* WB3d */
       -                if (raw.b == WORD_BREAK_PROP_WSEGSPACE &&
       -                    raw.c == WORD_BREAK_PROP_WSEGSPACE) {
       +                if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
                                continue;
                        }
        
                        /* WB4 */
       -                if (raw.c == WORD_BREAK_PROP_EXTEND ||
       -                    raw.c == WORD_BREAK_PROP_FORMAT ||
       -                    raw.c == WORD_BREAK_PROP_ZWJ) {
       +                if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
       +                    p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
                                continue;
                        }
        
                        /* WB5 */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB6 */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    (skip.c == WORD_BREAK_PROP_MIDLETTER    ||
       -                     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    len > 2 &&
       -                    (skip.d == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.d == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.d == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER    ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB7 */
       -                if ((skip.b == WORD_BREAK_PROP_MIDLETTER    ||
       -                     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    len > 2 &&
       -                    (skip.a == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.a == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.a == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER    ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB7a */
       -                if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
       -                    skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
                                continue;
                        }
        
                        /* WB7b */
       -                if (skip.b == WORD_BREAK_PROP_HEBREW_LETTER &&
       -                    skip.c == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       -                    len > 2 &&
       -                    skip.d == WORD_BREAK_PROP_HEBREW_LETTER) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       +                    p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
                                continue;
                        }
        
                        /* WB7c */
       -                if (skip.b == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       -                    skip.c == WORD_BREAK_PROP_HEBREW_LETTER &&
       -                    off > 1 &&
       -                    skip.a == WORD_BREAK_PROP_HEBREW_LETTER) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
       +                    p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
                                continue;
                        }
        
                        /* WB8 */
       -                if (skip.b == WORD_BREAK_PROP_NUMERIC &&
       -                    skip.c == WORD_BREAK_PROP_NUMERIC) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB9 */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER) &&
       -                    skip.c == WORD_BREAK_PROP_NUMERIC) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB10 */
       -                if (skip.b == WORD_BREAK_PROP_NUMERIC &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER)) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
                                continue;
                        }
        
                        /* WB11 */
       -                if ((skip.b == WORD_BREAK_PROP_MIDNUM       ||
       -                     skip.b == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.b == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    skip.c == WORD_BREAK_PROP_NUMERIC &&
       -                    off > 1 &&
       -                    skip.a == WORD_BREAK_PROP_NUMERIC) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM       ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB12 */
       -                if (skip.b == WORD_BREAK_PROP_NUMERIC &&
       -                    (skip.c == WORD_BREAK_PROP_MIDNUM       ||
       -                     skip.c == WORD_BREAK_PROP_MIDNUMLET    ||
       -                     skip.c == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       -                    len > 2 &&
       -                    skip.d == WORD_BREAK_PROP_NUMERIC) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM       ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET    ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
       +                    p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
                                continue;
                        }
        
                        /* WB13 */
       -                if (skip.b == WORD_BREAK_PROP_KATAKANA &&
       -                    skip.c == WORD_BREAK_PROP_KATAKANA) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
                                continue;
                        }
        
                        /* WB13a */
       -                if ((skip.b == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.b == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.b == WORD_BREAK_PROP_HEBREW_LETTER        ||
       -                     skip.b == WORD_BREAK_PROP_NUMERIC              ||
       -                     skip.b == WORD_BREAK_PROP_KATAKANA             ||
       -                     skip.b == WORD_BREAK_PROP_EXTENDNUMLET) &&
       -                    skip.c == WORD_BREAK_PROP_EXTENDNUMLET) {
       +                if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER        ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC              ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA             ||
       +                     p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
                                continue;
                        }
        
                        /* WB13b */
       -                if (skip.b == WORD_BREAK_PROP_EXTENDNUMLET &&
       -                    (skip.c == WORD_BREAK_PROP_ALETTER              ||
       -                     skip.c == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       -                     skip.c == WORD_BREAK_PROP_HEBREW_LETTER        ||
       -                     skip.c == WORD_BREAK_PROP_NUMERIC              ||
       -                     skip.c == WORD_BREAK_PROP_KATAKANA)) {
       +                if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
       +                    (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER        ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC              ||
       +                     p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
                                continue;
                        }
        
                        /* WB15 and WB16 */
       -                if (!ri_even &&
       -                    skip.c == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
       +                if (!state.ri_even &&
       +                    p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
                                continue;
                        }
        
       @@ -333,17 +244,25 @@ next_word_break(const void *str, size_t len, size_t (*get_codepoint)
                        break;
                }
        
       -        return off;
       +        return herodotus_reader_number_read(&(p.mid_reader));
        }
        
        size_t
        grapheme_next_word_break(const uint_least32_t *str, size_t len)
        {
       -        return next_word_break(str, len, get_codepoint);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
       +
       +        return next_word_break(&r);
        }
        
        size_t
        grapheme_next_word_break_utf8(const char *str, size_t len)
        {
       -        return next_word_break(str, len, get_codepoint_utf8);
       +        HERODOTUS_READER r;
       +
       +        herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
       +
       +        return next_word_break(&r);
        }
 (DIR) diff --git a/test/case.c b/test/case.c
       @@ -0,0 +1,580 @@
       +/* See LICENSE file for copyright and license details. */
       +#include <stdbool.h>
       +#include <stdint.h>
       +#include <stdio.h>
       +#include <string.h>
       +
       +#include "../grapheme.h"
       +#include "util.h"
       +
       +struct unit_test_is_case_utf8 {
       +        const char *description;
       +        struct {
       +                const char *src;
       +                size_t srclen;
       +        } input;
       +        struct {
       +                bool ret;
       +                size_t caselen;
       +        } output;
       +};
       +
       +struct unit_test_to_case_utf8 {
       +        const char *description;
       +        struct {
       +                const char *src;
       +                size_t srclen;
       +                size_t destlen;
       +        } input;
       +        struct {
       +                const char *dest;
       +                size_t ret;
       +        } output;
       +};
       +
       +static const struct unit_test_is_case_utf8 is_lowercase_utf8[] = {
       +        {
       +                .description = "empty input",
       +                .input =  { "", 0 },
       +                .output = { true, 0 },
       +        },
       +        {
       +                .description = "one character, violation",
       +                .input =  { "A", 1 },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one character, confirmation",
       +                .input =  { "\xC3\x9F", 2 },
       +                .output = { true, 2 },
       +        },
       +        {
       +                .description = "one character, violation, NUL-terminated",
       +                .input =  { "A", SIZE_MAX },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one character, confirmation, NUL-terminated",
       +                .input =  { "\xC3\x9F", SIZE_MAX },
       +                .output = { true, 2 },
       +        },
       +        {
       +                .description = "one word, violation",
       +                .input =  { "Hello", 5 },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one word, partial confirmation",
       +                .input =  { "gru" "\xC3\x9F" "fOrmel", 11 },
       +                .output = { false, 6 },
       +        },
       +        {
       +                .description = "one word, full confirmation",
       +                .input =  { "gru" "\xC3\x9F" "formel", 11 },
       +                .output = { true, 11 },
       +        },
       +        {
       +                .description = "one word, violation, NUL-terminated",
       +                .input =  { "Hello", SIZE_MAX },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one word, partial confirmation, NUL-terminated",
       +                .input =  { "gru" "\xC3\x9F" "fOrmel", SIZE_MAX },
       +                .output = { false, 6 },
       +        },
       +        {
       +                .description = "one word, full confirmation, NUL-terminated",
       +                .input =  { "gru" "\xC3\x9F" "formel", SIZE_MAX },
       +                .output = { true, 11 },
       +        },
       +};
       +
       +static const struct unit_test_is_case_utf8 is_uppercase_utf8[] = {
       +        {
       +                .description = "empty input",
       +                .input =  { "", 0 },
       +                .output = { true, 0 },
       +        },
       +        {
       +                .description = "one character, violation",
       +                .input =  { "\xC3\x9F", 2 },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one character, confirmation",
       +                .input =  { "A", 1 },
       +                .output = { true, 1 },
       +        },
       +        {
       +                .description = "one character, violation, NUL-terminated",
       +                .input =  { "\xC3\x9F", SIZE_MAX },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one character, confirmation, NUL-terminated",
       +                .input =  { "A", SIZE_MAX },
       +                .output = { true, 1 },
       +        },
       +        {
       +                .description = "one word, violation",
       +                .input =  { "hello", 5 },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one word, partial confirmation",
       +                .input =  { "GRU" "\xC3\x9F" "formel", 11 },
       +                .output = { false, 3 },
       +        },
       +        {
       +                .description = "one word, full confirmation",
       +                .input =  { "HELLO", 5 },
       +                .output = { true, 5 },
       +        },
       +        {
       +                .description = "one word, violation, NUL-terminated",
       +                .input =  { "hello", SIZE_MAX },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one word, partial confirmation, NUL-terminated",
       +                .input =  { "GRU" "\xC3\x9F" "formel", SIZE_MAX },
       +                .output = { false, 3 },
       +        },
       +        {
       +                .description = "one word, full confirmation, NUL-terminated",
       +                .input =  { "HELLO", SIZE_MAX },
       +                .output = { true, 5 },
       +        },
       +};
       +
       +static const struct unit_test_is_case_utf8 is_titlecase_utf8[] = {
       +        {
       +                .description = "empty input",
       +                .input =  { "", 0 },
       +                .output = { true, 0 },
       +        },
       +        {
       +                .description = "one character, violation",
       +                .input =  { "\xC3\x9F", 2 },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one character, confirmation",
       +                .input =  { "A", 1 },
       +                .output = { true, 1 },
       +        },
       +        {
       +                .description = "one character, violation, NUL-terminated",
       +                .input =  { "\xC3\x9F", SIZE_MAX },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one character, confirmation, NUL-terminated",
       +                .input =  { "A", SIZE_MAX },
       +                .output = { true, 1 },
       +        },
       +        {
       +                .description = "one word, violation",
       +                .input =  { "hello", 5 },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one word, partial confirmation",
       +                .input =  { "Gru" "\xC3\x9F" "fOrmel", 11 },
       +                .output = { false, 6 },
       +        },
       +        {
       +                .description = "one word, full confirmation",
       +                .input =  { "Gru" "\xC3\x9F" "formel", 11 },
       +                .output = { true, 11 },
       +        },
       +        {
       +                .description = "one word, violation, NUL-terminated",
       +                .input =  { "hello", SIZE_MAX },
       +                .output = { false, 0 },
       +        },
       +        {
       +                .description = "one word, partial confirmation, NUL-terminated",
       +                .input =  { "Gru" "\xC3\x9F" "fOrmel", SIZE_MAX },
       +                .output = { false, 6 },
       +        },
       +        {
       +                .description = "one word, full confirmation, NUL-terminated",
       +                .input =  { "Gru" "\xC3\x9F" "formel", SIZE_MAX },
       +                .output = { true, 11 },
       +        },
       +        {
       +                .description = "multiple words, partial confirmation",
       +                .input =  { "Hello Gru" "\xC3\x9F" "fOrmel!", 18 },
       +                .output = { false, 12 },
       +        },
       +        {
       +                .description = "multiple words, full confirmation",
       +                .input =  { "Hello Gru" "\xC3\x9F" "formel!", 18 },
       +                .output = { true, 18 },
       +        },
       +        {
       +                .description = "multiple words, partial confirmation, NUL-terminated",
       +                .input =  { "Hello Gru" "\xC3\x9F" "fOrmel!", SIZE_MAX },
       +                .output = { false, 12 },
       +        },
       +        {
       +                .description = "multiple words, full confirmation, NUL-terminated",
       +                .input =  { "Hello Gru" "\xC3\x9F" "formel!", SIZE_MAX },
       +                .output = { true, 18 },
       +        },
       +};
       +
       +static const struct unit_test_to_case_utf8 to_lowercase_utf8[] = {
       +        {
       +                .description = "empty input",
       +                .input =  { "", 0, 10 },
       +                .output = { "", 0 },
       +        },
       +        {
       +                .description = "empty output",
       +                .input =  { "hello", 5, 0 },
       +                .output = { "", 5 },
       +        },
       +        {
       +                .description = "one character, conversion",
       +                .input =  { "A", 1, 10 },
       +                .output = { "a", 1 },
       +        },
       +        {
       +                .description = "one character, no conversion",
       +                .input =  { "\xC3\x9F", 2, 10 },
       +                .output = { "\xC3\x9F", 2 },
       +        },
       +        {
       +                .description = "one character, conversion, truncation",
       +                .input =  { "A", 1, 0 },
       +                .output = { "", 1 },
       +        },
       +        {
       +                .description = "one character, conversion, NUL-terminated",
       +                .input =  { "A", SIZE_MAX, 10 },
       +                .output = { "a", 1 },
       +        },
       +        {
       +                .description = "one character, no conversion, NUL-terminated",
       +                .input =  { "\xC3\x9F", SIZE_MAX, 10 },
       +                .output = { "\xC3\x9F", 2 },
       +        },
       +        {
       +                .description = "one character, conversion, NUL-terminated, truncation",
       +                .input =  { "A", SIZE_MAX, 0 },
       +                .output = { "", 1 },
       +        },
       +        {
       +                .description = "one word, conversion",
       +                .input =  { "wOrD", 4, 10 },
       +                .output = { "word", 4 },
       +        },
       +        {
       +                .description = "one word, no conversion",
       +                .input =  { "word", 4, 10 },
       +                .output = { "word", 4 },
       +        },
       +        {
       +                .description = "one word, conversion, truncation",
       +                .input =  { "wOrD", 4, 3 },
       +                .output = { "wo", 4 },
       +        },
       +        {
       +                .description = "one word, conversion, NUL-terminated",
       +                .input =  { "wOrD", SIZE_MAX, 10 },
       +                .output = { "word", 4 },
       +        },
       +        {
       +                .description = "one word, no conversion, NUL-terminated",
       +                .input =  { "word", SIZE_MAX, 10 },
       +                .output = { "word", 4 },
       +        },
       +        {
       +                .description = "one word, conversion, NUL-terminated, truncation",
       +                .input =  { "wOrD", SIZE_MAX, 3 },
       +                .output = { "wo", 4 },
       +        },
       +};
       +
       +static const struct unit_test_to_case_utf8 to_uppercase_utf8[] = {
       +        {
       +                .description = "empty input",
       +                .input =  { "", 0, 10 },
       +                .output = { "", 0 },
       +        },
       +        {
       +                .description = "empty output",
       +                .input =  { "hello", 5, 0 },
       +                .output = { "", 5 },
       +        },
       +        {
       +                .description = "one character, conversion",
       +                .input =  { "\xC3\x9F", 2, 10 },
       +                .output = { "SS", 2 },
       +        },
       +        {
       +                .description = "one character, no conversion",
       +                .input =  { "A", 1, 10 },
       +                .output = { "A", 1 },
       +        },
       +        {
       +                .description = "one character, conversion, truncation",
       +                .input =  { "\xC3\x9F", 2, 0 },
       +                .output = { "", 2 },
       +        },
       +        {
       +                .description = "one character, conversion, NUL-terminated",
       +                .input =  { "\xC3\x9F", SIZE_MAX, 10 },
       +                .output = { "SS", 2 },
       +        },
       +        {
       +                .description = "one character, no conversion, NUL-terminated",
       +                .input =  { "A", SIZE_MAX, 10 },
       +                .output = { "A", 1 },
       +        },
       +        {
       +                .description = "one character, conversion, NUL-terminated, truncation",
       +                .input =  { "\xC3\x9F", SIZE_MAX, 0 },
       +                .output = { "", 2 },
       +        },
       +        {
       +                .description = "one word, conversion",
       +                .input =  { "gRu" "\xC3\x9F" "fOrMel", 11, 15 },
       +                .output = { "GRUSSFORMEL", 11 },
       +        },
       +        {
       +                .description = "one word, no conversion",
       +                .input =  { "WORD", 4, 10 },
       +                .output = { "WORD", 4 },
       +        },
       +        {
       +                .description = "one word, conversion, truncation",
       +                .input =  { "gRu" "\xC3\x9F" "formel", 11, 5 },
       +                .output = { "GRUS", 11 },
       +        },
       +        {
       +                .description = "one word, conversion, NUL-terminated",
       +                .input =  { "gRu" "\xC3\x9F" "formel", SIZE_MAX, 15 },
       +                .output = { "GRUSSFORMEL", 11 },
       +        },
       +        {
       +                .description = "one word, no conversion, NUL-terminated",
       +                .input =  { "WORD", SIZE_MAX, 10 },
       +                .output = { "WORD", 4 },
       +        },
       +        {
       +                .description = "one word, conversion, NUL-terminated, truncation",
       +                .input =  { "gRu" "\xC3\x9F" "formel", SIZE_MAX, 5 },
       +                .output = { "GRUS", 11 },
       +        },
       +};
       +
       +static const struct unit_test_to_case_utf8 to_titlecase_utf8[] = {
       +        {
       +                .description = "empty input",
       +                .input =  { "", 0, 10 },
       +                .output = { "", 0 },
       +        },
       +        {
       +                .description = "empty output",
       +                .input =  { "hello", 5, 0 },
       +                .output = { "", 5 },
       +        },
       +        {
       +                .description = "one character, conversion",
       +                .input =  { "a", 1, 10 },
       +                .output = { "A", 1 },
       +        },
       +        {
       +                .description = "one character, no conversion",
       +                .input =  { "A", 1, 10 },
       +                .output = { "A", 1 },
       +        },
       +        {
       +                .description = "one character, conversion, truncation",
       +                .input =  { "a", 1, 0 },
       +                .output = { "", 1 },
       +        },
       +        {
       +                .description = "one character, conversion, NUL-terminated",
       +                .input =  { "a", SIZE_MAX, 10 },
       +                .output = { "A", 1 },
       +        },
       +        {
       +                .description = "one character, no conversion, NUL-terminated",
       +                .input =  { "A", SIZE_MAX, 10 },
       +                .output = { "A", 1 },
       +        },
       +        {
       +                .description = "one character, conversion, NUL-terminated, truncation",
       +                .input =  { "a", SIZE_MAX, 0 },
       +                .output = { "", 1 },
       +        },
       +        {
       +                .description = "one word, conversion",
       +                .input =  { "heLlo", 5, 10 },
       +                .output = { "Hello", 5 },
       +        },
       +        {
       +                .description = "one word, no conversion",
       +                .input =  { "Hello", 5, 10 },
       +                .output = { "Hello", 5 },
       +        },
       +        {
       +                .description = "one word, conversion, truncation",
       +                .input =  { "heLlo", 5, 2 },
       +                .output = { "H", 5 },
       +        },
       +        {
       +                .description = "one word, conversion, NUL-terminated",
       +                .input =  { "heLlo", SIZE_MAX, 10 },
       +                .output = { "Hello", 5 },
       +        },
       +        {
       +                .description = "one word, no conversion, NUL-terminated",
       +                .input =  { "Hello", SIZE_MAX, 10 },
       +                .output = { "Hello", 5 },
       +        },
       +        {
       +                .description = "one word, conversion, NUL-terminated, truncation",
       +                .input =  { "heLlo", SIZE_MAX, 3 },
       +                .output = { "He", 5 },
       +        },
       +        {
       +                .description = "two words, conversion",
       +                .input =  { "heLlo wORLd!", 12, 20 },
       +                .output = { "Hello World!", 12 },
       +        },
       +        {
       +                .description = "two words, no conversion",
       +                .input =  { "Hello World!", 12, 20 },
       +                .output = { "Hello World!", 12 },
       +        },
       +        {
       +                .description = "two words, conversion, truncation",
       +                .input =  { "heLlo wORLd!", 12, 8 },
       +                .output = { "Hello W", 12 },
       +        },
       +        {
       +                .description = "two words, conversion, NUL-terminated",
       +                .input =  { "heLlo wORLd!", SIZE_MAX, 20 },
       +                .output = { "Hello World!", 12 },
       +        },
       +        {
       +                .description = "two words, no conversion, NUL-terminated",
       +                .input =  { "Hello World!", SIZE_MAX, 20 },
       +                .output = { "Hello World!", 12 },
       +        },
       +        {
       +                .description = "two words, conversion, NUL-terminated, truncation",
       +                .input =  { "heLlo wORLd!", SIZE_MAX, 4 },
       +                .output = { "Hel", 12 },
       +        },
       +};
       +
       +static int
       +unit_test_callback_is_case_utf8(const void *t, size_t off, const char *name,
       +                                const char *argv0)
       +{
       +        const struct unit_test_is_case_utf8 *test =
       +                (const struct unit_test_is_case_utf8 *)t + off;
       +        bool ret = false;
       +        size_t caselen = 0x7f;
       +
       +        if (t == is_lowercase_utf8) {
       +                ret = grapheme_is_lowercase_utf8(test->input.src, test->input.srclen,
       +                                                 &caselen);
       +        } else if (t == is_uppercase_utf8) {
       +                ret = grapheme_is_uppercase_utf8(test->input.src, test->input.srclen,
       +                                                 &caselen);
       +        } else if (t == is_titlecase_utf8) {
       +                ret = grapheme_is_titlecase_utf8(test->input.src, test->input.srclen,
       +                                                 &caselen);
       +
       +        } else {
       +                goto err;
       +        }
       +
       +        /* check results */
       +        if (ret != test->output.ret || caselen != test->output.caselen) {
       +                goto err;
       +        }
       +
       +        return 0;
       +err:
       +        fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
       +                "(returned (%s, %zu) instead of (%s, %zu)).\n", argv0,
       +                name, off, test->description, ret ? "true" : "false",
       +                caselen, test->output.ret ? "true" : "false",
       +                test->output.caselen);
       +        return 1;
       +}
       +
       +static int
       +unit_test_callback_to_case_utf8(const void *t, size_t off, const char *name,
       +                                const char *argv0)
       +{
       +        const struct unit_test_to_case_utf8 *test =
       +                (const struct unit_test_to_case_utf8 *)t + off;
       +        size_t ret = 0, i;
       +        char buf[512];
       +
       +        /* fill the array with canary values */
       +        memset(buf, 0x7f, LEN(buf));
       +
       +        if (t == to_lowercase_utf8) {
       +                ret = grapheme_to_lowercase_utf8(test->input.src, test->input.srclen,
       +                                                 buf, test->input.destlen);
       +        } else if (t == to_uppercase_utf8) {
       +                ret = grapheme_to_uppercase_utf8(test->input.src, test->input.srclen,
       +                                                 buf, test->input.destlen);
       +        } else if (t == to_titlecase_utf8) {
       +                ret = grapheme_to_titlecase_utf8(test->input.src, test->input.srclen,
       +                                                 buf, test->input.destlen);
       +        } else {
       +                goto err;
       +        }
       +
       +        /* check results */
       +        if (ret != test->output.ret ||
       +            memcmp(buf, test->output.dest, MIN(test->input.destlen, test->output.ret))) {
       +                goto err;
       +        }
       +
       +        /* check that none of the canary values have been overwritten */
       +        for (i = test->input.destlen; i < LEN(buf); i++) {
       +                if (buf[i] != 0x7f) {
       +                        goto err;
       +                }
       +        }
       +
       +        return 0;
       +err:
       +        fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
       +                "(returned (\"%.*s\", %zu) instead of (\"%.*s\", %zu)).\n", argv0,
       +                name, off, test->description, (int)ret, buf, ret,
       +                (int)test->output.ret, test->output.dest, test->output.ret);
       +        return 1;
       +}
       +
       +int
       +main(int argc, char *argv[])
       +{
       +        (void)argc;
       +
       +        return run_unit_tests(unit_test_callback_is_case_utf8, is_lowercase_utf8,
       +                              LEN(is_lowercase_utf8), "grapheme_is_lowercase_utf8", argv[0]) +
       +               run_unit_tests(unit_test_callback_is_case_utf8, is_uppercase_utf8,
       +                              LEN(is_uppercase_utf8), "grapheme_is_uppercase_utf8", argv[0]) +
       +               run_unit_tests(unit_test_callback_is_case_utf8, is_titlecase_utf8,
       +                              LEN(is_titlecase_utf8), "grapheme_is_titlecase_utf8", argv[0]) +
       +               run_unit_tests(unit_test_callback_to_case_utf8, to_lowercase_utf8,
       +                              LEN(to_lowercase_utf8), "grapheme_to_lowercase_utf8", argv[0]) +
       +               run_unit_tests(unit_test_callback_to_case_utf8, to_uppercase_utf8,
       +                              LEN(to_uppercase_utf8), "grapheme_to_uppercase_utf8", argv[0]) +
       +               run_unit_tests(unit_test_callback_to_case_utf8, to_titlecase_utf8,
       +                              LEN(to_titlecase_utf8), "grapheme_to_titlecase_utf8", argv[0]);
       +}
 (DIR) diff --git a/test/character.c b/test/character.c
       @@ -6,12 +6,121 @@
        #include "../grapheme.h"
        #include "util.h"
        
       +static const struct unit_test_next_break next_character_break[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one character",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2A },
       +                        .srclen = 3,
       +                },
       +                .output = { 2 },
       +        },
       +        {
       +                .description = "one character, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 2 },
       +        },
       +};
       +
       +static const struct unit_test_next_break_utf8 next_character_break_utf8[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = { "", 0 },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, NUL-terminated",
       +                .input = { "", SIZE_MAX },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one character",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA*", 9 },
       +                .output = { 8 },
       +        },
       +        {
       +                .description = "one character, fragment",
       +                .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
       +                .output = { 4 },
       +        },
       +        {
       +                .description = "one character, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA", SIZE_MAX },
       +                .output = { 8 },
       +        },
       +        {
       +                .description = "one character, fragment, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
       +                .output = { 4 },
       +        },
       +};
       +
       +static int
       +unit_test_callback_next_character_break(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break(t, off,
       +                                             grapheme_next_character_break,
       +                                             name, argv0);
       +}
       +
       +static int
       +unit_test_callback_next_character_break_utf8(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break_utf8(t, off,
       +                                                  grapheme_next_character_break_utf8,
       +                                                  name, argv0);
       +}
       +
        int
        main(int argc, char *argv[])
        {
                (void)argc;
        
                return run_break_tests(grapheme_next_character_break,
       -                               character_break_test,
       -                               LEN(character_break_test), argv[0]);
       +                               character_break_test, LEN(character_break_test), argv[0]) +
       +               run_unit_tests(unit_test_callback_next_character_break,
       +                              next_character_break, LEN(next_character_break),
       +                              "grapheme_next_character_break", argv[0]) +
       +               run_unit_tests(unit_test_callback_next_character_break_utf8,
       +                              next_character_break_utf8, LEN(next_character_break_utf8),
       +                              "grapheme_next_character_break_utf8", argv[0]);
        }
 (DIR) diff --git a/test/line.c b/test/line.c
       @@ -6,6 +6,110 @@
        #include "../grapheme.h"
        #include "util.h"
        
       +static const struct unit_test_next_break next_line_break[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one opportunity",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A },
       +                        .srclen = 4,
       +                },
       +                .output = { 3 },
       +        },
       +        {
       +                .description = "one opportunity, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A, 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 3 },
       +        },
       +};
       +
       +static const struct unit_test_next_break_utf8 next_line_break_utf8[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = { "", 0 },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, NUL-terminated",
       +                .input = { "", SIZE_MAX },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one opportunity",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA *", 10 },
       +                .output = { 9 },
       +        },
       +        {
       +                .description = "one opportunity, fragment",
       +                .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
       +                .output = { 4 },
       +        },
       +        {
       +                .description = "one opportunity, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA A", SIZE_MAX },
       +                .output = { 9 },
       +        },
       +        {
       +                .description = "one opportunity, fragment, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
       +                .output = { 4 },
       +        },
       +};
       +
       +static int
       +unit_test_callback_next_line_break(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break(t, off,
       +                                             grapheme_next_line_break,
       +                                             name, argv0);
       +}
       +
       +static int
       +unit_test_callback_next_line_break_utf8(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break_utf8(t, off,
       +                                                  grapheme_next_line_break_utf8,
       +                                                  name, argv0);
       +}
       +
        int
        main(int argc, char *argv[])
        {
       @@ -13,5 +117,11 @@ main(int argc, char *argv[])
        
                return run_break_tests(grapheme_next_line_break,
                                       line_break_test, LEN(line_break_test),
       -                               argv[0]);
       +                               argv[0]) +
       +               run_unit_tests(unit_test_callback_next_line_break,
       +                              next_line_break, LEN(next_line_break),
       +                              "grapheme_next_line_break", argv[0]) +
       +               run_unit_tests(unit_test_callback_next_line_break_utf8,
       +                              next_line_break_utf8, LEN(next_line_break_utf8),
       +                              "grapheme_next_line_break_utf8", argv[0]);
        }
 (DIR) diff --git a/test/sentence.c b/test/sentence.c
       @@ -6,6 +6,110 @@
        #include "../grapheme.h"
        #include "util.h"
        
       +static const struct unit_test_next_break next_sentence_break[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one sentence",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2E, 0x20, 0x2A },
       +                        .srclen = 5,
       +                },
       +                .output = { 4 },
       +        },
       +        {
       +                .description = "one sentence, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x2E, 0x20, 0x2A, 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 4 },
       +        },
       +};
       +
       +static const struct unit_test_next_break_utf8 next_sentence_break_utf8[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = { "", 0 },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, NUL-terminated",
       +                .input = { "", SIZE_MAX },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one sentence",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is the flag of Germany.  It", 36 },
       +                .output = { 34 },
       +        },
       +        {
       +                .description = "one sentence, fragment",
       +                .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
       +                .output = { 4 },
       +        },
       +        {
       +                .description = "one sentence, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is the flag of Germany.  It", SIZE_MAX },
       +                .output = { 34 },
       +        },
       +        {
       +                .description = "one sentence, fragment, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
       +                .output = { 6 },
       +        },
       +};
       +
       +static int
       +unit_test_callback_next_sentence_break(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break(t, off,
       +                                             grapheme_next_sentence_break,
       +                                             name, argv0);
       +}
       +
       +static int
       +unit_test_callback_next_sentence_break_utf8(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break_utf8(t, off,
       +                                                  grapheme_next_sentence_break_utf8,
       +                                                  name, argv0);
       +}
       +
        int
        main(int argc, char *argv[])
        {
       @@ -13,5 +117,11 @@ main(int argc, char *argv[])
        
                return run_break_tests(grapheme_next_sentence_break,
                                       sentence_break_test,
       -                               LEN(sentence_break_test), argv[0]);
       +                               LEN(sentence_break_test), argv[0]) +
       +               run_unit_tests(unit_test_callback_next_sentence_break,
       +                              next_sentence_break, LEN(next_sentence_break),
       +                              "grapheme_next_sentence_break", argv[0]) +
       +               run_unit_tests(unit_test_callback_next_sentence_break_utf8,
       +                              next_sentence_break_utf8, LEN(next_sentence_break_utf8),
       +                              "grapheme_next_character_break_utf8", argv[0]);
        }
 (DIR) diff --git a/test/utf8-decode.c b/test/utf8-decode.c
       @@ -310,7 +310,7 @@ main(int argc, char *argv[])
                                failed++;
                        }
                }
       -        printf("%s: %zu/%zu tests passed.\n", argv[0],
       +        printf("%s: %zu/%zu unit tests passed.\n", argv[0],
                       LEN(dec_test) - failed, LEN(dec_test));
        
                return (failed > 0) ? 1 : 0;
 (DIR) diff --git a/test/utf8-encode.c b/test/utf8-encode.c
       @@ -86,7 +86,7 @@ main(int argc, char *argv[])
                                failed++;
                        }
                }
       -        printf("%s: %zu/%zu tests passed.\n", argv[0],
       +        printf("%s: %zu/%zu unit tests passed.\n", argv[0],
                       LEN(enc_test) - failed, LEN(enc_test));
        
                return (failed > 0) ? 1 : 0;
 (DIR) diff --git a/test/util.c b/test/util.c
       @@ -23,7 +23,7 @@ run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
                                /* check if our resulting offset matches */
                                if (j == test[i].lenlen ||
                                    res != test[i].len[j++]) {
       -                                fprintf(stderr, "%s: Failed test %zu \"%s\".\n",
       +                                fprintf(stderr, "%s: Failed conformance test %zu \"%s\".\n",
                                                argv0, i, test[i].descr);
                                        fprintf(stderr, "J=%zu: EXPECTED len %zu, got %zu\n", j-1, test[i].len[j-1], res);
                                        failed++;
       @@ -31,8 +31,68 @@ run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
                                }
                        }
                }
       -        printf("%s: %zu/%zu tests passed.\n", argv0,
       +        printf("%s: %zu/%zu conformance tests passed.\n", argv0,
                       testlen - failed, testlen);
        
                return (failed > 0) ? 1 : 0;
        }
       +
       +int
       +run_unit_tests(int (*unit_test_callback)(const void *, size_t, const char *,
       +               const char *), const void *test, size_t testlen, const char *name,
       +               const char *argv0)
       +{
       +        size_t i, failed;
       +
       +        for (i = 0, failed = 0; i < testlen; i++) {
       +                failed += (unit_test_callback(test, i, name, argv0) == 0) ? 0 : 1;
       +        }
       +
       +        printf("%s: %s: %zu/%zu unit tests passed.\n", argv0, name,
       +               testlen - failed, testlen);
       +
       +        return (failed > 0) ? 1 : 0;
       +}
       +
       +int
       +unit_test_callback_next_break(const struct unit_test_next_break *t, size_t off,
       +                                   size_t (*next_break)(const uint_least32_t *, size_t),
       +                                   const char *name, const char *argv0)
       +{
       +        const struct unit_test_next_break *test = t + off;
       +
       +        size_t ret = next_break(test->input.src, test->input.srclen);
       +
       +        if (ret != test->output.ret) {
       +                goto err;
       +        }
       +
       +        return 0;
       +err:
       +        fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
       +                "(returned %zu instead of %zu).\n", argv0,
       +                name, off, test->description, ret, test->output.ret);
       +        return 1;
       +}
       +
       +int
       +unit_test_callback_next_break_utf8(const struct unit_test_next_break_utf8 *t,
       +                                   size_t off,
       +                                   size_t (*next_break_utf8)(const char *, size_t),
       +                                   const char *name, const char *argv0)
       +{
       +        const struct unit_test_next_break_utf8 *test = t + off;
       +
       +        size_t ret = next_break_utf8(test->input.src, test->input.srclen);
       +
       +        if (ret != test->output.ret) {
       +                goto err;
       +        }
       +
       +        return 0;
       +err:
       +        fprintf(stderr, "%s: %s: Failed unit test %zu \"%s\" "
       +                "(returned %zu instead of %zu).\n", argv0,
       +                name, off, test->description, ret, test->output.ret);
       +        return 1;
       +}
 (DIR) diff --git a/test/util.h b/test/util.h
       @@ -5,10 +5,45 @@
        #include "../gen/types.h"
        #include "../grapheme.h"
        
       +#undef MIN
       +#define MIN(x,y)  ((x) < (y) ? (x) : (y))
       +#undef LEN
        #define LEN(x) (sizeof(x) / sizeof(*(x)))
        
       +struct unit_test_next_break {
       +        const char *description;
       +        struct {
       +                const uint_least32_t *src;
       +                size_t srclen;
       +        } input;
       +        struct {
       +                size_t ret;
       +        } output;
       +};
       +
       +struct unit_test_next_break_utf8 {
       +        const char *description;
       +        struct {
       +                const char *src;
       +                size_t srclen;
       +        } input;
       +        struct {
       +                size_t ret;
       +        } output;
       +};
       +
        int run_break_tests(size_t (*next_break)(const uint_least32_t *, size_t),
                            const struct break_test *test, size_t testlen,
                            const char *);
       +int run_unit_tests(int (*unit_test_callback)(const void *, size_t, const char *,
       +                   const char *), const void *, size_t, const char *, const char *);
       +
       +int unit_test_callback_next_break(const struct unit_test_next_break *, size_t,
       +                                  size_t (*next_break)(const uint_least32_t *, size_t),
       +                                  const char *, const char *);
       +int unit_test_callback_next_break_utf8(const struct unit_test_next_break_utf8 *,
       +                                       size_t,
       +                                       size_t (*next_break_utf8)(const char *, size_t),
       +                                       const char *, const char *);
        
        #endif /* UTIL_H */
 (DIR) diff --git a/test/word.c b/test/word.c
       @@ -6,11 +6,121 @@
        #include "../grapheme.h"
        #include "util.h"
        
       +static const struct unit_test_next_break next_word_break[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one word",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A },
       +                        .srclen = 4,
       +                },
       +                .output = { 2 },
       +        },
       +        {
       +                .description = "one word, null-terminated",
       +                .input = {
       +                        .src    = (uint_least32_t *)(uint_least32_t[]){ 0x1F1E9, 0x1F1EA, 0x20, 0x2A, 0x0 },
       +                        .srclen = SIZE_MAX,
       +                },
       +                .output = { 2 },
       +        },
       +};
       +
       +static const struct unit_test_next_break_utf8 next_word_break_utf8[] = {
       +        {
       +                .description = "NULL input",
       +                .input = {
       +                        .src    = NULL,
       +                        .srclen = 0,
       +                },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input",
       +                .input = { "", 0 },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "empty input, NUL-terminated",
       +                .input = { "", SIZE_MAX },
       +                .output = { 0 },
       +        },
       +        {
       +                .description = "one word",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is", 11 },
       +                .output = { 8 },
       +        },
       +        {
       +                .description = "one word, fragment",
       +                .input = { "\xF0\x9F\x87\xA9\xF0", 5 },
       +                .output = { 4 },
       +        },
       +        {
       +                .description = "one word, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F\x87\xAA is", SIZE_MAX },
       +                .output = { 8 },
       +        },
       +        {
       +                .description = "one word, fragment, NUL-terminated",
       +                .input = { "\xF0\x9F\x87\xA9\xF0\x9F", SIZE_MAX },
       +                .output = { 4 },
       +        },
       +};
       +
       +static int
       +unit_test_callback_next_word_break(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break(t, off,
       +                                             grapheme_next_word_break,
       +                                             name, argv0);
       +}
       +
       +static int
       +unit_test_callback_next_word_break_utf8(const void *t, size_t off,
       +                                             const char *name,
       +                                             const char *argv0)
       +{
       +        return unit_test_callback_next_break_utf8(t, off,
       +                                                  grapheme_next_word_break_utf8,
       +                                                  name, argv0);
       +}
       +
        int
        main(int argc, char *argv[])
        {
                (void)argc;
        
                return run_break_tests(grapheme_next_word_break, word_break_test,
       -                               LEN(word_break_test), argv[0]);
       +                               LEN(word_break_test), argv[0]) +
       +               run_unit_tests(unit_test_callback_next_word_break,
       +                              next_word_break, LEN(next_word_break),
       +                              "grapheme_next_word_break", argv[0]) +
       +               run_unit_tests(unit_test_callback_next_word_break_utf8,
       +                              next_word_break_utf8, LEN(next_word_break_utf8),
       +                              "grapheme_next_word_break_utf8", argv[0]);
        }