word.c - libgrapheme - unicode string library
(HTM) git clone git://git.suckless.org/libgrapheme
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
word.c (3038B)
---
1 /* See LICENSE file for copyright and license details. */
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
5
6 #include "util.h"
7
8 #define FILE_EMOJI "data/emoji-data.txt"
9 #define FILE_WORD "data/WordBreakProperty.txt"
10
11 static const struct property_spec word_break_property[] = {
12 {
13 .enumname = "OTHER",
14 .file = NULL,
15 .ucdname = NULL,
16 },
17 {
18 .enumname = "ALETTER",
19 .file = FILE_WORD,
20 .ucdname = "ALetter",
21 },
22 {
23 .enumname = "BOTH_ALETTER_EXTPICT",
24 .file = NULL,
25 .ucdname = NULL,
26 },
27 {
28 .enumname = "CR",
29 .file = FILE_WORD,
30 .ucdname = "CR",
31 },
32 {
33 .enumname = "DOUBLE_QUOTE",
34 .file = FILE_WORD,
35 .ucdname = "Double_Quote",
36 },
37 {
38 .enumname = "EXTEND",
39 .file = FILE_WORD,
40 .ucdname = "Extend",
41 },
42 {
43 .enumname = "EXTENDED_PICTOGRAPHIC",
44 .file = FILE_EMOJI,
45 .ucdname = "Extended_Pictographic",
46 },
47 {
48 .enumname = "EXTENDNUMLET",
49 .file = FILE_WORD,
50 .ucdname = "ExtendNumLet",
51 },
52 {
53 .enumname = "FORMAT",
54 .file = FILE_WORD,
55 .ucdname = "Format",
56 },
57 {
58 .enumname = "HEBREW_LETTER",
59 .file = FILE_WORD,
60 .ucdname = "Hebrew_Letter",
61 },
62 {
63 .enumname = "KATAKANA",
64 .file = FILE_WORD,
65 .ucdname = "Katakana",
66 },
67 {
68 .enumname = "LF",
69 .file = FILE_WORD,
70 .ucdname = "LF",
71 },
72 {
73 .enumname = "MIDLETTER",
74 .file = FILE_WORD,
75 .ucdname = "MidLetter",
76 },
77 {
78 .enumname = "MIDNUM",
79 .file = FILE_WORD,
80 .ucdname = "MidNum",
81 },
82 {
83 .enumname = "MIDNUMLET",
84 .file = FILE_WORD,
85 .ucdname = "MidNumLet",
86 },
87 {
88 .enumname = "NEWLINE",
89 .file = FILE_WORD,
90 .ucdname = "Newline",
91 },
92 {
93 .enumname = "NUMERIC",
94 .file = FILE_WORD,
95 .ucdname = "Numeric",
96 },
97 {
98 .enumname = "REGIONAL_INDICATOR",
99 .file = FILE_WORD,
100 .ucdname = "Regional_Indicator",
101 },
102 {
103 .enumname = "SINGLE_QUOTE",
104 .file = FILE_WORD,
105 .ucdname = "Single_Quote",
106 },
107 {
108 .enumname = "WSEGSPACE",
109 .file = FILE_WORD,
110 .ucdname = "WSegSpace",
111 },
112 {
113 .enumname = "ZWJ",
114 .file = FILE_WORD,
115 .ucdname = "ZWJ",
116 },
117 };
118
119 static uint_least8_t
120 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
121 {
122 uint_least8_t result;
123
124 (void)cp;
125
126 if ((!strcmp(word_break_property[prop1].enumname, "ALETTER") &&
127 !strcmp(word_break_property[prop2].enumname,
128 "EXTENDED_PICTOGRAPHIC")) ||
129 (!strcmp(word_break_property[prop1].enumname,
130 "EXTENDED_PICTOGRAPHIC") &&
131 !strcmp(word_break_property[prop2].enumname, "ALETTER"))) {
132 for (result = 0; result < LEN(word_break_property); result++) {
133 if (!strcmp(word_break_property[result].enumname,
134 "BOTH_ALETTER_EXTPICT")) {
135 break;
136 }
137 }
138 if (result == LEN(word_break_property)) {
139 fprintf(stderr, "handle_conflict: Internal error.\n");
140 exit(1);
141 }
142 } else {
143 fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
144 exit(1);
145 }
146
147 return result;
148 }
149
150 int
151 main(int argc, char *argv[])
152 {
153 (void)argc;
154
155 properties_generate_break_property(
156 word_break_property, LEN(word_break_property), NULL,
157 handle_conflict, NULL, "word_break", argv[0]);
158
159 return 0;
160 }