cp1252toutf8.c - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
cp1252toutf8.c (10973B)
---
1 /* convert text encoding cp1252 to utf-8 */
2
3 #include <stdio.h>
4
5 #ifdef WIN32
6 /* fcntl.h required for Windows binary mode */
7 #include <fcntl.h>
8 #endif
9
10 #ifdef __OpenBSD__
11 #include <err.h>
12 #include <unistd.h>
13 #endif
14
15 static long charmap[256] = {
16 0x0000, /* Null */
17 0x0001, /* Start Of Heading */
18 0x0002, /* Start Of Text */
19 0x0003, /* End Of Text */
20 0x0004, /* End Of Transmission */
21 0x0005, /* Enquiry */
22 0x0006, /* Acknowledge */
23 0x0007, /* Bell */
24 0x0008, /* Backspace */
25 0x0009, /* Horizontal Tabulation */
26 0x000a, /* Line Feed */
27 0x000b, /* Vertical Tabulation */
28 0x000c, /* Form Feed */
29 0x000d, /* Carriage Return */
30 0x000e, /* Shift Out */
31 0x000f, /* Shift In */
32 0x0010, /* Data Link Escape */
33 0x0011, /* Device Control One */
34 0x0012, /* Device Control Two */
35 0x0013, /* Device Control Three */
36 0x0014, /* Device Control Four */
37 0x0015, /* Negative Acknowledge */
38 0x0016, /* Synchronous Idle */
39 0x0017, /* End Of Transmission Block */
40 0x0018, /* Cancel */
41 0x0019, /* End Of Medium */
42 0x001a, /* Substitute */
43 0x001b, /* Escape */
44 0x001c, /* File Separator */
45 0x001d, /* Group Separator */
46 0x001e, /* Record Separator */
47 0x001f, /* Unit Separator */
48 0x0020, /* Space */
49 0x0021, /* Exclamation Mark */
50 0x0022, /* Quotation Mark */
51 0x0023, /* Number Sign */
52 0x0024, /* Dollar Sign */
53 0x0025, /* Percent Sign */
54 0x0026, /* Ampersand */
55 0x0027, /* Apostrophe */
56 0x0028, /* Left Parenthesis */
57 0x0029, /* Right Parenthesis */
58 0x002a, /* Asterisk */
59 0x002b, /* Plus Sign */
60 0x002c, /* Comma */
61 0x002d, /* Hyphen-Minus */
62 0x002e, /* Full Stop */
63 0x002f, /* Solidus */
64 0x0030, /* Digit Zero */
65 0x0031, /* Digit One */
66 0x0032, /* Digit Two */
67 0x0033, /* Digit Three */
68 0x0034, /* Digit Four */
69 0x0035, /* Digit Five */
70 0x0036, /* Digit Six */
71 0x0037, /* Digit Seven */
72 0x0038, /* Digit Eight */
73 0x0039, /* Digit Nine */
74 0x003a, /* Colon */
75 0x003b, /* Semicolon */
76 0x003c, /* Less-Than Sign */
77 0x003d, /* Equals Sign */
78 0x003e, /* Greater-Than Sign */
79 0x003f, /* Question Mark */
80 0x0040, /* Commercial At */
81 0x0041, /* Latin Capital Letter A */
82 0x0042, /* Latin Capital Letter B */
83 0x0043, /* Latin Capital Letter C */
84 0x0044, /* Latin Capital Letter D */
85 0x0045, /* Latin Capital Letter E */
86 0x0046, /* Latin Capital Letter F */
87 0x0047, /* Latin Capital Letter G */
88 0x0048, /* Latin Capital Letter H */
89 0x0049, /* Latin Capital Letter I */
90 0x004a, /* Latin Capital Letter J */
91 0x004b, /* Latin Capital Letter K */
92 0x004c, /* Latin Capital Letter L */
93 0x004d, /* Latin Capital Letter M */
94 0x004e, /* Latin Capital Letter N */
95 0x004f, /* Latin Capital Letter O */
96 0x0050, /* Latin Capital Letter P */
97 0x0051, /* Latin Capital Letter Q */
98 0x0052, /* Latin Capital Letter R */
99 0x0053, /* Latin Capital Letter S */
100 0x0054, /* Latin Capital Letter T */
101 0x0055, /* Latin Capital Letter U */
102 0x0056, /* Latin Capital Letter V */
103 0x0057, /* Latin Capital Letter W */
104 0x0058, /* Latin Capital Letter X */
105 0x0059, /* Latin Capital Letter Y */
106 0x005a, /* Latin Capital Letter Z */
107 0x005b, /* Left Square Bracket */
108 0x005c, /* Reverse Solidus */
109 0x005d, /* Right Square Bracket */
110 0x005e, /* Circumflex Accent */
111 0x005f, /* Low Line */
112 0x0060, /* Grave Accent */
113 0x0061, /* Latin Small Letter A */
114 0x0062, /* Latin Small Letter B */
115 0x0063, /* Latin Small Letter C */
116 0x0064, /* Latin Small Letter D */
117 0x0065, /* Latin Small Letter E */
118 0x0066, /* Latin Small Letter F */
119 0x0067, /* Latin Small Letter G */
120 0x0068, /* Latin Small Letter H */
121 0x0069, /* Latin Small Letter I */
122 0x006a, /* Latin Small Letter J */
123 0x006b, /* Latin Small Letter K */
124 0x006c, /* Latin Small Letter L */
125 0x006d, /* Latin Small Letter M */
126 0x006e, /* Latin Small Letter N */
127 0x006f, /* Latin Small Letter O */
128 0x0070, /* Latin Small Letter P */
129 0x0071, /* Latin Small Letter Q */
130 0x0072, /* Latin Small Letter R */
131 0x0073, /* Latin Small Letter S */
132 0x0074, /* Latin Small Letter T */
133 0x0075, /* Latin Small Letter U */
134 0x0076, /* Latin Small Letter V */
135 0x0077, /* Latin Small Letter W */
136 0x0078, /* Latin Small Letter X */
137 0x0079, /* Latin Small Letter Y */
138 0x007a, /* Latin Small Letter Z */
139 0x007b, /* Left Curly Bracket */
140 0x007c, /* Vertical Line */
141 0x007d, /* Right Curly Bracket */
142 0x007e, /* Tilde */
143 0x007f, /* Delete */
144 0x20ac, /* Euro Sign */
145 0x0081,
146 0x201a, /* Single Low-9 Quotation Mark */
147 0x0191, /* Latin Capital Letter F With Hook */
148 0x201e, /* Double Low-9 Quotation Mark */
149 0x2026, /* Horizontal Ellipsis */
150 0x2020, /* Dagger */
151 0x2021, /* Double Dagger */
152 0x02c6, /* Modifier Letter Circumflex Accent */
153 0x2030, /* Per Mille Sign */
154 0x0160, /* Latin Capital Letter S With Caron */
155 0x2039, /* Single Left-Pointing Angle Quotation Mark */
156 0x0152, /* Latin Capital Ligature Oe */
157 0x008d,
158 0x017d, /* Latin Capital Letter Z With Caron */
159 0x008f,
160 0x0090,
161 0x2018, /* Left Single Quotation Mark */
162 0x2019, /* Right Single Quotation Mark */
163 0x201c, /* Left Double Quotation Mark */
164 0x201d, /* Right Double Quotation Mark */
165 0x2022, /* Bullet */
166 0x2013, /* En Dash */
167 0x2014, /* Em Dash */
168 0x02dc, /* Small Tilde */
169 0x2122, /* Trade Mark Sign */
170 0x0161, /* Latin Small Letter S With Caron */
171 0x203a, /* Single Right-Pointing Angle Quotation Mark */
172 0x0153, /* Latin Small Ligature Oe */
173 0x009d,
174 0x017e, /* Latin Small Letter Z With Caron */
175 0x0178, /* Latin Capital Letter Y With Diaeresis */
176 0x00a0, /* No-Break Space */
177 0x00a1, /* Inverted Exclamation Mark */
178 0x00a2, /* Cent Sign */
179 0x00a3, /* Pound Sign */
180 0x00a4, /* Currency Sign */
181 0x00a5, /* Yen Sign */
182 0x00a6, /* Broken Bar */
183 0x00a7, /* Section Sign */
184 0x00a8, /* Diaeresis */
185 0x00a9, /* Copyright Sign */
186 0x00aa, /* Feminine Ordinal Indicator */
187 0x00ab, /* Left-Pointing Double Angle Quotation Mark */
188 0x00ac, /* Not Sign */
189 0x00ad, /* Soft Hyphen */
190 0x00ae, /* Registered Sign */
191 0x00af, /* Macron */
192 0x00b0, /* Degree Sign */
193 0x00b1, /* Plus-Minus Sign */
194 0x00b2, /* Superscript Two */
195 0x00b3, /* Superscript Three */
196 0x00b4, /* Acute Accent */
197 0x00b5, /* Micro Sign */
198 0x00b6, /* Pilcrow Sign */
199 0x00b7, /* Middle Dot */
200 0x00b8, /* Cedilla */
201 0x00b9, /* Superscript One */
202 0x00ba, /* Masculine Ordinal Indicator */
203 0x00bb, /* Right-Pointing Double Angle Quotation Mark */
204 0x00bc, /* Vulgar Fraction One Quarter */
205 0x00bd, /* Vulgar Fraction One Half */
206 0x00be, /* Vulgar Fraction Three Quarters */
207 0x00bf, /* Inverted Question Mark */
208 0x00c0, /* Latin Capital Letter A With Grave */
209 0x00c1, /* Latin Capital Letter A With Acute */
210 0x00c2, /* Latin Capital Letter A With Circumflex */
211 0x00c3, /* Latin Capital Letter A With Tilde */
212 0x00c4, /* Latin Capital Letter A With Diaeresis */
213 0x00c5, /* Latin Capital Letter A With Ring Above */
214 0x00c6, /* Latin Capital Ligature Ae */
215 0x00c7, /* Latin Capital Letter C With Cedilla */
216 0x00c8, /* Latin Capital Letter E With Grave */
217 0x00c9, /* Latin Capital Letter E With Acute */
218 0x00ca, /* Latin Capital Letter E With Circumflex */
219 0x00cb, /* Latin Capital Letter E With Diaeresis */
220 0x00cc, /* Latin Capital Letter I With Grave */
221 0x00cd, /* Latin Capital Letter I With Acute */
222 0x00ce, /* Latin Capital Letter I With Circumflex */
223 0x00cf, /* Latin Capital Letter I With Diaeresis */
224 0x00d0, /* Latin Capital Letter Eth */
225 0x00d1, /* Latin Capital Letter N With Tilde */
226 0x00d2, /* Latin Capital Letter O With Grave */
227 0x00d3, /* Latin Capital Letter O With Acute */
228 0x00d4, /* Latin Capital Letter O With Circumflex */
229 0x00d5, /* Latin Capital Letter O With Tilde */
230 0x00d6, /* Latin Capital Letter O With Diaeresis */
231 0x00d7, /* Multiplication Sign */
232 0x00d8, /* Latin Capital Letter O With Stroke */
233 0x00d9, /* Latin Capital Letter U With Grave */
234 0x00da, /* Latin Capital Letter U With Acute */
235 0x00db, /* Latin Capital Letter U With Circumflex */
236 0x00dc, /* Latin Capital Letter U With Diaeresis */
237 0x00dd, /* Latin Capital Letter Y With Acute */
238 0x00de, /* Latin Capital Letter Thorn */
239 0x00df, /* Latin Small Letter Sharp S */
240 0x00e0, /* Latin Small Letter A With Grave */
241 0x00e1, /* Latin Small Letter A With Acute */
242 0x00e2, /* Latin Small Letter A With Circumflex */
243 0x00e3, /* Latin Small Letter A With Tilde */
244 0x00e4, /* Latin Small Letter A With Diaeresis */
245 0x00e5, /* Latin Small Letter A With Ring Above */
246 0x00e6, /* Latin Small Ligature Ae */
247 0x00e7, /* Latin Small Letter C With Cedilla */
248 0x00e8, /* Latin Small Letter E With Grave */
249 0x00e9, /* Latin Small Letter E With Acute */
250 0x00ea, /* Latin Small Letter E With Circumflex */
251 0x00eb, /* Latin Small Letter E With Diaeresis */
252 0x00ec, /* Latin Small Letter I With Grave */
253 0x00ed, /* Latin Small Letter I With Acute */
254 0x00ee, /* Latin Small Letter I With Circumflex */
255 0x00ef, /* Latin Small Letter I With Diaeresis */
256 0x00f0, /* Latin Small Letter Eth */
257 0x00f1, /* Latin Small Letter N With Tilde */
258 0x00f2, /* Latin Small Letter O With Grave */
259 0x00f3, /* Latin Small Letter O With Acute */
260 0x00f4, /* Latin Small Letter O With Circumflex */
261 0x00f5, /* Latin Small Letter O With Tilde */
262 0x00f6, /* Latin Small Letter O With Diaeresis */
263 0x00f7, /* Division Sign */
264 0x00f8, /* Latin Small Letter O With Stroke */
265 0x00f9, /* Latin Small Letter U With Grave */
266 0x00fa, /* Latin Small Letter U With Acute */
267 0x00fb, /* Latin Small Letter U With Circumflex */
268 0x00fc, /* Latin Small Letter U With Diaeresis */
269 0x00fd, /* Latin Small Letter Y With Acute */
270 0x00fe, /* Latin Small Letter Thorn */
271 0x00ff, /* Latin Small Letter Y With Diaeresis */
272 };
273
274 int
275 codepointtoutf8(long r, char *s)
276 {
277 if (r == 0) {
278 return 0; /* NUL byte */
279 } else if (r <= 0x7F) {
280 /* 1 byte: 0aaaaaaa */
281 s[0] = r;
282 return 1;
283 } else if (r <= 0x07FF) {
284 /* 2 bytes: 00000aaa aabbbbbb */
285 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */
286 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */
287 return 2;
288 } else if (r <= 0xFFFF) {
289 /* 3 bytes: aaaabbbb bbcccccc */
290 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
291 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */
292 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */
293 return 3;
294 } else {
295 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
296 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
297 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
298 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */
299 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */
300 return 4;
301 }
302 }
303
304 int
305 main(void)
306 {
307 char buf[5];
308 int c, i, n;
309
310 #ifdef __OpenBSD__
311 if (pledge("stdio", NULL) == -1)
312 err(1, "pledge");
313 #endif
314
315 /* required for Windows binary mode aka more retarded bullshit. */
316 #if WIN32
317 _setmode(_fileno(stdin), _O_BINARY);
318 _setmode(_fileno(stdout), _O_BINARY);
319 _setmode(_fileno(stderr), _O_BINARY);
320 #endif
321
322 while ((c = getchar()) != EOF) {
323 n = codepointtoutf8(charmap[c], buf);
324 for (i = 0; i < n; i++)
325 putchar(buf[i]);
326 }
327
328 if (ferror(stdin)) {
329 perror(NULL);
330 return 1;
331 }
332 return 0;
333 }