utf8toxmlent.c - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
utf8toxmlent.c (1180B)
---
1 /* convert UTF-8 codepoints outside the ASCII range to XML numeric entities
2 Some caveats: it does not do any validation. only converts single
3 codepoints up to 4 bytes.
4 usage: curl -s 'https://www.w3.org/2001/06/utf-8-test/UTF-8-demo.html' | thisprogram
5 */
6 #include <stdio.h>
7 #include <stdlib.h>
8
9 int
10 getnext(void)
11 {
12 int c;
13
14 if ((c = getchar()) == EOF)
15 exit(0);
16 return c;
17 }
18
19 int
20 main(void)
21 {
22 long long cp;
23 unsigned char b0, b1, b2, b3;
24
25 while (1) {
26 b0 = getnext();
27 if (b0 < 0x80) { /* 1 byte, ASCII */
28 putchar(b0);
29 continue;
30 }
31
32 if ((b0 & 0xf0) == 0xf0) { /* 4 bytes */
33 b1 = getnext();
34 b2 = getnext();
35 b3 = getnext();
36
37 b0 = (b0 & ~0xf0);
38 b1 = (b1 & ~0x80);
39 b2 = (b2 & ~0x80);
40 b3 = (b3 & ~0x80);
41
42 cp = (b0 << 18) | (b1 << 12) | (b2 << 6) | b3;
43 } else if ((b0 & 0xe0) == 0xe0) { /* 3 bytes */
44 b1 = getnext();
45 b2 = getnext();
46
47 b0 = (b0 & ~0xe0);
48 b1 = (b1 & ~0x80);
49 b2 = (b2 & ~0x80);
50
51 cp = (b0 << 12) | (b1 << 6) | b2;
52 } else if ((b0 & 0xc0) == 0xc0) { /* 2 bytes */
53 b1 = getnext();
54
55 b0 = (b0 & ~0xc0);
56 b1 = (b1 & ~0x80);
57
58 cp = (b0 << 6) | b1;
59 }
60 printf("&#%lld;", cp);
61 }
62
63 return 0;
64 }