utf8toxmlent.c - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       utf8toxmlent.c (1180B)
       ---
            1 /* convert UTF-8 codepoints outside the ASCII range to XML numeric entities
            2    Some caveats: it does not do any validation. only converts single
            3    codepoints up to 4 bytes.
            4    usage: curl -s 'https://www.w3.org/2001/06/utf-8-test/UTF-8-demo.html' | thisprogram
            5 */
            6 #include <stdio.h>
            7 #include <stdlib.h>
            8 
            9 int
           10 getnext(void)
           11 {
           12         int c;
           13 
           14         if ((c = getchar()) == EOF)
           15                 exit(0);
           16         return c;
           17 }
           18 
           19 int
           20 main(void)
           21 {
           22         long long cp;
           23         unsigned char b0, b1, b2, b3;
           24 
           25         while (1) {
           26                 b0 = getnext();
           27                 if (b0 < 0x80) { /* 1 byte, ASCII */
           28                         putchar(b0);
           29                         continue;
           30                 }
           31 
           32                 if ((b0 & 0xf0) == 0xf0) { /* 4 bytes */
           33                         b1 = getnext();
           34                         b2 = getnext();
           35                         b3 = getnext();
           36 
           37                         b0 = (b0 & ~0xf0);
           38                         b1 = (b1 & ~0x80);
           39                         b2 = (b2 & ~0x80);
           40                         b3 = (b3 & ~0x80);
           41 
           42                         cp = (b0 << 18) | (b1 << 12) | (b2 << 6) | b3;
           43                 } else if ((b0 & 0xe0) == 0xe0) { /* 3 bytes */
           44                         b1 = getnext();
           45                         b2 = getnext();
           46                         
           47                         b0 = (b0 & ~0xe0);
           48                         b1 = (b1 & ~0x80);
           49                         b2 = (b2 & ~0x80);
           50 
           51                         cp = (b0 << 12) | (b1 << 6) | b2;
           52                 } else if ((b0 & 0xc0) == 0xc0) { /* 2 bytes */
           53                         b1 = getnext();
           54                         
           55                         b0 = (b0 & ~0xc0);
           56                         b1 = (b1 & ~0x80);
           57 
           58                         cp = (b0 << 6) | b1;
           59                 }
           60                 printf("&#%lld;", cp);
           61         }
           62 
           63         return 0;
           64 }