cp1252toutf8.c - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       cp1252toutf8.c (10973B)
       ---
            1 /* convert text encoding cp1252 to utf-8 */
            2 
            3 #include <stdio.h>
            4 
            5 #ifdef WIN32
            6 /* fcntl.h required for Windows binary mode */
            7 #include <fcntl.h>
            8 #endif
            9 
           10 #ifdef __OpenBSD__
           11 #include <err.h>
           12 #include <unistd.h>
           13 #endif
           14 
           15 static long charmap[256] = {
           16         0x0000,        /* Null */
           17         0x0001,        /* Start Of Heading */
           18         0x0002,        /* Start Of Text */
           19         0x0003,        /* End Of Text */
           20         0x0004,        /* End Of Transmission */
           21         0x0005,        /* Enquiry */
           22         0x0006,        /* Acknowledge */
           23         0x0007,        /* Bell */
           24         0x0008,        /* Backspace */
           25         0x0009,        /* Horizontal Tabulation */
           26         0x000a,        /* Line Feed */
           27         0x000b,        /* Vertical Tabulation */
           28         0x000c,        /* Form Feed */
           29         0x000d,        /* Carriage Return */
           30         0x000e,        /* Shift Out */
           31         0x000f,        /* Shift In */
           32         0x0010,        /* Data Link Escape */
           33         0x0011,        /* Device Control One */
           34         0x0012,        /* Device Control Two */
           35         0x0013,        /* Device Control Three */
           36         0x0014,        /* Device Control Four */
           37         0x0015,        /* Negative Acknowledge */
           38         0x0016,        /* Synchronous Idle */
           39         0x0017,        /* End Of Transmission Block */
           40         0x0018,        /* Cancel */
           41         0x0019,        /* End Of Medium */
           42         0x001a,        /* Substitute */
           43         0x001b,        /* Escape */
           44         0x001c,        /* File Separator */
           45         0x001d,        /* Group Separator */
           46         0x001e,        /* Record Separator */
           47         0x001f,        /* Unit Separator */
           48         0x0020,        /* Space */
           49         0x0021,        /* Exclamation Mark */
           50         0x0022,        /* Quotation Mark */
           51         0x0023,        /* Number Sign */
           52         0x0024,        /* Dollar Sign */
           53         0x0025,        /* Percent Sign */
           54         0x0026,        /* Ampersand */
           55         0x0027,        /* Apostrophe */
           56         0x0028,        /* Left Parenthesis */
           57         0x0029,        /* Right Parenthesis */
           58         0x002a,        /* Asterisk */
           59         0x002b,        /* Plus Sign */
           60         0x002c,        /* Comma */
           61         0x002d,        /* Hyphen-Minus */
           62         0x002e,        /* Full Stop */
           63         0x002f,        /* Solidus */
           64         0x0030,        /* Digit Zero */
           65         0x0031,        /* Digit One */
           66         0x0032,        /* Digit Two */
           67         0x0033,        /* Digit Three */
           68         0x0034,        /* Digit Four */
           69         0x0035,        /* Digit Five */
           70         0x0036,        /* Digit Six */
           71         0x0037,        /* Digit Seven */
           72         0x0038,        /* Digit Eight */
           73         0x0039,        /* Digit Nine */
           74         0x003a,        /* Colon */
           75         0x003b,        /* Semicolon */
           76         0x003c,        /* Less-Than Sign */
           77         0x003d,        /* Equals Sign */
           78         0x003e,        /* Greater-Than Sign */
           79         0x003f,        /* Question Mark */
           80         0x0040,        /* Commercial At */
           81         0x0041,        /* Latin Capital Letter A */
           82         0x0042,        /* Latin Capital Letter B */
           83         0x0043,        /* Latin Capital Letter C */
           84         0x0044,        /* Latin Capital Letter D */
           85         0x0045,        /* Latin Capital Letter E */
           86         0x0046,        /* Latin Capital Letter F */
           87         0x0047,        /* Latin Capital Letter G */
           88         0x0048,        /* Latin Capital Letter H */
           89         0x0049,        /* Latin Capital Letter I */
           90         0x004a,        /* Latin Capital Letter J */
           91         0x004b,        /* Latin Capital Letter K */
           92         0x004c,        /* Latin Capital Letter L */
           93         0x004d,        /* Latin Capital Letter M */
           94         0x004e,        /* Latin Capital Letter N */
           95         0x004f,        /* Latin Capital Letter O */
           96         0x0050,        /* Latin Capital Letter P */
           97         0x0051,        /* Latin Capital Letter Q */
           98         0x0052,        /* Latin Capital Letter R */
           99         0x0053,        /* Latin Capital Letter S */
          100         0x0054,        /* Latin Capital Letter T */
          101         0x0055,        /* Latin Capital Letter U */
          102         0x0056,        /* Latin Capital Letter V */
          103         0x0057,        /* Latin Capital Letter W */
          104         0x0058,        /* Latin Capital Letter X */
          105         0x0059,        /* Latin Capital Letter Y */
          106         0x005a,        /* Latin Capital Letter Z */
          107         0x005b,        /* Left Square Bracket */
          108         0x005c,        /* Reverse Solidus */
          109         0x005d,        /* Right Square Bracket */
          110         0x005e,        /* Circumflex Accent */
          111         0x005f,        /* Low Line */
          112         0x0060,        /* Grave Accent */
          113         0x0061,        /* Latin Small Letter A */
          114         0x0062,        /* Latin Small Letter B */
          115         0x0063,        /* Latin Small Letter C */
          116         0x0064,        /* Latin Small Letter D */
          117         0x0065,        /* Latin Small Letter E */
          118         0x0066,        /* Latin Small Letter F */
          119         0x0067,        /* Latin Small Letter G */
          120         0x0068,        /* Latin Small Letter H */
          121         0x0069,        /* Latin Small Letter I */
          122         0x006a,        /* Latin Small Letter J */
          123         0x006b,        /* Latin Small Letter K */
          124         0x006c,        /* Latin Small Letter L */
          125         0x006d,        /* Latin Small Letter M */
          126         0x006e,        /* Latin Small Letter N */
          127         0x006f,        /* Latin Small Letter O */
          128         0x0070,        /* Latin Small Letter P */
          129         0x0071,        /* Latin Small Letter Q */
          130         0x0072,        /* Latin Small Letter R */
          131         0x0073,        /* Latin Small Letter S */
          132         0x0074,        /* Latin Small Letter T */
          133         0x0075,        /* Latin Small Letter U */
          134         0x0076,        /* Latin Small Letter V */
          135         0x0077,        /* Latin Small Letter W */
          136         0x0078,        /* Latin Small Letter X */
          137         0x0079,        /* Latin Small Letter Y */
          138         0x007a,        /* Latin Small Letter Z */
          139         0x007b,        /* Left Curly Bracket */
          140         0x007c,        /* Vertical Line */
          141         0x007d,        /* Right Curly Bracket */
          142         0x007e,        /* Tilde */
          143         0x007f,        /* Delete */
          144         0x20ac,        /* Euro Sign */
          145         0x0081,
          146         0x201a,        /* Single Low-9 Quotation Mark */
          147         0x0191,        /* Latin Capital Letter F With Hook */
          148         0x201e,        /* Double Low-9 Quotation Mark */
          149         0x2026,        /* Horizontal Ellipsis */
          150         0x2020,        /* Dagger */
          151         0x2021,        /* Double Dagger */
          152         0x02c6,        /* Modifier Letter Circumflex Accent */
          153         0x2030,        /* Per Mille Sign */
          154         0x0160,        /* Latin Capital Letter S With Caron */
          155         0x2039,        /* Single Left-Pointing Angle Quotation Mark */
          156         0x0152,        /* Latin Capital Ligature Oe */
          157         0x008d,
          158         0x017d,        /* Latin Capital Letter Z With Caron */
          159         0x008f,
          160         0x0090,
          161         0x2018,        /* Left Single Quotation Mark */
          162         0x2019,        /* Right Single Quotation Mark */
          163         0x201c,        /* Left Double Quotation Mark */
          164         0x201d,        /* Right Double Quotation Mark */
          165         0x2022,        /* Bullet */
          166         0x2013,        /* En Dash */
          167         0x2014,        /* Em Dash */
          168         0x02dc,        /* Small Tilde */
          169         0x2122,        /* Trade Mark Sign */
          170         0x0161,        /* Latin Small Letter S With Caron */
          171         0x203a,        /* Single Right-Pointing Angle Quotation Mark */
          172         0x0153,        /* Latin Small Ligature Oe */
          173         0x009d,
          174         0x017e,        /* Latin Small Letter Z With Caron */
          175         0x0178,        /* Latin Capital Letter Y With Diaeresis */
          176         0x00a0,        /* No-Break Space */
          177         0x00a1,        /* Inverted Exclamation Mark */
          178         0x00a2,        /* Cent Sign */
          179         0x00a3,        /* Pound Sign */
          180         0x00a4,        /* Currency Sign */
          181         0x00a5,        /* Yen Sign */
          182         0x00a6,        /* Broken Bar */
          183         0x00a7,        /* Section Sign */
          184         0x00a8,        /* Diaeresis */
          185         0x00a9,        /* Copyright Sign */
          186         0x00aa,        /* Feminine Ordinal Indicator */
          187         0x00ab,        /* Left-Pointing Double Angle Quotation Mark */
          188         0x00ac,        /* Not Sign */
          189         0x00ad,        /* Soft Hyphen */
          190         0x00ae,        /* Registered Sign */
          191         0x00af,        /* Macron */
          192         0x00b0,        /* Degree Sign */
          193         0x00b1,        /* Plus-Minus Sign */
          194         0x00b2,        /* Superscript Two */
          195         0x00b3,        /* Superscript Three */
          196         0x00b4,        /* Acute Accent */
          197         0x00b5,        /* Micro Sign */
          198         0x00b6,        /* Pilcrow Sign */
          199         0x00b7,        /* Middle Dot */
          200         0x00b8,        /* Cedilla */
          201         0x00b9,        /* Superscript One */
          202         0x00ba,        /* Masculine Ordinal Indicator */
          203         0x00bb,        /* Right-Pointing Double Angle Quotation Mark */
          204         0x00bc,        /* Vulgar Fraction One Quarter */
          205         0x00bd,        /* Vulgar Fraction One Half */
          206         0x00be,        /* Vulgar Fraction Three Quarters */
          207         0x00bf,        /* Inverted Question Mark */
          208         0x00c0,        /* Latin Capital Letter A With Grave */
          209         0x00c1,        /* Latin Capital Letter A With Acute */
          210         0x00c2,        /* Latin Capital Letter A With Circumflex */
          211         0x00c3,        /* Latin Capital Letter A With Tilde */
          212         0x00c4,        /* Latin Capital Letter A With Diaeresis */
          213         0x00c5,        /* Latin Capital Letter A With Ring Above */
          214         0x00c6,        /* Latin Capital Ligature Ae */
          215         0x00c7,        /* Latin Capital Letter C With Cedilla */
          216         0x00c8,        /* Latin Capital Letter E With Grave */
          217         0x00c9,        /* Latin Capital Letter E With Acute */
          218         0x00ca,        /* Latin Capital Letter E With Circumflex */
          219         0x00cb,        /* Latin Capital Letter E With Diaeresis */
          220         0x00cc,        /* Latin Capital Letter I With Grave */
          221         0x00cd,        /* Latin Capital Letter I With Acute */
          222         0x00ce,        /* Latin Capital Letter I With Circumflex */
          223         0x00cf,        /* Latin Capital Letter I With Diaeresis */
          224         0x00d0,        /* Latin Capital Letter Eth */
          225         0x00d1,        /* Latin Capital Letter N With Tilde */
          226         0x00d2,        /* Latin Capital Letter O With Grave */
          227         0x00d3,        /* Latin Capital Letter O With Acute */
          228         0x00d4,        /* Latin Capital Letter O With Circumflex */
          229         0x00d5,        /* Latin Capital Letter O With Tilde */
          230         0x00d6,        /* Latin Capital Letter O With Diaeresis */
          231         0x00d7,        /* Multiplication Sign */
          232         0x00d8,        /* Latin Capital Letter O With Stroke */
          233         0x00d9,        /* Latin Capital Letter U With Grave */
          234         0x00da,        /* Latin Capital Letter U With Acute */
          235         0x00db,        /* Latin Capital Letter U With Circumflex */
          236         0x00dc,        /* Latin Capital Letter U With Diaeresis */
          237         0x00dd,        /* Latin Capital Letter Y With Acute */
          238         0x00de,        /* Latin Capital Letter Thorn */
          239         0x00df,        /* Latin Small Letter Sharp S */
          240         0x00e0,        /* Latin Small Letter A With Grave */
          241         0x00e1,        /* Latin Small Letter A With Acute */
          242         0x00e2,        /* Latin Small Letter A With Circumflex */
          243         0x00e3,        /* Latin Small Letter A With Tilde */
          244         0x00e4,        /* Latin Small Letter A With Diaeresis */
          245         0x00e5,        /* Latin Small Letter A With Ring Above */
          246         0x00e6,        /* Latin Small Ligature Ae */
          247         0x00e7,        /* Latin Small Letter C With Cedilla */
          248         0x00e8,        /* Latin Small Letter E With Grave */
          249         0x00e9,        /* Latin Small Letter E With Acute */
          250         0x00ea,        /* Latin Small Letter E With Circumflex */
          251         0x00eb,        /* Latin Small Letter E With Diaeresis */
          252         0x00ec,        /* Latin Small Letter I With Grave */
          253         0x00ed,        /* Latin Small Letter I With Acute */
          254         0x00ee,        /* Latin Small Letter I With Circumflex */
          255         0x00ef,        /* Latin Small Letter I With Diaeresis */
          256         0x00f0,        /* Latin Small Letter Eth */
          257         0x00f1,        /* Latin Small Letter N With Tilde */
          258         0x00f2,        /* Latin Small Letter O With Grave */
          259         0x00f3,        /* Latin Small Letter O With Acute */
          260         0x00f4,        /* Latin Small Letter O With Circumflex */
          261         0x00f5,        /* Latin Small Letter O With Tilde */
          262         0x00f6,        /* Latin Small Letter O With Diaeresis */
          263         0x00f7,        /* Division Sign */
          264         0x00f8,        /* Latin Small Letter O With Stroke */
          265         0x00f9,        /* Latin Small Letter U With Grave */
          266         0x00fa,        /* Latin Small Letter U With Acute */
          267         0x00fb,        /* Latin Small Letter U With Circumflex */
          268         0x00fc,        /* Latin Small Letter U With Diaeresis */
          269         0x00fd,        /* Latin Small Letter Y With Acute */
          270         0x00fe,        /* Latin Small Letter Thorn */
          271         0x00ff,        /* Latin Small Letter Y With Diaeresis */
          272 };
          273 
          274 int
          275 codepointtoutf8(long r, char *s)
          276 {
          277         if (r == 0) {
          278                 return 0; /* NUL byte */
          279         } else if (r <= 0x7F) {
          280                 /* 1 byte: 0aaaaaaa */
          281                 s[0] = r;
          282                 return 1;
          283         } else if (r <= 0x07FF) {
          284                 /* 2 bytes: 00000aaa aabbbbbb */
          285                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          286                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          287                 return 2;
          288         } else if (r <= 0xFFFF) {
          289                 /* 3 bytes: aaaabbbb bbcccccc */
          290                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          291                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          292                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          293                 return 3;
          294         } else {
          295                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          296                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          297                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          298                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          299                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          300                 return 4;
          301         }
          302 }
          303 
          304 int
          305 main(void)
          306 {
          307         char buf[5];
          308         int c, i, n;
          309 
          310 #ifdef __OpenBSD__
          311         if (pledge("stdio", NULL) == -1)
          312                 err(1, "pledge");
          313 #endif
          314 
          315         /* required for Windows binary mode aka more retarded bullshit. */
          316 #if WIN32
          317         _setmode(_fileno(stdin), _O_BINARY);
          318         _setmode(_fileno(stdout), _O_BINARY);
          319         _setmode(_fileno(stderr), _O_BINARY);
          320 #endif
          321 
          322         while ((c = getchar()) != EOF) {
          323                 n = codepointtoutf8(charmap[c], buf);
          324                 for (i = 0; i < n; i++)
          325                         putchar(buf[i]);
          326         }
          327 
          328         if (ferror(stdin)) {
          329                 perror(NULL);
          330                 return 1;
          331         }
          332         return 0;
          333 }