cp1250toutf8.c - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       cp1250toutf8.c (11177B)
       ---
            1 /* convert text encoding cp1250 to utf-8 */
            2 
            3 #include <stdio.h>
            4 
            5 #ifdef WIN32
            6 /* fcntl.h required for Windows binary mode */
            7 #include <fcntl.h>
            8 #endif
            9 
           10 #ifdef __OpenBSD__
           11 #include <err.h>
           12 #include <unistd.h>
           13 #endif
           14 
           15 static long charmap[256] = {
           16         0x0000, /* Null */
           17         0x0001, /* Start Of Heading */
           18         0x0002, /* Start Of Text */
           19         0x0003, /* End Of Text */
           20         0x0004, /* End Of Transmission */
           21         0x0005, /* Enquiry */
           22         0x0006, /* Acknowledge */
           23         0x0007, /* Bell */
           24         0x0008, /* Backspace */
           25         0x0009, /* Horizontal Tabulation */
           26         0x000a, /* Line Feed */
           27         0x000b, /* Vertical Tabulation */
           28         0x000c, /* Form Feed */
           29         0x000d, /* Carriage Return */
           30         0x000e, /* Shift Out */
           31         0x000f, /* Shift In */
           32         0x0010, /* Data Link Escape */
           33         0x0011, /* Device Control One */
           34         0x0012, /* Device Control Two */
           35         0x0013, /* Device Control Three */
           36         0x0014, /* Device Control Four */
           37         0x0015, /* Negative Acknowledge */
           38         0x0016, /* Synchronous Idle */
           39         0x0017, /* End Of Transmission Block */
           40         0x0018, /* Cancel */
           41         0x0019, /* End Of Medium */
           42         0x001a, /* Substitute */
           43         0x001b, /* Escape */
           44         0x001c, /* File Separator */
           45         0x001d, /* Group Separator */
           46         0x001e, /* Record Separator */
           47         0x001f, /* Unit Separator */
           48         0x0020, /* Space */
           49         0x0021, /* Exclamation Mark */
           50         0x0022, /* Quotation Mark */
           51         0x0023, /* Number Sign */
           52         0x0024, /* Dollar Sign */
           53         0x0025, /* Percent Sign */
           54         0x0026, /* Ampersand */
           55         0x0027, /* Apostrophe */
           56         0x0028, /* Left Parenthesis */
           57         0x0029, /* Right Parenthesis */
           58         0x002a, /* Asterisk */
           59         0x002b, /* Plus Sign */
           60         0x002c, /* Comma */
           61         0x002d, /* Hyphen-Minus */
           62         0x002e, /* Full Stop */
           63         0x002f, /* Solidus */
           64         0x0030, /* Digit Zero */
           65         0x0031, /* Digit One */
           66         0x0032, /* Digit Two */
           67         0x0033, /* Digit Three */
           68         0x0034, /* Digit Four */
           69         0x0035, /* Digit Five */
           70         0x0036, /* Digit Six */
           71         0x0037, /* Digit Seven */
           72         0x0038, /* Digit Eight */
           73         0x0039, /* Digit Nine */
           74         0x003a, /* Colon */
           75         0x003b, /* Semicolon */
           76         0x003c, /* Less-Than Sign */
           77         0x003d, /* Equals Sign */
           78         0x003e, /* Greater-Than Sign */
           79         0x003f, /* Question Mark */
           80         0x0040, /* Commercial At */
           81         0x0041, /* Latin Capital Letter A */
           82         0x0042, /* Latin Capital Letter B */
           83         0x0043, /* Latin Capital Letter C */
           84         0x0044, /* Latin Capital Letter D */
           85         0x0045, /* Latin Capital Letter E */
           86         0x0046, /* Latin Capital Letter F */
           87         0x0047, /* Latin Capital Letter G */
           88         0x0048, /* Latin Capital Letter H */
           89         0x0049, /* Latin Capital Letter I */
           90         0x004a, /* Latin Capital Letter J */
           91         0x004b, /* Latin Capital Letter K */
           92         0x004c, /* Latin Capital Letter L */
           93         0x004d, /* Latin Capital Letter M */
           94         0x004e, /* Latin Capital Letter N */
           95         0x004f, /* Latin Capital Letter O */
           96         0x0050, /* Latin Capital Letter P */
           97         0x0051, /* Latin Capital Letter Q */
           98         0x0052, /* Latin Capital Letter R */
           99         0x0053, /* Latin Capital Letter S */
          100         0x0054, /* Latin Capital Letter T */
          101         0x0055, /* Latin Capital Letter U */
          102         0x0056, /* Latin Capital Letter V */
          103         0x0057, /* Latin Capital Letter W */
          104         0x0058, /* Latin Capital Letter X */
          105         0x0059, /* Latin Capital Letter Y */
          106         0x005a, /* Latin Capital Letter Z */
          107         0x005b, /* Left Square Bracket */
          108         0x005c, /* Reverse Solidus */
          109         0x005d, /* Right Square Bracket */
          110         0x005e, /* Circumflex Accent */
          111         0x005f, /* Low Line */
          112         0x0060, /* Grave Accent */
          113         0x0061, /* Latin Small Letter A */
          114         0x0062, /* Latin Small Letter B */
          115         0x0063, /* Latin Small Letter C */
          116         0x0064, /* Latin Small Letter D */
          117         0x0065, /* Latin Small Letter E */
          118         0x0066, /* Latin Small Letter F */
          119         0x0067, /* Latin Small Letter G */
          120         0x0068, /* Latin Small Letter H */
          121         0x0069, /* Latin Small Letter I */
          122         0x006a, /* Latin Small Letter J */
          123         0x006b, /* Latin Small Letter K */
          124         0x006c, /* Latin Small Letter L */
          125         0x006d, /* Latin Small Letter M */
          126         0x006e, /* Latin Small Letter N */
          127         0x006f, /* Latin Small Letter O */
          128         0x0070, /* Latin Small Letter P */
          129         0x0071, /* Latin Small Letter Q */
          130         0x0072, /* Latin Small Letter R */
          131         0x0073, /* Latin Small Letter S */
          132         0x0074, /* Latin Small Letter T */
          133         0x0075, /* Latin Small Letter U */
          134         0x0076, /* Latin Small Letter V */
          135         0x0077, /* Latin Small Letter W */
          136         0x0078, /* Latin Small Letter X */
          137         0x0079, /* Latin Small Letter Y */
          138         0x007a, /* Latin Small Letter Z */
          139         0x007b, /* Left Curly Bracket */
          140         0x007c, /* Vertical Line */
          141         0x007d, /* Right Curly Bracket */
          142         0x007e, /* Tilde */
          143         0x007f, /* Delete */
          144         0x20ac, /* Euro Sign */
          145         0x0081, /*  */
          146         0x201a, /* Single Low-9 Quotation Mark */
          147         0x0083, /*  */
          148         0x201e, /* Double Low-9 Quotation Mark */
          149         0x2026, /* Horizontal Ellipsis */
          150         0x2020, /* Dagger */
          151         0x2021, /* Double Dagger */
          152         0x0088, /*  */
          153         0x2030, /* Per Mille Sign */
          154         0x0160, /* Latin Capital Letter S With Caron */
          155         0x2039, /* Single Left-Pointing Angle Quotation Mark */
          156         0x015a, /* Latin Capital Letter S With Acute */
          157         0x0164, /* Latin Capital Letter T With Caron */
          158         0x017d, /* Latin Capital Letter Z With Caron */
          159         0x0179, /* Latin Capital Letter Z With Acute */
          160         0x0090, /*  */
          161         0x2018, /* Left Single Quotation Mark */
          162         0x2019, /* Right Single Quotation Mark */
          163         0x201c, /* Left Double Quotation Mark */
          164         0x201d, /* Right Double Quotation Mark */
          165         0x2022, /* Bullet */
          166         0x2013, /* En Dash */
          167         0x2014, /* Em Dash */
          168         0x0098, /*  */
          169         0x2122, /* Trade Mark Sign */
          170         0x0161, /* Latin Small Letter S With Caron */
          171         0x203a, /* Single Right-Pointing Angle Quotation Mark */
          172         0x015b, /* Latin Small Letter S With Acute */
          173         0x0165, /* Latin Small Letter T With Caron */
          174         0x017e, /* Latin Small Letter Z With Caron */
          175         0x017a, /* Latin Small Letter Z With Acute */
          176         0x00a0, /* No-Break Space */
          177         0x02c7, /* Caron */
          178         0x02d8, /* Breve */
          179         0x0141, /* Latin Capital Letter L With Stroke */
          180         0x00a4, /* Currency Sign */
          181         0x0104, /* Latin Capital Letter A With Ogonek */
          182         0x00a6, /* Broken Bar */
          183         0x00a7, /* Section Sign */
          184         0x00a8, /* Diaeresis */
          185         0x00a9, /* Copyright Sign */
          186         0x015e, /* Latin Capital Letter S With Cedilla */
          187         0x00ab, /* Left-Pointing Double Angle Quotation Mark */
          188         0x00ac, /* Not Sign */
          189         0x00ad, /* Soft Hyphen */
          190         0x00ae, /* Registered Sign */
          191         0x017b, /* Latin Capital Letter Z With Dot Above */
          192         0x00b0, /* Degree Sign */
          193         0x00b1, /* Plus-Minus Sign */
          194         0x02db, /* Ogonek */
          195         0x0142, /* Latin Small Letter L With Stroke */
          196         0x00b4, /* Acute Accent */
          197         0x00b5, /* Micro Sign */
          198         0x00b6, /* Pilcrow Sign */
          199         0x00b7, /* Middle Dot */
          200         0x00b8, /* Cedilla */
          201         0x0105, /* Latin Small Letter A With Ogonek */
          202         0x015f, /* Latin Small Letter S With Cedilla */
          203         0x00bb, /* Right-Pointing Double Angle Quotation Mark */
          204         0x013d, /* Latin Capital Letter L With Caron */
          205         0x02dd, /* Double Acute Accent */
          206         0x013e, /* Latin Small Letter L With Caron */
          207         0x017c, /* Latin Small Letter Z With Dot Above */
          208         0x0154, /* Latin Capital Letter R With Acute */
          209         0x00c1, /* Latin Capital Letter A With Acute */
          210         0x00c2, /* Latin Capital Letter A With Circumflex */
          211         0x0102, /* Latin Capital Letter A With Breve */
          212         0x00c4, /* Latin Capital Letter A With Diaeresis */
          213         0x0139, /* Latin Capital Letter L With Acute */
          214         0x0106, /* Latin Capital Letter C With Acute */
          215         0x00c7, /* Latin Capital Letter C With Cedilla */
          216         0x010c, /* Latin Capital Letter C With Caron */
          217         0x00c9, /* Latin Capital Letter E With Acute */
          218         0x0118, /* Latin Capital Letter E With Ogonek */
          219         0x00cb, /* Latin Capital Letter E With Diaeresis */
          220         0x011a, /* Latin Capital Letter E With Caron */
          221         0x00cd, /* Latin Capital Letter I With Acute */
          222         0x00ce, /* Latin Capital Letter I With Circumflex */
          223         0x010e, /* Latin Capital Letter D With Caron */
          224         0x0110, /* Latin Capital Letter D With Stroke */
          225         0x0143, /* Latin Capital Letter N With Acute */
          226         0x0147, /* Latin Capital Letter N With Caron */
          227         0x00d3, /* Latin Capital Letter O With Acute */
          228         0x00d4, /* Latin Capital Letter O With Circumflex */
          229         0x0150, /* Latin Capital Letter O With Double Acute */
          230         0x00d6, /* Latin Capital Letter O With Diaeresis */
          231         0x00d7, /* Multiplication Sign */
          232         0x0158, /* Latin Capital Letter R With Caron */
          233         0x016e, /* Latin Capital Letter U With Ring Above */
          234         0x00da, /* Latin Capital Letter U With Acute */
          235         0x0170, /* Latin Capital Letter U With Double Acute */
          236         0x00dc, /* Latin Capital Letter U With Diaeresis */
          237         0x00dd, /* Latin Capital Letter Y With Acute */
          238         0x0162, /* Latin Capital Letter T With Cedilla */
          239         0x00df, /* Latin Small Letter Sharp S */
          240         0x0155, /* Latin Small Letter R With Acute */
          241         0x00e1, /* Latin Small Letter A With Acute */
          242         0x00e2, /* Latin Small Letter A With Circumflex */
          243         0x0103, /* Latin Small Letter A With Breve */
          244         0x00e4, /* Latin Small Letter A With Diaeresis */
          245         0x013a, /* Latin Small Letter L With Acute */
          246         0x0107, /* Latin Small Letter C With Acute */
          247         0x00e7, /* Latin Small Letter C With Cedilla */
          248         0x010d, /* Latin Small Letter C With Caron */
          249         0x00e9, /* Latin Small Letter E With Acute */
          250         0x0119, /* Latin Small Letter E With Ogonek */
          251         0x00eb, /* Latin Small Letter E With Diaeresis */
          252         0x011b, /* Latin Small Letter E With Caron */
          253         0x00ed, /* Latin Small Letter I With Acute */
          254         0x00ee, /* Latin Small Letter I With Circumflex */
          255         0x010f, /* Latin Small Letter D With Caron */
          256         0x0111, /* Latin Small Letter D With Stroke */
          257         0x0144, /* Latin Small Letter N With Acute */
          258         0x0148, /* Latin Small Letter N With Caron */
          259         0x00f3, /* Latin Small Letter O With Acute */
          260         0x00f4, /* Latin Small Letter O With Circumflex */
          261         0x0151, /* Latin Small Letter O With Double Acute */
          262         0x00f6, /* Latin Small Letter O With Diaeresis */
          263         0x00f7, /* Division Sign */
          264         0x0159, /* Latin Small Letter R With Caron */
          265         0x016f, /* Latin Small Letter U With Ring Above */
          266         0x00fa, /* Latin Small Letter U With Acute */
          267         0x0171, /* Latin Small Letter U With Double Acute */
          268         0x00fc, /* Latin Small Letter U With Diaeresis */
          269         0x00fd, /* Latin Small Letter Y With Acute */
          270         0x0163, /* Latin Small Letter T With Cedilla */
          271         0x02d9, /* Dot Above */
          272 };
          273 
          274 int
          275 codepointtoutf8(long r, char *s)
          276 {
          277         if (r == 0) {
          278                 return 0; /* NUL byte */
          279         } else if (r <= 0x7F) {
          280                 /* 1 byte: 0aaaaaaa */
          281                 s[0] = r;
          282                 return 1;
          283         } else if (r <= 0x07FF) {
          284                 /* 2 bytes: 00000aaa aabbbbbb */
          285                 s[0] = 0xC0 | ((r & 0x0007C0) >>  6); /* 110aaaaa */
          286                 s[1] = 0x80 |  (r & 0x00003F);        /* 10bbbbbb */
          287                 return 2;
          288         } else if (r <= 0xFFFF) {
          289                 /* 3 bytes: aaaabbbb bbcccccc */
          290                 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */
          291                 s[1] = 0x80 | ((r & 0x000FC0) >>  6); /* 10bbbbbb */
          292                 s[2] = 0x80 |  (r & 0x00003F);        /* 10cccccc */
          293                 return 3;
          294         } else {
          295                 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */
          296                 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */
          297                 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */
          298                 s[2] = 0x80 | ((r & 0x000FC0) >>  6); /* 10cccccc */
          299                 s[3] = 0x80 |  (r & 0x00003F);        /* 10dddddd */
          300                 return 4;
          301         }
          302 }
          303 
          304 int
          305 main(void)
          306 {
          307         char buf[5];
          308         int c, i, n;
          309 
          310 #ifdef __OpenBSD__
          311         if (pledge("stdio", NULL) == -1)
          312                 err(1, "pledge");
          313 #endif
          314 
          315         /* required for Windows binary mode aka more retarded bullshit. */
          316 #if WIN32
          317         _setmode(_fileno(stdin), _O_BINARY);
          318         _setmode(_fileno(stdout), _O_BINARY);
          319         _setmode(_fileno(stderr), _O_BINARY);
          320 #endif
          321 
          322         while ((c = getchar()) != EOF) {
          323                 n = codepointtoutf8(charmap[c], buf);
          324                 for (i = 0; i < n; i++)
          325                         putchar(buf[i]);
          326         }
          327 
          328         if (ferror(stdin)) {
          329                 perror(NULL);
          330                 return 1;
          331         }
          332         return 0;
          333 }