Received: from danpost2.uni-c.dk by dkuug.dk via EUnet with SMTP (5.64+/8+bit/IDA-1.2.8) id AA12183; Wed, 4 Sep 91 02:03:43 +0200 Received: from vm.uni-c.dk by danpost2.uni-c.dk (5.65c+/1.34) id AA15375; Wed, 4 Sep 1991 00:04:03 GMT Message-Id: <199109040004.AA15375@danpost2.uni-c.dk> Received: from vm.uni-c.dk by vm.uni-c.dk (IBM VM SMTP V2R1) with BSMTP id 0815; Wed, 04 Sep 91 02:05:47 DNT Received: from SEARN.SUNET.SE by vm.uni-c.dk (Mailer R2.07) with BSMTP id 5755; Wed, 04 Sep 91 02:05:45 DNT Received: from SEARN.BITNET by SEARN.SUNET.SE (Mailer R2.05) with BSMTP id 0515; Wed, 04 Sep 91 02:07:34 +0200 Date: Tue, 3 Sep 1991 17:01:01 U Reply-To: Multi-byte Code Issues Sender: Multi-byte Code Issues From: Mark Davis Subject: ATM C Implementation X-To: unicore@Eng.Sun.COM, ISO10646%JHUVM.BITNET@cunyvm.cuny.edu, ansix3l2%JHUVM.BITNET@cunyvm.cuny.edu To: Multiple recipients of list ISO10646 X-Charset: US-DK X-Char-Esc: 29 Status: RO Subject: Time:4:34 PM OFFICE MEMO ATM C Implementation Date:9/3/91 Here is an implementation of the ATM algorithm, for those interested. Any feedback would be welcome.--Mark ///////////////////////////////////////////////////////////////// // A Transformation Method // Author: Mark Davis // Date: August 30, 1991 // The following is a C test implementation of the ATM algorithm // described in the C0 committee report (see that text for // details as to the purpose and requirements). // The details of the algorithm are somewhat changed from that // report, to correct some bugs and take into account some // results of the WG2 meeting. ///////////////////////////////////////////////////////////////// #include #include typedef unsigned char ubyte; typedef unsigned short ushort; typedef unsigned long ucs; typedef short index; typedef short bufferLength; enum {false,true}; typedef unsigned char Boolean; #define c0Start 0x00 #define c0End 0x20 #define g0Start 0x21 #define g0End 0x7E #define c1Start 0x7F #define c1End 0x9F #define g1Start 0xA0 #define g1End 0xFF #define uStart 0x100 #define g0Count (c1Start - g0Start) #define g1Count (uStart - g1Start) #define c0Count (g0Start - c0Start) #define c1Count (g1Start - c1Start) #define gCount (g0Count + g1Count) #define cCount (c0Count + c1Count) #define section0 0x000000A0 #define section1 0x00000100 #define section2 0x00004016 #define section3 0x00038E2E #define section4 0xFFFFFFFF #define break0 0xA0 #define break1 0xA1 #define break2 0xF6 #define break3 0xFC #define break4 0x100 #define errorChar 0xFFFFFFFF ///////////////////////////////////////////////////////////////// // SkipTable is used to map a contiguous range onto values that // do not include control bytes. // It maps the values from 0 to 256 as follows: // 0 .. g0Count-1 => g0Start..g0End // g0Count .. gCount-1 => g1Start..g1End // gCount .. gCount+c0Count-1 => c0Start..c0End // gCount+c0Count .. g1End => c1Start..c1End // UnskipTable reverses the effect of SkipTable. // The last two ranges are not, strictly speaking, necessary, // but make it injective and surjective, providing // predictability for out-of-range cases. // Call FillSkipTable before using any other routine. ///////////////////////////////////////////////////////////////// ubyte SkipTable [256]; ucs UnskipTable [256]; void FillSkipTables (void) { index c; for (c = 0; c < 256; c++) { if (c < g0Count) SkipTable[c] = (ubyte)(c + g0Start); else if (c < gCount) SkipTable[c] = (ubyte)(c - g0Count + g1Start); else if (c < (gCount + c0Count)) SkipTable[c] = (ubyte)(c - gCount + c0Start); else SkipTable[c] = (ubyte)(c - (gCount + c0Count) + c1Start); UnskipTable[SkipTable[c]] = c; }; }; ///////////////////////////////////////////////////////////////// // The procedure ToATM takes a UCS character (0..4G) and maps it // to a sequence of bytes that do not include control characters // (C0 or C1), SPACE, or DEL. // The length of the sequence can be from 1 to 5 bytes, depending // on the first byte. ///////////////////////////////////////////////////////////////// void ToAtm(ucs ch, ubyte* a, bufferLength *len) { ubyte *chPtr; chPtr = a; if (ch < section0) { chPtr += (*len = 1); *--chPtr = (ubyte) ch; } else if (ch < section1) { chPtr += (*len = 2); *--chPtr = (ubyte) ch; *--chPtr = break0; } else if (ch < section2) { chPtr += (*len = 2); ch -= section1; *--chPtr = SkipTable[ch % gCount]; ch /= gCount; *--chPtr = (ubyte)(break1 + ch); } else if (ch < section3) { chPtr += (*len = 3); ch -= section2; *--chPtr = SkipTable[ch % gCount]; ch /= gCount; *--chPtr = SkipTable[ch % gCount]; ch /= gCount; *--chPtr = (ubyte)(break2 + ch); } else { chPtr += (*len = 5); ch -= section3; *--chPtr = SkipTable[ch % gCount]; ch /= gCount; *--chPtr = SkipTable[ch % gCount]; ch /= gCount; *--chPtr = SkipTable[ch % gCount]; ch /= gCount; *--chPtr = SkipTable[ch % gCount]; ch /= gCount; *--chPtr = (ubyte)(break3 + ch); }; }; ///////////////////////////////////////////////////////////////// // The procedure FromATM takes a sequence of ATM bytes (as // generated by ToATM) and maps it to the UCS character (0..4G) // that generated it. // Note that there are a number of byte sequences that cannot // be produced by the ATM algorithm, and are invalid input. // As written, this procedure checks for some of the obvious // invalid values, such as insufficient bufferLength (based on // the first byte), but does not do full-fledged checking for // invalid sequences (such as ). ///////////////////////////////////////////////////////////////// ucs FromAtm(ubyte** bufferStart, bufferLength maxLength) { register ubyte c, *a; ucs result; if (maxLength < 1) return errorChar; a = *bufferStart; c = *a++; result = 0; if (c < break0) { result = c; } else if (c < break1) { result = *a++; } else if (c < break2) { if (maxLength < 2) return errorChar; result = c - break1; result *= gCount; result += UnskipTable[*a++]; result += section1; } else if (c < break3) { if (maxLength < 3) return errorChar; result = c - break2; result *= gCount; result += UnskipTable[*a++]; result *= gCount; result += UnskipTable[*a++]; result += section2; } else { if (maxLength < 5) return errorChar; result = c - break3; result *= gCount; result += UnskipTable[*a++]; result *= gCount; result += UnskipTable[*a++]; result *= gCount; result += UnskipTable[*a++]; result *= gCount; result += UnskipTable[*a++]; result += section3; }; *bufferStart = a; // pass back new starting point return result; };