Message-Id: <199109040004.AA15375@danpost2.uni-c.dk>
Date:         Tue, 3 Sep 1991 17:01:01 U
Reply-To: Multi-byte Code Issues <ISO10646@JHUVM.BITNET>
Sender: Multi-byte Code Issues <ISO10646@JHUVM.BITNET>
From: Mark Davis <mark_davis@gateway.qm.apple.com>
Subject:      ATM C Implementation
To: Multiple recipients of list ISO10646 <ISO10646@JHUVM>
Status: RO
                       Subject:                               Time:4:34 PM
  OFFICE MEMO          ATM C Implementation                   Date:9/3/91
Here is an implementation of the ATM algorithm, for those interested.
Any feedback would be welcome.--Mark

/////////////////////////////////////////////////////////////////
// A Transformation Method
// Author:	Mark Davis
// Date:	August 30, 1991
// The following is a C test implementation of the ATM algorithm
// described in the C0 committee report (see that text for
// details as to the purpose and requirements).
// The details of the algorithm are somewhat changed from that
// report, to correct some bugs and take into account some
// results of the WG2 meeting.
/////////////////////////////////////////////////////////////////

#include <STDIO.H>
#include <TYPES.h>

typedef unsigned char ubyte;
typedef unsigned short ushort;
typedef unsigned long ucs;
typedef short index;
typedef short bufferLength;
enum {false,true};
typedef unsigned char Boolean;

#define c0Start 0x00
#define c0End	0x20
#define g0Start 0x21
#define g0End	0x7E
#define c1Start 0x7F
#define c1End	0x9F
#define g1Start 0xA0
#define g1End	0xFF
#define uStart	0x100

#define g0Count (c1Start - g0Start)
#define g1Count (uStart  - g1Start)
#define c0Count (g0Start - c0Start)
#define c1Count (g1Start - c1Start)

#define gCount	(g0Count + g1Count)
#define cCount	(c0Count + c1Count)

#define section0 0x000000A0
#define section1 0x00000100
#define section2 0x00004016
#define section3 0x00038E2E
#define section4 0xFFFFFFFF

#define break0	0xA0
#define break1	0xA1
#define break2	0xF6
#define break3	0xFC
#define break4	0x100

#define errorChar 0xFFFFFFFF

/////////////////////////////////////////////////////////////////
// SkipTable is used to map a contiguous range onto values that
// do not include control bytes.
// It maps the values from 0 to 256 as follows:
//	0				..	g0Count-1			=>	g0Start..g0End
//	g0Count			..	gCount-1			=>	g1Start..g1End
//	gCount			..	gCount+c0Count-1	=>	c0Start..c0End
//	gCount+c0Count	..	g1End				=>	c1Start..c1End
// UnskipTable reverses the effect of SkipTable.
// The last two ranges are not, strictly speaking, necessary,
// but make it injective and surjective, providing
// predictability for out-of-range cases.
// Call FillSkipTable before using any other routine.
/////////////////////////////////////////////////////////////////

ubyte	SkipTable [256];
ucs		UnskipTable [256];

void FillSkipTables (void) {
	index	c;
	for (c = 0; c < 256; c++) {
		if (c < g0Count)
			SkipTable[c] = (ubyte)(c + g0Start);
		else if (c < gCount)
			SkipTable[c] = (ubyte)(c - g0Count + g1Start);
		else if (c < (gCount + c0Count))
			SkipTable[c] = (ubyte)(c - gCount + c0Start);
		else
			SkipTable[c]
				= (ubyte)(c - (gCount + c0Count) + c1Start);
		UnskipTable[SkipTable[c]] = c;
	};
};

/////////////////////////////////////////////////////////////////
// The procedure ToATM takes a UCS character (0..4G) and maps it
// to a sequence of bytes that do not include control characters
// (C0 or C1), SPACE, or DEL.
// The length of the sequence can be from 1 to 5 bytes, depending
// on the first byte.
/////////////////////////////////////////////////////////////////

void ToAtm(ucs ch, ubyte* a, bufferLength *len) {
	ubyte *chPtr;
	chPtr = a;
	if (ch < section0) {
		chPtr += (*len = 1);
		*--chPtr = (ubyte) ch;
	} else if (ch < section1) {
		chPtr += (*len = 2);
		*--chPtr = (ubyte) ch;
		*--chPtr = break0;
	} else if (ch < section2) {
		chPtr += (*len = 2);
		ch -= section1;
		*--chPtr = SkipTable[ch % gCount]; ch /= gCount;
		*--chPtr = (ubyte)(break1 + ch);
	} else if (ch < section3) {
		chPtr += (*len = 3);
		ch -= section2;
		*--chPtr = SkipTable[ch % gCount]; ch /= gCount;
		*--chPtr = SkipTable[ch % gCount]; ch /= gCount;
		*--chPtr = (ubyte)(break2 + ch);
	} else {
		chPtr += (*len = 5);
		ch -= section3;
		*--chPtr = SkipTable[ch % gCount]; ch /= gCount;
		*--chPtr = SkipTable[ch % gCount]; ch /= gCount;
		*--chPtr = SkipTable[ch % gCount]; ch /= gCount;
		*--chPtr = SkipTable[ch % gCount]; ch /= gCount;
		*--chPtr = (ubyte)(break3 + ch);
	};
};

/////////////////////////////////////////////////////////////////
// The procedure FromATM takes a sequence of ATM bytes (as
// generated by ToATM) and maps it to the UCS character (0..4G)
// that generated it.
// Note that there are a number of byte sequences that cannot
// be produced by the ATM algorithm, and are invalid input.
// As written, this procedure checks for some of the obvious
// invalid values, such as insufficient bufferLength (based on
// the first byte), but does not do full-fledged checking for
// invalid sequences (such as <A0,21>).
/////////////////////////////////////////////////////////////////

ucs FromAtm(ubyte** bufferStart, bufferLength maxLength) {
	register ubyte		c, *a;
	ucs					result;

	if (maxLength < 1) return errorChar;
	a = *bufferStart;
	c = *a++;
	result = 0;
	if (c < break0) {
		result = c;
	} else if (c < break1) {
		result = *a++;
	} else if (c < break2) {
		if (maxLength < 2) return errorChar;
		result = c - break1;
		result *= gCount; result += UnskipTable[*a++];
		result += section1;
	} else if (c < break3) {
		if (maxLength < 3) return errorChar;
		result = c - break2;
		result *= gCount; result += UnskipTable[*a++];
		result *= gCount; result += UnskipTable[*a++];
		result += section2;
	} else {
		if (maxLength < 5) return errorChar;
		result = c - break3;
		result *= gCount; result += UnskipTable[*a++];
		result *= gCount; result += UnskipTable[*a++];
		result *= gCount; result += UnskipTable[*a++];
		result *= gCount; result += UnskipTable[*a++];
		result += section3;
	};
	*bufferStart = a;	// pass back new starting point
	return result;
};