Subj : Re: spidermonkey: byte arrays - again To : netscape.public.mozilla.jseng From : Shanti Rao Date : Mon Sep 27 2004 01:47 am I'll volunteer my UCS-2 / UTF-8 conversion code if you want to go beyond iso-8859-1. Shanti size_t UTF8ToUCS2C(const char* in,uint16 &c) { if ((in[0] & 0x80) == 0) {c=in[0]; return 1;} //1 character OK // 110xxxxx 10xxxxxx if (((in[0] & 0xe0) == 0xc0) && ((in[1] & 0xc0) == 0x80)) { c = ((in[0] & 0x1f) << 6) | (in[1] & 0x3f); return 2;} // 1110xxxx 10xxxxxx 10xxxxxx if (((in[0] & 0xf0) == 0xe0) && ((in[1] & 0xc0) == 0x80) && ((in[2] & 0xc0 )== 0x80)) { c = ((in[0] & 0x0f) << 12) | ((in[1] & 0x3f) << 6)| (in[2] & 0x3f); return 3;} // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx if (((in[0] & 0xf8) == 0xf0) && ((in[1] & 0xc0) == 0x80) && ((in[2] & 0xc0 )== 0x80)&& ((in[3] & 0xc0) == 0x80)) { c = ((in[0] & 0x0f) << 18) | ((in[1] & 0x3f) << 12)| ((in[2] & 0x3f) <<6)| ((in[3] & 0x3f)); return 4;} return 0; //error } ///returns the number of characters (not bytes) needed (not including the final null) size_t UTF8ToUCS2Length(const char* in) { uint16 x; size_t length=0; while (*in) { size_t a = UTF8ToUCS2C(in,x); if (!a) break; length++; in += a; } return length; } ///returns the number of characters (not bytes) converted (not including the final null) size_t UTF8ToUCS2(const char* in,uint16 * out) { size_t length=0; while (*in) { size_t a = UTF8ToUCS2C(in,*out); if (!a) break; length++; out++; in += a; } *out = 0; return length; } size_t UCS2ToUTF8C(unsigned in, char* out) { int ret=0; if ((in & 0x80) == 0) { out[0]=(char)in; return 1; } else if (in < 0x800) { out[1] = (char)((in & 0x3f) | 0x80); out[0] = (char)((in >> 6) | 0xc0); return 2; } else { out[2] = (char)((in & 0x3f) | 0x80); out[1] = (char)(((in >> 6) & 0x3f) | 0x80); out[0] = (char)((in >> 12) | 0xe0); return 3; } } ///returns the number of bytes (not not characters) needed (not including the final null) size_t UCS2ToUTF8Length(const uint16* in) { size_t length =0; char d[3]; while (*in) { length += UCS2ToUTF8C(*in,d); in++; } return length; } ///returns the length of the result string (not including the final null) size_t UCS2ToUTF8(const uint16* in,char* out) { size_t length =0; while (*in) { size_t a = UCS2ToUTF8C(*in,out); out += a; in++; } *out = 0; return length; } Jens Thiele wrote: > Shanti Rao wrote: > >>Jens, >> >>What are you using it for? Arbitrary indexing and manipulation, or >>sequential scanning of text? >> > > > I contacted Sajid Raza (the poster in the thread I mentioned) by private > mail and he has written a ByteArray class which closely meets my > requirements / what I would have written. I now am awaiting his reply if > he wants to release this under some open source lincense so I could post > a link. > > My thoughts about what a ByteArray should have: > > Native operations needed: > (all operations must be safe - range checked, > additional operations could also be added in Javascript itself) > > get size > set size/resize (fill new bytes with zero or optionally given value) > get byte > set byte > > move bytes within byte array (like memmove) > copy bytes from other byte array (like memcpy) > > ByteArray <-> js string: > convert to / interpret as iso-8859-1 string (as 8-bit subset of ucs-2) > convert to / interpret as js string (ucs-2) - odd number of bytes? > set from ucs-2 js string > set from ucs-2 js string (interpreted as 8-bit iso-8859-1 string) > > ByteArray <-> js array? > > toSource()? > > Jens .