Subj : Re: spidermonkey: byte arrays - again
To   : netscape.public.mozilla.jseng
From : Shanti Rao
Date : Mon Sep 27 2004 01:47 am

I'll volunteer my UCS-2 / UTF-8 conversion code if you want to go beyond 
iso-8859-1.

Shanti

size_t UTF8ToUCS2C(const char* in,uint16 &c)
{
  if ((in[0] & 0x80) == 0) {c=in[0]; return 1;} //1 character OK

  // 110xxxxx 10xxxxxx
  if (((in[0] & 0xe0) == 0xc0) && ((in[1] & 0xc0) == 0x80))
   { c = ((in[0] & 0x1f) << 6) | (in[1] & 0x3f); return 2;}

  // 1110xxxx 10xxxxxx 10xxxxxx
  if (((in[0] & 0xf0) == 0xe0) && ((in[1] & 0xc0) == 0x80) && ((in[2] & 0xc0 
)== 0x80))
   { c = ((in[0] & 0x0f) << 12) | ((in[1] & 0x3f) << 6)| (in[2] & 0x3f); 
return 3;}

  // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  if (((in[0] & 0xf8) == 0xf0) && ((in[1] & 0xc0) == 0x80) && ((in[2] & 0xc0 
)== 0x80)&& ((in[3] & 0xc0) == 0x80))
   { c = ((in[0] & 0x0f) << 18) | ((in[1] & 0x3f) << 12)| ((in[2] & 0x3f) 
<<6)| ((in[3] & 0x3f)); return 4;}

  return 0; //error
}

///returns the number of characters (not bytes) needed  (not including the 
final null)
size_t UTF8ToUCS2Length(const char* in)
{
  uint16 x;
  size_t length=0;
  while (*in)
   {
    size_t a = UTF8ToUCS2C(in,x);
    if (!a) break;
    length++;
    in += a;
   }
  return length;
}

///returns the number of characters (not bytes) converted (not including the 
final null)
size_t UTF8ToUCS2(const char* in,uint16 * out)
{
  size_t length=0;
  while (*in)
   {
    size_t a = UTF8ToUCS2C(in,*out);
    if (!a) break;
    length++;
    out++;
    in += a;
   }
  *out = 0;
  return length;
}

size_t UCS2ToUTF8C(unsigned in, char* out)
{
   int ret=0;
   if ((in & 0x80) == 0)
   {
	  out[0]=(char)in;
	  return 1;
   }
   else if (in < 0x800)
   {
	  out[1] = (char)((in & 0x3f) | 0x80);
	  out[0] = (char)((in >> 6)   | 0xc0);
	  return 2;
   }
   else
   {
	  out[2] = (char)((in & 0x3f) | 0x80);
	  out[1] = (char)(((in >> 6) & 0x3f) | 0x80);
	  out[0] = (char)((in >> 12)  | 0xe0);
	  return 3;
   }
}

///returns the number of bytes (not not characters) needed (not including the 
final null)
size_t UCS2ToUTF8Length(const uint16* in)
{
  size_t length =0;
  char d[3];
  while (*in)
  {
   length += UCS2ToUTF8C(*in,d);
   in++;
  }
  return length;
}

///returns the length of the result string (not including the final null)
size_t UCS2ToUTF8(const uint16* in,char* out)
{
  size_t length =0;
  while (*in)
  {
   size_t a = UCS2ToUTF8C(*in,out);
   out += a;
   in++;
  }
  *out = 0;
  return length;
}


Jens Thiele wrote:
> Shanti Rao wrote:
> 
>>Jens,
>>
>>What are you using it for? Arbitrary indexing and manipulation, or
>>sequential scanning of text?
>>
> 
> 
> I contacted Sajid Raza (the poster in the thread I mentioned) by private
> mail and he has written a ByteArray class which closely meets my
> requirements / what I would have written. I now am awaiting his reply if
> he wants to release this under some open source lincense so I could post
> a link.
> 
> My thoughts about what a ByteArray should have:
> 
> Native operations needed:
> (all operations must be safe - range checked,
> additional operations could also be added in Javascript itself)
> 
> get size
> set size/resize (fill new bytes with zero or optionally given value)
> get byte
> set byte
> 
> move bytes within byte array (like memmove)
> copy bytes from other byte array (like memcpy)
> 
> ByteArray <-> js string:
>   convert to / interpret as iso-8859-1 string (as 8-bit subset of ucs-2)
>   convert to / interpret as js string (ucs-2) - odd number of bytes?
>   set from ucs-2 js string
>   set from ucs-2 js string (interpreted as 8-bit iso-8859-1 string)
> 
> ByteArray <-> js array?
> 
> toSource()?
> 
> Jens

.