/* libeXtra/utils/utf8string.cpp
 *
 * Copyright (C) 2002 Francis James Franklin <fjf@alinameridon.com>
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <cstdlib>
#include <cstring>

#include <new>
#include <stdexcept>

#include "utf8string.h"

/* returns the UCS-4 code equivalent to the first UTF-8 sequence of utf8str,
 * or (UCS4Char)(-1) if utf8str is not valid UTF-8.
 */
UTF8String::UCS4Char UTF8String::UCS4Code (const char * utf8str)
{
  if (utf8str == 0) return (UCS4Char)(-1); // no string - return error

  const char * p = utf8str;

  UCS4Char ret_code = 0;

  int bytesExpected = 0;

  switch (UTF8String::sequence_length (p))
    {
    case 6: // lead byte in 6-byte sequence
      bytesExpected = 6;
      ret_code = (UCS4Char) (*p & 0x01);
      p++;
      break;
    case 5: // lead byte in 5-byte sequence
      bytesExpected = 5;
      ret_code = (UCS4Char) (*p & 0x03);
      p++;
      break;
    case 4: // lead byte in 4-byte sequence
      bytesExpected = 4;
      ret_code = (UCS4Char) (*p & 0x07);
      p++;
      break;
    case 3: // lead byte in 3-byte sequence
      bytesExpected = 3;
      ret_code = (UCS4Char) (*p & 0x0f);
      p++;
      break;
    case 2: // lead byte in 2-byte sequence
      bytesExpected = 2;
      ret_code = (UCS4Char) (*p & 0x1f);
      p++;
      break;
    case 1: // plain us-ascii part of latin-1
      ret_code = (UCS4Char) (*p);
      break;
    case 0: // end-of-string
      ret_code = 0;
      break;
    default: // return error
      ret_code = (UCS4Char) (-1);
      break;
    }
  if (bytesExpected == 0) return ret_code;

  int bytes = 1;

  while (*p)
    {
      // 'continuing' octets:
      if (UTF8String::trailing_byte (p)) // trailing byte in multi-byte sequence
	{
	  bytes++;
	  ret_code = (ret_code << 6) | (UCS4Char) (*p & 0x3f);
	  if (bytes == bytesExpected) break;
	  p++;
	  continue;
	}
      ret_code = (UCS4Char) (-1);
      break; // invalid byte - not UTF-8
    }
  if (bytes != bytesExpected) ret_code = (UCS4Char) (-1);

  return ret_code;
}

UTF8String::UTF8StringValue::UTF8StringValue (const char * utf8str) :
  m_count(1),
  m_utf8length(0),
  m_bytelength(0),
  m_bytemaxlen(0),
  m_utf8str(0),
  m_ucs4str(0)
{
  if (!grow ()) return; // throws std::bad_alloc
  append (utf8str);
}

UTF8String::UTF8StringValue::UTF8StringValue (const UCS4Char * ucs4str, int length) :
  m_count(1),
  m_utf8length(0),
  m_bytelength(0),
  m_bytemaxlen(0),
  m_utf8str(0),
  m_ucs4str(0)
{
  if (!grow ()) return; // throws std::bad_alloc
  append (ucs4str, length);
}

UTF8String::UTF8StringValue::UTF8StringValue (const const_iterator & ci_start,
					      const const_iterator & ci_end) :
  m_count(1),
  m_utf8length(0),
  m_bytelength(0),
  m_bytemaxlen(0),
  m_utf8str(0),
  m_ucs4str(0)
{
  if (!grow ()) return; // throws std::bad_alloc
  append (ci_start, ci_end);
}

UTF8String::UTF8StringValue::~UTF8StringValue ()
{
  if (m_utf8str) free (m_utf8str);
  if (m_ucs4str) delete [] m_ucs4str;
}

void UTF8String::UTF8StringValue::clear ()
{
  m_utf8length = 0;
  m_bytelength = 0;

  if (m_ucs4str) // internal UCS-4 string no longer valid
    {
      delete [] m_ucs4str;
      m_ucs4str = 0;
    }
}

/* Want to store strings internally as *valid* UTF-8, so only copy valid
 * byte sequences.
 * Although perhaps better to throw UTF8Invalid exception ??
 */
bool UTF8String::UTF8StringValue::append (const char * utf8str)
{
  if ( utf8str == 0) return true; // or is it?
  if (*utf8str == 0) return true;

  if (m_ucs4str) // internal UCS-4 string no longer valid
    {
      delete [] m_ucs4str;
      m_ucs4str = 0;
    }

  int utf8length = 0;
  int bytelength = 0;

  const char * p = utf8str;
  while (true)
    {
      int seql = UTF8String::sequence_length (p);
      if (seql <= 0) break;
      if (seql == 1)
	{
	  utf8length++;
	  bytelength++;
	  p++;
	  continue;
	}
      // multi-byte sequence
      p++;
      bool valid = true;
      for (int s = 1; s < seql; s++)
	{
	  if (!UTF8String::trailing_byte (p))
	    {
	      valid = false;
	      break;
	    }
	  p++;
	}
      if (!valid) break;
      utf8length++;
      bytelength += seql;
    }

  if (bytelength == 0) return true;
  if (!grow (bytelength)) return false; // throws std::bad_alloc

  char * utf8end = m_utf8str + m_bytelength;

  memcpy (utf8end, utf8str, bytelength);

  m_utf8length += utf8length;
  m_bytelength += bytelength;

  utf8end += bytelength;
  *utf8end = 0;

  return true;
}

// length=0 indicates 0-termination
bool UTF8String::UTF8StringValue::append (const UCS4Char * ucs4str, int length)
{
  if (length < 0) return true; // or is it?

  if ( ucs4str == 0) return true; // or is it?
  if (*ucs4str == 0) return true;

  if (m_ucs4str) // internal UCS-4 string no longer valid
    {
      delete [] m_ucs4str;
      m_ucs4str = 0;
    }

  int bytelength = 0;
  int ucs4length = 0;
  if (length == 0)
    {
      // ucs4str is 0-terminated
      const UCS4Char * p = ucs4str;
      while (int seql = UTF8String::sequence_length (*p))
	{
	  if (seql <= 0) break;
	  bytelength += seql;
	  ucs4length++;
	  p++;
	}
    }
  else
    {
      const UCS4Char * p = ucs4str;
      for (int i = 0; i < length; i++)
	{
	  int seql = UTF8String::sequence_length (*p);
	  if (seql <= 0) break;
	  bytelength += seql;
	  ucs4length++;
	  p++;
	}
    }
  if (ucs4length == 0) return true; // or is it? (do we need a UCS4Invalid exception too??)

  if (bytelength == 0) return true; // shouldn't happen
  if (!grow (bytelength)) return false; // throws std::bad_alloc

  char * utf8end = m_utf8str + m_bytelength;

  const UCS4Char * p = ucs4str;

  for (int i = 0; i < ucs4length; i++)
    {
      UCS4Char u = *p;
      int seql = UTF8String::sequence_length (u);
      if (seql <= 0) break; // shouldn't happen
      switch (seql)
	{
	case 1:
	  *utf8end++ = (char)  (u & 0x0000007f);
	  break;
	case 2:
	  *utf8end++ = (char) (0xc0 | ((u & 0x000007c0) >>  6));
	  *utf8end++ = (char) (0x80 |  (u & 0x0000003f)       );
	  break;
	case 3:
	  *utf8end++ = (char) (0xe0 | ((u & 0x0000f000) >> 12));
	  *utf8end++ = (char) (0x80 | ((u & 0x00000fc0) >>  6));
	  *utf8end++ = (char) (0x80 |  (u & 0x0000003f)       );
	  break;
	case 4:
	  *utf8end++ = (char) (0xf0 | ((u & 0x001c0000) >> 18));
	  *utf8end++ = (char) (0x80 | ((u & 0x0003f000) >> 12));
	  *utf8end++ = (char) (0x80 | ((u & 0x00000fc0) >>  6));
	  *utf8end++ = (char) (0x80 |  (u & 0x0000003f)       );
	  break;
	case 5:
	  *utf8end++ = (char) (0xf8 | ((u & 0x03000000) >> 24));
	  *utf8end++ = (char) (0x80 | ((u & 0x00fc0000) >> 18));
	  *utf8end++ = (char) (0x80 | ((u & 0x0003f000) >> 12));
	  *utf8end++ = (char) (0x80 | ((u & 0x00000fc0) >>  6));
	  *utf8end++ = (char) (0x80 |  (u & 0x0000003f)       );
	  break;
	case 6:
	  *utf8end++ = (char) (0xfc | ((u & 0x40000000) >> 30));
	  *utf8end++ = (char) (0x80 | ((u & 0x3f000000) >> 24));
	  *utf8end++ = (char) (0x80 | ((u & 0x00fc0000) >> 18));
	  *utf8end++ = (char) (0x80 | ((u & 0x0003f000) >> 12));
	  *utf8end++ = (char) (0x80 | ((u & 0x00000fc0) >>  6));
	  *utf8end++ = (char) (0x80 |  (u & 0x0000003f)       );
	  break;
	default: // shouldn't happen
	  break;
	}
      p++;
    }
  *utf8end = 0;

  m_utf8length += ucs4length;
  m_bytelength += bytelength;

  return true;
}

/* the idea here being that iterators (provided they are used properly)
 * reference valid UTF-8 strings, so less validation required.
 */
bool UTF8String::UTF8StringValue::append (const const_iterator & ci_start,
					  const const_iterator & ci_end)
{
  int bytelength = &ci_end - &ci_start;
  if (bytelength == 0) return true;
  if (bytelength < 0)
    {
      throw std::out_of_range("UTF8String::const_iterator - end iterator before start!");
      return false;
    }
  if (!grow (bytelength)) return false; // throws std::bad_alloc

  if (m_ucs4str) // internal UCS-4 string no longer valid
    {
      delete [] m_ucs4str;
      m_ucs4str = 0;
    }

  int utf8length = 0;
  const_iterator ci = ci_start;
  while (ci != ci_end)
    {
      utf8length++;
      ++ci;
    }

  char * utf8end = m_utf8str + m_bytelength;
  memcpy (utf8end, &ci_start, bytelength);
  utf8end += bytelength;
  *utf8end = 0;

  m_utf8length += utf8length;
  m_bytelength += bytelength;

  return true;
}

/* keep a copy of the string as UCS-4; but generate only if requested,
 * and be prepared to give it away on demand...
 */
const UTF8String::UCS4Char * UTF8String::UTF8StringValue::ucs4_str ()
{
  if (m_ucs4str == 0)
    {
      m_ucs4str = new UCS4Char[m_utf8length + 1];
      if (m_ucs4str == 0)
	{
	  throw std::bad_alloc();
	  static const UCS4Char sc = 0;
	  return &sc;
	}
      else
	{
	  const char * p = m_utf8str;
	  UCS4Char * ucs4end = m_ucs4str;
	  while (int seql = UTF8String::sequence_length (p))
	    {
	      if (seql <= 0) break;
	      *ucs4end++ = UTF8String::UCS4Code (p);
	      p += seql;
	    }
	  *ucs4end = 0;
	}
    }
  return m_ucs4str;
}

/* returns true if there's space to append a string of byte-length <size>
 */
bool UTF8String::UTF8StringValue::grow (int size)
{
  int new_size = m_bytelength + size + 1;
  if (new_size <= m_bytemaxlen) return true;

  /* If it's a big string then don't try to second-guess, but if it's a
   * small string then chances are there will be other small strings
   * to be appended also, so allocate in bunches...
   */
  if (size < 32) new_size = m_bytemaxlen + 32;

  if (m_utf8str == 0)
    {
      m_utf8str = (char *) malloc (new_size);
      if (m_utf8str == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
      *m_utf8str = 0;
      m_bytemaxlen = new_size;
    }
  else
    {
      char * more = (char *) realloc ((void *) m_utf8str, new_size);
      if (more == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
      m_utf8str = more;
      m_bytemaxlen = new_size;
    }
  return true;
}

void UTF8String::const_iterator::set_utf8ptr (const char * utf8ptr)
{
  if (m_value == 0)
    {
      throw std::out_of_range("UTF8String::const_iterator unassociated with UTF8String!");
      return;
    }
  if (m_value->utf8_str () == 0)
    {
      throw std::out_of_range("UTF8String::const_iterator unassociated with string!");
      return;
    }
  if ((utf8ptr <  m_value->utf8_str ()) ||
      (utf8ptr > (m_value->utf8_str () + m_value->byte_length ())))
    {
      throw std::out_of_range("UTF8String::const_iterator out of range!");
      return;
    }
  if (UTF8String::sequence_length (utf8ptr) < 0)
    {
      throw std::out_of_range("UTF8String::const_iterator set to invalid position!");
      return;
    }
  m_utf8ptr = utf8ptr;
}

UTF8String::const_iterator::const_iterator (const UTF8StringValue * value, const char * utf8ptr) :
  m_value(value),
  m_utf8ptr(utf8ptr)
{
  set_utf8ptr (utf8ptr); // throws std::out_of_range
}

UTF8String::const_iterator & UTF8String::const_iterator::operator++ () // prefix operator
{
  int seql = UTF8String::sequence_length (m_utf8ptr);
  if (seql <= 0)
    {
      throw std::out_of_range("UTF8String::const_iterator at end of string!");
      return *this;
    }
  m_utf8ptr += seql;
  return *this;
}

UTF8String::const_iterator & UTF8String::const_iterator::operator-- () // prefix operator
{
  if (m_utf8ptr <= m_value->utf8_str ())
    {
      throw std::out_of_range("UTF8String::const_iterator at start of string!");
      return *this;
    }
  m_utf8ptr--;
  while (UTF8String::trailing_byte (m_utf8ptr)) m_utf8ptr--;
  return *this;
}

UTF8String::UTF8String (const char * utf8str) :
  m_value(new UTF8StringValue(utf8str))
{
  if (m_value == 0) throw std::bad_alloc();
}

// length=0 indicates 0-termination
UTF8String::UTF8String (const UCS4Char * ucs4str, int length) :
  m_value(new UTF8StringValue(ucs4str, length))
{
  if (m_value == 0) throw std::bad_alloc();
}

UTF8String::UTF8String (const UTF8String & rhs) :
  m_value(rhs.m_value)
{
  m_value->ref ();
}

UTF8String::~UTF8String ()
{
  if (!m_value->unref ()) delete m_value;
}

// if 0, create empty string ""
UTF8String & UTF8String::operator= (const char * utf8str)
{
  append (utf8str, true);
  return *this;
}

UTF8String & UTF8String::operator= (const UTF8String & rhs)
{
  if (m_value == rhs.m_value) return *this;

  if (!m_value->unref ()) delete m_value;
  m_value = rhs.m_value;
  m_value->ref ();

  return *this;
}

// if 0, assume empty string ""
UTF8String & UTF8String::operator+= (const char * utf8str)
{
  append (utf8str, false);
  return *this;
}

UTF8String & UTF8String::operator+= (const UTF8String & rhs)
{
  append (rhs.begin (), rhs.end ());
  return *this;
}

bool UTF8String::append (const char * utf8str, bool clear)
{
  if (m_value->count () == 1)
    {
      if (clear) m_value->clear ();
    }
  else
    {
      m_value->unref ();
      if (clear)
	{
	  m_value = new UTF8StringValue(0);
	}
      else
	{
	  m_value = new UTF8StringValue(begin (), end ());
	}
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
    }
  return m_value->append (utf8str);
}

// length=0 indicates 0-termination
bool UTF8String::append (const UCS4Char * ucs4str, int length)
{
  if (m_value->count () > 1)
    {
      m_value->unref ();
      m_value = new UTF8StringValue(begin (), end ());
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
    }
  return m_value->append (ucs4str, length);
}

bool UTF8String::append (const const_iterator & ci_start, const const_iterator & ci_end)
{
  if (m_value->count () > 1)
    {
      m_value->unref ();
      m_value = new UTF8StringValue(begin (), end ());
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
    }
  return m_value->append (ci_start, ci_end);
}

/* Copy str, replacing substrings matching str1 with corresponding
 * substring str2.
 * Note: pairs must be {0,0}-terminated.
 */
UTF8String UTF8String::substring_replace (const UTF8String & str, const UTF8StringPair * pairs)
{
  int npair = 0;
  if (pairs)
    {
      const UTF8StringPair * p = pairs;
      while (true)
	{
	  if ((p->str1 == 0) || (p->str2 == 0)) break;
	  if ((p->str1->utf8_str () == 0) || (p->str2->utf8_str () == 0)) break; // hmm :(
	  npair++;
	  p++;
	}
    }
  if (npair == 0) return UTF8String(str);

  UTF8String r;

  const const_iterator ci_end = str.end ();
  const_iterator ci_start = str.begin ();
  const_iterator ci = ci_start;
  while (ci != ci_end)
    {
      const char * utf8ptr = &ci;
      const UTF8String * match = 0;
      const UTF8StringPair * p = pairs;
      for (int i = 0; i < npair; i++)
	{
	  const UTF8String * substr = p->str1;
	  if (strncmp (utf8ptr, substr->utf8_str (), substr->byte_length ()) == 0) // safer to use memcmp ??
	    {
	      match = p->str2;
	      break;
	    }
	  p++;
	}
      if (match == 0)
	{
	  ++ci;
	  continue;
	}
      if (ci_start != ci) r.append (ci_start, ci);
      ci_start = ++ci;
      r.append (match->begin (), match->end ());
    }
  if (ci_start != ci_end) r.append (ci_start, ci_end);

  return r;
}
