/* UTF8-UCS4-String/lib/xp/ucs4string.cpp
 * 
 * Copyright (C) 2002 Francis James Franklin <fjf@alinameridon.com>
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <cstdlib>
#include <cstring>

#include <new>
#include <stdexcept>

#include "ucs4string.h"

UCS4String::UCS4StringValue::UCS4StringValue (const char * utf8str) :
  m_count(1),      // reference count
  m_ucs4length(0), // number of code points
  m_bytelength(0), // length in bytes
  m_bytemaxlen(0), // buffer size
  m_ucs4str(0),
  m_utf8str(static_cast<const char *>(0))
{
  if (!grow ()) return; // throws std::bad_alloc
  append (utf8str);
}

UCS4String::UCS4StringValue::UCS4StringValue (const UCS4Char * ucs4str, int length) :
  m_count(1),      // reference count
  m_ucs4length(0), // number of code points
  m_bytelength(0), // length in bytes
  m_bytemaxlen(0), // buffer size
  m_ucs4str(0),
  m_utf8str(static_cast<const char *>(0))
{
  if (!grow ()) return; // throws std::bad_alloc
  append (ucs4str, length);
}

UCS4String::UCS4StringValue::UCS4StringValue (const iterator & ci_start,
					      const iterator & ci_end) :
  m_count(1),      // reference count
  m_ucs4length(0), // number of code points
  m_bytelength(0), // length in bytes
  m_bytemaxlen(0), // buffer size
  m_ucs4str(0),
  m_utf8str(static_cast<const char *>(0))
{
  if (!grow ()) return; // throws std::bad_alloc
  append (ci_start, ci_end);
}

UCS4String::UCS4StringValue::~UCS4StringValue ()
{
  if (m_ucs4str) free (m_ucs4str);
}

void UCS4String::UCS4StringValue::clear () // set ucs4/bytelength to 0
{
  m_ucs4length = 0;
  m_bytelength = 0;

  if (m_utf8str.byte_length ()) m_utf8str = ""; // internal UTF-8 string no longer valid
}

/* not the most efficient way to do it, but this eliminates the need to validate the UTF-8
 * input string - the UTF8String class will take care of that.
 */
bool UCS4String::UCS4StringValue::append (const char * utf8str)
{
  UTF8String str(utf8str);
  return append (str.ucs4_str (), str.utf8_length ());
}

bool UCS4String::UCS4StringValue::append (const UCS4Char * ucs4str, int length) // length=0 indicates 0-termination
{
  if (length < 0) return true; // or is it?

  if ( ucs4str == 0) return true; // or is it?
  if (*ucs4str == 0) return true;

  int ucs4length = 0;
  if (length == 0)
    {
      // ucs4str is 0-terminated
      const UCS4Char * p = ucs4str;
      while (p)
	{
	  if ((*p) & ~0x7fffffff) break; // UCS-4 is only 31-bit!
	  ucs4length++;
	  p++;
	}
    }
  else
    {
      const UCS4Char * p = ucs4str;
      for (int i = 0; i < length; i++)
	{
	  if (*p == 0) break; // unexpected end-of-string
	  if ((*p) & ~0x7fffffff) break; // UCS-4 is only 31-bit!
	  ucs4length++;
	  p++;
	}
    }
  if (ucs4length == 0) return true; // or is it? (do we need a UCS4Invalid exception too??)

  if (!grow (ucs4length)) return false; // throws std::bad_alloc

  if (m_utf8str.byte_length ()) m_utf8str = ""; // internal UTF-8 string no longer valid

  int bytelength = ucs4length * sizeof (UCS4Char);

  UCS4Char * ucs4end = m_ucs4str + m_ucs4length;
  memcpy (ucs4end, ucs4str, bytelength);
  ucs4end += ucs4length;
  *ucs4end = 0;

  m_ucs4length += ucs4length;
  m_bytelength += bytelength;

  return true;
}

bool UCS4String::UCS4StringValue::append (const iterator & ci_start, const iterator & ci_end)
{
  int ucs4length = &ci_end - &ci_start;
  if (ucs4length == 0) return true;
  if (ucs4length < 0)
    {
      throw std::out_of_range("UCS4String::iterator - end iterator before start!");
      return false;
    }
  if (!grow (ucs4length)) return false; // throws std::bad_alloc

  if (m_utf8str.byte_length ()) m_utf8str = ""; // internal UTF-8 string no longer valid

  int bytelength = ucs4length * sizeof (UCS4Char);

  UCS4Char * ucs4end = m_ucs4str + m_ucs4length;
  memcpy (ucs4end, &ci_start, bytelength);
  ucs4end += ucs4length;
  *ucs4end = 0;

  m_ucs4length += ucs4length;
  m_bytelength += bytelength;

  return true;
}

/* watch out for multiple references!
 * 
 * since ins() may need to realloc() the internal UCS-4 string buffer, the reference
 * insertion iterator (ci) may need to be updated - ins() will therefore update this.
 */
bool UCS4String::UCS4StringValue::ins (iterator & ci, const iterator & ci_start, const iterator & ci_end)
{
  int ucs4length = &ci_end - &ci_start;
  if (ucs4length == 0) return true;
  if (ucs4length < 0)
    {
      throw std::out_of_range("UCS4String::iterator - end iterator before start!");
      return false;
    }

  int ci_offset = &ci - m_ucs4str;
  if ((ci_offset < 0) || (ci_offset > m_ucs4length))
    {
      throw std::out_of_range("UCS4String::iterator - iterator not for this string!");
      return false;
    }

  if (!grow (ucs4length)) return false; // throws std::bad_alloc
  ci = iterator (this, m_ucs4str + ci_offset); // throws std::out_of_range

  if (m_utf8str.byte_length ()) m_utf8str = ""; // internal UTF-8 string no longer valid

  int bytelength = ucs4length * sizeof (UCS4Char);

  memmove (const_cast<UCS4Char *>(&ci) + ucs4length, &ci, (m_ucs4length + 1 - ci_offset) * sizeof (UCS4Char));
  memcpy (const_cast<UCS4Char *>(&ci), &ci_start, bytelength);

  m_ucs4length += ucs4length;
  m_bytelength += bytelength;

  return true;
}

/* watch out for multiple references!
 * 
 * although the reference deletion iterator (ci) will remain valid, other iterators may
 * be affected and perhaps even invalidated - use with extreme caution!
 */
bool UCS4String::UCS4StringValue::del (const iterator & ci, int ucs4length)
{
  int ci_offset = &ci - m_ucs4str;
  if ((ci_offset < 0) || (ci_offset > m_ucs4length))
    {
      throw std::out_of_range("UCS4String::iterator - iterator not for this string!");
      return false;
    }
  if (ci_offset + ucs4length > m_ucs4length)
    {
      throw std::out_of_range("UCS4String::iterator - attempt to delete beyond end of string!");
      return false;
    }

  if (m_utf8str.byte_length ()) m_utf8str = ""; // internal UTF-8 string no longer valid

  int bytelength = ucs4length * sizeof (UCS4Char);

  memmove (const_cast<UCS4Char *>(&ci), &ci + ucs4length, ((m_ucs4length + 1) - (ci_offset + ucs4length)) * sizeof (UCS4Char));

  m_ucs4length -= ucs4length;
  m_bytelength -= bytelength;

  return true;
}

bool UCS4String::UCS4StringValue::set (const iterator & ci, UCS4Char ucs4)
{
  int ci_offset = &ci - m_ucs4str;
  if ((ci_offset < 0) || (ci_offset > m_ucs4length))
    {
      throw std::out_of_range("UCS4String::iterator - iterator not for this string!");
      return false;
    }
  if (m_ucs4str[ci_offset] == ucs4) return true; // no change!
  if (ci_offset == m_ucs4length)
    {
      throw std::out_of_range("UCS4String::iterator - attempt to set beyond end of string!");
      return false;
    }

  if (m_utf8str.byte_length ()) m_utf8str = ""; // internal UTF-8 string no longer valid

  if (ucs4 == 0) // hmm, truncation!
    {
      m_ucs4length = ci_offset;
      m_ucs4str[ci_offset] = ucs4;
      return true;
    }
  else if ((ucs4 & 0x7fffffff) == ucs4) // UCS-4 is 31-bit!
    {
      m_ucs4str[ci_offset] = ucs4;
      return true;
    }
  return false;
}

const char * UCS4String::UCS4StringValue::utf8_str ()
{
  if ((m_utf8str.byte_length () == 0) && m_ucs4length)
    {
      m_utf8str.append (m_ucs4str, m_ucs4length);
    }
  return m_utf8str.utf8_str ();
}

/* size indicates required code points, *not* bytes
 */
bool UCS4String::UCS4StringValue::grow (int size)
{
  int new_size = m_bytelength + (size + 1) * sizeof (UCS4Char);
  if (new_size <= m_bytemaxlen) return true;

  /* If it's a big string then don't try to second-guess, but if it's a
   * small string then chances are there will be other small strings
   * to be appended also, so allocate in bunches...
   */
  if (size < 16) new_size = m_bytemaxlen + 128;

  if (m_ucs4str == 0)
    {
      m_ucs4str = (UCS4Char *) malloc (new_size);
      if (m_ucs4str == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
      *m_ucs4str = 0;
      m_bytemaxlen = new_size;
    }
  else
    {
      UCS4Char * more = (UCS4Char *) realloc ((void *) m_ucs4str, new_size);
      if (more == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
      m_ucs4str = more;
      m_bytemaxlen = new_size;
    }
  return true;
}

void UCS4String::iterator::set_ucs4ptr (const UCS4Char * ucs4ptr)
{
  if (m_value == 0)
    {
      throw std::out_of_range("UCS4String::iterator unassociated with UCS4String!");
      return;
    }
  if (m_value->ucs4_str () == 0)
    {
      throw std::out_of_range("UCS4String::iterator unassociated with string!");
      return;
    }
  if ((ucs4ptr <  m_value->ucs4_str ()) ||
      (ucs4ptr > (m_value->ucs4_str () + m_value->ucs4_length ())))
    {
      throw std::out_of_range("UCS4String::iterator out of range!");
      return;
    }
  m_ucs4ptr = const_cast<UCS4Char *>(ucs4ptr);
}

UCS4String::iterator::iterator (const UCS4StringValue * value, const UCS4Char * ucs4ptr) :
  m_value(value),
  m_ucs4ptr(const_cast<UCS4Char *>(ucs4ptr))
{
  set_ucs4ptr (ucs4ptr); // throws std::out_of_range
}

UCS4String::iterator & UCS4String::iterator::operator+= (int adv)
{
  if ((m_ucs4ptr - m_value->ucs4_str ()) + adv > m_value->ucs4_length ())
    {
      throw std::out_of_range("UCS4String::iterator too near end of string!");
      return *this;
    }
  m_ucs4ptr += adv;
  return *this;
}

UCS4String::iterator & UCS4String::iterator::operator-= (int ret)
{
  if (m_ucs4ptr < m_value->ucs4_str () + ret)
    {
      throw std::out_of_range("UCS4String::iterator too near start of string!");
      return *this;
    }
  m_ucs4ptr -= ret;
  return *this;
}

UCS4String::iterator & UCS4String::iterator::operator++ () // prefix operator
{
  if (*m_ucs4ptr == 0)
    {
      throw std::out_of_range("UCS4String::iterator at end of string!");
      return *this;
    }
  m_ucs4ptr ++;
  return *this;
}

UCS4String::iterator & UCS4String::iterator::operator-- () // prefix operator
{
  if (m_ucs4ptr <= m_value->ucs4_str ())
    {
      throw std::out_of_range("UCS4String::iterator at start of string!");
      return *this;
    }
  m_ucs4ptr--;
  return *this;
}

/* compare the string sequence between ci_start and (optionally inclusive) ci_end
 * with *this; returns the difference using memcmp().
 */
int UCS4String::iterator::string_compare (const iterator & ci_start, const iterator & ci_end, bool end_inclusive) const
{
  int ext_length = &ci_end - &ci_start;
  if (ext_length < 0)
    {
      throw std::out_of_range("UCS4String::iterator - end iterator before start!");
      return 0;
    }
  if (end_inclusive) ext_length++; // external length

  int int_length = (m_value->ucs4_length () + 1) - (m_ucs4ptr - m_value->ucs4_str ()); // internal length

  int ucs4length = (int_length < ext_length) ? int_length : ext_length;
  int bytelength = ucs4length * sizeof (UCS4Char);

  return memcmp (m_ucs4ptr, &ci_start, bytelength);
}

bool UCS4String::iterator::ucs4_strcmp (const UCS4Char * str) const
{
  const UCS4Char * str_ptr = str;
  const UCS4Char * ucs4ptr = m_value->ucs4_str ();
  int length = m_value->ucs4_length ();
  bool match = true;
  for (int i = 0; i <= length; i++)
    {
      if (*str_ptr != *ucs4ptr)
	{
	  match = false;
	  break;
	}
      str_ptr++;
      ucs4ptr++;
    }
  return match;
}

/* iterator advances self to str; or to end() if str not found
 * returns self-reference
 */
const UCS4String::iterator & UCS4String::iterator::ucs4_strstr (const UCS4String & str)
{
  const UCS4Char * needle = str.ucs4_str ();

  int ucs4length = str.ucs4_length ();
  if (ucs4length == 0) return *this;

  while (*m_ucs4ptr)
    {
      while (*m_ucs4ptr != *needle)
	{
	  if (*m_ucs4ptr == 0) break;
	  m_ucs4ptr++;
	}
      if (*m_ucs4ptr == 0) break;

      bool match = true;
      for (int i = 0; i < ucs4length; i++)
	if (needle[i] != m_ucs4ptr[i])
	  {
	    match = false;
	    break;
	  }
      if (match) break;

      m_ucs4ptr++;
    }
  return *this;
}

UCS4String::UCS4String (const char * utf8str) :
  m_value(new UCS4StringValue(utf8str))
{
  if (m_value == 0) throw std::bad_alloc();
}

// length=0 indicates 0-termination
UCS4String::UCS4String (const UCS4Char * ucs4str, int length) :
  m_value(new UCS4StringValue(ucs4str, length))
{
  if (m_value == 0) throw std::bad_alloc();
}

UCS4String::UCS4String (const UCS4String & rhs) :
  m_value(rhs.m_value)
{
  m_value->ref ();
}

UCS4String::~UCS4String ()
{
  if (!m_value->unref ()) delete m_value;
}

// if 0, create empty string ""
UCS4String & UCS4String::operator= (const char * utf8str)
{
  append (utf8str, true);
  return *this;
}

UCS4String & UCS4String::operator= (const UTF8String & utf8str)
{
  append (utf8str.utf8_str (), true);
  return *this;
}

UCS4String & UCS4String::operator= (const UCS4String & rhs)
{
  if (m_value == rhs.m_value) return *this;

  if (!m_value->unref ()) delete m_value;
  m_value = rhs.m_value;
  m_value->ref ();

  return *this;
}

UCS4String & UCS4String::operator= (const UCS4Char * ucs4str)
{
  append (ucs4str, true);
  return *this;
}

// if 0, assume empty string ""
UCS4String & UCS4String::operator+= (const char * utf8str)
{
  append (utf8str, false);
  return *this;
}

UCS4String & UCS4String::operator+= (const UTF8String & utf8str)
{
  append (utf8str.utf8_str (), false);
  return *this;
}

UCS4String & UCS4String::operator+= (const UCS4String & rhs)
{
  append (rhs.begin (), rhs.end ());
  return *this;
}

UCS4String & UCS4String::operator+= (const UCS4Char * ucs4str)
{
  append (ucs4str, false);
  return *this;
}

UCS4String & UCS4String::operator+= (UCS4Char ucs4)
{
  append (&ucs4, 1);
  return *this;
}

bool UCS4String::append (const char * utf8str, bool clear)
{
  if (m_value->count () == 1)
    {
      if (clear) m_value->clear ();
    }
  else
    {
      m_value->unref ();
      if (clear)
	{
	  m_value = new UCS4StringValue(0);
	}
      else
	{
	  m_value = new UCS4StringValue(begin (), end ());
	}
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
    }
  return m_value->append (utf8str);
}

bool UCS4String::append (const UCS4Char * ucs4str, bool clear)
{
  if (m_value->count () == 1)
    {
      if (clear) m_value->clear ();
    }
  else
    {
      m_value->unref ();
      if (clear)
	{
	  m_value = new UCS4StringValue(0);
	}
      else
	{
	  m_value = new UCS4StringValue(begin (), end ());
	}
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
    }
  return m_value->append (ucs4str, 0);
}

// length=0 indicates 0-termination
bool UCS4String::append (const UCS4Char * ucs4str, int length)
{
  if (m_value->count () > 1)
    {
      m_value->unref ();
      m_value = new UCS4StringValue(begin (), end ());
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
    }
  return m_value->append (ucs4str, length);
}

bool UCS4String::append (const iterator & ci_start, const iterator & ci_end)
{
  if (m_value->count () > 1)
    {
      m_value->unref ();
      m_value = new UCS4StringValue(begin (), end ());
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
    }
  return m_value->append (ci_start, ci_end);
}

/* the reference insertion iterator (ci) will be updated if necessary;
 * other iterators may be affected - use with extreme caution!
 */
bool UCS4String::ins (iterator & ci, const iterator & ci_start, const iterator & ci_end)
{
  if (m_value->count () > 1)
    {
      int ci_offset = &ci - m_value->ucs4_str ();
      if ((ci_offset < 0) || (ci_offset > m_value->ucs4_length ()))
	{
	  throw std::out_of_range("UCS4String::iterator - iterator not for this string!");
	  return false;
	}
      m_value->unref ();
      m_value = new UCS4StringValue(begin (), end ());
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
      ci = iterator (m_value, m_value->ucs4_str () + ci_offset); // throws std::out_of_range
    }
  return m_value->ins (ci, ci_start, ci_end);
}

/* the reference deletion iterator (ci) will be updated if necessary;
 * other iterators may be affected - use with extreme caution!
 */
bool UCS4String::del (iterator & ci, int ucs4length)
{
  if (m_value->count () > 1)
    {
      int ci_offset = &ci - m_value->ucs4_str ();
      if ((ci_offset < 0) || (ci_offset > m_value->ucs4_length ()))
	{
	  throw std::out_of_range("UCS4String::iterator - iterator not for this string!");
	  return false;
	}
      m_value->unref ();
      m_value = new UCS4StringValue(begin (), end ());
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
      ci = iterator (m_value, m_value->ucs4_str () + ci_offset); // throws std::out_of_range
    }
  return m_value->del (ci, ucs4length);
}

/* the reference deletion iterator (ci) will be updated if necessary;
 * other iterators may be affected - use with extreme caution!
 */
bool UCS4String::set (iterator & ci, UCS4Char ucs4)
{
  if (m_value->count () > 1)
    {
      int ci_offset = &ci - m_value->ucs4_str ();
      if ((ci_offset < 0) || (ci_offset > m_value->ucs4_length ()))
	{
	  throw std::out_of_range("UCS4String::iterator - iterator not for this string!");
	  return false;
	}
      m_value->unref ();
      m_value = new UCS4StringValue(begin (), end ());
      if (m_value == 0)
	{
	  throw std::bad_alloc();
	  return false;
	}
      ci = iterator (m_value, m_value->ucs4_str () + ci_offset); // throws std::out_of_range
    }
  return m_value->set (ci, ucs4);
}
