/* libeXtra/utils/utf8string.h
 *
 * Copyright (C) 2002 Francis James Franklin <fjf@alinameridon.com>
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef LIBEXTRA_UTF8STRING_H
#define LIBEXTRA_UTF8STRING_H

/* UTF-8 and UCS-4 have a defined logical & lossless mapping. In UTF-8 a code
 * point is represented by a sequence of up to 6 bytes. In UCS-4 the same code
 * point is represented by a single 31-bit number.
 * 
 * The neat thing about UTF-8 is of course that it can be handled as strings.
 */
class UTF8String
{
 public:
  /* Try not to make any assumptions about the nature of UCS4Char. The sole
   * requirement is that it stores UCS-4 which is 31 bit. On some platforms it
   * may be wiser to use wchar_t rather than unsigned long, and I don't know
   * whether wchar_t is always 32-bit and always unsigned, so...
   */
  typedef unsigned long UCS4Char;

  static UCS4Char UCS4Cast (char c) { return static_cast<UCS4Char>(static_cast<unsigned char>(c)); }

  /* returns the UCS-4 code equivalent to the first UTF-8 sequence of utf8str,
   * or (UCS4Char)(-1) if utf8str is not valid UTF-8.
   */
  static UCS4Char UCS4Code (const char * utf8str);

  /* WARNING: Meyers forgive me, but sequence_length overloads a pointer
   *          with an int:
   *          sequence_length (const char * p) 
   *          sequence_length (UCS4Char u)
   */
  /* 
   * returns length of UTF-8 sequence (1-6) which *p starts
   * returns  0 if *p is end-of-string (i.e., 0)
   * returns -1 if *p doesn't start a UTF-8 sequence
   */
  static int sequence_length (const char * p)
    {
      if ( p == 0) return -1; // no string!
      if (*p == 0) return  0; // end-of-string
      if ((*p & 0x80) == 0x00) return 1; // plain us-ascii part of latin-1
      if ((*p & 0xe0) == 0xc0) return 2; // lead byte in 2-byte sequence
      if ((*p & 0xf0) == 0xe0) return 3; // lead byte in 3-byte sequence
      if ((*p & 0xf8) == 0xf0) return 4; // lead byte in 4-byte sequence
      if ((*p & 0xfc) == 0xf8) return 5; // lead byte in 5-byte sequence
      if ((*p & 0xfe) == 0xfc) return 6; // lead byte in 6-byte sequence
      return -1; // trailing or invalid byte
    }
  /* returns length of UTF-8 sequence (1-6) which UCS-4 code point u represents
   * returns  0 if u is end-of-string (i.e., 0)
   * returns -1 if u is not valid UCS-4
   */
  static int sequence_length (UCS4Char u)
    {
      if (u & ~0x7fffffff) return -1; // UCS-4 is only 31-bit!
      if (u == 0) return 0; // end-of-string
      if ((u & 0x7fffff80) == 0) return 1;
      if ((u & 0x7ffff800) == 0) return 2;
      if ((u & 0x7fff0000) == 0) return 3;
      if ((u & 0x7fe00000) == 0) return 4;
      if ((u & 0x7c000000) == 0) return 5;
      return 6;
    }

  /* returns true if *p is a trailing ('continuing') byte in a UTF-8 sequence
   */
  static bool trailing_byte (const char * p)
    {
      if (p == 0) return false; // no string!
      if ((*p & 0xc0) == 0x80) return true; // trailing byte in multi-byte sequence
      return false;
    }

  class const_iterator; // forward declaration - see below

 private:
  class UTF8StringValue
    {
    public:
      UTF8StringValue (const char * utf8str);
      UTF8StringValue (const UCS4Char * ucs4str, int length); // length=0 indicates 0-termination
      UTF8StringValue (const const_iterator & ci_start, const const_iterator & ci_end);

      ~UTF8StringValue ();

      void clear (); // set utf8/bytelength to 0

      /* watch out for multiple references!
       */
      bool append (const char * utf8str);
      bool append (const UCS4Char * ucs4str, int length); // length=0 indicates 0-termination
      bool append (const const_iterator & ci_start, const const_iterator & ci_end);

      int count () const { return m_count; }

      int ref ()   { return ++m_count; }
      int unref () { return --m_count; }

      const char * utf8_str () const { return m_utf8str; }

      int utf8_length () const { return m_utf8length; }
      int byte_length () const { return m_bytelength; }

      const UCS4Char * ucs4_str ();

    private:
      int m_count; // reference count

      int m_utf8length; // number of code points
      int m_bytelength; // length in bytes, excluding terminating 0
      int m_bytemaxlen; // buffer size

      char * m_utf8str;

      UCS4Char * m_ucs4str;

      bool grow (int size = 32);
    };

  UTF8StringValue * m_value;

 public:
  /* WARNING: Do *not* perform any non-const operations on a string while
   *          using iterators. The results are undefined and probably
   *          catastrophic!
   */
  class const_iterator
    {
    private:
      const UTF8StringValue * m_value;

      const char * m_utf8ptr;

      /* The const_iterator belongs to a particular string; set_utf8ptr()
       * sets pointer position, which must lie within the current string
       * and must not point to a trailing byte. The terminating 0 is valid.
       */
      void set_utf8ptr (const char * utf8ptr); // throws std::out_of_range

    public:
      /* calls set_utf8ptr() - throws std::out_of_range
       */
      const_iterator (const UTF8StringValue * value, const char * utf8ptr);

      inline UCS4Char operator* () const { return UTF8String::UCS4Code(m_utf8ptr); }
      inline const char * operator& () const { return m_utf8ptr; }

      const_iterator & operator++ (); // throws std::out_of_range
      const_iterator & operator-- (); // throws std::out_of_range

      inline const const_iterator operator++ (int /* */) // postfix operator
	{
	  const const_iterator ci = *this;
	  ++(*this);
	  return ci;
	}
      inline const const_iterator operator-- (int /* */) // postfix operator
	{
	  const const_iterator ci = *this;
	  --(*this);
	  return ci;
	}
    };
  /* The official way to get an iterator is to ask the string in question for
   * one, using begin(), end(), or offset():
   * 
   *   UTF8String utf8 = "Hello, World!";
   *   UTF8String::const_iterator ci = utf8.begin ();
   * 
   * WARNING: When using offset(ptr), ptr must be a non-trailing byte within
   *          the string, or a std::out_of_range exception will be thrown!
   */
  inline const const_iterator begin () const
    {
      return const_iterator (m_value, m_value->utf8_str ());
    }
  inline const const_iterator end () const
    {
      return const_iterator (m_value, m_value->utf8_str () + m_value->byte_length ());
    }
  inline const const_iterator offset (const char * utf8ptr) const // throws std::out_of_range
    {
      return const_iterator (m_value, utf8ptr);
    }

  UTF8String (const char * utf8str = 0);                 // if 0, create empty string ""
  UTF8String (const UCS4Char * ucs4str, int length = 0); // length=0 indicates 0-termination
  UTF8String (const UTF8String & rhs);                   // copy constructor

  ~UTF8String ();

  UTF8String & operator= (const char * utf8str);   // if 0, create empty string ""
  UTF8String & operator= (const UTF8String & rhs);

  UTF8String & operator+= (const char * utf8str);  // if 0, assume empty string ""
  UTF8String & operator+= (const UTF8String & rhs);

  /* NOTE: Best to append using const_iterator, if it's an option.
   */
 private:
  bool append (const char * utf8str, bool clear = false); // internal use only
 public:
  bool append (const UCS4Char * ucs4str, int length = 0); // length=0 indicates 0-termination
  bool append (const const_iterator & ci_start, const const_iterator & ci_end);

  /* The returned pointer is valid until the next non-const operation.
   * You will _always_ get a legal pointer back, even if to an empty (0) string.
   * (Shouldn't be 0 - unless we're suffering major memory-allocation problems!)
   */
  const char * utf8_str () const { return m_value->utf8_str (); }

  /* length of string in codes/sequences
   */
  int utf8_length () const { return m_value->utf8_length (); }

  /* length of string in bytes
   */
  int byte_length () const { return m_value->byte_length (); }

  /* The returned pointer is valid until the next non-const operation.
   * You will _always_ get a legal pointer back, even if to an empty (0) string.
   * (Shouldn't be 0 - unless we're suffering major memory-allocation problems!)
   */
  const UCS4Char * ucs4_str () { return m_value->ucs4_str (); }

  struct UTF8StringPair
  {
    UTF8String * str1;
    UTF8String * str2;
  };

  /* Copy str, replacing substrings matching str1 with corresponding
   * substring str2.
   * Note: pairs must be {0,0}-terminated.
   */
  static UTF8String substring_replace (const UTF8String & str, const UTF8StringPair * pairs);
};

inline UTF8String operator+ (const UTF8String & s1, const UTF8String & s2)
{
  UTF8String sr(s1);
  sr += s2;
  return sr;
}

inline UTF8String operator+ (const UTF8String & s1, const char * s2)
{
  UTF8String sr(s1);
  sr += s2;
  return sr;
}

inline UTF8String operator+ (const char * s1, const UTF8String & s2)
{
  UTF8String sr(s1);
  sr += s2;
  return sr;
}

inline bool operator!= (const UTF8String::const_iterator & ci1,
			const UTF8String::const_iterator & ci2)
{
  return ((&ci1) != (&ci2));
}

inline bool operator== (const UTF8String::const_iterator & ci1,
			const UTF8String::const_iterator & ci2)
{
  return ((&ci1) == (&ci2));
}

#endif /* ! LIBEXTRA_UTF8STRING_H */
