/* UTF8-UCS4-String/lib/xp/utf8test.cpp
 * 
 * Copyright (C) 2002 Francis James Franklin <fjf@alinameridon.com>
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <cstdio>
#include <cstring>

#include "utf8string.h"
#include "ucs4string.h"
#include "utf8stringpair.h"
#include "utf8stringmap.h"

void utf8test ();
void ucs4test ();
void map_test ();
void cca_test ();

int prog (int argc, const char * const * argv);
int encode (const char * filename);

int main (int argc, char ** argv)
{
  if (argc > 1) return prog (argc, argv);

  utf8test ();
  ucs4test ();
  map_test ();
  cca_test ();

  return 0;
}

void utf8test ()
{
  fprintf (stdout, "===========\n");
  fprintf (stdout, "UTF-8 Tests\n");
  fprintf (stdout, "===========\n");
  fprintf (stdout, "\n");

  UTF8String empty;

  UTF8String hello("Hello,");
  UTF8String world("world!");

  UTF8String h2 = hello;
  h2 += " ";

  UTF8String h2w = h2 + world;

  UTF8String am = "Add before: " + h2w;
  UTF8String pm = h2w + " - add after.";

  UTF8String ampm = "Add before: " + h2w + " - and also add after.";

  UTF8String um(""); // iso not utf
  UTF8String laut = "\"P en grn ng\" means 'On a green meadow'"; // iso not utf
  UTF8String utf8a("äöü");
  UTF8String utf8b = "På en grön äng";

  fprintf (stdout, "empty: '%s'\n", empty.utf8_str ());
  fprintf (stdout, "hello: '%s'\n", hello.utf8_str ());
  fprintf (stdout, "world: '%s'\n", world.utf8_str ());
  fprintf (stdout, "   h2: '%s'\n",    h2.utf8_str ());
  fprintf (stdout, "  h2w: '%s'\n",   h2w.utf8_str ());
  fprintf (stdout, "   am: '%s'\n",    am.utf8_str ());
  fprintf (stdout, "   pm: '%s'\n",    pm.utf8_str ());
  fprintf (stdout, " ampm: '%s'\n",  ampm.utf8_str ());
  fprintf (stdout, "   um: '%s'\n",    um.utf8_str ());
  fprintf (stdout, " laut: '%s'\n",  laut.utf8_str ());
  fprintf (stdout, "utf8a: '%s'\n", utf8a.utf8_str ());
  fprintf (stdout, "utf8b: '%s'\n", utf8b.utf8_str ());

  fprintf (stdout, "\n");

  const UTF8String::const_iterator ci_end = utf8b.end ();
  for (UTF8String::const_iterator ci = utf8b.begin (); ci != ci_end; ++ci)
    {
      const char * utf8str = &ci;
      int seql = UTF8String::sequence_length (utf8str);
      for (int i = 0; i < seql; i++) fprintf (stdout, "'%c'", utf8str[i]);
      fprintf (stdout, "[%d]", seql);
    }
  fprintf (stdout, "\n");

  fprintf (stdout, "\n");

  UTF8String::UCS4Char junk[] = {
    0x00000043,
    0x00000430,
    0x000049f1,
    0x0007be33,
    0x00cb0d37,
    0x24b21f3a,
    0x6f8f6fab,
    0x00000000
  };
  UTF8String ucs4(junk);
  fprintf (stdout, " ucs4: '%s'\n",  ucs4.utf8_str ());
  fprintf (stdout, "codes: '%d'\n",  ucs4.utf8_length ());
  fprintf (stdout, "bytes: '%d'\n",  ucs4.byte_length ());

  const UTF8String::UCS4Char * back = ucs4.ucs4_str ();
  for (int i = 0; i < ucs4.utf8_length (); i++)
    {
      fprintf (stdout, " [0x%08lx,0x%08lx]", (unsigned long) junk[i], (unsigned long) back[i]);
      fprintf (stdout, " (%d)\n", UTF8String::sequence_length (back[i]));
    }

  fprintf (stdout, "\n");

  UTF8String i_lt = "<";
  UTF8String o_lt = "&lt;";
  UTF8String i_gt = ">";
  UTF8String o_gt = "&gt;";
  UTF8String i_amp = "&";
  UTF8String o_amp = "&amp;";
  UTF8String::UTF8StringPair pairs[4];
  pairs[0].str1 = &i_lt;
  pairs[0].str2 = &o_lt;
  pairs[1].str1 = &i_gt;
  pairs[1].str2 = &o_gt;
  pairs[2].str1 = &i_amp;
  pairs[2].str2 = &o_amp;
  pairs[3].str1 = 0;
  pairs[3].str2 = 0;
  UTF8String in = "<?xml version=\"1.0\"?>\n<html>\n<body>\n<h1>Search &amp; Replace</h1>\n</body>\n</html>\n";
  UTF8String out = UTF8String::substring_replace (in, pairs);

  fprintf (stdout, "   in: '%s'\n",    in.utf8_str ());
  fprintf (stdout, "  out: '%s'\n",   out.utf8_str ());

  UTF8String cmp_long("This is a long string.");
  UTF8String cmp_shor("This is a lo");

  fprintf (stdout, "'%s' == '%s': ", cmp_long.utf8_str (), cmp_shor.utf8_str ());
  if (cmp_long == cmp_shor)
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_long == cmp_shor.utf8_str ())
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_long == 0)
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_shor == cmp_long)
    fprintf (stdout, "true\n");
  else
    fprintf (stdout, "false\n");

  fprintf (stdout, "'%s' != '%s': ", cmp_long.utf8_str (), cmp_shor.utf8_str ());
  if (cmp_long != cmp_shor)
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_long != cmp_shor.utf8_str ())
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_long != 0)
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_shor != cmp_long)
    fprintf (stdout, "true\n");
  else
    fprintf (stdout, "false\n");

  fprintf (stdout, "'%s' < '%s': ", cmp_long.utf8_str (), cmp_shor.utf8_str ());
  if (cmp_long < cmp_shor)
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_shor < cmp_long)
    fprintf (stdout, "false, ");
  else
    fprintf (stdout, "true, ");
  if (cmp_shor < cmp_shor)
    fprintf (stdout, "true\n");
  else
    fprintf (stdout, "false\n");

  fprintf (stdout, "'%s' starts with '%s': ", cmp_long.utf8_str (), cmp_shor.utf8_str ());
  if (cmp_long.compare (cmp_shor, true) == 0)
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_shor.compare (cmp_long, true) == 0)
    fprintf (stdout, "false, ");
  else
    fprintf (stdout, "true, ");
  if (cmp_shor.compare (&(cmp_shor.begin()), &(cmp_shor.end())) == 0)
    fprintf (stdout, "true, ");
  else
    fprintf (stdout, "false, ");
  if (cmp_long.compare (&(cmp_shor.begin()), &(cmp_shor.end())) == 0)
    fprintf (stdout, "false, ");
  else
    fprintf (stdout, "true, ");
  if (cmp_long.compare (&(cmp_shor.begin()), &(cmp_shor.end()), true) == 0)
    fprintf (stdout, "true\n");
  else
    fprintf (stdout, "false\n");

  cmp_long.prepend ("] prepend: ");
  cmp_long.append (cmp_shor.begin(), cmp_shor.end(), true);
  cmp_long.prepend ("[");
  fprintf (stdout, "%s\n", cmp_long.utf8_str ());

  fprintf (stdout, "\n");

  UTF8String edit("The quick brown fox jumped over the lazy dogs.");
  fprintf (stdout,"edit: %s\n", edit.utf8_str ());

  UTF8String ins1("(Oh!)");
  UTF8String::const_iterator ci = edit.begin ();
  while (ci != edit.end ())
    {
      if (*ci == UTF8String::UCS4Cast ('o'))
	{
	  edit.del (ci);
	  edit.ins (ci, ins1);
	  ++++++++++ci;
	}
      else ++ci;
    }
  fprintf (stdout,"edit: %s\n", edit.utf8_str ());

  ci = edit.begin ();
  while (ci != edit.end ())
    {
      if (*ci == UTF8String::UCS4Cast ('O'))
	{
	  edit.set (ci, UTF8String::UCS4Cast (''));
	}
      else if (*ci == UTF8String::UCS4Cast ('!'))
	{
	  ++ci;
	  edit.ins (ci, UTF8String::UCS4Cast (''));
	}
      ++ci;
    }
  fprintf (stdout,"edit: %s\n", edit.utf8_str ());

  fprintf (stdout, "\n");
}

void ucs4test ()
{
  fprintf (stdout, "===========\n");
  fprintf (stdout, "UCS-4 Tests\n");
  fprintf (stdout, "===========\n");
  fprintf (stdout, "\n");

  UCS4String s1("Once upon a time");
  UCS4String s2("in a galaxy populated by a race of beings called dochits (pron. doe-chits not do-kits)");
  UCS4String s3 = "who looked basically human but they averaged only 1m tall.";
  UCS4String s4 = s1 + ", " + s2 + " " + s3;

  fprintf (stdout, "s1: '%s'\n", s1.utf8_str ());
  fprintf (stdout, "s2: '%s'\n", s2.utf8_str ());
  fprintf (stdout, "s3: '%s'\n", s3.utf8_str ());
  fprintf (stdout, "s4: '%s'\n", s4.utf8_str ());

  UCS4String s5 = s3;
  UCS4String::iterator ci = s5.begin ();
  while (ci != s5.end ())
    {
      if (*ci == UCS4String::UCS4Cast ('u'))
	{
	  s5.set (ci, UCS4String::UCS4Cast ('v'));
	}
      ++ci;
    }
  fprintf (stdout, "s3: '%s'\n", s3.utf8_str ());
  fprintf (stdout, "s5: '%s'\n", s5.utf8_str ());

  UCS4String s6 = s5;
  ci = s6.begin ();
  while (ci != s6.end ())
    {
      if (*ci == UCS4String::UCS4Cast ('v'))
	{
	  s6.del (ci, 1);
	}
      else ++ci;
    }
  fprintf (stdout, "s6: '%s'\n", s6.utf8_str ());

  UCS4String s8 = "Ah!";
  UCS4String s7 = s5;
  ci = s7.begin ();
  while (ci != s7.end ())
    {
      if (*ci == UCS4String::UCS4Cast ('a'))
	{
	  s7.del (ci, 1);
	  s7.ins (ci, s8);
	  ci += s8.ucs4_length ();
	}
      else ++ci;
    }
  fprintf (stdout, "s7: '%s'\n", s7.utf8_str ());

  UCS4String s9 = s7;
  ci = s9.begin ();
  while (ci.ucs4_strstr (s8) != s9.end ())
    {
      s9.del (ci, 2);
    }
  fprintf (stdout, "s9: '%s'\n", s9.utf8_str ());
  fprintf (stdout, "s7: '%s'\n", s7.utf8_str ());
  fprintf (stdout, "s5: '%s'\n", s5.utf8_str ());

  UCS4String v1 = "few";
  UCS4String v2 = "many";
  if (v1 < v2) fprintf (stdout, "vv: few < many\n");
  if (v2 < v1) fprintf (stdout, "vv: many < few\n");

  fprintf (stdout, "\n");
}

void map_test ()
{
  fprintf (stdout, "======================\n");
  fprintf (stdout, "UTF-8 String Map Tests\n");
  fprintf (stdout, "======================\n");
  fprintf (stdout, "\n");

  UTF8String sk1("1");
  UTF8String sk2("k2");
  UTF8String sk3("key3");
  UTF8String sk4("Key 4");
  UTF8String sk5("String Key 5");

  UTF8String sv1("Hello");
  UTF8String sv2(",");
  UTF8String sv3(" ");
  UTF8String sv4("World");
  UTF8String sv5("!");

  UTF8StringMap map;

  map.ins (sk1,sv1);
  map.ins (sk2,sv2);
  map.ins (sk3,sv3);
  map.ins (sk4,sv4);
  map.ins (sk5,sv5);

  map.ins (sv1,sk1);
  map.ins (sv2,sk2);
  map.ins (sv3,sk3);
  map.ins (sv4,sk4);
  map.ins (sv5,sk5);

  fprintf (stdout, "'%s' -> '%s'\n", sk1.utf8_str (), map.lookup (sk1.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sk2.utf8_str (), map.lookup (sk2.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sk3.utf8_str (), map.lookup (sk3.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sk4.utf8_str (), map.lookup (sk4.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sk5.utf8_str (), map.lookup (sk5.utf8_str ())->value().utf8_str ());

  fprintf (stdout, "'%s' -> '%s'\n", sv1.utf8_str (), map.lookup (sv1.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sv2.utf8_str (), map.lookup (sv2.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sv3.utf8_str (), map.lookup (sv3.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sv4.utf8_str (), map.lookup (sv4.utf8_str ())->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sv5.utf8_str (), map.lookup (sv5.utf8_str ())->value().utf8_str ());

  map.ins (sk1,sv1);
  map.ins (sk2,sv1 + sv2);
  map.ins (sk3,sv1 + sv2 + sv3);
  map.ins (sk4,sv1 + sv2 + sv3 + sv4);
  map.ins (sk5,sv1 + sv2 + sv3 + sv4 + sv5);

  fprintf (stdout, "'%s' -> '%s'\n", sk1.utf8_str (), map.lookup (sk1)->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sk2.utf8_str (), map.lookup (sk2)->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sk3.utf8_str (), map.lookup (sk3)->value().utf8_str ());
  fprintf (stdout, "'%s' -> '%s'\n", sk4.utf8_str (), map.lookup (sk4)->value().utf8_str ());

  UTF8StringMap copy(map);

  fprintf (stdout, "map.size() = %lu\n", map.size ());
  map.del (sk1);
  map.del (sk2);
  map.del (sk3.utf8_str ());
  map.del (sk4.utf8_str ());
  fprintf (stdout, "map.size() = %lu\n", map.size ());

  fprintf (stdout, "'%s' -> '%s'\n", sk5.utf8_str (), map.lookup (sk5)->value().utf8_str ());

  fprintf (stdout, "\n");

  map = copy;
  fprintf (stdout, "copy.size() = %lu\n", copy.size ());
  copy.clear ();
  fprintf (stdout, "copy.size() = %lu\n", copy.size ());
  fprintf (stdout, "'%s' -> '%s'\n", sk5.utf8_str (), map.lookup (sk5)->value().utf8_str ());

  fprintf (stdout, "\n");
}

void cca_test ()
{
#ifdef ENABLE_COCOA_ADDITIONS
  fprintf (stdout, "========================\n");
  fprintf (stdout, "UTF-8 String Cocoa Tests\n");
  fprintf (stdout, "========================\n");
  fprintf (stdout, "\n");

  NSAutoreleasePool * pool = [NSAutoreleasePool alloc];
  if (pool)
    {
      [pool init];

      UTF8String ns1 = UTF8String::UTF8StringFromNSString (@"This is an objective string. Kinky, eh?");
      fprintf (stdout, "'%s' & '%s'\n", ns1.utf8_str (), [(ns1.ns_str ()) UTF8String]);

      UTF8String ns2("This is not an objective string.");
      fprintf (stdout, "'%s' & '%s'\n", ns2.utf8_str (), [(ns2.ns_str ()) UTF8String]);
      ns2 = "and neither is this...";
      fprintf (stdout, "'%s' & '%s'\n", ns2.utf8_str (), [(ns2.ns_str ()) UTF8String]);
      fprintf (stdout, "'%s' & '%s'\n", ns2.utf8_str (), [(ns2.ns_str ()) UTF8String]);

      [pool release];
    }
  fprintf (stdout, "\n");
#endif /* ENABLE_COCOA_ADDITIONS */
}

int prog (int argc, const char * const * argv)
{
  int status = 0;

  int arg = 0;
  while (++arg < argc)
    {
      if ((strcmp (argv[arg], "--encode") == 0) && (arg + 1 < argc))
	{
	  status = encode (argv[++arg]);
	  break;
	}
    }
  return status;
}

int encode (const char * filename)
{
  if ( filename == 0) return 1;
  if (*filename == 0) return 1;

  FILE * in = fopen (filename, "rb");
  if (in == 0)
    {
      fprintf (stderr, "unable to read from file `%s'\n", filename);
      return 1;
    }
  int status = 0;

#ifdef BUFLEN
#undef BUFLEN
#endif
#define BUFLEN 1024

  char buf[BUFLEN];
  char * ptr = 0;
  int length = 0;
  int addlen = 0;
  int seqlen = 0;
  bool done = false;

 _start:
  ptr = buf + length;
  addlen = (int) fread (ptr, 1, BUFLEN - length, in);
  if (addlen < BUFLEN - length) done = true;
  length += addlen;

  ptr = buf;
 _strip:
  if (ptr - buf < length)
    {
      seqlen = UTF8String::sequence_length (ptr);
      if (seqlen == 1)
	{
	  putchar (static_cast<int>(static_cast<unsigned char>(*ptr++)));
	  goto _strip;
	}
      if ((seqlen > 1) && (seqlen + ptr - buf <= length))
	{
	  bool valid = true;
	  for (int i = 1; i < seqlen; i++)
	    if (!UTF8String::trailing_byte (ptr + i))
	      {
		valid = false;
		break;
	      }
	  if (valid)
	    {
	      for (int i = 0; i < seqlen; i++)
		putchar (static_cast<int>(static_cast<unsigned char>(*ptr++)));
	    }
	  else
	    {
	      /* yikes! not UTF-8; let's just encode the byte as such...
	       */
	      unsigned char b2 = static_cast<unsigned char>(*ptr++);
	      unsigned char b1 = 0xc0 | ((b1 & 0xc0) >> 6);
	      b2 = 0x80 | (b2 & 0x3f);
	      putchar (static_cast<int>(b1));
	      putchar (static_cast<int>(b2));
	    }
	  goto _strip;
	}
      if ((seqlen < 0) || ((seqlen > 1) && done))
	{
	  /* yikes! not UTF-8; let's just encode the byte as such...
	   */
	  unsigned char b2 = static_cast<unsigned char>(*ptr++);
	  unsigned char b1 = 0xc0 | ((b1 & 0xc0) >> 6);
	  b2 = 0x80 | (b2 & 0x3f);
	  putchar (static_cast<int>(b1));
	  putchar (static_cast<int>(b2));
	  goto _strip;
	}
      if (seqlen == 0) // weird
	{
	  ptr++;
	  goto _strip;
	}
      /* got here? must have multibyte char but insufficient bytes to test validity...
       */
      length -= ptr - buf;
      memmove (ptr, buf, length);
      ptr = buf;
    }
  else length = 0;

  if (!done) goto _start;

  fclose (in);
  return status;
}
