#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#include "../document.h"
#include "html_priv.h"

/* a little global buffer */
char html_buffer[ BUFFERSIZE ];

/*
  searches in string for substring, not depeding if upcase or 
  lowercase character 
  e.g. finds "href" in "<A HREF="welcome.html"> 
  finds "SRC"  in "<img src="picture.gif"> 
  returns pointer to first found character in string 
  returns NULL if substring not found in string

*/

/*const char *strlostr( const char *string, const char *sub )
{

  int i = 0;
  int j = 0;
  
  if ( ( string[0] == '\0' ) || ( sub[0] == '\0' ) )
    return NULL;
  
  while( *string )
  {
    while( 1 )
    {
      if( tolower( string[i] ) == tolower( sub[j] ) )
      {
	i++;
	j++;
	if( sub[j] == '\0' ) return (const char *)string;
	if( string[i] == '\0' ) return NULL;
      }
      else
      {
	i = 0;
	j = 0;
	break;
      }
    }
    string++;
  }
  return NULL;
}*/

/*
  checks if node already in memory
  urls are compared caseINsenitiv!!!
*/

TI *nodeInMem( const char *url )
{
  TI *n;
  for( n = CH.root; n ; n = n->nextList )
    if( strcasecmp( url, n->url ) == 0)
    {
      dbg( DI, DBG_HTML|3, "found %s in mem", url);
      return n;
    }

  dbg( DI, DBG_HTML|3, "didn't find %s in mem", url);
  return NULL;
}

/*
  extract the domain part of url ( "http://www.kill-windows.com/why/i/hate/bill" becomes
  "www.kill-windows.com")

  if there is no url or the url is not valid an empty string is returned
  (i.e. http:// ?  http://blabla\urks ... a windows user thought he could navigate
  through the net like on his own computer) -> "\0"
*/

const char *domainOfUrl( const char *url )
{
  int l, dbuf = FALSE;
  char *s, *e, *u = (char*)url;

  /* avoid problems if parameter == BUF. not perfect because shit happens
     if i.e. url == BUF + 1 :-( (TODO ?)*/
  if( url == (const char*)&BUF )
  {
    dbuf = TRUE;
    u = strdup( (const char*)url );
  }

  BUF[0] = '\0';

  s = skipService( u );

  /* find next slash */
  if( ( e = strchr( s, '/' ) ) == NULL  ) /* upsala */
  {
    strncpy( BUF, s, BUFFERSIZE - 1 );
    BUF[ BUFFERSIZE ] = '\0';
    if( dbuf ) free( u );
    return BUF;
  }

  l = (int)(e - s);
  if( l < 0 )
  {
    if( dbuf ) free( u );
    return BUF;
  }

  strncpy( BUF, s, l );
  BUF[l] = '\0';

  dbg( DI, DBG_HTML | 3, "domain of %s is %s", u, BUF );

  if( dbuf ) free( u );
  return BUF;
}

/*
  checks if url is valid (if the user selected include then it checks if one of the
  domains
*/

int inexPassed( const char *url )
{
  SL *sl;
  int rc, ld, ls, of;
  const char *udomain;

  /* if include then default is not to include it, if exclude other way round */
  rc = CP.searchinclude ?  FALSE : TRUE;
  
  /* we are only interrested in the domain part of the url */
  udomain = domainOfUrl( url );
  /* if the length is zero then the url was a bad one... */
  if( ( ld = strlen( udomain ) ) == 0 )
    return rc;
  
  /* go through all in-/ex-list entrys */
  for( sl = ( CP.searchinclude ? CP.searchincludelist : CP.searchexcludelist  ) ;
       sl ; sl = sl->next )
  {
    ls = strlen( sl->domain );
    /* if length of domain is shorter that length of searchlist entry then 
       skip entry */
    if( ld < ls )
      continue;
    of = ld - ls;
    /* if found swap rc and return */
    if( strncasecmp( (const char*)(udomain + of), (const char*)(sl->domain), ls ) == 0)
    {
      dbg( DI, DBG_HTML | 3, "inex passed: %d", !rc );
      return !rc; 
    }
  }
  dbg( DI, DBG_HTML | 3, "inex passed: %d", rc );
  return rc;
}

/*
  checks if the url includes a filename or not...
*/

int urlHasFilename( const char *url )
{
  char *s, *p, *sv;

  /* slash at end: no filename */
  if( url[ strlen( url ) - 1 ] == '/' )
    return FALSE;

  sv = skipService( url );
  s = strrchr( url, '/' );
  p = strrchr( url, '.' );

  if( s == NULL || s <= sv )  /* no slash ??? */
  {
    return FALSE;
  }
  else
  {
    if( s > p ) /* we found no filename in the form blabla/blabla.blo */
      return FALSE;
  }

  return TRUE;
}

/*
  the super duper automatic document type recognition system
*/

enum Documenttype doctype( const char *url, const char *text )
{
  char *e;

  if( !urlHasFilename( url ) )
    return HTMLDoc;

  if( strchr( url, '#' ) )
    return HTMLDoc;

  if( strchr( url, '?' ) )
    return CGI;

  e = strrchr( url, '.' ) + 1;

  if( strcasecmp( e, "html" ) == 0 ||
      strcasecmp( e, "htm" ) == 0 ||
      strcasecmp( e, "shtml" ) == 0 ||
      strcasecmp( e, "shtm" ) == 0 )
    return HTMLDoc;
  else if( strcasecmp( e, "gif") == 0 ||
	   strcasecmp( e, "jpg") == 0 ||
	   strcasecmp( e, "png") == 0 ||
	   strcasecmp( e, "bmp") == 0 ||
	   strcasecmp( e, "pbm") == 0 ||
	   strcasecmp( e, "tif") == 0 ||
	   strcasecmp( e, "tiff") == 0 )
    return Graphic;
  else if( strcasecmp( e, "gz" ) == 0 ||
	   strcasecmp( e, "tar" ) == 0 ||
	   strcasecmp( e, "Z" ) == 0 ||
	   strcasecmp( e, "zip" ) == 0 ||
	   strcasecmp( e, "arj" ) == 0 ||
	   strcasecmp( e, "lha" ) == 0 ||
	   strcasecmp( e, "tgz" ) == 0 ||
	   strcasecmp( e, "jar" ) == 0 ||
	   strcasecmp( e, "gz" ) == 0 )
    return PackedFile;
  else if( strcasecmp( e, "txt" ) == 0 ||
	   strcasecmp( e, "text" ) == 0 ||
	   strcasecmp( e, "me" ) == 0 ||   /* read._me_ */
	   strcasecmp( e, "mich" ) == 0 )  /* lies._mich_ */
    return TextFile;
  else if( strcasecmp( e, "cgi" ) == 0 )
    return CGI;
  else if( strcasecmp( e, "class" ) == 0 )
    return Java;
  else if(  strcasecmp( e, "pl" ) == 0 ||
	    strcasecmp( e, "perl" ) == 0 )
    return Perl;
  else
  {
    if( strchr( text, '<') < ( text +3 ) )
      return HTMLDoc;
    
    if( ( ( e = strstr( text, "#!/") ) < ( text + 3 ) ) && ( e != NULL ) )
    {
      if( ( ( e = strstr( text, "/perl") ) < ( text + 20 ) ) && ( e != NULL ) )
	return Perl;
      else
	return CGI;
    }
      
  }
  return Unknown;
}

/*
  gets url and checks if a slash / is at the end when no filename
  is used 
  sets a slash in first case so that HTTP can load it

*/

const char *makeOrderName( const char *url )
{
  char *u = (char*)url;
  int l, dbuf = FALSE;

  l = strlen( u );
  if( l < 1 )
    return url;

  if( url == (const char*)&BUF )
  {
    dbuf = TRUE;
    u = strdup( url );
  }

  if( !urlHasFilename( u ) && u[l-1] != '/' )
  {
    strncpy( BUF, u, BUFFERSIZE - 2 );
    BUF[ BUFFERSIZE - 1 ] = '\0';
    strcat( BUF, "/" );
    if( dbuf ) free( u );
    return BUF;
  }
  else
  {
    if( dbuf ) free( u );
    return url;
  }
}

/*
  calculates a checksum for a given String (text)
  returns the calculated integer checksum value
  uses a simple algorithm which XORs the integer
  values of the text
  thanx to Philipp Reisner
*/

int checksum( const char *text, int length )
{
  int sum, index, i;

  sum = 0;
  index = (int)(length / CHECKSUMLEN);
  if( index < 4 )
    index = 4;  
  for( i = 0; i + 4 < length; i += index )
    sum = sum ^ *(int *)&text[i];
  
  return sum;
}


/* converts an url to its service name
   (e.g. http, ftp, gopher, file ...)
							   v
   returns pointer to first char after service i.e. http://test.com
   and changes service according to the service
*/

char *urlToService( int *service, const char *url )
{
  if( strncasecmp( url, "http://", 7) == 0)
  {
    *service = SERVICE_HTTP;
    url += 7;
  }
  else if( strncasecmp( url, "file:/", 6) == 0)
  {
    *service = SERVICE_FILE;
    url += 6;
  }
  else if( strncasecmp( url, "mailto:", 7) == 0)
  {
    *service = SERVICE_MAILTO;
    url += 7;
  }
  else if( strncasecmp( url, "ftp://", 6) == 0)
  {
    *service = SERVICE_FTP;
    url += 6;
  }
  else if( strncasecmp( url, "gopher://", 9) == 0)
  {
    *service = SERVICE_GOPHER;
    url += 9;
  }
  else
  {
    *service = SERVICE_NONE;
    return (char*)url;
  }
  while( *url == '/' ) url++;
  return (char*)url;
}

char *service( int service )
{
  switch( service )
  {
    case SERVICE_HTTP: return "http://";
    case SERVICE_FILE: return "file:/";
    case SERVICE_MAILTO: return "mailto:";
    case SERVICE_FTP: return "ftp://";
    case SERVICE_GOPHER: return "gopher://";
    default:
    case SERVICE_NONE: return "";
  }
  return NULL; /* never reached */
}


/*
  generates a fully qualified name of url. if url is relative then the (fully qualified)
  father url is used to generate the missing parts.

  i.e. n->url = http://www.test.com/test/t2/index.html
  url = pic/back.gif -> http://www.test.com/test/t2/pic/back.gif
  url = ../pic/back.gif -> http://www.test.com/test/pic/back.gif
  url = /pic/back.gif -> http://www.test.com/pic/back.gif

*/

char *genFullQualName( const char* furl, const char* url )
{
  char *u = (char*)url, *u2, *u3 = (char*)furl, *u4, *u5, *u6, *u7, *u8, *u9;
  int l, s, s2, dbuf = FALSE, i;

  if( url == (const char*)&BUF )
  {
    dbuf = TRUE;
    u = strdup( url );
  }

  /* remove ...#sdfsdff from url */
  /* TODO ? removen */
  if( ( u4 = strchr( u, '#' ) ) )
      *u4 = '\0';

  /* remove ? from cgi url ( parameter list ) */
  if( ( u4 = strchr( u, '?' ) ) )
      *u4 = '\0';

  u2 = urlToService( &s, u );
  /* is already fully qualif. */
  if( s != SERVICE_NONE || furl == NULL )
  {
    if( dbuf ) free( u );
    return (char*)url;
  }
  /* absolute path ? */
  if( *u2 == '/' )
  {
    /*    u4 = strstr( u3, ":/" );
    u4++;
    while( *u4 == '/' )
      u4++;*/
    u4 = skipService( u3 );

    u5 = strchr( u4, '/' );
    if( u5 )
      l = u5 - u3;
    else
      l = strlen( u3 );
    strncpy( BUF, u3, l);
    BUF[l] = '\0';
    /* BUF has now http://www.test.com/ */
    strcat( BUF, u2 );
  }
  else /* relative path */
  {
    strncpy( BUF, u3, BUFFERSIZE - 2 );
    BUF[ BUFFERSIZE - 1 ] = '\0';

    /* remove filename at the end */
    if( urlHasFilename( BUF ) && ( u4 = strrchr( BUF, '/' ) ) )
      *++u4 = '\0';
    /* ensure that there is a slash at the end of the url */
    l = strlen( BUF );
    if( BUF[l-1] != '/' )
    {
      BUF[l] = '/';
      BUF[l+1] = '\0';
    }
    /* url is now http://www.test.com/test/t2/ */
    u6 = strchr( urlToService( &s2, BUF ), '/' );
    /* u6 points now to /test/t2/ */
    /* walk through the path and build url. ./ & ../ are interpreted right */
    u7 = u;
/* TODO (start): not fully tested especially the .. */
    while( ( u8 = strchr( u7, '/' ) ) )
    {
      l = u8 - u7;
      if( strncmp( u7, ".", l ) == 0 )	/* found a .    skip it */
	u7 = u8 + 1;
      else if( strncmp( u7, "..", l ) == 0 )  /* found a .. go one / back (if possible) */
      {
	/* go one dir higher */
	for( u9 = BUF + strlen( BUF ) - 2; u9 > BUF && *u9 != '/'; u9-- ) ;
	u7 = u8 + 1;
	/* if valid */
	if( u9 >= u6 ) 
	  *++u9 = '\0';
      }
      else
      {
	i = strlen( BUF );
	strncat( BUF, u7, l ); /* append dirname */
	BUF[l+i] = '/';
	BUF[l+i+1] = '\0';
	u7 = u8 + 1;
      }
    }
/* TODO(end): not tested */
    strcat( BUF, u7 ); /* append rest of url */
  }

  if( dbuf ) free( u );
  dbg( DI, DBG_HTML | 3, "Generated full qual name %s of %s", BUF, url );
  return BUF;
}

/*
  returns a pointer to the url without the service part
  http://asdasd/asd -> asdasd/asd
  /asd/sd -> asd/sd
  adasd/fds -> adasd/fds

*/

char *skipService( const char *url )
{
  char *u;

  if( url == NULL )
    return NULL;

  /* remove unneccesary "[service]:/[insert as many / as you want]" */
  /* from the beginning */
  if( ( u = strstr( url, ":/" ) ) )
    while( *(++u) == '/' ) ;
  else
    u = (char *)url;
 
  return u;
}
