/*
 * FILE:
 * parse.C
 *
 * FUNCTION:
 * Parse URI's into thier componenent pieces.
 * Vaguely similar to the function provided by libwww.HTParse, but
 * simpler, more efficient, easier to use.
 *
 * Result of parse is in the form
 * access :// host : port / path # anchor
 * The input URI does not need to be absolute; it can be partial.
 *
 * HISTORY:
 * Created by Linas Vepstas linas@linas.org November 1998
 */

#include <stdlib.h>
#include <string.h>
#include <strings.h>

#include "generic.h"
#include "parse.h"


/* ============================================================ */

wlURI :: wlURI (void)
{
   access = NULL;
   host = NULL;
   port = NULL;
   path = NULL;

   access_len = 0;
   host_len = 0;
   port_len = 0;
   path_len = 0;

   portno = 0;
   encrypt = -1;

   error = 0;
}

/* ============================================================ */
#define WHITESPACE " \t\v\f\n\r"
 
#define IF_ACCESS(str)                              \
   if (!(strncasecmp (uri, str, strlen (str)))) {   \
      access = uri;                                 \
      access_len = strlen (str);                    \
   }

void 
wlURI :: Parse (void)
{
   char * curloc;

   if (!uri) return;

   /* strip off leading whitespace, if any */
   uri = strpskip (uri, WHITESPACE);
   if (!uri) return;
   
   /* determine which, if any, access method is specified */
   if (!access) {
      /* http first because most likely to occur. save cycles */
      IF_ACCESS ("http:")   else
      IF_ACCESS ("https:")  else
      IF_ACCESS ("ftp:")    else
      IF_ACCESS ("about:")  else
      IF_ACCESS ("gopher:") else
      IF_ACCESS ("mailto:") else
      IF_ACCESS ("news:")   else
      IF_ACCESS ("nntp:")   else
      IF_ACCESS ("telnet:") 
   }

   curloc = uri;

   /* if we found an access method, look for a host name */
   if (access) 
   {
      curloc = strchr (access, ':');
      if (!curloc) { error ++; return; }
      access_len = curloc - access + 1;

      /* opt out of some of the less common access methods */
      /* I suppose we could further process some of these, but I'm lazy */
      if ((!strncasecmp (access, "about:", access_len)) ||
          (!strncasecmp (access, "gopher:", access_len)) ||
          (!strncasecmp (access, "mailto:", access_len)) ||
          (!strncasecmp (access, "news:", access_len)) ||
          (!strncasecmp (access, "nntp:", access_len)) )
      {
         path = access + access_len;
         path_len = strlen (path);
         return;
      }

      curloc = access;
      curloc += access_len;

      /* bump counter to host name, set up port numbers, etc */
      if ((!strncasecmp (access, "http:", access_len)) )
      {
         curloc += 2;
         portno = 80;
         encrypt = 0;
      }
      else 
      if ((!strncasecmp (access, "https:", access_len)) )
      {
         curloc += 2;
         portno = 443;
         encrypt = 1;
      }
      else 
      if ((!strncasecmp (access, "telnet:", access_len)) )
      {
         portno = 21;
         encrypt = 0;
      }
      else 
      if ((!strncasecmp (access, "ftp:", access_len)) )
      {
         curloc += 2;
         portno = 23;
         encrypt = 0;
      }
      else 
      if ((!strncasecmp (access, "nntp:", access_len)) )
      {
         portno = 119;
         encrypt = 0;
      }
      else 
      if ((!strncasecmp (access, "gopher:", access_len)) )
      {
         curloc += 2;
         portno = 70;
         encrypt = 0;
      }

      /* whats left had better be a machine name */
      host = curloc;
   }

   /* complete the processing of the hostname.  Note that 
    * host may have been set by the code above, or it may have
    * been set when it was passed in.  */

   if (host) 
   {
      /* length of host name is first occurrence of : or / */
      curloc = strpbrk (host, ":/" WHITESPACE);
      if (!curloc) {
         /* whoops! end of the line! */
         host_len = strlen (host);
         return;
      } else {
         host_len = curloc - host;
         if (':' == *curloc) {
            port = ++curloc;
            portno = (unsigned short) atoi (port);
            curloc = strpbrk (host, "/" WHITESPACE);
            if (!curloc) {
               /* whoops! end of the line! */
               port_len = strlen (port);
               return;
            } else {
               port_len = curloc - port;
            }
         }
      }
   }

   /* at this point, we expect curloc to be pointing at a path,
    * with either a leading /, or not
    */

   /* strip off leading whitespace, if any */
   path = strpskip (curloc, WHITESPACE);
   if (path) {
      /* strip off trailing whitespace, if any */
      curloc = strpbrk (path, WHITESPACE);
      if (!curloc) {
         path_len = strlen (path);
      } else {
         path_len = curloc - path;
      }
   }
    
   /* OK, we are done */
}

/* ============================================================ */
/* return the start and end points of a url reference */
/* non-zero return value indicates failure */

int
wl_extract_link (char **st, char **en)
{
   char * start;
   char * end;
   char delim;

   if (!st) return 1;
   start = *st;
   if (en) end = *en;
   if (!start) return 1;

   /* skip whitespace */
   start += strspn (start, WHITESPACE);

   delim = *start;
   /* 
    * if the delimiter is not a quote mark, or open-paren,
    * then assume that there is no delim, and look for a char
    * that is not a valid-url-encoded char, such as a quote mark.
    *
    * for example, must parse 
    * document.write ("<img src=http://some/img/" + var + ".gif");
    */
   if ('(' == delim) {
      start ++;
      delim = ')';
      end = strchr(start,delim);
   } else
   if (('\'' == delim) || ('\"' == delim)) {
      start ++;
      end = strchr(start,delim);
   } else {
      end = strpbrk (start, "\"\'>," WHITESPACE);
   }
   if (!end) return 1;

   *st = start;
   if (en) *en = end;

   return 0;
}

/* ============================================================ */
/* scan_for_links will search for token "toka", and optionally,
 * tokb and then tokc in the indicated buffer.  If it finds any of 
 * these, it will call the callback.
 */

char *
wl_scan_for_links (wlString &page, 
                   char *buffstart, size_t bufflen, 
			       char *toka, char * tokb, char * tokc,
                   void (*callback)(wlString&, char *, char *))
{
	char *start, *end;
	char *current, *buffend;
	char *retval = buffstart;
	int rc;

	size_t tokalen = strlen (toka);
	
	current = buffstart;
	buffend = buffstart + bufflen;

	while ((current = strncasestr(current, toka, buffend-current)))
	{
		start = current + tokalen;

		if (tokb) {
			size_t tokblen = strlen (tokb);
		
			/* skip over white space */
			start += strspn (start, " \t\n\r");

			/* next non-blank character had better be tokb */
			if (strncasecmp (start, tokb, tokblen)) {
				current += tokalen;
				continue;
			}
			start += tokblen;
		}

		if (tokc) {
			size_t tokclen = strlen (tokc);
		
			/* skip over white space */
			start += strspn (start, " \t\n\r");

			/* next non-blank character had better be tokc */
			if (strncasecmp (start, tokc, tokclen)) {
				current += tokalen;
				continue;
			}
			start += tokclen;
		}

		/* find the delimiters of the URL */
		rc = wl_extract_link (&start, &end);
		if (rc) {
#define PRTLEN 120
			char tmpbuff[PRTLEN +1];

			/* we did not find matching delimiters        */
			/* .... this should not happen. ............. */
			perr ("Error: scan_for_links(): "
				"while scanning for %s, "
				"could not find matching delimiters \n",
				toka);

			end = strpbrk (current, "\r\n");
			if (NULL == end) end = current + 60;
			if (end > buffend) end = buffend;
			if (end > current+PRTLEN) end = current+PRTLEN;

			strncpy (tmpbuff, current, end-current);
			tmpbuff [end-current] = 0x0;
			prt ("The offending line was: %s \n", tmpbuff);

			current += tokalen;
			continue;
		}

		/* update search pointer for next time through loop. */ 
		current = end;
		retval = current;

		/* we now have the link name; start and end point */
		/* to its start and end. ........................ */
		if (callback) (*callback) (page, start, end);
	}

	return retval;
}

/* =========================  END OF FILE ======================= */

