W               [ \t\r\n]
F               [-a-z0-9$_.!*''(),%;/?:@&=+]

%x HEADER
%x COMMENT
%x ANGLE_START ANGLE
%x IMAGE IMAGE_SRC
%x ANCHOR ANCHOR_HREF
%x BODY BODY_BACK

%{
/***************************************
  $Header: /home/amb/wwwoffle/RCS/html.l 1.11 1997/03/24 08:42:34 amb Exp $

  WWWOFFLE - World Wide Web Offline Explorer - Version 1.1.
  Parse the HTML and look for the images, links and end of body.
  ******************/ /******************
  Written by Andrew M. Bishop

  This file Copyright 1997 Andrew M. Bishop
  It may be distributed under the GNU Public License, version 2, or
  any higher version.  See section COPYING of the GNU Public license
  for conditions under which this file may be redistributed.
  ***************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "wwwoffle.h"

#if 0
int main()
{
 char **list=NULL;
 int i;

 ParseHTML(stdin,"foo","bar/");

 if((list=ListImages()))
    for(i=0;list[i];i++)
       printf("Image: %s\n",list[i]);

 if((list=ListLinks()))
    for(i=0;list[i];i++)
       printf("Link : %s\n",list[i]);
}
#endif

extern FILE* yyin;
extern int yylex(void);
#define yywrap() 1

static void add_image_or_link(char* name,int is_image);
static void canonicalise_image_or_link(char ***list);

/*+ If the file is parsed as an HTML file. +*/
static int is_html=0;

/*+ The list of images. +*/
static char **images=NULL;

/*+ The number of images. +*/
static int nimages=0;

/*+ The list of links. +*/
static char **links=NULL;

/*+ The number of links. +*/
static int nlinks=0;

/*+ Just before the end of body tag (or the end of html tag (or end of file)). +*/
static int body_or_html_end=0;

/*+ The host that this is referenced from. +*/
static char *refhost=NULL;

/*+ The path that this is referenced from. +*/
static char *refpath=NULL;


/*++++++++++++++++++++++++++++++++++++++
  Parse the HTML and look for images.

  int ParseHTML Returns 1 if it was parsed as an HTML file.

  FILE *file The file to parse.

  char *host The name of the host.

  char *path The name of the path.
  ++++++++++++++++++++++++++++++++++++++*/

int ParseHTML(FILE *file,char *host,char *path)
{
 int i;

 refhost=host;
 refpath=path;

 is_html=0;

 if(images)
   {
    for(i=0;images[i];i++)
       free(images[i]);
    free(images);
   }
 images=NULL;
 nimages=0;

 if(links)
   {
    for(i=0;links[i];i++)
       free(links[i]);
    free(links);
   }
 links=NULL;
 nlinks=0;

 body_or_html_end=0;

 yyin=file;
 yyrestart(yyin);
 yylex();

 if(images)
    add_image_or_link(NULL,1);

 if(links)
    add_image_or_link(NULL,0);

 return(is_html);
}


/*++++++++++++++++++++++++++++++++++++++
  List the images found in the HTML file.

  char **ListImages Returns a null terminated list of images.
  ++++++++++++++++++++++++++++++++++++++*/

char **ListImages(void)
{
 if(images)
    canonicalise_image_or_link(&images);

 return(images);
}


/*++++++++++++++++++++++++++++++++++++++
  List the links found in the HTML file.

  char **ListLinks Returns a null terminated list of links.
  ++++++++++++++++++++++++++++++++++++++*/

char **ListLinks(void)
{
 if(links)
    canonicalise_image_or_link(&links);

 return(links);
}


/*++++++++++++++++++++++++++++++++++++++
  Return the position of the end of the html in the document.

  int GetHTMLEnd Returns the position of the end of the html.
  ++++++++++++++++++++++++++++++++++++++*/

int GetHTMLEnd(void)
{
 return(body_or_html_end);
}


/*++++++++++++++++++++++++++++++++++++++
  A function to add a name to the list of images or links.

  char* name The name to add.

  int is_image Set to true if it is an image.
  ++++++++++++++++++++++++++++++++++++++*/

static void add_image_or_link(char* name,int is_image)
{
 char ***list;
 int *nlist;

 if(name)
   {
    char *colon=strchr(name,':');

    if(colon)
      {
       char *slash=strchr(name,'/');

       if((!slash || slash>colon) && strncasecmp("http:",name,5))
          return;
      }
   }

 if(is_image)
   {
    list=&images;
    nlist=&nimages;
   }
 else
   {
    list=&links;
    nlist=&nlinks;
   }

 if((*nlist)==0)
    (*list)=(char**)malloc(16*sizeof(char*));
 else if(((*nlist)%16)==0)
    (*list)=(char**)realloc((*list),((*nlist)+16)*sizeof(char*));

 if(name)
   {
    (*list)[(*nlist)]=(char*)malloc(strlen(name)+1);
    strcpy((*list)[(*nlist)],name);
   }
 else
    (*list)[(*nlist)]=NULL;

 (*nlist)++;
}


/*++++++++++++++++++++++++++++++++++++++
  Fix the list up with canonical URLs not the relative ones.

  char ***list The list to modify.
  ++++++++++++++++++++++++++++++++++++++*/

static void canonicalise_image_or_link(char ***list)
{
 int i,j;

 for(i=0;(*list)[i];i++)
   {
    char **item=&(*list)[i];
    char *name=*item;

    if(strncasecmp("http:",name,5))
      {
       if(*name=='/')
         {
          *item=(char*)malloc(strlen(refhost)+strlen(name)+8);
          sprintf(*item,"http://%s%s",refhost,name);
         }
       else
         {
          int i=0,j;
          char *path=(char*)malloc(strlen(refpath)+1);

          strcpy(path,refpath);

          for(j=strlen(path);j>0;j--)
             if(path[j]=='/')
                break;
          path[j]=0;

          while(name[i]=='.')
            {
             if(!strncmp("./",&name[i],2))
                i+=2;
             else if(!strncmp("../",&name[i],3))
               {
                i+=3;
                for(j=strlen(path)-1;j>0;j--)
                   if(path[j]=='/')
                      break;
                path[j]=0;
               }
            }

          *item=(char*)malloc(strlen(refhost)+strlen(path)+strlen(&name[i])+12);
          if(*path)
             sprintf(*item,"http://%s/%s/%s",refhost,path,&name[i]);
          else
             sprintf(*item,"http://%s/%s",refhost,&name[i]);
          free(path);
         }

       free(name);
      }
   }

 /* remove the duplicates */

 for(i=0;(*list)[i];i++)
   {
    for(j=i+1;(*list)[j];j++)
       if(!strcmp((*list)[i],(*list)[j]))
          break;

    if((*list)[j])
      {
       free((*list)[j]);
       do
         {
          (*list)[j]=(*list)[j+1];
         }
       while((*list)[j++]);
       i--;
      }
   }
}

%}

%%
 int open_angle=0,position=0,html_end=0,body_end=0;
 BEGIN(HEADER);

<HEADER>\r*\n                                           { if(is_html) BEGIN(INITIAL); else return(EOF); }
<HEADER>"content-type:"[ \t]+"text/html"[ \t]*\r*\n     { is_html=1; }
<HEADER>.+\r*\n                                         { }


[^<]+                                   { position+=yyleng; }
"<!--"                                  { position+=yyleng; BEGIN(COMMENT); }
"<"{W}*                                 { position+=yyleng; BEGIN(ANGLE_START); open_angle=yyleng; }


<COMMENT>">"                            { position+=yyleng; BEGIN(INITIAL); }
<COMMENT>[^>]+                          { position+=yyleng; }


<ANGLE_START>"img"{W}                   { position+=yyleng; BEGIN(IMAGE); }
<ANGLE_START>"a"{W}                     { position+=yyleng; BEGIN(ANCHOR); }
<ANGLE_START>"link"{W}                  { position+=yyleng; BEGIN(ANCHOR); }
<ANGLE_START>"body"{W}                  { position+=yyleng; BEGIN(BODY); }
<ANGLE_START>"/body"                    { position+=yyleng; BEGIN(ANGLE); body_end=position-yyleng-open_angle; }
<ANGLE_START>"/html"                    { position+=yyleng; BEGIN(ANGLE); html_end=position-yyleng-open_angle; }
<ANGLE_START>">"                        { position+=yyleng; BEGIN(INITIAL); }
<ANGLE_START>.                          { position+=yyleng; BEGIN(ANGLE); }


<ANGLE>">"                              { position+=yyleng; BEGIN(INITIAL); }
<ANGLE>[^>]+                            { position+=yyleng; }


<IMAGE>">"                              { position+=yyleng; BEGIN(INITIAL); }
<IMAGE>"src"{W}*"="{W}*"\""*            { position+=yyleng; BEGIN(IMAGE_SRC); }
<IMAGE>.|\n                             { position+=yyleng; }
<IMAGE_SRC>{F}+                         { position+=yyleng; BEGIN(IMAGE); add_image_or_link(yytext,1); }
<IMAGE_SRC>.                            { position+=yyleng; BEGIN(IMAGE); }


<ANCHOR>">"                             { position+=yyleng; BEGIN(INITIAL); }
<ANCHOR>"href"{W}*"="{W}*"\""*          { position+=yyleng; BEGIN(ANCHOR_HREF); }
<ANCHOR>.|\n                            { position+=yyleng; }
<ANCHOR_HREF>{F}+                       { position+=yyleng; BEGIN(ANCHOR); add_image_or_link(yytext,0); }
<ANCHOR_HREF>.                          { position+=yyleng; BEGIN(ANCHOR); }


<BODY>">"                               { position+=yyleng; BEGIN(INITIAL); }
<BODY>"background"{W}*"="{W}*"\""*      { position+=yyleng; BEGIN(BODY_BACK); }
<BODY>.|\n                              { position+=yyleng; }
<BODY_BACK>{F}+                         { position+=yyleng; BEGIN(BODY); add_image_or_link(yytext,1); }
<BODY_BACK>.                            { position+=yyleng; BEGIN(BODY); }


<<EOF>>                                 { if(body_end && html_end && (html_end-body_end)<16 && (position-html_end)<16)
                                             body_or_html_end=body_end;
                                          else if(body_end && (position-body_end)<16)
                                             body_or_html_end=body_end;
                                          else if(html_end && (position-html_end)<16)
                                             body_or_html_end=html_end;
                                          else
                                             body_or_html_end=position;
                                          return(EOF); }

%%
