W               [ \t\r\n]
Q               [\"\']
F               [-a-z0-9$_.!*(),%;/?:@&=+~|]
FA              [-a-z0-9$_.!*()%;/?:@&=+~|]

%x ANGLE_START ANGLE
%x ANCHOR ANCHOR_HREF
%x AREA AREA_HREF
%x BASE BASE_HREF
%x BODY BODY_BACK
%x COMMENT
%x FRAME FRAME_SRC FRAME_LINK
%x IMAGE IMAGE_SRC IMAGE_LINK
%x INPUT INPUT_SRC INPUT_LINK
%x LINK LINK_STYLE LINK_STYLE_HREF
%x META META_REFRESH META_REFRESH_URL
%x OBJECT OBJECT_SRC OBJECT_PART ARCHIVE
%x PARAM PARAM_VALUE
%x SCRIPT SCRIPT_SRC
%x QUOTE QUOTE_CITE

%{
/***************************************
  $Header: /home/amb/wwwoffle/RCS/html.l 2.34 1999/02/19 19:55:09 amb Exp $

  WWWOFFLE - World Wide Web Offline Explorer - Version 2.4c.
  Parse the HTML and look for the images, links and other things.
  ******************/ /******************
  Written by Andrew M. Bishop
  Object handling by Walter Pfannenmller

  This file Copyright 1997,98,99 Andrew M. Bishop
  It may be distributed under the GNU Public License, version 2, or
  any higher version.  See section COPYING of the GNU Public license
  for conditions under which this file may be redistributed.
  ***************************************/


#define DEBUG_HTML 0

#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include <sys/stat.h>
#include <unistd.h>
#include <time.h>

#include "wwwoffle.h"
#include "document.h"
#include "config.h"
#include "misc.h"

#include "errors.h"

extern int html_yylex(void);
#define html_yywrap() 1


/*+ The refresh content of a Meta tag. +*/
static char *meta_refresh=NULL;

/*+ The content of a Base tag. +*/
static char *base_url=NULL;

/*+ The file descriptor that we are reading from. +*/
static int html_yyfd=-1;

/*+ The base URL of this page. +*/
static URL *baseUrl=NULL;


/*++++++++++++++++++++++++++++++++++++++
  Parse the HTML and look for references to image/links/frames.

  int fd The file descriptor of the file to parse.

  URL *Url The reference URL to use.
  ++++++++++++++++++++++++++++++++++++++*/

void ParseHTML(int fd,URL *Url)
{
 PrintMessage(Debug,"Parsing document using HTML parser.");

 baseUrl=Url;
 base_url=NULL;

 SetBaseURL(baseUrl);

 if(meta_refresh)
    free(meta_refresh);
 meta_refresh=NULL;

 ResetReferences();

 html_yyfd=fd;
 html_yyrestart(NULL);
 html_yylex();

 if(base_url)
   {
    baseUrl=SplitURL(base_url);
    SetBaseURL(baseUrl);
   }
}


/*++++++++++++++++++++++++++++++++++++++
  Return the URL from the Meta Refresh tag if there is one.

  char *MetaRefresh Returns the new URL or NULL if none.
  ++++++++++++++++++++++++++++++++++++++*/

char *MetaRefresh(void)
{
 if(meta_refresh)
   {
    char *new=LinkURL(baseUrl,meta_refresh);
    if(new!=meta_refresh)
      {
       free(meta_refresh);
       meta_refresh=new;
      }
   }

 return(meta_refresh);
}


/*++++++++++++++++++++++++++++++++++++++
  Object and Param treatment:
  this is the attempt to extract all 
  valid URIs from the OBJECT or PARAM tag
  defined in HTML 4.0.

  there is a problem with inline data, classes, ..
  or
  URIs of the form
  java:...
  data:...
  clsid:...
  and inline data
 
  these will be taken care of later.

  (c) Walter Pfannenmueller
  ++++++++++++++++++++++++++++++++++++++*/

/* no more than obj_archives_max are accepted: seems enough */
#define obj_archives_max 32

/* Object */
enum {
    obj_classid = 0,
    obj_codetype,
    obj_codebase,
    obj_code,
    obj_object,
    obj_data,
    obj_usemap,
    obj_type,
    obj_longdesc,

    obj_archives_start,
    obj_archives_end = obj_archives_start + obj_archives_max,
    obj_parts_size
};

static int obj_codetype_Type = RefObject; 
static int obj_type_Type = RefObject; 
static int obj_narchives = 0;

static char *obj_parts[obj_parts_size] = { NULL, }; 

/* Param */

enum {
   param_type = 0,
   param_value,
   param_parts_size
};

static int param_valuetype_is_ref = 0;
static char *param_parts[param_parts_size] = { NULL, }; 

static char **op_ = NULL;
static void op_free()
{
    if(*op_ != NULL)
    {
       free(*op_);
       *op_ = NULL;
    }
}

static void op_malloc(char *text)
{
    op_free();
    *op_ = (char *)malloc(strlen(text) + 1);
    strcpy(*op_,text); 
} 

/*+++++++++++++++++++++++++++++++++++++++++++++++
  turn dots to slash and add .class to Applets
  +++++++++++++++++++++++++++++++++++++++++++++++*/
static const char class_suffix[] = ".class";
static char *norm_applet_class(int part)
{
    char *dots;
    char *applet = obj_parts[part];
    if(strcmp(&applet[strlen(applet) - strlen(class_suffix)],class_suffix))
    {
        applet = (char *)malloc(strlen(applet) + sizeof(class_suffix));
        strcpy(applet,obj_parts[part]); 
        strcat(applet,class_suffix); 
        op_ = &obj_parts[part];
        op_free();
        *op_ = applet;
    }
    dots = applet;
    while((dots = strchr(dots,'.')) < (applet + strlen(applet) - sizeof(class_suffix)))
    {
       *dots = '/';
    }
    return applet;
}
 
/*+++++++++++++++++++++++++++++++++++++++++++++++
  add codebase
  +++++++++++++++++++++++++++++++++++++++++++++++*/
static char *add_obj_codebase(int part)
{
    if(obj_parts[obj_codebase])
    {
        char *url = (char *)malloc(strlen(obj_parts[obj_codebase]) + 
                                   sizeof("/") +
                                   strlen(obj_parts[part]));
        strcpy(url,obj_parts[obj_codebase]); 
        if(url[strlen(url) - 1] != '/')
        {
            strcat(url,"/"); 
        }
        strcat(url,obj_parts[part]);
        op_ = &obj_parts[part];
        op_free();
        *op_ = url;
    }
    return obj_parts[part];
}

/*++++++++++++++++++++++++++++++++++++++
  take the object's info and add codebase
  ++++++++++++++++++++++++++++++++++++++*/
static void codebase_url(int part, RefType refType)
{
    if(obj_parts[part])
    {
        add_obj_codebase(part); 
        AddReference(obj_parts[part], refType);
    }
}

/*++++++++++++++++++++++++++++++++++++++
  take the object's info and build an applet url
  ++++++++++++++++++++++++++++++++++++++*/
static void applet_url(int part)
{
    if(obj_parts[part])
    {
        norm_applet_class(part);
        codebase_url(part,RefInlineObject);
    }
}

/*++++++++++++++++++++++++++++++++++++++
  take the object's info and build urls
  ++++++++++++++++++++++++++++++++++++++*/
static void build_obj_urls()
{
    int i;
    if(obj_codetype_Type == RefImage || obj_type_Type == RefImage)
    {
        codebase_url(obj_classid,RefImage);
        codebase_url(obj_data,RefImage);
    }
    else
    {
        applet_url(obj_classid);
        applet_url(obj_data);
    }
    applet_url(obj_code);
    applet_url(obj_object);
    codebase_url(obj_usemap,RefLink);
    codebase_url(obj_longdesc,RefLink);
    for(i = 0; i < obj_narchives; i++)
    {
        codebase_url(i + obj_archives_start, RefObject);
    }
    for(i = 0; i < obj_parts_size; i++)
    {
       op_ = &obj_parts[i];
       op_free();
    }
    obj_codetype_Type = RefObject; 
    obj_type_Type = RefObject; 
    obj_narchives = 0;
}

/*+++++++++++++++
  storing archives  
  +++++++++++++++*/
static void add_obj_archive(char *text)
{
    if(obj_narchives < obj_archives_max)
    {
        op_ = &obj_parts[obj_archives_start + obj_narchives++];
        op_malloc(text);
    }
}

/*+++++++++++++++++++++++++++++++++++++++++
  take the param's info and build urls
  +++++++++++++++++++++++++++++++++++++++++*/
static void build_param_urls()
{
    int i;
    if(param_valuetype_is_ref && param_parts[param_value])
    {
        AddReference(param_parts[param_value],RefObject);
    } 
    param_valuetype_is_ref = 0;
    for(i = 0; i < param_parts_size; i++)
    {
       op_ = &param_parts[i];
       op_free();
    }
}


/*+ A macro to read data that can be used by the lexer. +*/
#define YY_INPUT(buf,result,max_size) \
        if((result=read_data(html_yyfd,buf,max_size))==-1) \
           result=0;

%}

%%
 /* Handle comments and other angle brackets */

[^<]+                                   { }
"<!--"                                  { BEGIN(COMMENT); }
"<"{W}*                                 { BEGIN(ANGLE_START); }

 /* Comments */

<COMMENT>"-->"                          { BEGIN(INITIAL); }
<COMMENT>">"                            { }
<COMMENT>"-"                            { }
<COMMENT>[^->]+                         { }

 /* Angle brackets */

<ANGLE_START>">"                        { BEGIN(INITIAL); }
<ANGLE_START>"a"{W}                     { BEGIN(ANCHOR); }
<ANGLE_START>"area"{W}                  { BEGIN(AREA); }
<ANGLE_START>"base"{W}                  { BEGIN(BASE); }
<ANGLE_START>"body"{W}                  { BEGIN(BODY); }
<ANGLE_START>"frame"|"iframe"{W}        { BEGIN(FRAME); }
<ANGLE_START>"img"{W}                   { BEGIN(IMAGE); }
<ANGLE_START>"input"{W}                 { BEGIN(INPUT); }
<ANGLE_START>"link"{W}                  { BEGIN(LINK); }
<ANGLE_START>"meta"{W}                  { BEGIN(META); }
<ANGLE_START>"object"|"applet"|"embed"|"xml"{W} { BEGIN(OBJECT); }
<ANGLE_START>"param"{W}                 { BEGIN(PARAM); } 
<ANGLE_START>"script"{W}                { BEGIN(SCRIPT); }
<ANGLE_START>"q"|"blockquote"{W}        { BEGIN(QUOTE); }
<ANGLE_START>.|\r|\n                    { BEGIN(ANGLE); }

<ANGLE>">"                              { BEGIN(INITIAL); }
<ANGLE>[^>]+                            { }

 /* Base */

<BASE>">"                               { BEGIN(INITIAL); }
<BASE>"href"{W}*"="{W}*{Q}*             { BEGIN(BASE_HREF); }
<BASE>.|\r|\n                           { }

<BASE_HREF>">"                          { BEGIN(INITIAL); }
<BASE_HREF>{F}+                         { BEGIN(BASE);
                                          base_url=(char*)malloc(strlen(html_yytext)+1); strcpy(base_url,html_yytext); }
<BASE_HREF>.|\r|\n                      { BEGIN(BASE); }

 /* Meta refresh */

<META>">"                               { BEGIN(INITIAL); }
<META>"HTTP-EQUIV"{W}*"="{W}*{Q}*"Refresh"{Q}* { BEGIN(META_REFRESH); }
<META>.|\r|\n                           { }

<META_REFRESH>">"                       { BEGIN(INITIAL); }
<META_REFRESH>"URL"{W}*"="{W}*{Q}*      { BEGIN(META_REFRESH_URL); }
<META_REFRESH>.|\r|\n                   { }

<META_REFRESH_URL>{F}+                  { BEGIN(META);
                                          meta_refresh=(char*)malloc(strlen(html_yytext)+1); strcpy(meta_refresh,html_yytext); }
<META_REFRESH_URL>.|\r|\n               { BEGIN(META); }

 /* Stylesheets */

<LINK>">"                               { BEGIN(INITIAL); }
<LINK>"REL"{W}*"="{W}*{Q}*"Stylesheet"{Q}* { BEGIN(LINK_STYLE); }
<LINK>.|\r|\n                           { }

<LINK_STYLE>">"                         { BEGIN(INITIAL); }
<LINK_STYLE>"href"{W}*"="{W}*{Q}*       { BEGIN(LINK_STYLE_HREF); }
<LINK_STYLE>.|\r|\n                     { }

<LINK_STYLE_HREF>{F}+                   { BEGIN(LINK); AddReference(html_yytext,RefStyleSheet); }
<LINK_STYLE_HREF>.|\r|\n                { BEGIN(LINK); }

 /* Images */

<IMAGE>">"                              { BEGIN(INITIAL); }
<IMAGE>"src"{W}*"="{W}*{Q}*             { BEGIN(IMAGE_SRC); }
<IMAGE>"longdesc"{W}*"="{W}*{Q}*        { BEGIN(IMAGE_LINK); }
<IMAGE>"usemap"{W}*"="{W}*{Q}*          { BEGIN(IMAGE_LINK); }
<IMAGE>.|\r|\n                          { }

<IMAGE_SRC>">"                          { BEGIN(INITIAL); }
<IMAGE_SRC>{F}+                         { BEGIN(IMAGE); AddReference(html_yytext,RefImage); }
<IMAGE_SRC>.|\r|\n                      { BEGIN(IMAGE); }

<IMAGE_LINK>">"                         { BEGIN(INITIAL); }
<IMAGE_LINK>{F}+                        { BEGIN(IMAGE); AddReference(html_yytext,RefLink); }
<IMAGE_LINK>.|\r|\n                     { BEGIN(IMAGE); }

<INPUT>">"                              { BEGIN(INITIAL); }
<INPUT>"src"{W}*"="{W}*{Q}*             { BEGIN(INPUT_SRC); }
<INPUT>"usemap"{W}*"="{W}*{Q}*          { BEGIN(INPUT_LINK); }
<INPUT>.|\r|\n                          { }

<INPUT_SRC>">"                          { BEGIN(INITIAL); }
<INPUT_SRC>{F}+                         { BEGIN(INPUT); AddReference(html_yytext,RefImage); }
<INPUT_SRC>.|\r|\n                      { BEGIN(INPUT); }

<INPUT_LINK>">"                         { BEGIN(INITIAL); }
<INPUT_LINK>{F}+                        { BEGIN(INPUT); AddReference(html_yytext,RefLink); }
<INPUT_LINK>.|\r|\n                     { BEGIN(INPUT); }

<BODY>">"                               { BEGIN(INITIAL); }
<BODY>"background"{W}*"="{W}*{Q}*       { BEGIN(BODY_BACK); }
<BODY>.|\r|\n                           { }

<BODY_BACK>">"                          { BEGIN(INITIAL); }
<BODY_BACK>{F}+                         { BEGIN(BODY); AddReference(html_yytext,RefImage); }
<BODY_BACK>.|\r|\n                      { BEGIN(BODY); }

 /* Frames */

<FRAME>">"                              { BEGIN(INITIAL); }
<FRAME>"src"{W}*"="{W}*{Q}*             { BEGIN(FRAME_SRC); }
<FRAME>"longdesc"{W}*"="{W}*{Q}*        { BEGIN(FRAME_LINK); }
<FRAME>.|\r|\n                          { }

<FRAME_SRC>">"                          { BEGIN(INITIAL); }
<FRAME_SRC>{F}+                         { BEGIN(FRAME); AddReference(html_yytext,RefFrame); }
<FRAME_SRC>.|\r|\n                      { BEGIN(FRAME); }

<FRAME_LINK>">"                         { BEGIN(INITIAL); }
<FRAME_LINK>{F}+                        { BEGIN(FRAME); AddReference(html_yytext,RefLink); }
<FRAME_LINK>.|\r|\n                     { BEGIN(FRAME); }

 /* Scripts */

<SCRIPT>">"                             { BEGIN(INITIAL); }
<SCRIPT>"src"{W}*"="{W}*{Q}*            { BEGIN(SCRIPT_SRC); }
<SCRIPT>.|\r|\n                         { }

<SCRIPT_SRC>">"                         { BEGIN(INITIAL); }
<SCRIPT_SRC>{F}+                        { BEGIN(SCRIPT); AddReference(html_yytext,RefScript); }
<SCRIPT_SRC>.|\r|\n                     { BEGIN(SCRIPT); }

 /* Objects */

<OBJECT>">"                             { BEGIN(INITIAL); build_obj_urls(); }
<OBJECT>"code"{W}*"="{W}*{Q}*           { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_code]; }
<OBJECT>"classid"{W}*"="{W}*{Q}*"java:" { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_classid]; }
<OBJECT>"classid"{W}*"="{W}*{Q}*"clsid:" { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_classid]; }
<OBJECT>"classid"{W}*"="{W}*{Q}*        { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_classid]; }
<OBJECT>"codetype"{W}*"="{W}*{Q}*"image" { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_codetype]; 
                                          obj_codetype_Type = RefImage; }
<OBJECT>"codetype"{W}*"="{W}*{Q}*       { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_codetype]; }
<OBJECT>"codebase"{W}*"="{W}*{Q}*       { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_codebase]; }
<OBJECT>"object"{W}*"="{W}*{Q}*         { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_object]; }
<OBJECT>"data"{W}*"="{W}*{Q}*           { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_data]; }
<OBJECT>"src"{W}*"="{W}*{Q}*            { BEGIN(OBJECT_SRC);}
<OBJECT>"type"{W}*"="{W}*{Q}*"image"    { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_type];
                                          obj_type_Type = RefImage; }
<OBJECT>"type"{W}*"="{W}*{Q}*           { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_type]; }
<OBJECT>"usemap"{W}*"="{W}*{Q}*         { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_usemap]; }
<OBJECT>"longdesc"{W}*"="{W}*{Q}*       { BEGIN(OBJECT_PART); op_ = &obj_parts[obj_longdesc]; }
<OBJECT>"archive"{W}*"="{W}*{Q}*        { BEGIN(ARCHIVE); }
<OBJECT>.|\r|\n                         { }

<OBJECT_SRC>">"                         { BEGIN(INITIAL); }
<OBJECT_SRC>{F}+                        { BEGIN(OBJECT); AddReference(html_yytext,RefInlineObject); }
<OBJECT_SRC>.|\r|\n                     { BEGIN(OBJECT); }

<OBJECT_PART>{F}+                       { BEGIN(OBJECT); op_malloc(html_yytext); }
<OBJECT_PART>.|\r|\n                    { }

<ARCHIVE>">"                            { BEGIN(INITIAL); build_obj_urls(); }
<ARCHIVE>{W}*","{W}*                    { }
<ARCHIVE>{W}+                           { }
<ARCHIVE>{FA}+                          { add_obj_archive(html_yytext); }
<ARCHIVE>{W}*|{Q}                       { BEGIN(OBJECT); }

<PARAM>">"                              { BEGIN(INITIAL); build_param_urls(); }
<PARAM>"type"{W}*"="{W}*{Q}*            { BEGIN(PARAM_VALUE); op_ = &param_parts[param_type]; }
<PARAM>"valuetype"{W}*"="{W}*{Q}*"ref"  { param_valuetype_is_ref = 1; } 
<PARAM>"value"{W}*"="{W}*{Q}*           { BEGIN(PARAM_VALUE); op_ = &param_parts[param_value]; }
<PARAM>"name"{W}*"="{W}*{Q}*"href"|"file"|"ref" { param_valuetype_is_ref = 1; } 
<PARAM>.|\r|\n                          { }

<PARAM_VALUE>{F}+                       { BEGIN(PARAM); op_malloc(html_yytext); }
<PARAM_VALUE>.|\r|\n                    { }

 /* Links */

<ANCHOR>">"                             { BEGIN(INITIAL); }
<ANCHOR>"href"{W}*"="{W}*{Q}*           { BEGIN(ANCHOR_HREF); }
<ANCHOR>.|\r|\n                         { }

<ANCHOR_HREF>">"                        { unput('>'); BEGIN(ANCHOR); }
<ANCHOR_HREF>{F}*                       { BEGIN(ANCHOR); AddReference(html_yytext,RefLink); }
<ANCHOR_HREF>"#"                        { BEGIN(ANCHOR); }
<ANCHOR_HREF>.|\r|\n                    { BEGIN(ANCHOR); }

<AREA>">"                               { BEGIN(INITIAL); }
<AREA>"href"{W}*"="{W}*{Q}*             { BEGIN(AREA_HREF); }
<AREA>.|\r|\n                           { }

<AREA_HREF>">"                          { BEGIN(INITIAL); }
<AREA_HREF>{F}+                         { BEGIN(AREA); AddReference(html_yytext,RefLink); }
<AREA_HREF>.|\r|\n                      { BEGIN(AREA); }

<QUOTE>">"                              { BEGIN(INITIAL); }
<QUOTE>"cite"{W}*"="{W}*{Q}*            { BEGIN(QUOTE_CITE); }
<QUOTE>.|\r|\n                          { }

<QUOTE_CITE>">"                          { BEGIN(INITIAL); }
<QUOTE_CITE>{F}+                         { BEGIN(QUOTE); AddReference(html_yytext,RefLink); }
<QUOTE_CITE>.|\r|\n                      { BEGIN(QUOTE); }

%%
