W               [ \t\r\n]
F               [-a-z0-9$_.!*(),%;/?:@&=+~|#]
K               [a-z0-9-]

%x DOCTYPE
%x COMMENT COMMENT_BAD
%x TAG_START TAG TAG_ATTR_KEY TAG_ATTR_VAL
%x DQUOTED SQUOTED

%{
/***************************************
  $Header: /home/amb/wwwoffle/RCS/htmlmodify.l 1.9 1999/09/30 16:31:17 amb Exp $

  WWWOFFLE - World Wide Web Offline Explorer - Version 2.5a.
  Parse the HTML and modify the source.
  ******************/ /******************
  Written by Andrew M. Bishop

  This file Copyright 1997,98,99 Andrew M. Bishop
  It may be distributed under the GNU Public License, version 2, or
  any higher version.  See section COPYING of the GNU Public license
  for conditions under which this file may be redistributed.
  ***************************************/


#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include <sys/stat.h>
#include <unistd.h>
#include <time.h>

#include "wwwoffle.h"
#include "document.h"
#include "config.h"
#include "misc.h"


/* Parser outputs */

#define LEX_PLAINTEXT  1
#define LEX_COMMENT    2
#define LEX_DOCTYPE    3

#define LEX_TAG_BEGIN  4
#define LEX_TAG_END    5

#define LEX_ATTR_KEY   6
#define LEX_ATTR_VAL   7

/* Tag types */

typedef enum _HTMLTags
{
 tag_a         = 0  /* "a"         */ ,
 tag__a        = 1  /* "/a"        */ ,
 tag_base      = 2  /* "base"      */ ,
 tag_blink     = 3  /* "blink"     */ ,
 tag__blink    = 4  /* "/blink"    */ ,
 tag__body     = 5  /* "/body"     */ ,
 tag__html     = 6  /* "/html"     */ ,
 tag_noscript  = 7  /* "noscript"  */ ,
 tag__noscript = 8  /* "/noscript" */ ,
 tag_script    = 9  /* "script"    */ ,
 tag__script   =10  /* "/script"   */ ,
 tag_ntags     =11
}
HTMLTags;

/* Tag strings */

static char *tags[]=
{
 /* tag_a         = 0  */  "a"         ,
 /* tag__a        = 1  */  "/a"        ,
 /* tag_base      = 2  */  "base"      ,
 /* tag_blink     = 3  */  "blink"     ,
 /* tag__blink    = 4  */  "/blink"    ,
 /* tag__body     = 5  */  "/body"     ,
 /* tag__html     = 6  */  "/html"     ,
 /* tag_noscript  = 7  */  "noscript"  ,
 /* tag__noscript = 8  */  "/noscript" ,
 /* tag_script    = 9  */  "script"    ,
 /* tag__script   =10  */  "/script"
};

/* Attribute types */

typedef enum _HTMLAttributes
{
 att_href        = 0  /* "href"        */ ,
 att_onblur      = 1  /* "onblur"      */ ,
 att_onchange    = 2  /* "onchange"    */ ,
 att_onclick     = 3  /* "onclick"     */ ,
 att_ondblclick  = 4  /* "ondblclick"  */ ,
 att_onfocus     = 5  /* "onfocus"     */ ,
 att_onkeydown   = 6  /* "onkeydown"   */ ,
 att_onkeypress  = 7  /* "onkeypress"  */ ,
 att_onload      = 8  /* "onload"      */ ,
 att_onmousedown = 9  /* "onmousedown" */ ,
 att_onmousemove =10  /* "onmousemove" */ ,
 att_onmouseout  =11  /* "onmouseout"  */ ,
 att_onmouseover =12  /* "onmouseover" */ ,
 att_onmouseup   =13  /* "onmouseup"   */ ,
 att_onreset     =14  /* "onreset"     */ ,
 att_onselect    =15  /* "onselect"    */ ,
 att_onsubmit    =16  /* "onsubmit"    */ ,
 att_onunload    =17  /* "onunload"    */ ,
 att_natts       =18
}
HTMLAttributes;

/* Attribute strings. */

static char *attributes[]=
{
 /* att_href      = 0 */ "href"        ,
 /* onblur        = 1 */ "onblur"      ,
 /* onchange      = 2 */ "onchange"    ,
 /* onclick       = 3 */ "onclick"     ,
 /* ondblclick    = 4 */ "ondblclick"  ,
 /* onfocus       = 5 */ "onfocus"     ,
 /* onkeydown     = 6 */ "onkeydown"   ,
 /* onkeypress    = 7 */ "onkeypress"  ,
 /* onload        = 8 */ "onload"      ,
 /* onmousedown   = 9 */ "onmousedown" ,
 /* onmousemove   =10 */ "onmousemove" ,
 /* onmouseout    =11 */ "onmouseout"  ,
 /* onmouseover   =12 */ "onmouseover" ,
 /* onmouseup     =13 */ "onmouseup"   ,
 /* onreset       =14 */ "onreset"     ,
 /* onselect      =15 */ "onselect"    ,
 /* onsubmit      =16 */ "onsubmit"    ,
 /* onunload      =17 */ "onunload"    ,
};


static void modify_html(void);

static char *htmlmodify_yylval=NULL;
extern int htmlmodify_yylex(void);

/*+ The file descriptor to output to. +*/
static int output_fd=-1;

/*+ The add-cache-info optional footer. +*/
static char *cache_info=NULL;

/*+ The file descriptor that we are reading from. +*/
static int htmlmodify_yyfd=-1;

/*+ The base URL of this page. +*/
static URL *baseUrl=NULL;

/*+ The quote character used. +*/
static char *quote="";

/*+ Set this to disable the output. +*/
static int disable_output=0;


/*++++++++++++++++++++++++++++++++++++++
  Output the file with the modificatons if it is HTML, else just output.

  int client The file to write to.

  int spool The file to read from.

  URL *Url The URL that we are parsing.
  ++++++++++++++++++++++++++++++++++++++*/

void OutputHTMLWithModifications(int client,int spool,URL *Url)
{
 static int first=1;

 if(AddCacheInfo)
   {
    struct stat buf;
    time_t t_ago;
    char *date,*timeunit,timeago[8];

    fstat(spool,&buf);

    t_ago=time(NULL)-buf.st_mtime;
    date=RFC822Date(buf.st_mtime,0);
    
    if(t_ago<0)
      {strcpy(timeago,"?");timeunit="";}
    else if(t_ago<3600)
      {sprintf(timeago,"%ld",t_ago/60);timeunit="m";}
    else if(t_ago<(24*3600))
      {sprintf(timeago,"%ld",t_ago/3600);timeunit="h";}
    else if(t_ago<(14*24*3600))
      {sprintf(timeago,"%ld",t_ago/(24*3600));timeunit="d";}
    else if(t_ago<(30*24*3600))
      {sprintf(timeago,"%ld",t_ago/(7*24*3600));timeunit="w";}
    else
      {sprintf(timeago,"%ld",t_ago/(30*24*3600));timeunit="M";}

    cache_info=HTMLMessageBody(-1,"AddCacheInfo",
                               "url",Url->name,
                               "date",date,
                               "time",timeago,
                               "unit",timeunit,
                               NULL);
   }

 baseUrl=Url;

 output_fd=client;
 htmlmodify_yyfd=spool;
 if(!first)
    htmlmodify_yyrestart(NULL);

 modify_html();

 if(cache_info)
    free(cache_info);
 cache_info=NULL;

 first=0;
}


/*+ A macro to output the data if valid to do so. +*/
#define YY_OUTPUT(text) \
           if(!disable_output) \
              write_string(output_fd,text)

/*++++++++++++++++++++++++++++++++++++++
  Modify the HTML looking for all of the things to be changed.
  ++++++++++++++++++++++++++++++++++++++*/

static void modify_html(void)
{
 HTMLTags tag=tag_ntags;
 HTMLAttributes key=att_natts;
 int url_cached=0;
 int yychar;
 int disable_key_val;
 char *key_string=NULL;

 /* The actual parser. */

 while((yychar=htmlmodify_yylex()))
    switch(yychar)
      {
      case LEX_PLAINTEXT:
       break;

      case LEX_COMMENT:
       break;

      case LEX_DOCTYPE:
       break;

      case LEX_TAG_BEGIN:
       for(tag=0;tag<tag_ntags;tag++)
          if(!strcasecmp(htmlmodify_yylval,tags[tag]))
             break;

       if(tag==tag__a)
         {
          if(url_cached==1)
            {if(AnchorModifyEnd[0]) YY_OUTPUT(AnchorModifyEnd[0]);}
          else if(url_cached==2)
            {if(AnchorModifyEnd[1]) YY_OUTPUT(AnchorModifyEnd[1]);}
          else if(url_cached==-1)
            {if(AnchorModifyEnd[2]) YY_OUTPUT(AnchorModifyEnd[2]);}
          url_cached=0;
         }
       else if(tag==tag__body && cache_info)
         {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
       else if(tag==tag__html && cache_info)
         {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}
       else if(DisableHTMLBlink && (tag==tag_blink || tag==tag__blink))
          disable_output++;
       else if(DisableHTMLScript && tag==tag_script)
          disable_output++;
       else if(DisableHTMLScript && (tag==tag_noscript || tag==tag__noscript))
         disable_output++;

       YY_OUTPUT("<");
       YY_OUTPUT(htmlmodify_yylval);
       break;

      case LEX_TAG_END:
       YY_OUTPUT(">");

       if(tag==tag_a)
         {
          if(url_cached==1)
            {if(AnchorModifyBegin[0]) YY_OUTPUT(AnchorModifyBegin[0]);}
          else if(url_cached==2)
            {if(AnchorModifyBegin[1]) YY_OUTPUT(AnchorModifyBegin[1]);}
          else if(url_cached==-1)
            {if(AnchorModifyBegin[2]) YY_OUTPUT(AnchorModifyBegin[2]);}
         }

       if(DisableHTMLBlink && (tag==tag_blink || tag==tag__blink))
         {if(disable_output) disable_output--;}
       else if(DisableHTMLScript && tag==tag__script)
         {if(disable_output) disable_output--;}
       else if(DisableHTMLScript && (tag==tag_noscript || tag==tag__noscript))
         {if(disable_output) disable_output--;}

       tag=tag_ntags;
       key=att_natts;
       break;

      case LEX_ATTR_KEY:
       key_string=(char*)realloc((void*)key_string,strlen(htmlmodify_yylval)+1);
       strcpy(key_string,htmlmodify_yylval);

       for(key=0;key<att_natts;key++)
          if(!strcasecmp(htmlmodify_yylval,attributes[key]))
             break;
      break;

      case LEX_ATTR_VAL:
       disable_key_val=0;

       if(htmlmodify_yylval && key!=att_natts)
         {
          /* Links */

          if(key==att_href && tag==tag_a)
            {
             char *link=NULL,*p,oldp=0;
             URL *Url=NULL;

             for(p=htmlmodify_yylval;*p;p++)
                if(*p=='#')
                  {
                   oldp=*p;
                   *p=0;
                   break;
                  }

             if(*htmlmodify_yylval)
               {
                link=LinkURL(baseUrl,htmlmodify_yylval);
                Url=SplitURL(link);
               }

             if(!Url || !Url->Protocol)
                url_cached=0;
             else if(ExistsWebpageSpoolFile(Url) || IsLocalNetHost(Url->host))
                url_cached=1;
             else if(ExistsOutgoingSpoolFile(Url))
                url_cached=2;
             else
                url_cached=-1;

             if(link!=htmlmodify_yylval)
                free(link);
             if(Url)
                FreeURL(Url);

             *p=oldp;
            }

          /* Base tag */

          else if(key==att_href && tag==tag_base)
             baseUrl=SplitURL(htmlmodify_yylval);

          /* Script events */

          else if(DisableHTMLScript &&
                  (key==att_onblur || key==att_onchange || key==att_onclick || key==att_ondblclick || key==att_onfocus ||
                   key==att_onkeydown || key==att_onkeypress || key==att_onload || key==att_onmousedown ||
                   key==att_onmousemove || key==att_onmouseout || key==att_onmouseover || key==att_onmouseup ||
                   key==att_onreset || key==att_onselect || key==att_onsubmit || key==att_onunload))
             disable_key_val=1;
         }

       if(!disable_key_val)
         {
          YY_OUTPUT(key_string);
          if(htmlmodify_yylval)
            {
             YY_OUTPUT("=");
             if(*quote)
                YY_OUTPUT(quote);
             YY_OUTPUT(htmlmodify_yylval);
             if(*quote)
                YY_OUTPUT(quote);
            }
         }

       key=att_natts;
       break;

      default:
      }

 if(cache_info)
   {YY_OUTPUT(cache_info); free(cache_info); cache_info=NULL;}

 if(key_string)
    free(key_string);
}


#ifndef htmlmodify_yywrap
/*+ Needed in lex but does nothing. +*/
#define htmlmodify_yywrap() 1
#endif

/*+ Reset the current string. +*/
#define reset_string \
 *string=0; \
 stringused=0;

/*+ append information to the current string. +*/
#define append_string(xx) \
 newlen=strlen(xx); \
 if((stringused+newlen)>=stringlen) \
    string=(char*)realloc((void*)string,stringlen=(stringused+newlen+1)); \
 strcpy(string+stringused,xx); \
 stringused+=newlen;

/*+ A macro to read data that can be used by the lexer. +*/
#define YY_INPUT(buf,result,max_size) \
        if((result=read_data(htmlmodify_yyfd,buf,max_size))==-1) \
           result=0;

%}

%%
 char *string=malloc(128);
 int stringlen=128,stringused=0,newlen;

 /* Handle comments and other tags */

[^<]+                       { YY_OUTPUT(htmlmodify_yytext); /* htmlmodify_yylval=htmlmodify_yytext; return(LEX_PLAINTEXT); */ }
"<!DOCTYPE"                 { YY_OUTPUT(htmlmodify_yytext); BEGIN(DOCTYPE); reset_string; }
"<!--"                      { YY_OUTPUT(htmlmodify_yytext); BEGIN(COMMENT); reset_string; }
"<!"{W}*"-"*                { YY_OUTPUT(htmlmodify_yytext); BEGIN(COMMENT_BAD); reset_string; }
"<"{W}*                     { BEGIN(TAG_START); }

 /* Doctype (DTD) */

<DOCTYPE>">"                { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_DOCTYPE); */ }
<DOCTYPE>[^>]+              { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }

 /* Comments - COMMENT_BAD is not a legal comment format (except <!>) but people use it as one.
               COMMENT is not strictly correct, but works better than the real thing. */

<COMMENT>"--"{W}*">"        { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_COMMENT); */ }
<COMMENT>">"                { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
<COMMENT>"-"                { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }
<COMMENT>[^->]+             { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }

<COMMENT_BAD>">"            { YY_OUTPUT(htmlmodify_yytext); BEGIN(INITIAL); /* htmlmodify_yylval=string; return(LEX_COMMENT); */ }
<COMMENT_BAD>[^>]+          { YY_OUTPUT(htmlmodify_yytext); /* append_string(htmlmodify_yytext); */ }

 /* Tags */

<TAG_START>">"              { BEGIN(INITIAL); YY_OUTPUT(htmlmodify_yytext); }
<TAG_START>{K}+             { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; return(LEX_TAG_BEGIN); }
<TAG_START>"/"{K}+          { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; return(LEX_TAG_BEGIN); }
<TAG_START>(.|\n)           { BEGIN(INITIAL); YY_OUTPUT(htmlmodify_yytext); }

<TAG>">"                    { BEGIN(INITIAL); htmlmodify_yylval=""; return(LEX_TAG_END); }
<TAG>"<"                    { BEGIN(INITIAL); unput(htmlmodify_yytext[0]); htmlmodify_yylval=""; return(LEX_TAG_END); }
<TAG>{K}+                   { BEGIN(TAG_ATTR_KEY); htmlmodify_yylval=htmlmodify_yytext; return(LEX_ATTR_KEY); }
<TAG>(.|\n)                 { YY_OUTPUT(htmlmodify_yytext); }

<TAG_ATTR_KEY>{W}+          { }
<TAG_ATTR_KEY>=             { BEGIN(TAG_ATTR_VAL); }
<TAG_ATTR_KEY>(.|\n)        { BEGIN(TAG); unput(htmlmodify_yytext[0]); unput(' '); htmlmodify_yylval=NULL; return(LEX_ATTR_VAL); }

<TAG_ATTR_VAL>\"            { BEGIN(DQUOTED); reset_string; }
<TAG_ATTR_VAL>\'            { BEGIN(SQUOTED); reset_string; }
<TAG_ATTR_VAL>{W}+          { }
<TAG_ATTR_VAL>{F}+          { BEGIN(TAG); htmlmodify_yylval=htmlmodify_yytext; quote=""; return(LEX_ATTR_VAL); }
<TAG_ATTR_VAL>(.|\n)        { BEGIN(TAG); unput(htmlmodify_yytext[0]); }

 /* Quoted strings */

<DQUOTED>\\\"               { append_string(htmlmodify_yytext); }
<DQUOTED>\\                 { append_string(htmlmodify_yytext); }
<DQUOTED>\"                 { BEGIN(TAG); htmlmodify_yylval=string; quote="\""; return(LEX_ATTR_VAL); }
<DQUOTED>(\r|\n)+           { }
<DQUOTED>[^\\\"\r\n]+       { append_string(htmlmodify_yytext); }

<SQUOTED>\\\'               { append_string(htmlmodify_yytext); }
<SQUOTED>\\                 { append_string(htmlmodify_yytext); }
<SQUOTED>\'                 { BEGIN(TAG); htmlmodify_yylval=string; quote="'"; return(LEX_ATTR_VAL); }
<SQUOTED>(\r|\n)+           { }
<SQUOTED>[^\\\'\r\n]+       { append_string(htmlmodify_yytext); }

 /* End of file */

<<EOF>>                     { free(string); BEGIN(INITIAL); return(0); }
%%
