/***************************************************************************
*
* Copyright 2001,2013 by Sean Conner.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 3 of the License, or (at your
* option) any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
* License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, see .
*
* Comments, questions and criticisms can be sent to: sean@conman.org
*
*************************************************************************/
#ifdef __GNUC__
# define _GNU_SOURCE
#endif
#include
#include
#include
#include
#include
#include "../nodelist.h"
#include "../htmltok.h"
#include "../util.h"
/************************************************************************/
static inline HToken ht_nexteof(HtmlToken token)
{
assert(token != NULL);
token->token = T_EOF;
return T_EOF;
}
/**********************************************************************/
static inline HToken ht_nextcom(HtmlToken token)
{
assert(token != NULL);
token->state = S_EOF;
return T_EOF;
}
/*********************************************************************/
static inline void ht_makepair(HtmlToken token,char const *restrict name,char const *restrict value)
{
struct pair *psp;
assert(token != NULL);
assert(name != NULL);
assert(value != NULL);
psp = PairCreate(name,value);
ListAddTail(&token->pairs,&psp->node);
}
/******************************************************************/
static void ht_acc(HtmlToken token,int c)
{
assert(token != NULL);
assert(c != EOF);
if (token->idx == token->max)
{
token->max += 64;
token->data = realloc(token->data,token->max);
}
token->data[token->idx++] = c;
}
/*********************************************************************/
static char *ht_accdup(HtmlToken token)
{
char *text;
assert(token != NULL);
text = malloc(token->idx + 1);
if (text != NULL)
{
memcpy(text,token->data,token->idx);
text[token->idx] = '\0';
token->idx = 0;
}
return text;
}
/**********************************************************************/
static HToken ht_nextstr(HtmlToken token)
{
int st = S_EOF;
assert(token != NULL);
assert(token->input != NULL);
while(!feof(token->input))
{
int c = fgetc(token->input);
if (c == EOF) break;
if (c == '<')
{
st = S_TAG;
break;
}
else
ht_acc(token,c);
}
token->value = ht_accdup(token);
token->token = T_STRING;
token->state = st;
return T_STRING;
}
/********************************************************************/
static HToken ht_nexttag(HtmlToken token)
{
int c;
char *t;
char *tt;
int level;
assert(token != NULL);
/*-----------------------------------------------------------
; but parsing HTML has proven to be a real bitch to program,
; especially testing for unexpected end of files and what not,
; so I'm dropping down a level, so to speak. This will probably
; produce horrible code, but at this point, nothing else has
; worked reliably, so ...
;------------------------------------------------------------*/
/*-------------------------------------------------
; STAGE ALPHA - we just read a '<', so skip past
; any white space till we hit the tag
;--------------------------------------------------*/
htnt_alpha: c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (isspace(c)) goto htnt_alpha;
if (c == '>' ) goto htnt_alpha10; /* done - return tag */
if (c == '!' ) goto htnt_comment;
goto htnt_gottag;
htnt_alpha10: token->value = up_string(ht_accdup(token));
goto htnt_done;
/*-----------------------------------------------
; STAGE BETA - read the tag and save
;----------------------------------------------*/
htnt_gottag: ht_acc(token,c);
c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (c == '=' ) goto htnt_error;
if (c == '>' ) goto htnt_alpha10; /* done - return tag */
if (!isspace(c)) goto htnt_gottag;
token->value = up_string(ht_accdup(token));
/*-------------------------------------------------
; STAGE GAMMA - skip space to options
;------------------------------------------------*/
htnt_bopts: c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (isspace(c)) goto htnt_bopts;
if (c == '>') goto htnt_done;
if (c == '=') goto htnt_error;
/*-----------------------------------------------
; STAGE DELTA - read in option
;-----------------------------------------------*/
htnt_gotopt: ht_acc(token,c);
c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (isspace(c)) goto htnt_boptval;
if (c == '>') goto htnt_gotoptd;
if (c == '=') goto htnt_voptval;
goto htnt_gotopt;
htnt_gotoptd: t = up_string(ht_accdup(token));
ht_makepair(token,t,"");
free(t);
goto htnt_done;
/*------------------------------------------------
; STAGE EPSILON - between an option and possibly its
; value - or another option
;-------------------------------------------------*/
htnt_boptval: c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (isspace(c)) goto htnt_boptval;
if (c == '>') goto htnt_gotoptd;
if (c == '=') goto htnt_voptval;
t = up_string(ht_accdup(token));
ht_makepair(token,t,"");
free(t);
goto htnt_gotopt;
/*-------------------------------------------------
; STAGE ZETA4[1] - skip space between '=' and value.
; Also determine if the value is quoted or not.
;
; [1] - local rock radio station in Lower Sheol
;--------------------------------------------------*/
htnt_voptval: t = up_string(ht_accdup(token));
htnt_voptval10: c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (isspace(c)) goto htnt_voptval10;
if (c == '>') goto htnt_error;
if (c == '\'') goto htnt_valsq;
if (c == '"') goto htnt_valdq;
/*-------------------------------------------------
; STAGE ETA - read in non-quoted value
;-------------------------------------------------*/
htnt_nqval: ht_acc(token,c);
c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (c == '>') goto htnt_nqvald;
if (!isspace(c)) goto htnt_nqval;
tt = ht_accdup(token);
ht_makepair(token,t,tt);
free(tt);
free(t);
goto htnt_bopts;
htnt_nqvald: tt = ht_accdup(token);
ht_makepair(token,t,tt);
free(tt);
free(t);
goto htnt_done;
/*------------------------------------------------
; STAGE THETA - read in single quoted value
;------------------------------------------------*/
htnt_valsq10: ht_acc(token,c);
htnt_valsq: c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (iscntrl(c)) c = ' ';
if (c != '\'') goto htnt_valsq10;
tt = ht_accdup(token);
ht_makepair(token,t,tt);
free(tt);
free(t);
goto htnt_bopts;
/*------------------------------------------------
; STAGE IOTA - read in a double quoted value
;------------------------------------------------*/
htnt_valdq10: ht_acc(token,c);
htnt_valdq: c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (iscntrl(c)) c = ' ';
if (c != '"') goto htnt_valdq10;
tt = ht_accdup(token);
ht_makepair(token,t,tt);
free(tt);
free(t);
goto htnt_bopts;
/*-------------------------------------------------
; STAGE KAPPA - read in a comment (kludgy for now)
;------------------------------------------------*/
htnt_comment: level = 1;
htnt_comm10: c = fgetc(token->input);
if (c == EOF) goto htnt_error;
if (c != '<') goto htnt_comm20;
level++;
goto htnt_comm30;
htnt_comm20: if (c != '>') goto htnt_comm30;
if (level == 1) goto htnt_commdone;
level--;
htnt_comm30: ht_acc(token,c);
goto htnt_comm10;
htnt_commdone: token->value = ht_accdup(token);
token->token = T_COMMENT;
token->state = S_STRING;
return T_COMMENT;
/*--------------------------------------------------
; STAGE OMEGA - done - return
;--------------------------------------------------*/
htnt_done: token->token = T_TAG;
token->state = S_STRING;
return T_TAG;
/*-----------------------------------------------
; Abandon All Hope Ye Who Enter Here
;-----------------------------------------------*/
htnt_error: token->token = T_EOF;
token->state = S_EOF;
return T_EOF;
}
/**********************************************************************/
int HtmlParseNext(HtmlToken token)
{
assert(token != NULL);
if(token->value)
free(token->value);
token->value = NULL;
PairListFree(&token->pairs);
token->idx = 0;
switch(token->state)
{
case T_EOF: return(ht_nexteof(token));
case T_STRING: return(ht_nextstr(token));
case T_TAG: return(ht_nexttag(token));
case T_COMMENT: return(ht_nextcom(token));
default:
assert(0); /* this shouldn't happen */
}
assert(0); /* and this shouldn't happen either */
return(T_EOF); /* shut up -Wall -pedantic -ansi options to gcc */
}
/********************************************************************/
.