/*
 *	Fixdoc v0.02.
 *
 *	Warn about and fix the common stuff that Nautilus needs for handling
 *	docbook. Of course (g)Nautilus ought to do some of this ;)
 *
 *	We do the follwing
 *	
 *	Force ENTITY to be upper case
 *	Turn CDATA into XML form
 *	Check the id="foo" tags are present and invent them if needed
 *	Check the id tag is an acceptable value and fix it if not
 *
 *	BUGS
 *	The parsing of [] sections within DOCTYPE is a hack
 *	This is not a real SGML/XML parser
 *	On no account show this code to Daniel Veillard
 *	Tags can only be 512 bytes long between < and >. I contend it's
 *		_your_ problem if they are.
 *
 *	(c) Copyright 2000 Alan Cox,  All Rights Reserved.
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 *	gcc -O2 -Wall -pedantic fixdoc.c -o fixdoc
 */

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>

static int in_tag;
static char *tagptr;
static int line;
static int look_away;
static int suckfactor;

static char *find_vpair(char *p, char **en, char **ev, int *quoted)
{
	char *ep;

	*quoted = 0;
		
	/* Find a foo=bar pair */
	while(*p && isspace(*p))
		p++;

	if(*p=='>' || *p==0)
		return NULL;
		
	*en=p;

	while(*p && !isspace(*p) && *p!='=' && *p!='>')
		p++;

	if(*p=='>' || *p==0)
		return NULL;
		
	ep = p;		/* Character to zap for name end */
	
	while(*p && isspace(*p))
		p++;
		
	if(*p++!='=')
	{
		*ev="";
		return p;
	}

	*ep=0; 	/* break the name string off */
	
	/* Ok we have the tag and the space marker */
	
	while(*p && isspace(*p))
		p++;

	if(*p=='"')
	{
		*quoted=1;
		p++;
	}
	
	*ev = p;
	
	if(*quoted)
	{
		/* Find the closing quote */
		p=strchr(p,'"');
		if(p==NULL)
		{
			fprintf(stderr, "Missing close quote on line %d.\n",line);
			return NULL;
		}
		*p++=0;
		return p;
	}
	
	/* Hunt the spaces */
	while(*p && !isspace(*p) && *p!='>')
		p++;
	if(*p)
		*p++=0;
	return p;
}
	
static void require_id(char *tag, char *data, char *id)
{
	char *en, *ev;
	int saw_id = 0;
	int quoted;
	static int next_id = 1;
	
	printf("<%s", tag);
	while((data=find_vpair(data, &en, &ev, &quoted))!=NULL)
	{
		if(strcasecmp(en, "id")==0)
		{
			saw_id=1;
			if(id && strcmp(id, ev))
			{
				fprintf(stderr, "Tag %s on line %d has an id of '%s' but only '%s' is acceptable. I've changed it.\n",
					tag, line, ev, id);
				printf(" id=\"%s\"", id);
				suckfactor++;
				continue;
			}
		}
		if(!quoted)
			printf(" %s=%s", en, ev);
		else
			printf(" %s=\"%s\"", en, ev);
	}
	if(!saw_id)
	{
		/* No id tag */
		if(id==NULL)
		{
			fprintf(stderr, "Tag %s on line %d has no id and is sad. I've called it 'unnamed%d'\n",
				tag, line, next_id);
			printf(" id=\"unnamed%d\"", next_id++);
		}
		else
		{
			fprintf(stderr, "Tag %s on line %d had no id but requires the id of '%s'. I've fixed it for you.\n",
				tag, line, id);
			printf(" id=\"%s\"", id);
		}
		suckfactor++;
	}
	printf(">");
}

static int looks_like_cdata(char *pt)
{
	char *p=pt;
	
	if(strncmp(pt, "<![", 3))
		return 0;
	/* ok so its an xml/sgml tag not a dtd tag */
	
	pt=pt+3;
	while(*pt && isspace(*pt))
		pt++;
	if(strncmp(pt, "CDATA", 5))
		return 0;
	/* Ok this is cdata - but have we got the [ */
	
	pt+=5;
	while(*pt && isspace(*pt))
		pt++;
	if(*pt!='[')
		return 0;
	/* Ok we have an SGML cdata */
	if(strncmp(p, "<![CDATA[", 8))
		suckfactor++;
	return 1;
}	

static void lowercase(char *p)
{
	int n=1;
	while(*p)
	{
		if(isupper(*p))
		{
			*p=tolower(*p);
			suckfactor+=n;
			n=0;
		}
		p++;
	}
}

static void mungify(char *p)
{
	char *n;
	/* Entity capitals */
	if(strncasecmp(p, "<!ENTITY", 8)==0)
	{
		if(strncasecmp(p, "<!ENTITY", 8))
			suckfactor++;
		memcpy(p, "<!ENTITY", 8);
		printf("%s", p);
		return;
	}
	
	/* Other tags lower case */
	n=p+1;
	while(*n!=0)
	{
		if(strchr(" \t>", *n))
			break;
		n++;
	}
	
	/* Ok p+1 to n-1 is the word we want to look at */
	if(*n=='>')
		*n=0;
	else if(*n)
		*n++=0;
		
	/* Dont dork with system stuff just duckbook */
	if(p[1]!='!')
		lowercase(p+1);
				
	if(strcmp(p+1, "book")==0)
		require_id(p+1, n, "index");

	else if(strcmp(p+1, "article")==0)
		require_id(p+1, n, "index");

	else if(strcmp(p+1, "chapter")==0)
		require_id(p+1, n, NULL);

	else if(strcmp(p+1, "appendix")==0)
		require_id(p+1, n, NULL);
		
	else if(strcmp(p+1, "preface")==0)
		require_id(p+1, n, NULL);
		
	else if(strcmp(p+1, "legalnotice")==0)
		require_id(p+1, n, "legalnotice");

	else if(strncmp(p+1,"sect",4)==0 && p[5]>='0' && p[5]<'6')
		require_id(p+1, n, NULL);
		
	else  if(*n)
		printf("%s %s", p, n); 
	else printf("%s>", p);
}

static void munch_comments(void)
{
	int c;
	int state=0;
	int l2 = line;
	
	while((c=getchar())!=EOF)
	{
		if(c=='\n')
			line++;
		putchar(c);
		if(c=='-' && (state==0||state==1))
		{
			state++;
			continue;
		}
		if(c=='>' && state==2)
			return;
		state=0;
	}
	fprintf(stderr, "Unterminated comment at end of file (start is probably line %d).\n", l2);
	suckfactor++;
}

static void munch_cdata(void)
{
	int c;
	int state=0;
	int l2=line;
	
	while((c=getchar())!=EOF)
	{
		if(c=='\n')
			line++;
		putchar(c);
		if(c==']' && (state==0||state==1))
		{
			state++;
			continue;
		}
		if(c=='>' && state==2)
			return;
		state=0;
	}
	fprintf(stderr, "Unterminated CDATA at end of file (start is probably line %d).\n", l2);
	suckfactor++;
}

int main(int argc, char *argv[])
{
	char tagbuf[512];
	int c;
	int tp = 0;

	if(argc!=1)
	{
		fprintf(stderr, "%s: this command is a filter.\n", argv[0]);
		if(argv[1][1]!='h' && strcmp(argv[1], "--help"))
			fprintf(stderr, "I've no idea what '%s' is all about.\n", argv[1]);
		fprintf(stderr, "Try %s <mydocument >nicedocument.\n", argv[0]);
		exit(1);
	}
		
	in_tag = 0;
	tagptr = NULL;

	while((c=getchar())!=EOF)
	{
		if(c=='\n')
			line++;
		if(c=='<')
		{
			tagbuf[tp]=0;
			if(in_tag)
			{
				if(strncasecmp(tagbuf, "<!DOCTYPE ", 10)==0)
				{
					printf("%s", tagbuf);
					look_away=1;
				}
				else
				{
					fprintf(stderr, "%s: tag in tag on line %d. Bad bad bad.\n", argv[0], line);
					suckfactor++;
				}
			}
			in_tag=1;
			tp=0;
		}
		if(c=='>')
		{
			if(!in_tag)
			{
				if(!look_away)
				{
					fprintf(stderr, "%s: tag end without a tag on line %d. Naughty naughty.\n", argv[0], line);
					suckfactor++;
				}
				else
					look_away--;
			}
			else
			{
				tagbuf[tp++]='>';
				tagbuf[tp]=0;
				in_tag=0;
				mungify(tagbuf);
				continue;
			}
		}
		if(!in_tag)
		{
			/* % to entity */
			if(c=='%')
			{
				suckfactor++;
				printf("&percnt;");
			}
			else
				putchar(c);
		}
		else
		{
			tagbuf[tp++]=c;
			if(tp==4 && memcmp(tagbuf, "<!--", 4)==0)
			{
				printf("<!--");
				munch_comments();
				in_tag=0;
				tp=0;
			}
			if(looks_like_cdata(tagbuf))
			{
				/* tagbuf is SGML , output XML form */
				printf("<![CDATA[");
				munch_cdata();
				in_tag=0;
				tp=0;
			}
			else if(tp==511)
			{
				fprintf(stderr, "%s: tag too long for my poor little mind. Boom!.\n", argv[0]);
				exit(1);
			}
		}				
	}
	if(suckfactor!=0)
		fprintf(stderr, "Document required %d change%s.\n", suckfactor,
			suckfactor==1?"":"s");
	exit(0);
}
				
