#include <stdio.h>
#include <ctype.h>
#include "options.h"

/*
 * NAME
 *	fmt - simple text formatter
 *
 * SYNOPSIS
 *	usage: fmt [ -cs ] [ -width ] [ -p prefix ] [ file ... ]
 *
 * DESCRIPTION
 *	Fmt is a simple text formatter, similar to the BSD program fmt(1),
 *	but uses best-fit line breaking, by a simple version of
 *
 *		"Breaking Paragraphs into Lines",
 *		Donald E. Knuth and Michael F. Plass,
 *		"Software--Practice and Experience" 11 (1981) 1119-1184.
 *
 *	Tabs are expanded on input and re-introduced on output.
 *
 * OPTIONS
 *	-c	Crown margin mode.  The indentation of the first line
 *		of a paragraph must be different from the indentation
 *		of the second.  Subsequent lines must have the same
 *		indentation as the second line.
 *
 *	-s	Split lines only.
 *
 *	-width	Maximum line width (default 75).  Fmt prefers to make
 *		lines about 7% shorter, to give it room to balance line
 *		lengths.
 *
 *	-p prefix
 *		Only lines whose non-white text begins with the prefix are
 *		re-arranged; the prefix is stripped for the formatting and
 *		re-attached to each formatted output line.  For example,
 *
 *			fmt -p ' * '
 *
 *		formats C comments laid out in the normal way, leaving
 *		the code unchanged.  The -p option may also be combined
 *		with the other options.
 *
 * BUGS
 *	Very long paragraphs break fmt's limits.  Please send bug reports
 *	and suggestions to the author.
 *
 * AUTHOR
 *	Ross Paterson <rap@doc.ic.ac.uk>
 */

/*
 * The following parameters represent the program's idea of what is "best".
 * Adjust to taste, subject to the caveats given.
 */

#define	WIDTH	75	/* longest permitted line length */
#define	LEEWAY	15	/* 1/LEEWAY of line is best left unused */

/*
 * Costs and bonuses are expressed as the equivalent departure from
 * the optimal line length, multiplied by 10.
 * e.g. assigning something a cost of 50 means that it is as bad as
 * a line 5 characters too short or long.
 * The definition of SHORT_COST(n) should not be changed.
 * However, EQUIV(n) may need tuning.
 */

typedef	long	COST;
#define	MAXCOST	(~(((COST)1)<<(8*sizeof(COST)-1)))

#define	SQR(n)		((n)*(n))
#define	EQUIV(n)	SQR((COST)(n))

/* cost of a filled line longer or shorter than best_width */
#define	SHORT_COST(n)	EQUIV((n)*10)
/* cost of the difference between adjacent filled lines */
#define	RAGGED_COST(n)	(SHORT_COST(n)/2)

/* basic cost per line */
#define	LINE_COST	EQUIV(70)
/* line break after the first word of a sentence */
#define	WIDOW_COST	EQUIV(200)
/* line break before the last word of a sentence */
#define	ORPHAN_COST	EQUIV(150)
/* line break at the end of a sentence */
#define	SENTENCE_BONUS	EQUIV(50)
/* line break after close parenthesis */
#define	PAREN_BONUS	EQUIV(40)
/* line break after other punctuation */
#define	PUNCT_BONUS	EQUIV(40)

/*
 * program limits: max. words and characters in a paragraph.
 * These are not checked.
 */

#define	MAXWORDS	2000
#define	MAXCHARS	10000

/*
 * Miscellaneous definitions
 */

#define	reg	register

typedef	unsigned natural;
typedef	int	bool;
#define	TRUE	1
#define	FALSE	0

/* small versions for use in records */
typedef	char	sbool, sint;

#define	when	break; case
#define	or	: case
#define	otherwise	break; default

#define	repeat	for (;;)
#define	until(c)	if (c) break

extern	void	trim_prefix();
extern	void	fmt();
extern	bool	get_paragraph();
extern	int	get_line(), get_prefix(), get_space();
extern	int	copy_rest();
extern	bool	same_para();
extern	void	fmt_paragraph();
extern	void	check_punctuation();
extern	COST	base_cost(), line_cost();
extern	void	put_paragraph(), put_line(), put_prefix(), put_space();

FILE	*infile;
int	in_column = 0;
int	out_column = 0;

bool	crown = FALSE;
bool	split = FALSE;		/* each line is a paragraph on its own */
char	*prefix = "";
int	prefix_length;
int	prefix_lead_space;
int	prefix_full_length;
int	max_width = WIDTH;
int	best_width;

main(argc, argv)
reg	int	argc;
reg	char	*argv[];
{
reg	int	i;

	OPTIONS	("-cs -width -p prefix [ file ... ]")
		FLAG	('c', crown)
		FLAG	('s', split)
		NUM_OPT	(max_width)
		STRING	('p', prefix)
	ENDOPTS
	best_width = max_width*(LEEWAY-1)/LEEWAY;
	trim_prefix();

	if (argc == 1) {
		infile = stdin;
		fmt();
	}
	else
		for (i = 1; i < argc; i++)
			if ((infile = fopen(argv[i], "r")) == NULL)
				fprintf(stderr, "%s: can't read file '%s'\n",
					argv[0], argv[i]);
			else {
				fmt();
				fclose(infile);
			}
	exit(0);
}

void
trim_prefix()
{
reg	char	*s;

	prefix_lead_space = 0;
	while (*prefix == ' ') {
		prefix_lead_space++;
		prefix++;
	}
	prefix_full_length = strlen(prefix);
	s = prefix + prefix_full_length;
	while (s > prefix && s[-1] == ' ')
		s--;
	*s = '\0';
	prefix_length = s - prefix;
}

int	next_char;	/* one-character look-ahead */
char	*to_match = "";

void
fmt()
{
	next_char = get_prefix();
	while (get_paragraph()) {
		fmt_paragraph();
		put_paragraph();
	}
}

#define	WORD	struct _WORD

WORD {
	/* static attributes determined during input */
	char	*text;
	sint	length;
	sint	space;		/* the size of the following space */
	sbool	paren;		/* starts with open paren */
	sbool	period;		/* ends in [.?!])* */
	sbool	punct;		/* ends in punctuation */
	/* the remaining fields are computed during the optimization */
	sint	line_length;	/* length of the best line starting here */
	COST	best_cost;
	WORD	*next_break;	/* break which achieves best_cost */
};

WORD	word[MAXWORDS];
WORD	*word_limit;
char	*wptr;

int	first_indent;	/* indentation of first line */
int	indent;		/* indentation of rest of current paragraph */
int	prefix_indent;
int	next_prefix_indent;
bool	tabs = FALSE;	/* tabs in input? */

/*
 * Definitions.
 *
 * A <paragraph> is a maximal non-empty set of consecutive non-blank
 * lines at the same indent.
 * In split mode, a paragraph is a single non-blank line.
 * In crown mode, the second and subsequent lines must have the same
 * indentation, but differing from the first line.
 * If a prefix is in effect, it must also be at the same indent for
 * each line in the paragraph.
 *
 * A <word> is a maximal non-empty set of non-white characters.
 *
 * A <sentence break> is either the end of a paragraph or a word ending
 * in [.?!], possibly followed by ) or ], followed by a word beginning
 * with a capital.
 */

bool
get_paragraph()
{
reg	int	c;
static	char	parabuf[MAXCHARS];	/* space for the paragraph text */

	c = next_char;
	/*
	 * Scan (and copy) blank lines,
	 * and lines not introduced by the prefix.
	 */
	while (c == '\n' || *to_match ||
	       next_prefix_indent < prefix_lead_space ||
	       in_column < next_prefix_indent + prefix_full_length) {
		if (prefix_length != 0)
			c = copy_rest(c);
	until(c == EOF);
		putchar('\n');
		c = get_prefix();
	}
	if (c == EOF) {
		next_char = EOF;
		return FALSE;
	}
	prefix_indent = next_prefix_indent;
	wptr = parabuf;
	word_limit = word;
	if (split) {
		first_indent = indent = in_column;
		c = get_line(c);
	}
	else if (crown) {
		first_indent = in_column;
		c = get_line(c);
		if (same_para(c) && in_column != first_indent) {
			indent = in_column;
			do {	/* for each line till the end of the para */
				c = get_line(c);
			} while (same_para(c) && in_column == indent);
		}
		/*
		 * If only one line, use the secondary indent
		 * from last time (initially 0) if it splits.
		 */
	}
	else {
		first_indent = indent = in_column;
		do {	/* for each line till the end of the para */
			c = get_line(c);
		} while (same_para(c) && in_column == indent);
	}
	(word_limit-1)->period = TRUE;
	next_char = c;
	return TRUE;
}

/*
 * Copy a line which failed to match the prefix to the output,
 * or which was blank after the prefix.
 *
 * In the former case, c is the character that failed to match *to_match.
 * In the latter, c is \n or EOF.
 * Returns the character ending the line.
 */
int
copy_rest(c)
reg	int	c;
{
reg	char	*s;

	out_column = 0;
	put_space(next_prefix_indent);
	for (s = prefix; s != to_match; s++)
		putchar(*s);
	if (c != '\n' && c != EOF) {
		out_column += to_match - prefix;
		put_space(in_column - out_column);
		do {
			putchar(c);
			c = getc(infile);
		} while (c != '\n' && c != EOF);
	}
	return c;
}

bool
same_para(c)
reg	int	c;
{
	return next_prefix_indent == prefix_indent && *to_match == '\0' &&
	       c != '\n' && c != EOF;
}

/*
 * Read a line, given first non-blank character c, and the following
 * indent, returning the first non-blank character of the next line.
 */
int
get_line(c)
reg	int	c;
{
	int	start;

	do {	/* for each word in a line */
		/* scan word */
		if (islower(c) && word_limit != word)
			(word_limit-1)->period = FALSE;
		word_limit->text = wptr;
		do {
			*wptr++ = c;
			c = getc(infile);
		} while (c != EOF && ! isspace(c));
		check_punctuation(word_limit, wptr-1);
		in_column += word_limit->length = wptr - word_limit->text;
		*wptr++ = '\0';
		/* scan inter-word space */
		start = in_column;
		c = get_space(c);
		word_limit->space = in_column - start;
		word_limit++;
		if (c == EOF)
			return EOF;
	} while (c != '\n');
	c = get_prefix();
	(word_limit-1)->space =
		c != EOF && ! islower(c) && (word_limit-1)->period ? 2 : 1;
	return c;
}

int
get_prefix()
{
reg	int	c;

	in_column = 0;
	c = get_space(getc(infile));
	to_match = prefix;
	if (prefix_length == 0)
		next_prefix_indent = 0;
	else {
		next_prefix_indent = in_column;
		while (*to_match && c == *to_match) {
			in_column++;
			to_match++;
			c = getc(infile);
		}
		if (*to_match == '\0')
			c = get_space(c);
	}
	return c;
}

/*
 * Scan blank characters, keeping in_column up-to-date.
 */
int
get_space(c)
reg	int	c;
{
	repeat {
		if (c == ' ')
			in_column++;
		else if (c == '\t') {
			tabs = TRUE;
			in_column = (in_column/8 + 1)*8;
		}
		else
			return c;
		c = getc(infile);
	}
}

void
check_punctuation(w, finish)
reg	WORD	*w;
reg	char	*finish;
{
reg	char	*start;
reg	int	c;

	start = w->text;

	c = *start;
	w->paren = c == '(' || c == '[';

	c = *finish;
	w->punct = ispunct(c);
	while ((c == ')' || c == ']' || c == '\'' || c == '"') &&
	       finish > start)
		c = *--finish;
	w->period = c == '.' || c == '?' || c == '!';
}

/*
 * Compute the optimal formatting for the whole paragraph by computing and
 * remembering the optimal formatting for each suffix from the empty one
 * to the whole paragraph.
 */
void
fmt_paragraph()
{
reg	WORD	*start, *w;
reg	int	len;
reg	COST	wcost, best;

	word_limit->best_cost = 0;
	word_limit->length = max_width;		/* sentinel */

	for (start = word_limit-1; start >= word; start--) {
		best = MAXCOST;
		len = start == word ? first_indent : indent;
		/* at least one word, however long, in the line */
		w = start;
		len += w->length;
		do {
			w++;
			/* consider breaking before w */
			wcost = line_cost(w, len) + w->best_cost;
			if (wcost < best) {
				best = wcost;
				start->next_break = w;
				start->line_length = len;
			}
			len += (w-1)->space;	/* w > start >= word */
			len += w->length;
		} while (len < max_width);
		start->best_cost = best + base_cost(start);
	}
}

/* constant component of cost of breaking before this word */
COST
base_cost(this)
reg	WORD	*this;
{
reg	COST	cost;

	cost = LINE_COST;

	if (this > word)
		if ((this-1)->period)
			cost -= SENTENCE_BONUS;
		else if ((this-1)->punct)
			cost -= PUNCT_BONUS;
		else if (this > word+1 && (this-2)->period)
			cost += WIDOW_COST/((this-1)->length+2);

	if (this->paren)
		cost -= PAREN_BONUS;
	else if (this->period)
		cost += ORPHAN_COST/(this->length+2);

	return cost;
}

/* length-dependent component of cost of breaking at next */
COST
line_cost(next, len)
reg	WORD	*next;
reg	int	len;
{
reg	int	n;
reg	COST	cost;

	if (next == word_limit)
		return 0;
	n = best_width - len;
	cost = SHORT_COST(n);
	if (next->next_break != word_limit) {
		n = len - next->line_length;
		cost += RAGGED_COST(n);
	}
	return cost;
}

void
put_paragraph()
{
reg	WORD	*w;

	out_column = 0;
	put_prefix(first_indent);
	put_line(word);
	for (w = word->next_break; w != word_limit; w = w->next_break) {
		put_prefix(indent);
		put_line(w);
	}
}

void
put_line(w)
reg	WORD	*w;
{
reg	WORD	*endline;

	endline = w->next_break-1;
	out_column += w->length;
	fputs(w->text, stdout);
	while (w != endline) {
		put_space(w->space);
		w++;
		fputs(w->text, stdout);
		out_column += w->length;
	}
	putchar('\n');
}

void
put_prefix(space)
	int	space;
{
	out_column = 0;
	put_space(prefix_indent);
	fputs(prefix, stdout);
	out_column += prefix_length;
	put_space(space - out_column);
}

void
put_space(space)
	int	space;
{
reg	int	space_target, tab_target;

	space_target = out_column + space;
	if (tabs) {
		tab_target = space_target/8*8;
		if (out_column+1 < tab_target)
			while (out_column < tab_target) {
				putchar('\t');
				out_column = (out_column/8 + 1)*8;
			}
	}
	while (out_column < space_target) {
		putchar(' ');
		out_column++;
	}
}
