/*
 * text.c
 *
 * Text manipulation routines
 *
 */

#include "frotz.h"

#define EMBEDDED_STRING 0
#define STATIC_STRING 1
#define LOW_STRING 2

static char *v1_alphabet[3] = {
    "abcdefghijklmnopqrstuvwxyz",
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
    " 0123456789.,!?_#'\"/\\<-:()"
};

static char *v2_alphabet[3] = {
    "abcdefghijklmnopqrstuvwxyz",
    "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
    " \n0123456789.,!?_#'\"/\\-:()"
};

/*
 * encode_text
 *
 * Encode a string into 2 or 3 words. This is used to look up
 * words from the input line in the dictionary.
 *
 */

static void encode_text (int length, zword ascii_text, zword *encoded)
{
    zword char_addr;
    zbyte codes[12];
    zbyte c, c2;
    int limit;
    int count;
    int i, j, k;

    /* The dictionary resolution is six (V1-V3) or nine letters (V4+) */

    limit = (h_version <= V3) ? 6 : 9;

    /* Convert ASCII to Z characters */

    for (i = 0, count = 0; i < length && count < limit; i++) {

	LOW_BYTE (ascii_text, c)
	ascii_text++;

	for (j = 0; j < 3; j++)
	    for (k = 0; k < 26; k++) {

		if (h_version == V1)
		    c2 = v1_alphabet[j][k];
		else if (h_alphabet == 0)
		    c2 = v2_alphabet[j][k];
		else {
		    char_addr = h_alphabet + 26 * j + k;
		    LOW_BYTE (char_addr, c2)
		}

		if (c == c2)
		    goto letter_found;
	    }

	codes[count++] = 5;
	codes[count++] = 6;
	codes[count++] = c >> 5;
	codes[count++] = c & 0x1f;
	continue;

letter_found:

	if (j != 0)
	    codes[count++] = j + ((h_version <= V2) ? 1 : 3);

	codes[count++] = k + 6;
    }

    /* If the word takes less than "limit" codes, pad it with 5's */

    while (count < limit)
	codes[count++] = 5;

    /* Three codes make a single word */

    for (i = 0; i < limit / 3; i++) {
	encoded[i] = 0;
	for (j = 0; j < 3; j++)
	    encoded[i] = (encoded[i] << 5) | codes[3 * i + j];
    }

    /* Set high bit on last word */

    encoded[i - 1] |= 0x8000;

}/* encode_text */

/*
 * z_encode_text
 *
 * Convert ASCII text to encoded text, suitable for use in a
 * dictionary.
 *
 */

void z_encode_text (zword ascii_text, zword word_length, zword from, zword coded_text)
{
    zword encoded[3];

    /* Encode the word */

    encode_text (word_length, ascii_text + from, encoded);

    /* Move the encoded word into the destination buffer */

    z_storew (coded_text, 0, encoded[0]);
    z_storew (coded_text, 1, encoded[1]);
    z_storew (coded_text, 2, encoded[2]);

}/* z_encode_text */

/*
 * decode_text
 *
 * decode_text is a helper function to convert encoded text to ASCII.
 * Text is encoded by squeezing each character into 5 bits. 3 * 5 bit
 * encoded characters can fit in one word with a spare bit left over.
 * The spare bit is used to signal to end of a string. The 5 bit encoded
 * characters can either be actual character codes or prefix codes that
 * affect the following code.
 *
 */

static void decode_text (int string_type, zword addr)
{
    long real_addr;
    zword encoded;
    zword char_addr;
    zword ptr_addr;
    zword abbr_addr;
    zbyte c, c2;
    int shift_state = 0;
    int shift_lock = 0;
    int abbreviation_flag = 0;
    int abbreviation = 0;
    int ascii_flag = 0;
    int ascii = 0;
    int i;

    /* Calculate the real address if it's a static string */

    if (string_type == STATIC_STRING)
	real_addr = ((long) h_strings_offset << 3) + ((long) addr << story_shift);

    do {

	/* Strings can be found at three different places: in the lower
	   64KB, at the end of the paged memory ("static strings") or
	   embedded in the instruction stream. */

	if (string_type == LOW_STRING) {
	    LOW_WORD (addr, encoded)
	    addr += 2;
	} else if (string_type == STATIC_STRING) {
	    HIGH_WORD (real_addr, encoded)
	    real_addr += 2;
	} else
	    CODE_WORD (encoded)

	/* Every word contains 3 Z-characters */

	for (i = 10; i >= 0; i -= 5) {

	    c = (encoded >> i) & 0x1f;

	    if (abbreviation_flag != 0) {

		/* There are three tables of abbreviations ("synonyms"),
		   each table consisting of 32 abbreviations. (Precisely,
		   the tables simply hold addresses which point to strings
		   in the resident memory. Uniquely, these addresses must
		   be multiplied with 2 in order to obtain the absolute
		   address of the abbreviation). */

		ptr_addr = h_abbreviations + 64 * (abbreviation - 1) + 2 * c;
		LOW_WORD (ptr_addr, abbr_addr)
		decode_text (LOW_STRING, abbr_addr << 1);
		abbreviation_flag = 0;

	    } else if (ascii_flag == 1) {

		/* If this is the first part ASCII code then remember it.
		   Since the codes are only 5 bits you need two codes to
		   make one ASCII character. The first code contains the
		   top 3 bits, the second code contains the bottom 5 bits. */

		ascii = c;
		ascii_flag = 2;

	    } else if (ascii_flag == 2) {

		/* If this is the second part ASCII code then assemble
		   the character from the two codes and output it. */

		z_print_char ((ascii << 5) | c);
		ascii_flag = 0;

	    } else if (shift_state == 2 && c == 6) {

		/* The character 6 in the punctuation set indicates
		   that the next two codes make an ASCII character. */

		shift_state = shift_lock;
		ascii_flag = 1;

	    } else if (h_version >= V2 && shift_state == 2 && c == 7 || h_version == 1 && c == 1) {

		/* Since Z-code 2, character 7 in the punctuation set is
		   newline. In earliest Z-code format, character 1 is
		   newline in all three character sets. */

		z_new_line ();
		shift_state = shift_lock;

	    } else if (c >= 6) {

		/* This is a normal character, so select it from the
		   currently active character set. */

		if (h_version == V1)
		    c2 = v1_alphabet[shift_state][c - 6];
		else if (h_alphabet == 0)
		    c2 = v2_alphabet[shift_state][c - 6];
		else {
		    char_addr = h_alphabet + 26 * shift_state + c - 6;
		    LOW_BYTE (char_addr, c2)
		}

		z_print_char (c2);
		shift_state = shift_lock;

	    } else if (c == 0) {

		/* Character 0 means space regardless of the current
		   shift state (ie. in all character sets). */

		z_print_char (' ');
		shift_state = shift_lock;

	    } else if (h_version >= V3 && c <= 3 || h_version == V2 && c == 1) {

		/* Select the table for the following abbreviation code.
		   Z-code versions differ: V1 has no abbreviations, V2 has
		   only one abbreviation table, whereas later versions
		   have three tables. */

		abbreviation_flag = 1;
		abbreviation = c;
		shift_state = shift_lock;

	    } else {

		/* Select the shift state for the next character. The shift
		   state automatically gets reset back to lowercase for V3+
		   games. Those games only use shift keys 4 and 5, since 2
		   and 3 are reserved for abbreviations. This is different
		   in V1 and V2. Shift keys 2 & 3 only shift the next char,
		   shift keys 4 & 5 lock the shift until reset. */

		shift_state = (shift_lock + (c & 1) + 1) % 3;

		if (h_version <= V2 && c >= 4)
		    shift_lock = shift_state;
	    }
	}

    } while ((encoded & 0x8000) == 0);

}/* decode_text */

/*
 * z_print
 *
 * Print the string embedded in the instruction stream at this point.
 * All strings that do not need to be referenced by address are embedded
 * in the instruction stream.
 *
 */

void z_print (void)
{

    decode_text (EMBEDDED_STRING, 0);

}/* z_print */

/*
 * z_print_addr
 *
 * Print using a real address. Real addresses point to strings within
 * the lower 64KB of the memory.
 *
 */

void z_print_addr (zword addr)
{

    decode_text (LOW_STRING, addr);

}/* z_print_addr */

/*
 * z_print_num
 *
 * Print a (signed) 16bit number.
 *
 */

void z_print_num (zword value)
{
    int i;

    /* Print sign */

    if ((short) value < 0) {
	z_print_char ('-');
	value = -value;
    }

    /* Print value */

    for (i = 10000; i != 0; i /= 10)
	if (value >= i || i == 1)
	    z_print_char ('0' + (value / i) % 10);

}/* z_print_num */

/*
 * z_print_obj
 *
 * Print an object description. Object descriptions are stored
 * at the front of the property list for the object.
 *
 */

void z_print_obj (zword obj)
{
    zword name_addr;
    zbyte len;

    /* Get address of object name */

    name_addr = object_name (obj);

    /* Object name starts with a byte holdings its length in words */

    LOW_BYTE (name_addr, len)

    /* Print the name of the object, or supply a generic name if the
       object is anonymous. The latter is useful for cheat functions. */

    if (len != 0)
	decode_text (LOW_STRING, name_addr + 1);
    else {
	display_string ("object#");
	z_print_num (obj);
    }

}/* z_print_obj */

/*
 * z_print_paddr
 *
 * Print using a packed address. Packed addresses refer to so-called
 * static strings at the end of the paged memory.
 *
 */

void z_print_paddr (zword packed_address)
{

    decode_text (STATIC_STRING, packed_address);

}/* z_print_paddr */

/*
 * z_print_ret
 *
 * Print a string embedded in the instruction stream followed by
 * a newline. Finally, return from the current routine with true.
 *
 */

void z_print_ret (void)
{

    z_print ();
    z_new_line ();
    z_ret (1);

}/* z_print_ret */

/*
 * z_print_table
 *
 * Write text into a rectangular window.
 *
 *    argv[0] = start of text address
 *    argv[1] = rectangle width
 *    argv[2] = rectangle height (default = 1)
 *    argv[3] = number of characters to skip between lines (default = 0)
 *
 */

void z_print_table (int argc, zword *argv)
{
    int width, height;
    int row, col;
    zbyte value;
    zword addr;

    /* Supply default arguments */

    if (argc < 3)
	argv[2] = 1;
    if (argc < 4)
	argv[3] = 0;

    addr = argv[0];

    /* Get coordinates of top left corner of rectangle */

    os_get_cursor (&row, &col);

    /* Write text in width * height rectangle */

    for (height = 0; height < argv[2]; height++) {

	for (width = 0; width < argv[1]; width++) {
	    LOW_BYTE (addr, value)
	    z_print_char (value);
	    addr++;
	}

	if (height != (argv[2] - 1)) {
	    row++;
	    os_set_cursor (row, col);
	    addr += argv[3];
	}
    }

}/* z_print_table */

/*
 * tokenise_text
 *
 * Translate a single word to a token and append the token to the token
 * buffer. Unrecognised words cause empty slots if the flag is set.
 *
 */

static void tokenise_text (zword text, int length, int from, zword parse, zword dictionary, int flag)
{
    zword encoded[3];
    zword entry_addr;
    zword entry_count;
    zword entry;
    zbyte token_max;
    zbyte token_count;
    int entry_length;
    int entry_number;
    int resolution;
    int sorted;
    int lower, upper;
    int i;

    /* Dictionary entries take two (V1-V3) or three (V4+) words */

    resolution = (h_version <= V3) ? 2 : 3;

    /* Read the information from the dictionary header and copy it to
       entry_length (size of each entry) and entry_count (number of
       entries). If the entry_number is negative then the entries are
       _not_ in alphabetical order. */

    LOW_BYTE (dictionary, entry_length)
    dictionary += 1;
    LOW_WORD (dictionary, entry_count)
    dictionary += 2;

    sorted = ((short) entry_count > 0);
    if (sorted == 0)
	entry_count = -entry_count;

    /* Encode the word */

    encode_text (length, text + from, encoded);

    /* Use binary search if the entries are sorted alphabetically,
       otherwise do a linear search. This is done quite hacky using
       only a single loop construct. */

    lower = 0;
    upper = entry_count - 1;

    while (lower <= upper) {

	entry_number = (sorted != 0) ? (lower + upper) >> 1 : lower;
	entry_addr = dictionary + entry_number * entry_length;

	/* Compare encoded word to dictionary entry */

	for (i = 0; i < resolution; i++) {
	    LOW_WORD (entry_addr, entry)
	    if (encoded[i] != entry)
		break;
	    entry_addr += 2;
	}

	/* Leave loop, if the word has been found */

	if (i == resolution) {
	    entry_addr -= 2 * resolution;
	    break;
	}

	/* Otherwise set the new upper and lower bounds */

	if (!sorted)
	    lower++;
	else if (encoded[i] > entry)
	    lower = entry_number + 1;
	else
	    upper = entry_number - 1;
    }

    /* Store the token in the token buffer. This is, of course, only
       possible as long as there is room left in the buffer. Note that
       unrecognised words are represented by zero if the flag is clear,
       and that unrecognised words cause empty slots if it is set. */

    if (lower > upper)
	entry_addr = 0;

    LOW_BYTE (parse, token_max)
    parse++;
    LOW_BYTE (parse, token_count)
    parse--;

    if (token_count < token_max) {
	z_storeb (parse, 1, token_count + 1);
	if (entry_addr != 0 || flag == 0) {
	    z_storew (parse, 1 + (token_count << 1), entry_addr);
	    z_storeb (parse, 4 + (token_count << 2), length);
	    z_storeb (parse, 5 + (token_count << 2), from);
	}
    }

}/* tokenise_text */

/*
 * z_tokenise
 *
 * Analyse a text string (first argument), divide it into words,
 * translate the words to tokens and store the tokens in the token
 * buffer (second argument). The default dictionary is given by a
 * header field but can be replaced by an optional user dictionary
 * (third argument). If the flag (fourth argument) is set, then
 * unrecognised words cause "empty" slots in the token buffer. An
 * empty slot is left unchanged; this way it is possible to analyse
 * the same text string using several different dictionaries.
 *
 */

void z_tokenise (int argc, zword *argv)
{
    zword separator_addr;
    zword start_addr;
    zword end_addr;
    zbyte separator_count;
    zbyte last_char;
    zbyte c;
    zbyte separator;
    int isseparator;
    int i;

    /* Supply default parameters */

    if (argc < 3 || argv[2] == 0)
	argv[2] = h_dictionary;
    if (argc < 4)
	argv[3] = 0;

    /* Every dictionary is prefixed with a list of word separators */

    separator_addr = argv[2];
    LOW_BYTE (separator_addr, separator_count)
    argv[2] += 1 + separator_count;

    /* Remove all tokens before inserting new ones */

    z_storeb (argv[1], 1, 0);

    /* Move the end_addr pointer across the text buffer searching for
       the beginning of a word. If this succeeds, store the position in
       the start_addr pointer. Continue moving the end_addr pointer
       searching for the end of the word. When it is found, translate
       the word to a token and store it in the token buffer. Continue
       until the end of the buffer is reached. */

    start_addr = 0;
    end_addr = argv[0];

    if (h_version >= V5) {
	end_addr++;
	LOW_BYTE (end_addr, last_char)
    }

    do {

	/* Fetch next character */

	end_addr++;

	if (h_version >= V5 && end_addr - 2 == argv[0] + last_char)
	    c = 0;
	else
	    LOW_BYTE (end_addr, c)

	/* Check for separator */

	isseparator = 0;
	for (i = 1; i <= separator_count; i++) {
	    separator_addr++;
	    LOW_BYTE (separator_addr, separator)
	    if (c == separator)
		isseparator = 1;
	}
	separator_addr -= separator_count;

	/* Start or end of a word found? */

	if (start_addr == 0) {
	    if (isseparator == 0 && c != ' ' && c != 0)
		start_addr = end_addr;
	} else if (isseparator != 0 || c == ' ' || c == 0) {
	    tokenise_text (argv[0], end_addr - start_addr, start_addr - argv[0], argv[1], argv[2], argv[3]);
	    start_addr = 0;
	}

	/* Translate separator */

	if (isseparator != 0)
	    tokenise_text (argv[0], 1, end_addr - argv[0], argv[1], argv[2], argv[3]);

    } while (c != 0);

}/* z_tokenise */
