/* charset.c - process a regular expression character set
 * Copyright (C) 1995-99 Andrew Pipkin (minitrue@pagesz.net)
 * MiniTrue is free software released with no warranty. See COPYING for details
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "minitrue.h"
#include "charset.h"

static char *parse(CharSet *set, const char *src);
static void new_ch(CharSet *set, int ch, int remove_ch);
static char *set_abbrev(CharSet *set, const char *src, int remove_ch);

unsigned char CharSet_Word_set[NCHAR];

/* Set up word set, use default if null */
static char far set_err[] = "Cannot have any text following set for -w option";
void CharSet_Init(const char *word_set)
{
    CharSet temp_set;
    temp_set.array = CharSet_Word_set;
    temp_set.bit   = 1;

    word_set = parse(&temp_set, word_set ? word_set : "[a-zA-Z0-9_]");
    if(*word_set != '\0')
        error_msg(set_err);
}

/* Boolean arrays will have 257 characters, the bits in last byte indicate
 *   which bits in the array are used */
static ucharf *Crnt_array = NULL;
static int Crnt_bit = 256;
enum { BIT_USED = 256 };

/* Initializer for character set, doles out current bit to character set
 *   then moves to next bit, allocating a new array if needed */
char *CharSet_init(CharSet *set, const char *src)
{
 /* Crnt_bit of 256 means no free bits in Crnt_array so allocate a new array */
    if(Crnt_bit == 256)
    {   Crnt_array = x_farmalloc(NCHAR + 1);
        _fmemset(Crnt_array, 0, 256);
        Crnt_array[BIT_USED] = 0;
        Crnt_bit = 1;
    }
    set->array = Crnt_array;
    set->array[BIT_USED] |= set->bit = Crnt_bit;
    Crnt_bit *= 2;  /* move to next bit */
    return parse(set, src);
}

/* test if the characters at src represent a valid character set, return
 *  NULL if not a character set, return non-null if valid */
char *CharSet_test(const char *src)
{
    return parse(NULL, src);
}

/* Destructor for character set, indicate that bit in array is no longer in
 *   use and free array if no other bits are in use */
void CharSet_kill(CharSet *set)
{
    set->array[BIT_USED] &= ~set->bit;
    if(!set->array[BIT_USED])
    {   farfree(set->array);
        if(set->array == Crnt_array)
            Crnt_bit = 256;
    }
}
static const charf Invalid_class[] = "Character set must begin with [";
static const charf Unclosed_class[] = "Character set missing closing ]";

/* Process the text representation of a character set, return the set end */
enum range_states { NEED_START, HAVE_START, NEED_END};

/* Parse a character set - return NULL if *src does not point to a set */
char *parse(CharSet *set, const char *src)
{
    char ch, prev_ch = 0;
    int range_status = NEED_START, remove_ch = FALSE;
    char *abbrev_end = set_abbrev(set, src, remove_ch);

    if(abbrev_end)
        return abbrev_end;

 /* Otherwise need [] */
    if(!set)
        return (*src == '[') ? (char *)src : NULL;

    if(*src++ != '[')
        input_error(Invalid_class);

 /* ^ at start of set means to set all values, then remove subsequent values*/
    if(*src == '^')
    {   set_abbrev(set, ".", remove_ch);
        set->array['\n'] |= set->bit;
    }

    while((ch = (uchar)*src) != ']' && ch)
    { /* ^ will remove all subsequent characters from the set */
        if(ch == '^' && src[1] != ']')
        {   range_status = NEED_START;
            remove_ch    = TRUE;
            ++src;
        }
     /* - indicates range if previous char normal character */
        else if(ch == '-' && src[1] != ']' && range_status == HAVE_START)
        {   range_status = NEED_END;
            ++src;
        }
     /* . @ and backslash abbreviations can be included in set */
        else if(   ch == '\\'
                && (abbrev_end = set_abbrev(set, src, remove_ch)) != NULL)
        {   src          = abbrev_end;
            range_status = NEED_START;
        }
        else
        {   src = esc_to_ch(&ch, src);

         /* If not in range, just set one value */
            if(range_status != NEED_END)
            {   new_ch(set, up_casE(ch), remove_ch);
                new_ch(set, low_casE(ch), remove_ch);
                prev_ch      = ch;
                range_status = HAVE_START;
            }
         /* Set all range values if in range - [A-Z] is equivalent to [Z-A] */
            else
            {   int ch_i = miN((uchar)ch, (uchar)prev_ch);
                for(; ch_i <= maX((uchar)ch, (uchar)prev_ch); ++ch_i)
                {   new_ch(set, up_casE(ch_i), remove_ch);
                    new_ch(set, low_casE(ch_i), remove_ch);
                }
                range_status = NEED_START;
            }
        }
    }
    if(*src++ != ']')
        input_error(Unclosed_class);
    return (char *)src;
}

/* Add the character ch to the character set if remove_ch is false, otherwise
 * remove ch from the set. */
static void new_ch(CharSet *set, int ch, int remove_ch)
{
    unsigned char bit  = set->bit;
    if(!remove_ch)
        set->array[ch] |= bit;
    else
    {   unsigned char mask = ~bit;
        set->array[ch] &= mask;
    }
}

/* If src is pointing to an abbreviation for a predefined character set
 * return the end of the abbreviation, otherwise return NULL */
char *CharSet_Abbrev(const char *src)
{
    return set_abbrev(NULL, src, 0);
}

static int is_word_ch(int ch)
{
    return CharSet_Word_set[(uchar)(ch)] & WORD_BIT;
}

/* See if a character preceded by a \ represents a chacter class
 *   if set not null, set all bits of the set class
 * return end of symbol if ch represents a character class, NULL otherwise */
static char *set_abbrev(CharSet *set, const char *src, int remove_ch)
{
 /* This array maps the letters to the is* functions they represent */
    static int (*is_ch_fns[])(int) =
    {   0, 0, isalpha, isdigit,   0, 0, 0, 0,   0, 0, 0, islower,
        0, 0, 0, ispunct,  0, 0, isspace, 0,      isupper, 0, is_word_ch, 0,
        0, 0
    };
    if(*src == '\\' && isalpha(src[1]))
    {   int (*is_ch)(int) = is_ch_fns[tolower(src[1]) - 'a'];
        if(is_ch)
        {   if(set)
            {/* If char is upper case, use complement of class */
                int ch_i, class_comp = isupper(src[1]);
                for(ch_i = 0; ch_i < NCHAR; ++ch_i)
                {   if(!is_ch(ch_i) ^ !class_comp)
                        new_ch(set, ch_i, remove_ch);
                }
            }
            return (char *)src + 2;
        }
    }
 /* dot represents set of all chars with the exception of newline */
    else if(*src == '.')
    {   if(set)
        {   int ch_i;
            for(ch_i = 0; ch_i < NCHAR; ++ch_i)
            {   if(ch_i != NL)
                    new_ch(set, ch_i, remove_ch);
            }
        }
        return (char *)src + 1;
    }
    return NULL;
}

/* Print a textual representation of a character set  */
void CharSet_dump(CharSet *set)
{
    int ch_i;
    putchar('[');
    for(ch_i = 0; ch_i < 0x100; ++ch_i)
    {   if(set->array[ch_i] & set->bit)
        {   int range_end = ch_i;
            char buf[10];
            ch_to_esc(buf, ch_i);
            printf("%s", buf);
            while(set->array[++range_end] & set->bit && range_end < 0x100)
                ;
            if(--range_end > ch_i + 1)
            {   ch_to_esc(buf, range_end);
                printf("-%s", buf);
                ch_i = range_end;
            }
        }
    }
    printf("]\n");
}

#ifdef CHARSET_TEST
int main(void)
{
    char line_buf[512];
    CharSet char_set;
    while(fgets(line_buf, 512, stdin) != NULL)
    {   CharSet_init(&char_set, line_buf);
        CharSet_dump(&char_set);
        CharSet_kill(&char_set);
    }
    return EXIT_SUCCESS;
}
#endif /* CHARSET_TEST */
