/* rxchars.c - determine if two regular expressions are ambiguous
 * Copyright (C) 1995-99 Andrew Pipkin (minitrue@pagesz.net)
 * MiniTrue is free software released with no warranty. See COPYING for details
 *//* This module determines the set of characters which can occur at each
 * position of a regular expression */

#include <string.h>
#include <stdlib.h>

#include "minitrue.h"
#include "rxchars.h"
#include "regexp.h"
#include "charset.h"

static RxState *add_state(RxChars *rxc, RxAtom *atom, QuantParen *quant_paren);
static RxState *change_atom(RxChars *rxc, RxAtom *atom, RxState *state);
static void new_quant_paren(RxChars *rxc, RxState *state, Paren *paren);
static QuantParen *copy_quant_paren(QuantParen *src);
static RxState *close_quant_paren(RxChars *rxc, RxState *state);

/* Initialize the structure used to determine the regular expression
 * begining at start_ptr and ending at end_ptr characters. If dir is 1
 * move forwards, if -1 move backwards */
int RxChars_init(RxChars *rxc, RxAtom *start_ptr,
                  int len, int dir, RegExp *rx)
{
    if(!len)
        return FALSE;
    rxc->states = NULL;
    rxc->nused  = rxc->nalloc = 0;
    rxc->end    = start_ptr + len * dir;
    rxc->dir    = dir;
    rxc->rx     = rx;
    add_state(rxc, start_ptr, NULL);
    return TRUE;
}

/* Set ch_array to the values which can appear in the regular expression
 * at the current position, return 0 if the end of the regular expression
 * has been reached, 1 otherwise. Move to the next position when done */
int RxChars_next(RxChars *rxc, char ch_array[])
{
    int state_i;
    memset(ch_array, 0, NCHAR);
 /* Examine the states, if the count for the current atom in the state
  * has reached the maximum, move to the next atom. If the count is above
  * the minimum value continue with the current atom but add a state
  * for the next atom */
    for(state_i = 0; state_i < rxc->nused; ++state_i)
    {   RxState *state = &rxc->states[state_i];

        if(state->atom == rxc->end)
            return 0;

        if(state->count == state->atom->max)
        {   if(!change_atom(rxc, state->atom + rxc->dir, state))
                return 0;
        }

        if(state->count >= state->atom->min && !state->str_i)
        {   if(!add_state(rxc, state->atom + rxc->dir, state->quant_paren))
                return 0;
        }
    }
 /* Go through the states, setting the values of the array to correspond to
  * the atoms which the states are looking at */
    for(state_i = 0; state_i < rxc->nused; ++state_i)
    {   RxState *state = &rxc->states[state_i];
        RxAtom *atom = state->atom;
        int data_type = Rx_type(atom);
        if(data_type == RX_SET)
        {   int ch_i;
            for(ch_i = 0; ch_i < NCHAR; ++ch_i)
            {   if(CharSet_iN(atom->data.set, ch_i))
                    ch_array[ch_i] = 1;
            }
            ++state->count;
        }
        else if(data_type == RX_CH)
        {   int ch = atom->data.ch;
            ch_array[low_casE(ch)] = ch_array[up_casE(ch)] = 1;
            ++state->count;
        }
        else if(data_type & RX_STR)
        {   const char *str = atom->data.str.start;
            int len = atom->data.str.len, str_i = state->str_i;
            int ch = str[ (rxc->dir == 1) ? str_i : len - str_i - 1 ];

            ch_array[low_casE(ch)] = ch_array[up_casE(ch)] = 1;

            if(++state->str_i == len)
            {   state->str_i = 0;
                ++state->count;
            }
        }
    }
    return 1;
}

/* Add a state for atom to the end of the state array. Do a deep copy of the
 *  quantified parentheses info at quant_paren. Return 0 if the
 *  next atom is at the end of the regular expression, 1 otherwise */
static RxState *add_state(RxChars *rxc, RxAtom *atom, QuantParen *quant_paren)
{
    RxState *state;
    /* Allocate more states if no unused state */
    if(rxc->nused == rxc->nalloc)
    {   rxc->nalloc = !rxc->nalloc ? 32 : 2 * rxc->nalloc;
        rxc->states = x_realloc(rxc->states, rxc->nalloc * sizeof(RxChars));
    }
    state = &rxc->states[rxc->nused++];
    if(quant_paren)
    {   state->quant_paren = quant_paren = copy_quant_paren(quant_paren);
        while(quant_paren->outer)
        {   quant_paren->outer = copy_quant_paren(quant_paren->outer);
            quant_paren        = quant_paren->outer;
        }
    }
    else
        state->quant_paren = NULL;

    return atom ? change_atom(rxc, atom, state) : state;
}

/* Copy quantified paren at *src after allocating memory */
static QuantParen *copy_quant_paren(QuantParen *src)
{
    return memcpy(x_malloc(sizeof(QuantParen)), src, sizeof(QuantParen));
}

/* Replace the atom of state and reset the count */
static RxState *change_atom(RxChars *rxc, RxAtom *atom, RxState *state)
{
    int state_i = state - rxc->states;

 /* Move to an atom containing data */
    while(atom != rxc->end && !(atom->flags & (RX_CH | RX_SET | RX_STR)))
    {/* If parentheses containing alternatives entered, set up a state
      * for each alternative in the parentheses */
        if(Rx_is_paren(atom))
        {   if(!Rx_is_lparen(atom) ^ (rxc->dir == 1))
            {   int paren_i = atom->fail.paren_i;
                Paren *paren = &rxc->rx->data->parens[paren_i];
                if(Rx_paren_quant(paren))
                {   if(!paren->min)
                    {   int exit_i = Rx_exit_i(paren, rxc->dir) + rxc->dir;
                        if(!add_state(rxc, &rxc->rx->start[exit_i],
                                      state->quant_paren))
                            return NULL;
                        state = &rxc->states[state_i];
                    }
                    new_quant_paren(rxc, state, paren);
                }

                if(paren->nbar)
                {   int alt_i;

                    for(alt_i = 0; alt_i < paren->nbar; ++alt_i)
                    {   atom += rxc->dir;
                        add_state(rxc, atom, state->quant_paren);
                        while(   !(atom->flags & RX_ALTERN)
                              || atom->fail.paren_i != paren_i)
                            atom += rxc->dir;
                    }
                }
            }
         /* If end of quantified paren */
            else if(state->quant_paren && atom == state->quant_paren->end)
                return close_quant_paren(rxc, state);
        }
     /* If end of alternative reached, move to end of parentheses containing
      * alternatation */
        else if(Rx_is_alt(atom))
        {   Paren *paren = &rxc->rx->data->parens[atom->fail.paren_i];
            atom = &rxc->rx->start[ Rx_exit_i(paren, rxc->dir) ];
            continue;
        }
     /* If backreference encountered, move to the start of the corresponding
      * parentheses */
        else if(atom->flags & RX_BREF)
        {   Paren *paren = &rxc->rx->data->parens[atom->data.bref.num];
            int bref_i   = Rx_exit_i(paren, -rxc->dir);

         /* If backreference optional, add a state for the atom following
          * the backreference */
            if(!atom->min)
            {   if(!add_state(rxc, atom + rxc->dir, state->quant_paren))
                    return NULL;
                state = &rxc->states[state_i];
            }
            new_quant_paren(rxc, state, paren);
            state->quant_paren->back_ref = atom;

            atom = &rxc->rx->start[bref_i];
        }
        atom += rxc->dir;
    }

    if(atom == rxc->end)
        return NULL;

    state        = &rxc->states[state_i];
    state->atom  = atom;
    state->str_i = state->count = 0;

    return state;
}

/* Set up a quantified paren structure for state */
static void new_quant_paren(RxChars *rxc, RxState *state, Paren *paren)
{
    QuantParen *qp = x_malloc(sizeof(QuantParen));
    int end_i = Rx_exit_i(paren, rxc->dir);
    qp->count = 0;
    qp->end   = &rxc->rx->start[end_i];
    qp->outer = state->quant_paren;
    qp->back_ref = NULL;
    state->quant_paren = qp;
}

/* If the atom contained in state is at the end of a quantified paren,
 * exit the paren and/or go back to the start of the paren. Return NULL
 * if the end of the regular expression is reached, non-NULL if not */
static RxState *close_quant_paren(RxChars *rxc, RxState *state)
{
    RxAtom *atom   = state->quant_paren->end;
    int paren_i    = atom->fail.paren_i;
    Paren *paren   = &rxc->rx->data->parens[paren_i];
    QuantParen *qp = state->quant_paren;
    RxState *exit_par = state, *reset_par = state;
    int min, max, count = ++qp->count;

 /* If paren is backreference, need to use the min/max where
  * the backreference occurs, otherwise use the parentheses counts */
    if(!qp->back_ref)
    {   min = paren->min;
        max = paren->max;
    }
    else
    {   min = qp->back_ref->min;
        max = qp->back_ref->max;
    }
 /* If next parentheses is optional, need to clone state with one
  * state going back to paren start, the other exitting the
  * parentheses */
    if(min <= count && count < max)
    {/* defend against possibility of realloc */
        int state_i = state - rxc->states;
        exit_par    = add_state(rxc, NULL, state->quant_paren);
        reset_par   = &rxc->states[state_i];
    }
 /* If count less than max, increment count and reset to start of paren */
    if(count < max)
    {   int start_i = (rxc->dir == 1) ? paren->start_i : paren->end_i;
        change_atom(rxc, &rxc->rx->start[start_i + rxc->dir], reset_par);
    }
 /* If exitting quantified paren, free the topmost quantified paren
  * structure and move to the next atom */
    if(count >= min)
    {   QuantParen *outer;
        qp    = exit_par->quant_paren;
        if(qp->back_ref)
            atom = qp->back_ref;

        outer = qp->outer;
        free(qp);
        exit_par->quant_paren = outer;
        return change_atom(rxc, atom + rxc->dir, exit_par);
    }
    else
        return reset_par;
}

void RxChars_kill(RxChars *rxc)
{
    RxState *state = rxc->states;
    int state_i;
    for(state_i = 0; state_i < rxc->nused; ++state_i, ++state)
    {/* Free linked list of quantified parens */
        if(state->quant_paren)
        {   QuantParen *inner = state->quant_paren, *outer;
            do
            {   outer = inner->outer;
                free(inner);
            }while((inner = outer) != NULL);
        }
    }
    free(rxc->states);
}

/* This test program prints all the characters in which can occur at each
 *  position of the regexps on the command line until a possible end has
 *  been reached. If - is the first argument, go in reverse */
#ifdef RXCHARS_TEST
#include <stdio.h>
int main(int argc, char *argv[])
{
    int arg_i = 1, dir = 1;
    if(argc > 1 && !strcmp(argv[1], "-"))
    {   dir = -1;
        ++arg_i;
    }

    for(; arg_i < argc; ++arg_i)
    {   RegExp rx;
        RxData rxd;
        RxChars rxc;
        int rx_i = 0;
        char ch_array[NCHAR];

        Rx_parse(&rx, &rxd, argv[arg_i]);
        if(dir == 1)
            RxChars_init(&rxc, rx.start, rx.len, 1, &rx);
        else
            RxChars_init(&rxc, rx.start + rx.len - 1, rx.len, -1, &rx);

        while(RxChars_next(&rxc, ch_array))
        {   int ch_i;
            printf("%d:", rx_i);
            for(ch_i = 0; ch_i < NCHAR; ++ch_i)
            {   if(ch_array[ch_i])
                {   char esc_buf[20];
                    ch_to_esc(esc_buf, ch_i);
                    putchar(' ');
                    fputs(esc_buf, stdout);
                }
            }
            putchar('\n');
            ++rx_i;
        }
    }
    return 0;
}
#endif /* RXCHARS_TEST */
