/* rxparse.c - parse regular expression
 * Copyright (C) 1995-99 Andrew Pipkin (minitrue@pagesz.net)
 * MiniTrue is free software released with no warranty. See COPYING for details
 */

#include <string.h>
#include <ctype.h>

#include "minitrue.h"
#include "regexp.h"
#include "charset.h"

static char *read_string(RxAtom *rx_ptr, char *src, int nparen);
static int is_quant_char(int ch);
static char *have_anchor(const char *src, int *opcode);
static char *have_bref(const char *src, int *bref_num, int nparen);
static char *parse_quant(const char *src, int *min, int *max);
static void open_paren(RegExp *rx);
static char *close_paren(RegExp *rx, const char *src);
static void mark_alt(RegExp *rx);
static void set_paren_len(Paren *par);
static void rx_parse_error(const char far *msg);
static int NErrors;

/* Parse the regular expression starting at *src. Return the maximum length
 * of the regular expression, -1 if an error occurs */
int Rx_parse(RegExp *rx, char *src)
{
    RxData *rxd = rx->data;
    open_paren(rx);

    while(*src)
    {   char *anchor_end, *bref_end;
        int bref_num;

     /* Initialize atom */
        RxAtom *rx_ptr = Rx_init_atom2(rx, 0, rxd->open_par_i);

     /* If ( present, start new parenthesis data structure*/
        if(*src == '(')
        {   rx_ptr->flags       |= RX_LPAREN;
            rx_ptr->fail.paren_i = rxd->nparen;
            open_paren(rx);
            ++src;
        }
     /* If ) found, finish corresponding parentheses data structure */
        else if(*src == ')')
        {   rx_ptr->flags       |= RX_RPAREN;
            src = close_paren(rx, src);
        }

     /* Record presence of | in alternation data structure */
        else if(*src == '|')
        {   rx_ptr->flags        |= RX_ALTERN;
            rx_ptr->data.altern_i = rxd->nalt;
            mark_alt(rx);
            ++src;
        }
     /* now See if anchor present */
        else if((anchor_end = have_anchor(src, &rx_ptr->opcode)) != NULL)
        {   rx_ptr->flags |= RX_ANCHOR;
            src = anchor_end;
        }
     /* If back reference present, test validity */
        else if((bref_end = have_bref(src, &bref_num, rxd->nparen)) != NULL)
        {   rx_ptr->flags |= RX_BREF;
            src = bref_end;
         /* Make sure that parentheses corresponding to back reference has
          * been closed */
            if(!rxd->parens[bref_num].end_i)
                rx_parse_error("Backreference corresponds to unclosed parenthesis");
            rx_ptr->data.bref.num = bref_num;
        }

     /* If character set, parse it */
        else if(*src == '[' || CharSet_Abbrev(src))
        {   rx_ptr->flags  |= RX_SET;
            src             = CharSet_init(&rx_ptr->data.set, src);
            rx_ptr->max     = 1;
        }
     /* Otherwise a literal string is present, literal string will be ended
      * by non-escaped quantifier, character set or other regex symbol */
        else
            src = read_string(rx_ptr, src, rxd->nparen);

     /* After parsing quantifier following regexp atom, calculate the
      * maximum & mininum length of the atom and add to the max & min
      * length of the current parentheses */
        if(Rx_is_data(rx_ptr) || Rx_is_anchor(rx_ptr))
        {   Paren *open_par = &rxd->parens[rxd->open_par_i];
            src     = parse_quant(src, &rx_ptr->min, &rx_ptr->max);
            open_par->curr_max = add_truncate(open_par->curr_max,
                                              Rx_max_len(rx, rx_ptr));
            open_par->curr_min = add_truncate(open_par->curr_min,
                                              Rx_min_len(rx, rx_ptr));

         /* If anchor is optional (0 minimum occurences), it can be dropped
          * from the regular expression so overwrite the anchor */
            if(Rx_is_anchor(rx_ptr))
            {   if(!rx_ptr->min)
                    Rx_drop(rx);
                else
                    rx_ptr->min = rx_ptr->max = 1;
            }
        }
    }
    if(rxd->paren_level > 0)
        rx_parse_error("Unclosed (");

    close_paren(rx, src);
    rxd->nparse = rx->len;

 /* If error occurs, clean up and return -1 */
    if(NErrors)
    {   int rx_i;
        for(rx_i = 0; rx_i < rx->len; ++rx_i)
        {   RxAtom *rx_ptr = &rx->start[rx_i];
            if(Rx_type(rx_ptr) == RX_SET)
                CharSet_kill(&rx_ptr->data.set);
        }
        rx->start = NULL;
        rx->len   = rxd->nparse = 0;
        NErrors   = 0;
        return -1;
    }
    return rxd->parens[0].curr_max;
}

static void rx_parse_error(const char far *msg)
{
    input_error(msg);
    ++NErrors;
}

/* Read characters until string end of regular expression encountered,
 * if only one character found, indicate the atom is a single char otherwise
 * consider atom string */
static char *read_string(RxAtom *rx_ptr, char *src, int nparen)
{
    const char *str_start = src;
    char *dest = src, *new_src, ch;
    int anchor_op, bref;

 /* Make sure quantifiers quantify something */
    if(is_quant_char(*src))
    {   rx_parse_error("+*?{} follows nothing");
        ++src;
    }

 /* Stop when a regular expression symbol encountered */
    while((ch = *src) != '\0' && !CharSet_Abbrev(src) && ch != '(' && ch != ')'
          && ch != '[' && !is_quant_char(ch) && !have_anchor(src, &anchor_op)
          && !have_bref(src, &bref, nparen) && ch != '|')
    {/* Convert escaped characters & convert to lower-case if
      * case insensitive */
        if(*src == '\\')
            new_src = esc_to_ch(&ch, src);
        else
        {   ch      = low_casE(ch);
            new_src = src + 1;
        }
        if(str_start != src && is_quant_char(*new_src))
            break;

        *dest++ = ch;
        src     = new_src;
    }
 /* If fixed string only one character long, treat as single char */
    if(dest == str_start + 1)
    {   rx_ptr->flags   |= RX_CH;
        rx_ptr->data.ch  = *str_start;
    }
    else
    {   rx_ptr->flags         |= RX_STR;
        rx_ptr->data.str.start = str_start;
        rx_ptr->data.str.end   = dest;
        rx_ptr->data.str.len   = dest - (char *)str_start;
    }
    return src;
}

/* Test if character ch is quantifier */
static int is_quant_char(int ch)
{
    return (ch == '*' || ch == '+' || ch == '?' || ch == '{');
}

/* return end of anchor and set *anchor_op appropriately if anchor is
 * found at *src, return NULL otherwise */
static char *have_anchor(const char *src, int *anchor_op)
{
    int op = -1;
    if(*src == '^')             op = LINE_START;
    else if(*src == '$')        op = LINE_END;
    else if(*src == '\\')
    {   if(src[1] == 'b')       op = WORD_BREAK;
        else if(src[1] == 'B')  op = NON_BREAK;
        else if(src[1] == 'A')  op = FILE_START;
        else if(src[1] == 'Z')  op = FILE_END;
    }
    if(op == -1)
        return NULL;
    else
    {   if(op == LINE_START || op == LINE_END)
            ++src;
        else
            src += 2;
        *anchor_op = op;
        return (char *)src;
    }
}

/* if src points to a backreference, return the end of the backreference
 * and set *bref_num to the backreference number */
static char *have_bref(const char *src, int *bref_num, int nparen)
{
    int num = 0;
    if(*src == '\\')
    {/* Single digit back references are just preceded by a \ */
        if(isdigit(src[1]) && src[1] != '0')
        {   num = src[1] - '0';
            src += 2;
        }
     /* Back references between 10 and 99 are represented by \v[num][num] */
        else if(src[1] == 'v')
        {   if(!isdigit(src[2]) || !isdigit(src[3]))
                rx_parse_error("\\v must be followed by 2 digits specifying back reference");
            num = 10 * (src[1] - '0') + (src[2] - '0');
            src += 4;
        }
        *bref_num = num;
    }
    return (num && num < nparen) ? (char *)src : NULL;
}

/* Parse the quantifier and set *min_ptr and *max_ptr to the minimum
 * and maximum number of times the item can occur, return the end of the
 * quantifier */
const char far unclosed_rbrack[] = "Missing closing }";
const char far bad_brack[] = "Undefined characters in {} quantifier";

/* Parse the quantifier, return the end of the quantifier and set *min_ptr
 * and *max_ptr to the minimum & maximum # of times the atom can occur */
static char *parse_quant(const char *src, int *min_ptr, int *max_ptr)
{
    int min = 1, max = 1; /* default quantifier */
    char ch = *src;

 /* * = can appear any # of times */
    if(ch == '*')
    {   min = 0;
        max = INT_MAX;
        ++src;
    }
 /* + = can appear any # of times, as long as it appears once */
    else if(ch == '+')
    {   max = INT_MAX;
        ++src;
    }
 /* ? = can appear once, optional */
    else if(ch == '?')
    {   min = 0;
        ++src;
    }
 /* {m} = must appear exactly m times, {m,} = must appear m or more times
  * {m,n} can appear between m and n times inclusive */
    else if(ch == '{')
    {   int nread = 0;
        if(sscanf(++src, " %d %n", &min, &nread))
        {   src += nread;
            if(*src == ',')
            {   if(sscanf(++src, " %d %n", &max, &nread))
                    src += nread;
                else
                {   src = skip_ws(src);
                    max = INT_MAX;
                }
            }
            else
                max = min;
        }
        if(!*src)
            rx_parse_error(unclosed_rbrack);
        else if(*src++ != '}')
            rx_parse_error(bad_brack);
    }
    *min_ptr = min;
    *max_ptr = max;
    return (char *)src;
}

/* After ( encountered, set up a parentheses data structure */
static void open_paren(RegExp *rx)
{
    RxData *rxd = rx->data;
    Paren *paren;
 /* Allocate some more paren structures if none available */
    if(rxd->nparen == rxd->par_alloc)
    {   rxd->par_alloc += 16;
        rxd->parens     = x_realloc(rxd->parens, rxd->par_alloc * sizeof(Paren));
    }
    paren            = &rxd->parens[rxd->nparen];
    rxd->open_par_i  = rxd->nparen;
    memset(paren, 0, sizeof(Paren));
    paren->min_len   = INT_MAX;
    paren->start_i   = !rxd->nparen ? 0 : rx->len - 1;
    paren->level     = ++rxd->paren_level;
    paren->enter_i   = paren->store_i     = paren->first_alt_i = -1;
    paren->fail_i    = paren->backtrack_i[0] = paren->backtrack_i[1] = -1;

 /* Determine the index of the enclosing parenthesis */
    if(rxd->nparen)
    {   int enclose_i = rxd->nparen - 1;
        while(rxd->parens[enclose_i].level != rxd->paren_level - 1)
            --enclose_i;

        paren->enclose_i = enclose_i;
    }
    ++rxd->nparen;
}

/* Finish parentheses structure when ) encountered, parse a quantifier at
 *  *src if present, return the end of the quantifier */
static char *close_paren(RegExp *rx, const char *src)
{
    RxData *rxd     = rx->data;
    Paren *closed   = &rxd->parens[rxd->open_par_i], *open_par;

 /* Set open_par_i to parentheses enclosing just recently closed paren */
    rxd->open_par_i = closed->enclose_i;
    open_par        = &rxd->parens[rxd->open_par_i];

 /* Make sure that ) do not outnumber ( */
    if(!rxd->paren_level && *src)
        rx_parse_error("Unmatched )");

    set_paren_len(closed);
    closed->end_i   = rx->len;

 /* After parsing quantifer for parens, determine the maximum &
  * minimum length for the parentheses contents */
    if(*src == ')')
    {   src = parse_quant(++src, &closed->min, &closed->max);
        --closed->end_i;
        --rxd->paren_level;
    }
    else
        closed->min = closed->max = 1;

 /* add the max/min length of the closed paren multiplied by the length of
  *  the max/min quantifier to the length of the enclosing paren*/
    open_par->curr_max = add_truncate(open_par->curr_max,
                                      mult_truncate(closed->max_len,
                                                    closed->max));
    open_par->curr_min = add_truncate(open_par->curr_min,
                                      mult_truncate(closed->min_len,
                                                    closed->min));
    return (char *)src;
}

/* If | encountered, set up data structure for alternative */
static void mark_alt(RegExp *rx)
{
    RxData *rxd = rx->data;
    Altern *alt;
    Paren *paren = &rxd->parens[rxd->open_par_i];

 /* Allocate more alternative structures if none available */
    if(rxd->nalt == rxd->alt_alloc)
    {   rxd->alt_alloc += 16;
        rxd->alts      = x_realloc(rxd->alts, rxd->alt_alloc * sizeof(Altern));
    }
    alt            = &rxd->alts[rxd->nalt];
    alt->paren     = rxd->open_par_i;
    alt->i         = ++paren->nbar;
    alt->parse_i   = rx->len - 1;
    alt->jumped_to = FALSE;
    alt->min_len   = paren->curr_min;
    alt->max_len   = paren->curr_max;
    alt->encoded_i = -1;

    if(paren->first_alt_i == -1)
        paren->first_alt_i = rxd->nalt;

    paren->last_alt_i = rxd->nalt;
    set_paren_len(paren);
    ++rxd->nalt;
}

/* If minimum/maximum length of current alternative in parentheses is
 * smaller and/or greater than previous min/max len for paren, set min/max
 * len accordingly */
static void set_paren_len(Paren *par)
{
    if(par->curr_max > par->max_len)
        par->max_len = par->curr_max;
    if(par->curr_min < par->min_len)
        par->min_len = par->curr_min;

    par->curr_min = par->curr_max = 0;
}

