/* rxanalys.c - analyze regular expression
 * Copyright (C) 1995-99 Andrew Pipkin (minitrue@pagesz.net)
 * MiniTrue is free software released with no warranty. See COPYING for details
 *
 * Facts, shmacts! You can use facts to prove anything that's even remotely
 * true.
 */

#include <string.h>
#include <stdlib.h>

#include "minitrue.h"
#include "regexp.h"
#include "rxchars.h"

static void locate_backtrack(RegExp *rx, int rx_i, int start_i,
                             int stop_i, int end_i, int dir);
static int atoms_overlap(RxAtom *opt, RxAtom *req, int dir);
static int str_in_set(RxAtom *set_atom, RxAtom *str_atom, int dir);
static int sets_overlap(RxAtom *set_atom1, RxAtom *set_atom2);
static int strs_overlap(RxAtom *req_atom, RxAtom *opt_atom, int dir);
static int link_backtracks(RegExp *rx, int paren_i, int dir);
static int alt_backtrack(RegExp *rx, int rx_i, int paren_i,
                         int end_i, int dir);
static int alt_ambig(int paren_i, int dir, RegExp *rx);
static int is_ambig(RxAtom *rx1, int len1, RxAtom *rx2,
                    int len2, int dir, RegExp *rx);
static void atom_rarity(RxAtom *rx_atom, AtomData *atom_data);
static int alt_rarity(int paren_i, RegExp *rx);
static int case_ins_freq(unsigned char ch);
static int bref_distance(RegExp *rx, int bref_i);

/* Set up the encoded and atom_data arrays, then determine the rarities
 * for all the atoms in the regular expression */
void Rx_analyse(RegExp *rx)
{
    int rx_i;
    RxData *rxd = rx->data;
    RxAtom *rx_ptr;

    AtomData *atom_data = rxd->atom_data = x_malloc(sizeof(AtomData) *
                                                    rx->len);

    rxd->paren_list = x_malloc(sizeof(AtomLoc) * rxd->nparen);

    for(rx_ptr = rx->start, rx_i = 0; rx_i < rx->len; ++rx_ptr, ++rx_i)
    {   atom_data->nchar = -1;

        if(rx_ptr->flags & RX_BREF)
        {   rx_ptr->data.bref.dist = bref_distance(rx, rx_i);
            if(rx_ptr->min && rx_ptr->data.bref.dist != -1)
                atom_data->rarity = (FREQ_TOTAL / 4) * 3;
            else
                atom_data->rarity = 0;
        }
        else
            atom_rarity(rx_ptr, atom_data);
        atom_data++;
    }
    find_backtracks(0, rx->len - 1, rx);
    find_backtracks(rx->len - 1, 0, rx);
}

/* Determine where in the regular expression backtracking can occur.
 * Backtracking requires the boundary between optional atom and the
 * subsequent required atoms is ambiguous */
void find_backtracks(int start_i, int last_i, RegExp *rx)
{
    int rx_i, dir  = last_i > start_i ? 1 : -1;
    int rev = last_i < start_i;

    for(rx_i = start_i; (last_i - rx_i) * dir >= 0; rx_i += dir)
    {   RxAtom *rx_ptr         = &rx->start[rx_i];
        AtomData *data         = &rx->data->atom_data[rx_i];
        data->ngive_up[rev]    = data->nbacktrack[rev] = 0;
        data->backtrack_i[rev] = data->next_alt_i = -1;

     /* Look for the first atom that can be backtracked into */
        if(Rx_is_data(rx_ptr) || Rx_is_anchor(rx_ptr))
        {   if(rx_ptr->min)
                locate_backtrack(rx, rx_i, rx_i - dir, start_i, last_i, dir);
        }

     /* If at end of quantified parentheses and/or alternation, see if atom
      *   in parentheses has backtrack outside of parentheses, if it does
      *   that will be the atom to backtrackinto if the parentheses fails */
        else if(Rx_is_paren(rx_ptr))
        {   int paren_i = rx_ptr->fail.paren_i;
            Paren *paren = &rx->data->parens[paren_i];

            if(  (Rx_paren_quant(paren) || paren->nbar)
               && (   (   dir == 1  && Rx_is_rparen(rx_ptr)
                       && paren->start_i >= start_i)
                   || (   dir == -1 && Rx_is_lparen(rx_ptr)
                       && paren->end_i <= start_i)))

            {/* If alternation, determine ambiguity of alternatives */
                if(paren->nbar)
                {   paren->alt_ambig[rev]  = alt_ambig(paren_i, dir, rx);
                    paren->alt_btrack[rev] = FALSE;
                    if(paren->max >= 2 && paren->alt_ambig[rev])
                        paren->backtracked[rev] = paren->alt_btrack[rev] =TRUE;
                }

             /* Link backtracks external to parentheses together */
                (  paren->backtrack_i[rev] \
                 = rx->data->atom_data[rx_i].backtrack_i[rev] \
                 = link_backtracks(rx, paren_i, dir));
            }
        }
    }
#ifdef RX_ANALYS
    for(rx_i = start_i; rx_i != last_i + dir; rx_i += dir)
    {   RegExp_print_atom(rx_i, rx);
        printf(" %d %d %d %d %d\n", rx->data->atom_data[rx_i].rarity,
               rx->data->atom_data[rx_i].backtrack_i[0],
               rx->data->atom_data[rx_i].backtrack_i[1],
               rx->data->atom_data[rx_i].nbacktrack[0],
               rx->data->atom_data[rx_i].nbacktrack[1]);
    }
#endif

}

/* Try to find an atom between stop_i and rx_i which overlaps with rx_i. */
void locate_backtrack(RegExp *rx, int rx_i, int start_i,
                      int stop_i, int end_i, int dir)
{
    int dest_i = rx_i, src_i, rev = (dir == -1);
    RxAtom *rx_ptr = &rx->start[rx_i];
    RxAtom anchor_atom;

 /* If data is a backreference, use first item non-optional item */
    if(rx_ptr->flags & RX_BREF)
    {   Paren *bref_par = &rx->data->parens[rx_ptr->data.bref.num];
        for(rx_ptr = &rx->start[bref_par->start_i];
            rx_ptr < &rx->start[bref_par->end_i]; ++rx_ptr)
        {   if(rx_ptr->min && Rx_type(rx_ptr))
                break;
        }
    }
 /* If atom is zero-length anchor, use zero-length string which will
  *   result in all possible backtracks being tried */
 /*^ Should not need to try all backtracks for anchors with better
  *^  analysis routine */
    if(Rx_is_anchor(rx_ptr))
    {/* Backtracking won't work if anchor is at start or end of file*/
        if(rx_ptr->opcode == FILE_START || rx_ptr->opcode == FILE_END)
            return;

        anchor_atom.flags        = RX_STR;
        anchor_atom.data.str.len = 0;
        rx_ptr = &anchor_atom;
    }
    for(src_i = start_i; (src_i - stop_i) * dir >= 0; src_i -= dir)
    {   RxAtom *src_ptr     = &rx->start[src_i];
        int src_paren_i     = src_ptr->fail.paren_i;
        Paren *src_paren    = &rx->data->parens[ src_paren_i ];
        AtomData *src_data  = &rx->data->atom_data[src_i];
        AtomData *dest_data = &rx->data->atom_data[dest_i];

        if(src_ptr->min != src_ptr->max)
        {   int noverlap = atoms_overlap(src_ptr, rx_ptr, dir);
            if(noverlap)
            {   src_data->ngive_up[rev]     = noverlap;
                dest_data->nbacktrack[rev]  = noverlap;
                dest_data->backtrack_i[rev] = dest_i = src_i;
                ++rx->data->nbacktrack;
            }
        }
     /* If in alternation, and end of alternative reached, skip to the
      * start or end of the alternation conataining the paren */
        if(Rx_is_alt(src_ptr))
            src_i = Rx_exit_i(src_paren, -dir);

     /* See if backtracking through an alternative will be necessary.
      * Backtracking will be required if the alternatives are ambiguous
      * or an optional item in a alternative contains the current
      * atom */
        else if(src_i == Rx_exit_i(src_paren, dir) && src_paren->nbar)
        {   if(   alt_backtrack(rx, rx_i, src_paren_i, end_i, dir)
               || src_paren->alt_ambig[rev])
            {   Paren *qpar = Rx_closest_quant(rx, src_paren);
                src_paren->alt_btrack[rev]  = TRUE;
                dest_data->backtrack_i[rev] = dest_i = src_i;
                dest_data->nbacktrack[rev]  = INT_MAX;

             /* If ambigous alternation in quantified paren which does
              * not include backtracker, need to backtrack by dropping
              * parentheses */
                while(qpar && !Rx_in_paren(rx, rx_i, qpar))
                {   qpar->backtracked[rev] = TRUE;
                    qpar = Rx_closest_quant(rx, Rx_outer_paren(rx, qpar));
                }
            }
            break;
        }
     /* If at start of optional quantified parentheses, see if backtracking
      *  by dropping the entire parentheses is possible */
        else if(   src_i == Rx_exit_i(src_paren, -dir)
                && src_paren->min != src_paren->max
                && !Rx_in_paren(rx, rx_i, src_paren))
        {
         /* If the parentheses contents and the atoms following the
          * parentheses are ambiguous, backtrack by dropping the contents
          * of the parentheses */
            int par_end_i = Rx_exit_i(src_paren, dir) + dir;
            if(is_ambig(src_ptr + dir,
                        src_paren->end_i - src_paren->start_i - 1,
                        &rx->start[ par_end_i ],
                        abs(end_i - par_end_i) + 1, dir, rx))
            {   src_data->ngive_up[rev] = 1;
                if(dest_i != Rx_exit_i(src_paren, dir))
                {   dest_data->nbacktrack[rev]  = INT_MAX;
                    dest_data->backtrack_i[rev] = src_i;
                }
                dest_i = src_i;
                src_paren->backtracked[rev] = TRUE;
            }
         /* If backtracked into paren not optional, go to entrance of
          * nearest quantified paren unless paren has a backtrack */
            if(src_paren->min)
            {   if(src_data->backtrack_i[rev] != -1)
                    src_i = src_data->backtrack_i[rev];
                else
                {   Paren *enclose = &rx->data->parens[src_paren->enclose_i];
                    Paren *inner_quant = Rx_closest_quant(rx, enclose);
                    if(inner_quant && !Rx_in_paren(rx, rx_i, inner_quant))
                        src_i = Rx_exit_i(inner_quant, -dir) + dir;
                    else
                        break;
                }
            }
        }

     /* If required atom encountered, stop unless atom has backtracks
      * If atom is backtracked, start with backtrack atom */
        if(Rx_type(src_ptr) && src_ptr->min)
        {   int bt_i = src_data->backtrack_i[rev];
            if(   bt_i != -1
               && (   (dir == 1 && bt_i >= stop_i)
                   || (dir == -1 && bt_i <= stop_i)))
                src_i = src_data->backtrack_i[rev] + dir;
            else
            {   Paren *inner_quant = Rx_closest_quant(rx, src_paren);
                if(inner_quant && !Rx_in_paren(rx, rx_i, inner_quant))
                    src_i = Rx_exit_i(inner_quant, -dir) + dir;
                else
                    break;
            }
        }
    }
}

/* Link all the backtracks in a paren that is quantified or has
 * alternatives. Return the index of the nearest backtrack, -1 if no
 * backtracks */
static int link_backtracks(RegExp *rx, int paren_i, int dir)
{
    Paren *paren = &rx->data->parens[paren_i];
    int rx_i, nearest_bt_i = -1, rev = (dir == -1);
    for(rx_i = paren->start_i + 1; rx_i < paren->end_i; ++rx_i)
    {   AtomData *data = &rx->data->atom_data[rx_i];
        int bt_i = data->backtrack_i[rev];
        if(bt_i != -1 && (bt_i < paren->start_i || bt_i > paren->end_i))
        {   if(nearest_bt_i == -1)
                nearest_bt_i = bt_i;
            else if(dir * (bt_i - nearest_bt_i) > 0)
            {   rx->data->atom_data[bt_i].backtrack_i[rev] = nearest_bt_i;
                nearest_bt_i = bt_i;
            }
            else if(bt_i == nearest_bt_i)
                break;
            else
            {   AtomData *bt_data = &rx->data->atom_data[nearest_bt_i];
                for( ; ; )
                {   if(bt_data->backtrack_i[rev] == bt_i)
                        break;
                    else if(bt_data->backtrack_i[rev] == -1)
                    {   bt_data->backtrack_i[rev] = bt_i;
                        break;
                    }
                    else if(dir * (bt_i - bt_data->backtrack_i[rev]) > 0)
                    {   int orig_bt_i = bt_data->backtrack_i[rev];
                        bt_data->backtrack_i[rev] = bt_i;
                        rx->data->atom_data[bt_i].backtrack_i[rev] = orig_bt_i;
                        break;
                    }
                    bt_data = &rx->data->atom_data[bt_data->backtrack_i[rev]];
                }
            }
            data->backtrack_i[rev] = -1;
        }
    }
    return nearest_bt_i;
}

/* See if an atom preceding an alternative at rx_i can be found by backtracking
 * into any of the alternatives of the parentheses paren_i. Return the
 * #of alternatives which can be backtracked into */
static int alt_backtrack(RegExp *rx, int rx_i, int paren_i,
                         int last_i, int dir)
{
    Paren *paren    = &rx->data->parens[paren_i];
    int start_i     = Rx_exit_i(paren, -dir) + dir, alt_i, rev = (dir == -1);
    int nbacktracks = 0;
    AtomData *data  = &rx->data->atom_data[rx_i];

 /* Locate start & end of alternative, then see if atom at rx_i can backtrack
  * into that region */
    for (alt_i = 0; alt_i < paren->nbar + 1; ++alt_i)
    {   RxAtom *end_ptr = &rx->start[start_i];
        int end_i       = start_i;
        while(    (!Rx_is_paren(end_ptr) && !Rx_is_alt(end_ptr))
                || end_ptr->fail.paren_i != paren_i)
        {   end_ptr += dir;
            end_i   += dir;
        }
        locate_backtrack(rx, rx_i, end_i - dir, start_i, last_i, dir);
        if(data->backtrack_i[rev] != -1)
        {   int alt_bt_i = rx->data->atom_data[end_i].backtrack_i[rev];
            ++nbacktracks;
            if(   alt_bt_i == -1 || (dir == 1 && rx_i > alt_bt_i)
               || (dir == -1 && rx_i < alt_bt_i))
                (  rx->data->atom_data[end_i].backtrack_i[rev]
                 = data->backtrack_i[rev]);

            data->backtrack_i[rev] = -1;
        }
        start_i = end_i + dir;
    }
    return nbacktracks;
}

/* Determine if required atom req can back/fortrack into optional atom
 * opt. Return 0 if back/fortracking not allowed, a positive number if
 * back/fortracking possible. If dir is 1, test for backtracking, if -1
 * test for fortracking */
static int atoms_overlap(RxAtom *opt, RxAtom *req, int dir)
{
    int opt_type = Rx_type(opt), req_type = Rx_type(req);

    if(!opt_type || !req_type)
        return 0;

    if(opt_type == RX_SET)
    {/* If both atoms are sets, return MAX_INT if all values in the right
      * set are found in the left set, -MAX_INT if the intersection
      * of the sets is not the same as the right set, 0 if no intersection*/
        if(req_type == RX_SET)
            return sets_overlap(opt, req) ? INT_MAX : 0;
        else
            return str_in_set(opt, req, dir);
    }
    else
    {   if(req_type == RX_SET)
            return str_in_set(req, opt, dir);

     /* If going forwards, compare starts of strings, otherwise compare
      * ends of strings */
        else
            return strs_overlap(req, opt, dir);
    }
}

/* Return the number of characters in the string which are in the character
 * set, if all the characters in the string are in the set return INT_MAX.
 * If dir is 1, begin examining the string from the start, if -1 examine
 * string from end */
static int str_in_set(RxAtom *set_atom, RxAtom *str_atom, int dir)
{
    int str_len, nsame = 0;
    const char *str = Rx_atom_str(str_atom, &str_len);

 /* Start at end of string if going backwards */
    if(dir == -1)
        str += str_len - 1;

    for( ; str_len; str += dir, --str_len)
    {   if(!CharSet_iN(set_atom->data.set, *str))
            return nsame;
        ++nsame;
    }
    return INT_MAX;
}

/* Return the number of bytes found in both character sets */
static int sets_overlap(RxAtom *set_atom1, RxAtom *set_atom2)
{
    int ch_i, noverlap = 0;
    for(ch_i = 0; ch_i <= UCHAR_MAX; ++ch_i)
    {   if(   CharSet_iN(set_atom1->data.set, ch_i)
           && CharSet_iN(set_atom2->data.set, ch_i))
            ++noverlap;
    }
    return noverlap;
}

static int strs_overlap(RxAtom *req_atom, RxAtom *opt_atom, int dir)
{
    int req_len, opt_len;
    const char *req_str = Rx_atom_str(req_atom, &req_len);
    const char *opt_str = Rx_atom_str(opt_atom, &opt_len);

    if(req_len <= opt_len)
    {   int opt_i = (dir == -1) ? opt_len - req_len : 0;
        return strn_cmp_ci(req_str, &opt_str[opt_i], req_len) ? INT_MAX : 0;
    }
    else
    {   int overlap_len = 0, req_i = (dir == -1) ? req_len - opt_len : 0;
        while(strn_cmp_ci(&req_str[req_i], opt_str, opt_len))
        {   req_i       += dir * opt_len;
            overlap_len += opt_len;
            if(   (dir == 1 && req_len - req_i < opt_len)
               || (dir == -1 && req_i < opt_len))
                break;
        }
        return (overlap_len == req_len) ? INT_MAX : overlap_len;
    }
}

/* Return TRUE if there is at least one alternative in the alternation inside
 *  the parentheses with paren_i which can be the start of another
 *  alternative, FALSE if not */
static int alt_ambig(int paren_i, int dir, RegExp *rx)
{
    Paren *paren = &rx->data->parens[paren_i];
    int alt1_i = paren->start_i;
    do
    {   int alt2_start_i = Rx_next_alt_i(++alt1_i, paren_i, 1, rx);
        int len1         = alt2_start_i - alt1_i;
        RxAtom *alt1     = &rx->start[alt1_i];

        while(alt2_start_i != -1)
        {   RxAtom *alt2   = &rx->start[++alt2_start_i];
            int alt2_end_i = Rx_next_alt_i(alt2_start_i, paren_i, 1, rx);
            int len2 = ((alt2_end_i == -1) ? paren->end_i : alt2_end_i);
            len2 -= alt2_start_i;

            if(   (dir == 1 && is_ambig(alt1, len1, alt2, len2, dir, rx))
               || (dir == -1 && is_ambig(alt1 + len1 - 1, len1,
                                         alt2 + len2 - 1, len2, dir, rx)))
                return TRUE;

            alt2_start_i = alt2_end_i;
        }

        alt1_i = Rx_next_alt_i(alt1_i, paren_i, 1, rx);

    }while(alt1_i != -1);

    return FALSE;
}

/* Return non-zero if there is a string matching the len1 atoms beginning at
 * rx1 which is equivalent to the start of a string matching the len2 atoms
 * beginning at rx2 or vice versa. If dir is -1, see if the end of the
 * strings can be equivalent. */
static int is_ambig(RxAtom *start1, int len1, RxAtom *start2, int len2,
                    int dir, RegExp *rx)
{
    char ch_array1[NCHAR], ch_array2[NCHAR];
    RxChars rxc1, rxc2;

 /* Assume ambiguity for null regular expression */
    if(!len1 || !len2)
        return TRUE;

 /* If initial atoms do not match, do not need to use RxChars structures */
    if(   Rx_type(start1) && start1->min && Rx_type(start2)
       && start2->min && !atoms_overlap(start1, start2, dir))
        return FALSE;

 /* Set up data structures which will determine characters which can appear
  * at each possible position of the regular expression */
    RxChars_init(&rxc1, start1, len1, dir, rx);
    RxChars_init(&rxc2, start2, len2, dir, rx);

    while(RxChars_next(&rxc1, ch_array1) && RxChars_next(&rxc2, ch_array2))
    {   int ch_i;
        for(ch_i = 0; ch_i < NCHAR; ++ch_i)
        {   if(ch_array1[ch_i] & ch_array2[ch_i])
                break;
        }
        if(ch_i == NCHAR)
            return FALSE;
    }
    return TRUE;
}

/* Set the rarity field in *atom_data appropriately, also set nchar in
 * atom_data to the # of characters in the character set if a character set*/
static void atom_rarity(RxAtom *rx_atom, AtomData *atom_data)
{
    int rx_type      = Rx_type(rx_atom);
    int min_len      = mult_truncate(Rx_atom_len(rx_atom), rx_atom->min);
    int default_skip = miN((int)MAX_SKIP, min_len);
    int rarity       = default_skip * FREQ_TOTAL;
    int ch_i;

    switch(rx_type)
    { case RX_CH:
        {   int ch = rx_atom->data.ch;
            rarity -= case_ins_freq(ch) * default_skip;
         /* If the char is case-sensitive, boost the rarity because it can
          * be looked for with memchr */
            if(ch == low_casE(ch) && ch == up_casE(ch))
                rarity = (rarity / 2) * 3;
        }
        break;

      case RX_SET:
        {   rarity = FREQ_TOTAL; 
            atom_data->nchar = 0;
            for(ch_i = 0; ch_i < NCHAR; ++ch_i)
            {   if(CharSet_iN(rx_atom->data.set, ch_i))
                {   ++atom_data->nchar;
                    rarity   -= char_freq(ch_i);
                }
            }
         /* If set common (chances of match > 1/2), treat as if it is just
          * a single set to take offset the cost of all the backtracking
          * that will be needed */
            if(rarity > FREQ_TOTAL/2)
                rarity *= default_skip;
            else if(!default_skip)
                rarity = 0; 
        }
        break;

      case RX_STR:
        {   int str_len = rx_atom->data.str.len;
            int chs_used[NCHAR], str_i;
            memset(chs_used, 0, sizeof(int) * NCHAR);
            for(str_i = str_len - 1; str_i >= 0; --str_i)
            {   int ch       = rx_atom->data.str.start[str_i];
                int new_skip = str_len - str_i - 1;
                if(new_skip > default_skip)
                    break;
                if(!chs_used[ch])
                {   rarity -= case_ins_freq(ch) * (default_skip - new_skip);
                    chs_used[ch] = TRUE;
                }
            }
        }
        break;

      default:
        if(rx_atom->flags & RX_ANCHOR)
        {   int anch_op = rx_atom->opcode;
            if(anch_op == FILE_START || anch_op == FILE_END)
                rarity = UINT_MAX;
            else if(anch_op == LINE_START || anch_op == LINE_END)
                rarity = FREQ_TOTAL - 2 * char_freq(NL);
            else
                rarity = FREQ_TOTAL / 3;
        }
        else
            rarity = 0;
        break;
    }
    atom_data->rarity = rarity;
}

/* If case insensitivity desired & char has upper & lower case, return the
 * sum of the frequencies of the both cases, otherwise just return the
 * frequency of the character */
static int case_ins_freq(unsigned char ch)
{
    int cf;
    unsigned char low_case = low_casE(ch), up_case = up_casE(ch);
    cf = char_freq(low_case);
    if(low_case != up_case)
        cf += char_freq(up_case);
    return cf;
}

/* Find the rarest atom between start_i and end_i. Set *rarest_i_ptr to
 *   the index of the rarest atom and return the rarity. */
int find_rarest(RegExp *rx, int start_i, int end_i, int *rarest_i_ptr)
{
    int rx_i = start_i, rarest_i  = -1;
    unsigned max_rarity = 0;
    AtomData *atom_data = rx->data->atom_data;

    if(start_i == end_i)
        return 0;

    while(rx_i < end_i)
    {   int next_rx_i = rx_i + 1;
        unsigned rarity = atom_data[rx_i].rarity;
        RxAtom *rx_ptr = &rx->start[rx_i];

        if(rx_ptr->flags & RX_ALTERN && !rx_ptr->fail.paren_i)
            return rx_i;
        if(atom_data[rx_i].backtrack_i[0] != -1)
            rarity = (rarity * 2) / 3;

        if(Rx_is_lparen(rx_ptr))
        {   Paren *paren = &rx->data->parens[rx_ptr->fail.paren_i];
            if(!paren->min)
                next_rx_i = paren->end_i + 1;

            else if(paren->nbar)
            {   rarity    = alt_rarity(rx_ptr->fail.paren_i, rx);
                rx_i      = rx->data->alts[paren->first_alt_i].parse_i;
                next_rx_i = paren->end_i + 1;
            }
        }
        if(rarity > max_rarity)
        {   rarest_i   = rx_i;
            max_rarity = rarity;
        }
        rx_i = next_rx_i;
    }
    *rarest_i_ptr = rarest_i;
    return max_rarity;
}

/* Return the rarity of an alternation, which will be the sum of the rarities
 * of the alternatives divided by the square of the number of alternatives */
static int alt_rarity(int paren_i, RegExp *rx)
{
    Paren *paren = &rx->data->parens[paren_i];
    int nalt = 0, alt_start_i = paren->start_i + 1, alt_end_i, rarest_i;
    int rarity_sum = 0;
    do
    {   int alt_rar;
        for(alt_end_i = alt_start_i; alt_end_i < paren->end_i ;++alt_end_i)
        {   RxAtom *atom = &rx->start[alt_end_i];
            if(atom->flags & RX_ALTERN && atom->fail.paren_i == paren_i)
                break;
        }
        alt_rar = find_rarest(rx, alt_start_i, alt_end_i, &rarest_i);

        if(rarest_i != -1 && alt_rar > 0)
            rarity_sum += alt_rar;
        else
            return 0;

        ++nalt;
        alt_start_i = alt_end_i + 1;
    }while(alt_end_i < paren->end_i);

    return rarity_sum / (nalt * nalt);
}

/* Return the number of characters between the backreference located at
 *  atoms[bref_i] and the start of the parentheses it corresponds to.
 *  Return -1 if the distance is variable */
static int bref_distance(RegExp *rx, int bref_i)
{
    int bref_paren = rx->start[bref_i].data.bref.num;
    int rx_i = rx->data->parens[bref_paren].start_i;
    int len = 0;
    if(Rx_paren_quant(&rx->data->parens[bref_paren]))
        return -1;

    while(rx_i < bref_i)
    {   RxAtom *rx_ptr = &rx->start[rx_i];
        if(Rx_is_paren(rx_ptr))
        {   int paren_num    = rx_ptr->fail.paren_i;
            Paren *paren = &rx->data->parens[ abs(paren_num) ];

            if(paren->min_len != paren->max_len || Rx_paren_quant(paren))
                return -1;

            else if(Rx_paren_quant(paren))
                return -1;

            if(Rx_is_lparen(rx_ptr))
            {   len += paren->min_len;
                rx_i = paren->end_i;
            }
        }
        else if(Rx_atom_len(rx_ptr))
        {   int min_len = Rx_min_len(rx, rx_ptr);
            if(min_len != Rx_max_len(rx, rx_ptr))
                return -1;
            len = add_truncate(len, min_len);
        }
        ++rx_i;
    }
    return len;
}
