/* rxencode.c - generate program for regular expression
 * Copyright (C) 1995-99 Andrew Pipkin (minitrue@pagesz.net)
 * MiniTrue is free software released with no warranty. See COPYING for details
 */

#include <stdlib.h>
#include <string.h>
#include <assert.h>

#include "minitrue.h"
#include "regexp.h"

static int encode_quantified_anchor(RegExp *rx, int backward_i, int forward_i,
                                    int start_i, Paren *quant_paren);
static void make_top_alt(RegExp *rx, int paren_i);
static int  encode_forward(RegExp *rx, int src_i);
static int  encode_backward(RegExp *rx, int src_i);
static void encode_atom(RegExp *rx, int rx_i);
static int encode_rarest(RegExp *rx, int rarest_i);
static void set_opcode(RxAtom *rx);
static int link_backtrack(RegExp *rx, int parse_i, int encoded_i);
static void set_backtrack(RxAtom *atom, int bt_i);
static void encode_paren(RegExp *rx, int rx_i);
static void encode_alt(RegExp *rx, int rx_i);
static int unfinished_backref(RegExp *rx, RxAtom *atom);
static void redo(RegExp *rx, int start_i, int end_i, int rarest_i);
static void clear_unused_parens(RegExp *rx, RxAtom *atom);
static void link(RegExp *rx, int start_i, int len);
static int find_fail_i(RegExp *rx, int rx_i);
static Altern *find_next_alt(RegExp *rx, int paren_i, int rx_i, int dir);
static void init_stack(RegExp *rx, RxAtom *rx_ptr);
static int have_case_sense(void);

static int Case_sense = -1;

void Rx_encode(RegExp *rx, int start_i, int end_i)
{
    RxData *rxd    = rx->data;
    AtomData *data = rxd->atom_data;
    Paren *paren;
    int forward_i  = 0, backward_i, rarest_i, orig_len = rx->len;
    find_rarest(rx, start_i, end_i, &rarest_i);

    if(Case_sense == -1)
        Case_sense = have_case_sense();

    if(rarest_i == -1)
    {   (Rx_init_atom(rx, RX_ADVANCE))->fail.i = -1;
        rx->data->parens[0].fail_i = rx->len - 1;
        backward_i = -1;
    }
    else
    {   int paren_i        = rx->start[rarest_i].fail.paren_i, clear_bt_i;
        Paren *quant_paren = Rx_quant_paren(rx, rarest_i, start_i, end_i);

     /* Clear all the backtracks which cross the rarest atom except for
      *  parentheses popping */
        for(clear_bt_i = rarest_i + 1; clear_bt_i < end_i; ++clear_bt_i)
        {   int bt_i = rx->data->atom_data[clear_bt_i].backtrack_i[0];

            if(bt_i < rarest_i && bt_i != -1 && !Rx_is_paren(&rx->start[bt_i]))
                rx->data->atom_data[clear_bt_i].backtrack_i[0] = -1;
        }
        for(clear_bt_i = start_i; clear_bt_i < rarest_i; ++clear_bt_i)
        {   int bt_i = rx->data->atom_data[clear_bt_i].backtrack_i[1];

            if(bt_i >= rarest_i && !Rx_is_paren(&rx->start[bt_i]))
                rx->data->atom_data[clear_bt_i].backtrack_i[1] = -1;
        }

     /* If rarest atom occurs in alternative, set up code for finding top
      * level alternatives */
        if(rx->start[rarest_i].flags & RX_ALTERN)
        {   paren      = &rxd->parens[paren_i];
            make_top_alt(rx, paren_i);
            forward_i  = paren->end_i;
            backward_i = paren->start_i - (paren_i == 0);
        }
     /* Otherwise set up code to locate rarest atom and determine fortracks
      * up to rarest atom, if atom is in quantified parentheses find fortracks
      * before start of quantified parentheses */
        else
        {   forward_i = rarest_i + encode_rarest(rx, rarest_i);
            if(rarest_i != start_i)
                find_backtracks(rarest_i - 1, start_i, rx);

            backward_i = rarest_i - 1;
        }
     /* If rarest enclosed in quantified parentheses, set up code to unroll
      *   first instance of quantified paren */
        if(quant_paren)
        {
            backward_i = encode_quantified_anchor(rx, backward_i, forward_i,
                                                  start_i, quant_paren) - 1;
            forward_i  = quant_paren->end_i + 1;
        }
    }

 /* Convert the atoms surrounding the anchor, go backwards or forwards
  * depending on the rarities of the atoms */
    while(backward_i >= start_i || forward_i < end_i)
    {   if(   backward_i < start_i
           || (   forward_i < end_i
               && data[forward_i].rarity > data[backward_i].rarity
               && !unfinished_backref(rx, &rx->start[forward_i])
               && !rxd->nbacktrack))
            forward_i  = encode_forward(rx, forward_i);
        else
            backward_i = encode_backward(rx, backward_i);
    }
    link(rx, orig_len, rx->len - orig_len);
    redo(rx, start_i, end_i, rarest_i);
}

/* Return true if atom points to a backreference to a parentheses which has
 * not been finished, false otherwise */
static int unfinished_backref(RegExp *rx, RxAtom *atom)
{
    if(atom->flags & RX_BREF)
    {   Paren *paren = &rx->data->parens[atom->data.bref.num];
        return (paren->enter_i == -1 || paren->store_i == -1);
    }
    return FALSE;
}

/* If anchor can be backtracked over, it is neccessary to find retry the
 * regular expression in case the anchor can be found later */
static void redo(RegExp *rx, int start_i, int end_i, int rarest_i)
{
    int fortrack_i, overlapped = FALSE;
    RxData *data = rx->data;
    if(rarest_i == -1)
        return;

 /* If rarest has backtracks, it is necessary to refind the regexp starting
  * at the beginning of the match becuase there might be another match longer
  * than the current match */
    if(rx->start[rarest_i].flags & RX_ALTERN)
    {   int paren_i  = rx->start[rarest_i].fail.paren_i;
        Paren *paren = &data->parens[paren_i];
        overlapped   = (paren->backtrack_i[0] != -1);
    }
    else
    {   AtomData *anchor_data = &data->atom_data[rarest_i];
        int bt_i = anchor_data->backtrack_i[0];
        if(   start_i <= bt_i && bt_i < end_i
           && abs(anchor_data->nbacktrack[0]) == INT_MAX)
            overlapped = TRUE;
    }

 /* If there are two fortracks with a left paren between them, the parentheses
  * position might be in an incorrect position, so it is necessary to refind
  * the regexp */
    for(fortrack_i = rarest_i - 1; fortrack_i >= start_i; --fortrack_i)
    {   if(data->atom_data[fortrack_i].ngive_up[1])
        {   --fortrack_i;
            break;
        }
    }
    for( ; fortrack_i >= start_i; --fortrack_i)
    {   if(Rx_is_lparen(&rx->start[fortrack_i]))
            break;
    }
    for( ; fortrack_i >= start_i; --fortrack_i)
    {   if(data->atom_data[fortrack_i].ngive_up[1])
            break;
    }

    if(overlapped || fortrack_i >= start_i)
    {   int orig_len, rx_i;

        Paren *paren_ptr = data->parens;
     /* Reset parentheses data */
        for( ; paren_ptr < &data->parens[data->nparen]; ++paren_ptr)
        {   paren_ptr->enter_i = paren_ptr->store_i = -1;
            paren_ptr->jump_store_i  = -1;
            paren_ptr->alt_btrack[0] = paren_ptr->alt_ambig[0] = FALSE;
            paren_ptr->backtracked[0] = FALSE;
            paren_ptr->is_top_alt = FALSE;
        }
     /* RX_RESET will set match_end to match_start */
        Rx_init_atom(rx, RX_RESET);

        orig_len = rx->len;
        rx_i     = start_i;

        find_backtracks(start_i, end_i - 1, rx);
        while(rx_i < end_i)
            rx_i = encode_forward(rx, rx_i);

        link(rx, orig_len, rx->len - orig_len);
    }
}

/* Encode quantified parentheses containing anchor, backward_i is index
 * preceding anchor, forward_i is index following anchor and *quant_paren
 * is outermost quantified parentheses enclosing anchor */
static int encode_quantified_anchor(RegExp *rx, int backward_i, int forward_i,
                                    int start_i, Paren *quant_paren)
{
    RxData *rxd      = rx->data;
    Paren *paren_ptr = &rxd->parens[1];
    int src_i, stop_i, link_loc, par_i;

    if(backward_i == -1)
        return -1;

 /* Begin encoding at atom which fortracks into the quantified paren */
    for(stop_i = start_i; stop_i < quant_paren->start_i; ++stop_i)
    {   if(rx->data->atom_data[stop_i].backtrack_i[1] >= quant_paren->start_i)
            break;
    }

 /* Encode backwards to start of outermost quantified parentheses */
    for(src_i = backward_i; src_i >= stop_i; --src_i)
    {   RxAtom *src_ptr  = &rx->start[src_i];
        src_ptr->flags |= RX_REV;
        encode_atom(rx, src_i);

        if(src_i >= quant_paren->start_i)
        {   src_ptr         = &rx->start[src_i];
            src_ptr->flags |= RX_ALIAS;
            src_ptr->flags &= ~RX_REV;
        }
    }
    link(rx, rxd->nparse, rx->len - rxd->nparse);

 /* All encoded parentheses will be aliased. Set up instructions to copy
  * paren info to proper location */
    for(par_i = 1; par_i < rxd->nparen; ++par_i, ++paren_ptr)
    {   if(Rx_in_paren(rx, paren_ptr->start_i, quant_paren))
        {   if(paren_ptr->enter_i != -1)
            {   if(paren_ptr->store_i != -1)
                {   RxAtom *copy_loc        = Rx_init_atom(rx, COPY_PAREN);
                    copy_loc->flags         = (RX_BREF | RX_INDEX);
                    copy_loc->fail.paren_i  = paren_ptr->store_i;
                    copy_loc->data.bref.num = par_i;
                }
            }
            paren_ptr->enter_i    = paren_ptr->store_i = -1;
            paren_ptr->is_top_alt = FALSE;
        }
    }
 /* Set up link to into middle of quantified paren */
    link_loc = rx->len;
    Rx_init_atom3(rx, RX_JUMP, 0, RX_INDEX);

 /* Set backtracks in quantified paren before anchor */
    find_backtracks(quant_paren->start_i, forward_i, rx);

 /* Now encode entire quantified parentheses forward */
    for(src_i = quant_paren->start_i; src_i <= quant_paren->end_i; ++src_i)
    {   if(src_i == forward_i)
            rx->start[link_loc].fail.paren_i = rx->len;
        rx->start[src_i].flags &= ~RX_REV;
        encode_atom(rx, src_i);
    }
    return stop_i;
}

/* Set up a top-level alternation. Treat all the alternatives as if they
 * were separate expressions and set up the instructions so that the
 * nearest alternative will be chosen */
static void make_top_alt(RegExp *rx, int paren_i)
{
    Paren *paren = &rx->data->parens[paren_i];
    int nalt = paren->nbar + 1;
    int alt_i, top_alt_match_i, alt_start_i, alt_end_i, rx_i;
    int orig_first_alt_i = paren->first_alt_i, sub_paren_i = paren_i + 1;
    int nsub_paren = Rx_nsub_paren(rx, paren_i);

    RxAtom *atom;
    SubRX *sub_rxs;

    paren->is_top_alt = TRUE;

 /* Now set up instructions */
    atom    = Rx_init_atom(rx, TOP_ALT_INIT);
    sub_rxs = atom->data.top_alt.sub_rxs = x_malloc(nalt * sizeof(SubRX));
    atom->data.top_alt.len = nalt;
    atom                   = Rx_init_atom(rx, TOP_ALT_FAIL);
    atom->fail.i           = -1;
    top_alt_match_i        = rx->len;
    atom                   = Rx_init_atom(rx, TOP_ALT_MATCH);
    atom->max              = nsub_paren;
    atom->data.sub_parens.start_i = sub_paren_i;

    rx->data->atom_data[paren->end_i].encoded_i = rx->len;
    atom                   = Rx_init_atom(rx, TOP_ALT_NEXT);

    if(paren->alt_btrack[1])
    {   rx->data->atom_data[paren->start_i].encoded_i = rx->len;
        Rx_init_atom(rx, RX_JUMP);
    }

 /* Prevent link from trying to link to the subsequent alternatives */
    paren->first_alt_i = -1;

    alt_start_i = paren->start_i + (paren_i != 0);

 /* Generate code for each alternative, treat as if each alternative a
  * standalone regular expression */
    for(alt_i = 0; alt_i < nalt; ++alt_i)
    {   int alt_sub_parens = 0;

     /* Find end of alternative, count the number of subparens in the
      * alternative */
        for(alt_end_i = alt_start_i; alt_end_i < paren->end_i ;++alt_end_i)
        {   atom = &rx->start[alt_end_i];
            if(Rx_is_lparen(atom))
                ++alt_sub_parens;
            if(Rx_is_alt(atom) && atom->fail.paren_i == paren_i)
                break;
        }
        sub_rxs[alt_i].code.i      = rx->len;
        sub_rxs[alt_i].nsub_paren  = alt_sub_parens;
        sub_rxs[alt_i].sub_paren_i = sub_paren_i;

        Rx_encode(rx, alt_start_i, alt_end_i);

     /* If atoms following paren can backtrack into an alternative
      * set up atom which will store the atom to backtrack into, if
      * no backtracks possible, jump will be set to try next alternative*/
        if(paren->alt_btrack[0])
        {   int bt_i = rx->data->atom_data[alt_end_i].backtrack_i[0];
            atom = Rx_init_atom(rx, SET_JUMP);
            atom->data.bref.num = paren_i;
            atom->min           = atom->max = 1;
            if(bt_i == -1)
            {   atom->fail.i = top_alt_match_i + 1;
                atom->flags |= RX_INDEX;
            }
            else
                link_backtrack(rx, alt_end_i, rx->len - 1);
        }
        if(paren->alt_btrack[1])
        {   int ft_i = rx->data->atom_data[alt_start_i - 1].backtrack_i[1];
            atom = Rx_init_atom3(rx, SET_JUMP, 0, RX_BREF);
            atom->data.bref.num = -(top_alt_match_i + 2);
            atom->min           = atom->max = 1;
            if(ft_i == -1)
            {   atom->fail.i = top_alt_match_i + 1;
                atom->flags |= RX_INDEX;
            }
            else
            {   rx->start[alt_start_i - 1].flags |= RX_REV;
                link_backtrack(rx, alt_start_i - 1, rx->len - 1);
                rx->start[alt_start_i - 1].flags &= ~RX_REV;
            }
        }
     /* After alternative matched, jump back so that the next alternative
      * can be matched or the closest alternative can be found & used */
        Rx_init_atom3(rx, RX_JUMP, top_alt_match_i, RX_INDEX);
        alt_start_i  = alt_end_i + 1;
        sub_paren_i += alt_sub_parens;
    }
    paren->first_alt_i = orig_first_alt_i;

 /* Convert the fail indices in the rovers to jump back to TOP_ALT_FAIL */
    for(rx_i = top_alt_match_i + 2; rx_i < rx->len; ++rx_i)
    {   atom = &rx->start[rx_i];
        if(atom->fail.i == -1)
        {   atom->fail.i  = top_alt_match_i - 1;
            atom->flags  |= RX_INDEX;
        }
    }
 /* Have failures outside alternation result in search for next instance
  *  of alternative */
    rx->data->parens[0].fail_i = top_alt_match_i + 1;
    if(paren_i)
        rx->data->atom_data[paren->end_i].encoded_i = top_alt_match_i + 1;

 /* On successful match of top level alternative, jump to end of code related
  * to alternatives */
    atom         = &rx->start[top_alt_match_i];
    atom->flags |= RX_INDEX;
    atom->fail.i = rx->len;
}

/* Encode an atom at atom_i, use forward opcodes */
static int encode_forward(RegExp *rx, int atom_i)
{
    int stop_i = atom_i + 1;
    while(atom_i < stop_i)
    {   RxAtom *atom_ptr = &rx->start[atom_i];

     /* If atom is start of parentheses containing alternations or
      * quantification, need to encode to end of paren */
        if(Rx_is_lparen(atom_ptr))
        {   Paren *paren = &rx->data->parens[ atom_ptr->fail.paren_i ];
            if(   (Rx_paren_quant(paren) || paren->nbar)
               && paren->end_i > stop_i)
                stop_i = paren->end_i + 1;
        }
        atom_ptr->flags &= ~RX_REV;
        encode_atom(rx, atom_i);
        ++atom_i;
    }
    return stop_i;
}

/* Encode atom at atom_i, use backwards opcodes */
static int encode_backward(RegExp *rx, int atom_i)
{
    int stop_i = atom_i - 1;

    while(atom_i > stop_i)
    {   RxAtom *atom_ptr = &rx->start[atom_i];

     /* If atom is end of parentheses containing alternation or quantification
      * need to encode to start of paren */
        if(Rx_is_rparen(atom_ptr))
        {   Paren *paren = &rx->data->parens[ abs(atom_ptr->fail.paren_i) ];
            if(    (Rx_paren_quant(paren) || paren->nbar)
                && paren->start_i < stop_i)
                stop_i = paren->start_i - 1;
        }
        atom_ptr->flags |= RX_REV;
        encode_atom(rx, atom_i);
        --atom_i;
    }
    return atom_i;
}

/* If atom's minimum # of occurences & maximum are not the same and minimum
 * is greater than 0, it is necessary to split the atom into a mandatory
 * and optional part. Set the opcode appropriately */
static void encode_atom(RegExp *rx, int rx_i)
{
    RxAtom *atom;
    AtomData *data = &rx->data->atom_data[rx_i];
    Rx_set_nfree(rx, 8); /* Have at least 8 unused atoms */
    atom = &rx->start[rx_i];
    clear_unused_parens(rx, atom);

 /* If atom a data atom first encode mandatory portion */
    if(Rx_is_data(atom) || Rx_is_anchor(atom))
    {   int min = atom->min, max = atom->max, opt_no = atom->max - min;
        int len = Rx_atom_len(atom), orig_len = rx->len;
        int rev = (atom->flags & RX_REV) ? 1 : 0;
        RxAtom *dest = Rx_copy_atom(rx, rx_i);

     /* If atom a set with 255 or 256 values and can occur multiple times,
      * use special instructions to locate set */
        if(max > 1 && rx->data->atom_data[rx_i].nchar >= NCHAR - 1)
        {   int rev_op = rev ? (UNIV_SET_REV - UNIV_SET) :0;
            int op, ch_i = 0;
         /* If set is a single negated char, determine char and look
          *   for set by doing a memchr for char not in set */
            if(rx->data->atom_data[rx_i].nchar == NCHAR - 1)
            {   for(ch_i = 0; ch_i < NCHAR; ++ch_i)
                {   if(!CharSet_iN(dest->data.set, ch_i))
                        break;
                }
                op = NEG_CHAR_SET;
            }
         /* Otherwise set has all characters and chars do not need to be
          * examined, just make sure there are enough bytes for set */
            else
                op = UNIV_SET;

            dest->data.ch = ch_i;
            dest->opcode  = op + rev_op;
            dest->flags  &= RX_INDEX;
         /* Need to have an extra atom just to store start/end of optional
          * region */
            Rx_init_atom(rx, OPT_START_STORE);
        }
        else
        {   if(min)
            {   dest->max    = mult_truncate(min, len);
                set_opcode(dest);
                atom->flags |= RX_ALIAS;
            }
         /* Then encode optional part */
            if(opt_no)
            {   if(min)
                {   dest      = Rx_copy_atom(rx, rx_i);
                    dest->min = 0;
                }
                dest->max    = mult_truncate(opt_no, len);
                set_opcode(dest);
                atom->flags |= RX_ALIAS;
            }
        }
     /* Set link to backtrack atom if backtracking possible */
        if(min)
            link_backtrack(rx, rx_i, orig_len);

     /* If atom can be backtracked into, add code to store the
      * start & end of the backtrack region */
        if(data->ngive_up[rev])
        {   int rev_op = (atom->flags & RX_REV) ? FORTRACK - BACKTRACK :0;
            Rx_init_atom(rx, INIT_BACKTRACK + rev_op);
            dest = Rx_init_atom(rx, BACKTRACK + rev_op);
            dest->fail.paren_i = atom->fail.paren_i;
            link_backtrack(rx, rx_i, rx->len - 1);

            data->encoded_i = rx->len - 1;
        }
    }
    else if(Rx_is_paren(atom))
        encode_paren(rx, rx_i);
    else if(atom->flags & RX_ALTERN)
        encode_alt(rx, rx_i);
}

/* Set the opcode appropriately */
static void set_opcode(RxAtom *rx)
{
    static int ops[] =
    {   SING_BREF, SING_CH, SING_SET, SING_STR,
        MULT_BREFS, MULT_CHS, MULT_SETS, MULT_STRS,
        OPT_BREFS, OPT_CHS, OPT_SETS, OPT_STRS,
        0, 0, 0, 0,
        0, SING_CH_REV, SING_SET_REV, SING_STR_REV,
        0, MULT_CHS_REV, MULT_SETS_REV, MULT_STRS_REV,
        0, OPT_CHS_REV, OPT_SETS_REV, OPT_STRS_REV
    };
    int type = Rx_type(rx);

 /* Determine index for ops array */
    if(Rx_is_data(rx))
    {   if(!rx->min)
            type |= 8;
        else if(rx->min > 1)
            type |= 4;

        if(rx->flags & RX_REV)
            type |= 16;

        rx->opcode = ops[type];
    }
 /* If data is anchor, opcode has been set during parsing, just need to
  * adjust for reverse opcodes */
    else if(Rx_is_anchor(rx))
    {   if(rx->flags & RX_REV)
            rx->opcode += LINE_START_REV - LINE_START;
    }
}

/* Indicate the atom should backtrack into the atom whose original index is
 * bt_i if the atom fails */
static void set_backtrack(RxAtom *atom, int bt_i)
{
    atom->flags |= RX_BACKTRACK;
    atom->fail.i = bt_i;
}

/* */
static int link_backtrack(RegExp *rx, int parse_i, int encoded_i)
{
    AtomData *data = &rx->data->atom_data[parse_i];
    int rev  = (rx->start[parse_i].flags & RX_REV) ? 1 : 0;
    int bt_i = data->backtrack_i[rev];
    if(bt_i == -1)
        return -1;
    else
    {   RxAtom *atom = &rx->start[encoded_i];
        int jump_i = rx->len, nparen_reset = 0;
        Paren *par = Rx_paren(rx, parse_i), *bt_par = Rx_paren(rx, bt_i);
        Paren *quant_par = Rx_closest_quant(rx, par);

     /* Backtrack cannot jump outside of quantified paren unless atom is
      *  a parentheses */
        if(   quant_par  && !Rx_is_paren(&rx->start[parse_i])
           && (quant_par->start_i >= bt_i || bt_i >= quant_par->end_i))
            return -1;

        if(Rx_is_paren(&rx->start[bt_i]) && bt_i != Rx_exit_i(bt_par, 1))
            bt_par = Rx_outer_paren(rx, bt_par);

        while(bt_par->level > par->level)
        {   if(Rx_paren_quant(bt_par) && bt_par->store_i != -1)
            {   if(!(nparen_reset++))
                    Rx_init_atom3(rx, RX_JUMP, 0, RX_INDEX);
                Rx_init_atom3(rx, REENTER_PAREN, bt_par - rx->data->parens,
                              RX_PAREN_LOC);
            }
            bt_par = Rx_outer_paren(rx, bt_par);
        }

        if(!nparen_reset)
        {   set_backtrack(atom, bt_i);
            return 0;
        }
        else
        {   atom->fail.i = jump_i + 1;
            atom->flags |= RX_INDEX;
            Rx_init_atom3(rx, RX_JUMP, bt_i, RX_BACKTRACK);
            rx->start[jump_i].fail.i = rx->len;
            return jump_i + 1;
        }
    }
}

/* Generate the instructions needed to handle an atom at rx_i which contains
 * contains a parentheses */
static void encode_paren(RegExp *rx, int rx_i)
{
    RxAtom *atom   = &rx->start[rx_i];
    int paren_i    = atom->fail.paren_i, opcode, paren_loc_i;
    int rev        = (atom->flags & RX_REV) ? 1 : 0;
    int rev_op     = rev ? PAREN_START_REV - PAREN_START : 0;

    Paren *par  = &rx->data->parens[ paren_i ];
    RxAtom *dest;

 /* Set up atom which will store alternative to try next if paren alternative
  * contains alternative which are ambiguous or can be backtracked into */
    if(par->alt_btrack[rev] && par->enter_i != -1)
    {   if(!par->is_top_alt || par->min > 1)
        {   dest = Rx_init_atom2(rx, SET_JUMP, paren_i);
            dest->data.bref.num = paren_i;
            link_backtrack(rx, rx_i, rx->len - 1);
        }
        else
            par->is_top_alt = FALSE;

        Rx_init_atom(rx, SKIP_NEXT);
        rx->data->atom_data[rx_i].encoded_i = rx->len;
        par->jump_store_i = rx->len;
        if(par->min || !par->backtracked[rev])
            Rx_init_atom(rx, RX_JUMP);
        else
        {   dest = Rx_init_atom3(rx, JUMP_NULL, 0, RX_BREF);
            dest->data.bref.num = paren_i;
        }
    }

 /* If parentheses not quantified, just need to record start & end */
    if(!Rx_paren_quant(par))
    {   paren_loc_i        = rx->len;
        dest               = Rx_copy_atom(rx, rx_i);
        dest->fail.paren_i = paren_i;

     /* If parentheses contents have a fixed length, only one atom
      * will be needed to record the location of the parentheses */
        if(par->max_len == par->min_len && !par->nbar)
        {   if((Rx_is_lparen(atom) != 0) ^ (rev_op != 0))
            {   Rx_drop(rx);
                    return;
            }
            opcode       = FIXED_LEN_PAR;
            dest->max    = par->max_len;
            par->store_i = par->enter_i = paren_loc_i;
        }
     /* Otherwise need atom to record start of parentheses, set up link
      *   to eventual location */
        else if(par->enter_i == -1)
        {   par->enter_i = paren_loc_i;
            opcode       = PAREN_START;
            dest->flags |= RX_PAREN_LOC;
            if(rev_op && Rx_is_lparen(&rx->start[rx_i]))
            {   opcode = PAREN_START_SPLIT;
                rev_op = 0;
            }
        }
        else
        {   par->store_i = paren_loc_i;
            opcode       = PAREN_END;
        }
    }
 /* If parentheses quantified, need to keep track of count as well as
  * start & end of parentheses */
    else
    {/* Set up quantified paren entry if paren not started */
        if(par->enter_i == -1)
        {   opcode = GROUP_START;
            if(rev_op && Rx_is_lparen(&rx->start[rx_i]))
            {   opcode = GROUP_START_SPLIT;
                rev_op = rev = 0;
            }
            if(par->backtracked[rev])
            {   Paren *enc_par = par;
                do
                {   enc_par = &rx->data->parens[enc_par->enclose_i];
                }while(   enc_par != rx->data->parens
                       && !enc_par->backtracked[rev]);

                if(enc_par == rx->data->parens)
                {   Paren *sub_par = par;
                    int sub_par_i = paren_i;
                    int end_sub_i = paren_i + Rx_nsub_paren(rx, paren_i) + 1;

                    for(; sub_par_i < end_sub_i; ++sub_par_i, ++sub_par)
                    {   if(sub_par->backtracked[rev])
                        {   dest = Rx_init_atom3(rx, RESET_STACK,
                                                 sub_par_i, RX_PAREN_LOC);
                        }
                    }
                }
                Rx_init_atom3(rx, START_STACK, paren_i, RX_PAREN_LOC);
            }
            par->enter_i = paren_loc_i = rx->len;
            dest         = Rx_copy_atom(rx, rx_i);
            dest->max    = paren_i;
            dest->flags |= RX_PAREN_LOC;
        }
     /* End of quantified parenetheses requires two atoms, first atom
      * contains the min, max and current number of occurences, the
      * second stores the start and end of the current parentheses */
        else
        {/* If paren contents can be zero, need to test for that and
          * exit the paren */
            if(!par->min_len)
                Rx_init_atom2(rx, TEST_ZERO_PAREN + rev_op, paren_i);

            if(par->backtracked[rev])
            {   dest = Rx_init_atom3(rx, ADVANCE_STACK,
                                     par->enter_i, RX_INDEX);
            }

         /* Set up link for end of a quantified paren to go to back to start
          * also store the minimum & maximum # of occurences */
            dest = Rx_init_atom3(rx, GROUP_END + rev_op,
                                 par->enter_i + 1, RX_INDEX);
            dest->max    = par->max;
            dest->min    = par->min;

         /* Now set up the atom to jump to if the quantified paren fails */
            opcode       = GROUP_FAIL;
            par->store_i = paren_loc_i = rx->len;
            dest         = Rx_init_atom2(rx, GROUP_FAIL, par->enclose_i);
            dest->min    = 2 * (Rx_nsub_paren(rx, paren_i) + 1);
            if(rev)
            {   size_t nalloc = dest->min * sizeof(char * *);
                dest->data.paren.rmost = x_malloc(nalloc);
            }

         /* If backtrack possible, backtrack on failure. Otherwise do failure
          * for enclosing parentheses */
            link_backtrack(rx, rx_i, paren_loc_i);

            if(par->backtracked[rev])
            {   int begin_i  = (rev_op == 0) ? par->start_i : par->end_i;
                int skip_i   = rx->len;

             /* Set previous group failure atom to do pop */
                dest->flags  = RX_INDEX;
                dest->fail.i = rx->len + 1;

                Rx_init_atom(rx, SKIP_NEXT);

                if(par->alt_ambig[rev])
                {   Rx_init_atom3(rx, TEST_UNTRIED,
                                  par->jump_store_i, RX_INDEX);
                }
                rx->data->atom_data[begin_i].encoded_i = rx->len;

             /* For atom containing pop opcode, count will contain relative
              * offset of atom containing stack */
                dest = Rx_init_atom2(rx, POP_PAREN + rev_op, par->enclose_i);
                dest->data.count = par->enter_i - rx->len + 1;
                link_backtrack(rx, rx_i, rx->len - 1);

             /* If paren contains ambiguous alternative, set up code to
              * try next untried alternative after popping */
                if(par->alt_ambig[rev])
                    Rx_init_atom3(rx, RX_JUMP, par->jump_store_i, RX_INDEX);

                if(rx->len > skip_i + 2)
                    Rx_set_atom3(&rx->start[skip_i], RX_JUMP,
                                 rx->len, RX_INDEX);
            }
        }
    }
    rx->start[paren_loc_i].opcode = opcode + rev_op;
}

/* For an alternation, two operations will be set up. If the alternation
 * is reached, indicating a match, jump to the end of the alternatives.
 * After the jump, set an instruction to reset the match start/end to the
 * start/end of the parenthesis containing the alternative, which will
 * be jumped to when the preceding alternative fails */
static void encode_alt(RegExp *rx, int rx_i)
{
    RxAtom *atom   = &rx->start[rx_i];
    AtomData *data = &rx->data->atom_data[rx_i];
    int rev        = (atom->flags & RX_REV) ? 1 : 0;
    int rev_op     = rev ? HAVE_ALT_REV - HAVE_ALT : 0;
    int paren_i    = atom->fail.paren_i;
    Paren *paren   = &rx->data->parens[paren_i];
    RxAtom *dest;

 /* If alternation is backtracked through, set up the code which will
  * set the next location in the alternation to try */
    if(paren->alt_btrack[rev])
    {   dest = Rx_init_atom2(rx, SET_JUMP, paren_i);
        dest->data.bref.num = paren_i;

        if(data->backtrack_i[rev] != -1)
            set_backtrack(dest, data->backtrack_i[rev]);
    }

 /* Record code end index for alternative */
    dest = Rx_copy_atom(rx, rx_i);
    rx->data->alts[atom->data.altern_i].encoded_i = rx->len;

    if(paren_i)
    {   dest->flags |= RX_PAREN_LOC;
     /* Set up jump to end of alternative after success for preceding
      * alternative. If paren quantified jump to counter, otherwise jump
      * to block after paren storage, setting location */
        if(!Rx_paren_quant(paren))
            dest->opcode = HAVE_ALT + rev_op;
        else
        {   int off = -1;
            dest->opcode = JUMP_OFF;
            if(paren->backtracked[rev])
                --off;
            if(!paren->min_len)
                --off;
            dest->min = off;
        }

     /* Set up jump destination when preceding alternative fails which will
      * reset the location to the start of the parentheses containing the
      * alternative */
        dest = Rx_copy_atom(rx, dest - rx->start);
        dest->opcode = ALT_FAIL + rev_op;
    }
    else
    {   dest->opcode = RX_MATCH;
        (Rx_init_atom(rx, RX_RESET))->flags |= RX_INDEX;
    }
}

/* If atom is at the end of an alternative, set up code to null the
 * parens in all the alternatives except for the alternative ending with
 * alt_end. If just one paren is to be nulled store the paren_loc in fail,
 * otherwise a list will need to be used. Return the number of parens
 * which need to be cleared */
static void clear_unused_parens(RegExp *rx, RxAtom *atom)
{
    int used_start_i, used_end_i;
    int nclear   = 0;
    int paren_i  = atom->fail.paren_i, sub_paren_i;
    Paren *paren = &rx->data->parens[paren_i];
    RxAtom *used_start = atom, *used_end = atom;

    if(paren->is_top_alt)
        return;

 /* Return if atom is not at alternative end. Alternative end is
  * indicated by | or closure of paren containing alternation */
    if(!Rx_is_alt_end(atom, paren_i) || !paren->nbar || paren->enter_i == -1)
        return;

 /* first determine start & end of used alternative */
    if(atom->flags & RX_REV)
    {   while(!Rx_is_alt_end(++used_end, paren_i))
            ;
    }
    else
    {   while(!Rx_is_alt_end(--used_start, paren_i))
            ;
    }
    used_start_i = used_start - rx->start;
    used_end_i   = used_end   - rx->start;

 /* Now go through paren containing alternation, if sub-parens do not
  * occur in used alternative, set them to be nulled */
    for(sub_paren_i = paren_i + 1; sub_paren_i <rx->data->nparen;++sub_paren_i)
    {   Paren *sub_paren = &rx->data->parens[sub_paren_i];
        if(sub_paren->level == paren->level)
            break;
        if(sub_paren->start_i < used_start_i || sub_paren->end_i > used_end_i)
            rx->data->paren_list[nclear++].i = sub_paren->store_i;
    }
 /* Set up instructions to clear parens */
    if(nclear)
    {   RxAtom *clear_inst = Rx_init_atom(rx, CLEAR_PAREN);
     /* If just a single paren to clear, store paren loc in fail field */
        if(nclear == 1)
        {   clear_inst->flags       |= RX_INDEX;
            clear_inst->fail.paren_i = rx->data->paren_list[0].i;
        }
     /* Otherwise need to copy list */
        else
        {   clear_inst->opcode          = CLEAR_PARENS;
            clear_inst->data.clear_list = x_malloc(nclear * sizeof(AtomLoc));
            clear_inst->max             = nclear;
            memcpy(clear_inst->data.clear_list, rx->data->paren_list,
                   nclear * sizeof(AtomLoc));
        }
    }
}

/* Make the regexp atom at anchor_ptr the anchor. Use the FIND* opcodes
 * to locate the atom. Return 0 if the rarest atom needs to be re-encoded,
 * 1 otherwise */
static int encode_rarest(RegExp *rx, int rarest_i)
{
    RxAtom *anchor = &rx->start[rarest_i], *dest;
    int data_type  = Rx_type(anchor);
    int len = Rx_atom_len(anchor);
    int min = anchor->min, max = anchor->max, nmult = min;
    char ch = anchor->data.ch;

    Rx_set_nfree(rx, 4);
    dest = Rx_copy_atom(rx, rarest_i);
    dest->fail.paren_i = -1;
    dest->flags        = 0;
    anchor = &rx->start[rarest_i];
    rx->data->parens[0].fail_i = rx->len - 1;

 /* If just single case-sensitive char, use memchr to find */
    if(data_type == RX_CH && up_casE(ch) == low_casE(ch) && min == 1)
        dest->opcode = FIND_CH;

 /* Otherwise use boyer-moore routines to locate char or string */
    else if(data_type == RX_CH || data_type == RX_STR)
    {   const char *str = ( (data_type == RX_CH)
                            ? &anchor->data.ch
                            : anchor->data.str.start);

        if(mult_truncate(len, min) >= 1024)
        {   nmult = 1024 / len;
            if(!nmult)
                nmult = 1;
        }

     /* If string must occur multiple times, "multiply" the string to
      * obtain a longer string and bigger skips, maximum length of
      *  multiplied string is 1024 */
        if(nmult > 1)
        {   str      = str_mult(str, len, nmult);
            dest->data.bmoore.new_str = str;
        }
        else
            dest->data.bmoore.new_str = NULL;

        dest->data.bmoore.table = x_malloc(sizeof(FixedStr));
        dest->opcode            = FIND_STR;
        dest->max               = len * nmult;
        FixedStr_init(dest->data.bmoore.table, str, len * nmult);
    }
 /* If atom set, set up Boyer-Moore style skip table for character set */
    else if(data_type == RX_SET)
    {   int far *set_skips = x_farmalloc(sizeof(int) * NCHAR), ch_i;
        int skip = miN(min, 1024);
        for(ch_i = 0; ch_i < NCHAR; ++ch_i)
            set_skips[ch_i] = (CharSet_iN(anchor->data.set, ch_i) ? 0 : skip);

        dest->data.skips = set_skips;

        if(rx->data->atom_data[rarest_i].rarity / min > FREQ_TOTAL/2)
            dest->opcode = FIND_SET;
        else
        {   dest->opcode     = FIND_FIRST_SETS;
            dest             = Rx_init_atom3(rx, RX_JUMP,
                                             rx->len + 2, RX_INDEX);
            dest             = Rx_init_atom2(rx, FIND_NEXT_SETS, -1);
            dest->data.skips = set_skips;
            dest->min        = min;
            rx->data->parens[0].fail_i += 2;
        }
    }
    else if(anchor->flags & RX_ANCHOR)
        dest->opcode += FIND_LINE_START - LINE_START;

    else if(anchor->flags & RX_BREF)
    {   Paren *bref_paren   = &rx->data->parens[anchor->data.bref.num];
        dest->opcode        = FIND_BREF + Case_sense;
        dest->data.bref.len = bref_paren->min_len;
     /* If backreference is quantified, need to re-encode atom for
      * other instances */
        if(min != 1 || min != max)
        {   Rx_init_atom(rx, RX_RESET);
            return 0;
        }
    }

 /* Process optional part of atom, also process remaining required strings
  * which weren't included in the multiplied string */
    if(max - nmult && data_type)
    {   anchor->min -= nmult;
        anchor->max -= nmult;
        encode_atom(rx, rarest_i);
        anchor      = &rx->start[rarest_i];
        anchor->min = min;
        anchor->max = max;
    }
    return 1;
}

/* Set links for len atoms beginning at start_i appropriately */
static void link(RegExp *rx, int start_i, int len)
{
    int rx_i;
    RxAtom *rx_ptr = &rx->start[start_i];

    for(rx_i = start_i; rx_i < start_i + len; ++rx_ptr, ++rx_i)
    {   int paren_i  = rx_ptr->fail.paren_i, link_i;

     /* If RX_BREF is set, Replace the paren number with the negative index
      * of where the corresponding parentheses is stored. Make the index
      * negative so the index won't be considered a backreference number
      * in case the atom is linked twice */
        if(rx_ptr->flags & RX_BREF && rx_ptr->data.bref.num > 0)
        {   Paren *bref_paren = &rx->data->parens[ rx_ptr->data.bref.num ];
            if(bref_paren->store_i != -1)
                rx_ptr->data.bref.num = -bref_paren->store_i;
        }
        if(rx_ptr->opcode == SET_JUMP && !(rx_ptr->flags & RX_BREF))
        {   Paren *paren = &rx->data->parens[rx_ptr->data.bref.num];
            if(paren->jump_store_i)
            {   rx_ptr->data.bref.num = -paren->jump_store_i;
                rx_ptr->flags |= RX_BREF;
                rx_ptr->min = rx_ptr->max = 1;
            }
        }

     /* If RX_INDEX set, index has already been determined */
        if(rx_ptr->flags & RX_INDEX)
            continue;
     /* If RX_BACKTRACK set, convert to location where backtrack occurs */
        else if(rx_ptr->flags & RX_BACKTRACK)
            link_i = Rx_encoded_i(rx, paren_i);

        else if(paren_i == -1 && !Rx_is_paren(rx_ptr))
            link_i = -1;

     /* Set link to parentheses storage location if RX_PAREN_LOC set */
        else if(rx_ptr->flags & RX_PAREN_LOC)
        {   Paren *paren = &rx->data->parens[ paren_i ];
            if(paren->store_i == -1)
                continue;
            if(   rx_ptr->opcode == GROUP_START
               || rx_ptr->opcode == GROUP_START_REV)
                init_stack(rx, rx_ptr);

            link_i = paren->store_i;
            if(rx_ptr->opcode == JUMP_OFF)
            {   link_i += rx_ptr->min;
                rx_ptr->opcode = RX_JUMP;
            }
        }
        else
            link_i = find_fail_i(rx, rx_i);

        rx_ptr->flags |= RX_INDEX;
        rx_ptr->fail.i = link_i;
    }
}

/* Determine the index to try next if the atom located at rx_i is not found */
static int find_fail_i(RegExp *rx, int rx_i)
{
    RxAtom *atom = &rx->start[rx_i];
    int fail_i = -1, paren_i = atom->fail.paren_i;
    int rev = (atom->flags & RX_REV) ? 1 : 0;
    Paren *paren, *parens = rx->data->parens;

 /* Move to first enclosing paren which is quantified or has alternatives */
    while(  (paren = &parens[paren_i]) != parens
          && (!Rx_paren_quant(paren) || paren->store_i == -1)
          && !paren->nbar)
        paren_i = paren->enclose_i;

 /* If in top-level alternation, retry rover in current alternative on
  *   failure */
    if(paren->is_top_alt)
        fail_i = rx->data->parens[0].fail_i;

 /* If atom is in alternation, determine next alternative to try on failure */
    else if(paren->nbar)
    {   Altern *alt = find_next_alt(rx, paren_i, rx_i, rev ? -1 : 1);
        if(alt)
        {   fail_i = alt->encoded_i;
            assert(fail_i != -1);
            if(!alt->jumped_to)
            {   if(Rx_type(atom) || atom->flags & RX_BREF)
                    ++fail_i;
                alt->jumped_to = TRUE;
            }
        }
    }
    if(fail_i == -1)
    {   while(   paren_i && !Rx_paren_quant(paren)
              && paren->backtrack_i[rev] == -1)
        {   paren_i = paren->enclose_i;
            paren   = &parens[paren_i];
        }
        if(!paren_i)
            fail_i = paren->fail_i;
        else if(Rx_paren_quant(paren))
            fail_i = paren->store_i;
        else
            fail_i = Rx_encoded_i(rx, paren->backtrack_i[rev]);
    }
    return fail_i;
}

/* Find the next alternative located in parentheses paren which is encoded
 * after rx_i. If dir is 1 test alternatives going forward, if -1 go backward*/
static Altern *find_next_alt(RegExp *rx, int paren_i, int rx_i, int dir)
{
    RxData *rxd  = rx->data;
    Paren *paren = &rxd->parens[paren_i];
    int alt_i, end_i;

    if(dir == 1)
    {   alt_i = paren->first_alt_i;
        end_i = paren->last_alt_i;
    }
    else
    {   alt_i = paren->last_alt_i;
        end_i = paren->first_alt_i;
    }
    for( ; ; )
    {   Altern *alt = &rxd->alts[alt_i];
        if(alt->paren == paren_i && alt->encoded_i > rx_i)
            return alt;
        if(alt_i == end_i)
            break;
        alt_i += dir;
    }
    return NULL;
}

/* This function sets up a stack for a parentheses at rx_ptr which will
 *   store the start & end of the paren and its subparentheses, the start
 *   end of the backtrack regions and the stored address of the next
 *   alternative to try */
static void init_stack(RegExp *rx, RxAtom *rx_ptr)
{
    int paren_i = rx_ptr->fail.paren_i, rx_i;
    Paren *par = &rx->data->parens[paren_i], *sub_par = par;
    int npar = Rx_nsub_paren(rx, paren_i) + 1, nval = npar * 2, val_i = 0;
    char const * * *loc_buf = x_malloc(nval * sizeof(char * *));

 /* First store the addresses of the paren and its subparens */
    for(; npar > 0; --npar, ++sub_par)
    {   RxAtom *par_store_ptr = &rx->start[sub_par->store_i];
        loc_buf[val_i++] = &par_store_ptr->data.paren.start;
        loc_buf[val_i++] = &par_store_ptr->data.paren.end;
    }
 /* Now store the backtracks and set jumps in parentheses */
    for(rx_i = par->enter_i; rx_i < par->store_i; ++rx_i)
    {   RxAtom *rx_ptr = &rx->start[rx_i];
        int opcode = rx_ptr->opcode;
        if(opcode == INIT_BACKTRACK || opcode == INIT_FORTRACK)
        {   if(val_i + 2 > nval)
            {   nval   *= 2;
                loc_buf = x_realloc(loc_buf, nval * sizeof(char * *));
            }
            loc_buf[val_i++] = &rx_ptr->data.backtrack.start;
            loc_buf[val_i++] = &rx_ptr->data.backtrack.end;
        }
        else if(   opcode == SET_JUMP && rx_i < par->store_i - 2
                && rx_ptr[1].opcode == SKIP_NEXT
                && (   rx_ptr[2].opcode == RX_JUMP
                    || rx_ptr[2].opcode == JUMP_NULL))
        {   if(val_i >= nval)
            {   nval   *= 2;
                loc_buf = x_realloc(loc_buf, nval * sizeof(char * *));
            }
            loc_buf[val_i++] = (char const * *)&rx_ptr[2].fail.ptr;
        }
    }
    rx_ptr->data.paren_buf.locs = x_realloc(loc_buf, val_i *sizeof(char * *));
    rx_ptr->data.paren_buf.start = x_malloc(val_i * 2 * sizeof(char *));
    rx_ptr->data.paren_buf.i = val_i;
    rx_ptr->min = val_i;
    rx_ptr->max = val_i * 2;

    *rx_ptr->data.paren_buf.start = (char *)rx->start;
}

static int have_case_sense(void)
/* If first time, see if case sensitivity is desired */
{   int ch_i;
    for(ch_i = 0; ch_i < NCHAR; ++ch_i)
    {   if(ch_i != low_casE(ch_i) || ch_i != up_casE(ch_i))
            break;
    }
    return (ch_i == NCHAR);
}
