/* Copyright (C) 1995-99 Andrew Pipkin (minitrue@pagesz.net)
 * MiniTrue is free software released with no warranty. See COPYING for details
 */

#ifndef REGEXP_H
#define REGEXP_H 1

#include "charset.h"
#include "fixedstr.h"

typedef struct top_alt
{   struct sub_rx *sub_rxs;
    int i;
    int len;
}TopAlt;

typedef struct sub_paren
{   char * *buf;                /* used to store location of subparens */
    struct rx_atom * *par_locs; /* pointers to atoms where parens are stored */
    int start_i;                /* index of first subparen */
}SubParens;

/* This union is used to record the location of parentheses in the unused
 * alternatives */
typedef union
{   int i;                /* index where paren is stored parentheses */
    struct rx_atom *ptr;  /* pointer corresponding to index */
}AtomLoc;

typedef struct rx_atom
{   int flags;             /* flags for regular atom */
    int opcode;            /* opcode for operation used to find RxAtom */
    int min;               /* minimum # of bytes atom can occupy */
    int max;               /* maximum # of times atom can occur */
    union
    {   char ch;           /* single character */
     /* start & end of fixed substring */
        struct
        {   const char *start;
            const char *end;
            int len;
        }str;
        CharSet set;       /* character set */
        struct
        {   FixedStr *table;     /* pointer to boyer-moore find structure */
            const char *new_str; /* points to string created for boyer-moore
                                  * NULL if no string created */
        }bmoore;
        int count;         /* Number of times quantified paren has occurred */

        struct
        {   const char *start; /* Start of atoms enclosed in parentheses */
            const char *end;   /* End of atoms enclosed in parentheses */
            char const * * rmost; /* If paren is quantified and precedes
                                   * the anchor, this will store the rightmost
                                   * instance of the paren and the parens
                                   * nested in it */
        }paren;

        struct
        {   int num;  /* Number of parentheses for back reference */
            int dist; /* # of bytes between backreference and corresponding
                       * parentheses, -1 if variable */
            int len;  /* Length of backreference if corresponding parentheses
                       * length is fixed */
        }bref;

     /* Buffer used to store information about quantified parens
      * If paren is backtracked, buffer will be a stack
      * Atoms max will contain size of buffer, min will contain
      * number of values paren will store in buffer */
     /* If paren is backtracked, buffer will be stack */
        struct
        {   char const * *start;    /* buffer start */
            int i;                  /* index of current position in buffer */
            char const * * *locs;   /* array of locations to restore
                                     * stack contents to */
        }paren_buf;

        struct rx_atom *paren_loc; /* Atom in which desired paren is stored */
        int altern_i;      /* Index for alternation */

        struct
        {   const char *start; /* start of region backtracking is possible */
            const char *end;   /* end of region backtracking is possible */
            const char *orig;  /* original match start if backtrack, original
                                * match end if fortrack */
        }backtrack;

        int far *skips;      /* Skip table used to find character set(s) */

        TopAlt top_alt;       /* Data for top_level alternation */
        AtomLoc *clear_list; /* List of parentheses in unused alternatives
                               * which will be cleared when the other
                               * alternative is used */
        SubParens sub_parens; /* list of subparens in alternation */
    } data;

 /* During compilation these unions have indices corresponding to the
  *   subsequent item to go to in case the data is moved as a result of a
  *   realloc call. Once all RxAtoms have been allocated, the indices
  *   are converted to pointers.  */
    union
    {   struct rx_atom *ptr;  /* Next RxAtom if current RxAtom not found */
        int i;                /* Index corresponding to *fail_i; */
        int paren_i;          /* Index of nearest enclosing paren */
        const char *opt;      /* For optional atoms, start */
    }fail;

}RxAtom;

/* This structure records data about an alternative in a top-level
 * alternation */
typedef struct sub_rx
{   AtomLoc code;            /* pointer/index corresponding to first atom for*/
                             /* alternative */
    const char *start;       /* start of location in buffer where alt found */
    const char *end;         /* end of alt */
    const char *anchor;      /* anchor for alt */
    int nsub_paren;          /* number of subparens in alternative */
    int sub_paren_i;         /* index of first subparen in alternative */
}SubRX;

enum { BUF_BOF = 2, BUF_EOF = 1 };

enum rx_flags {
    RX_CH = 1,      /* 1st two bits are 1 if atom is single character */
    RX_SET = 2,     /* 1st two bits are 2 if atom contains character set */
    RX_STR = 3,     /* 1st two bits are 3 if fixed string */
    RX_BREF = 4,    /* Indicates atom is backreference */
    RX_ANCHOR = 8,  /* Indicates atom contains zero-length assertion like
                     * $, ^, \b or \B */
    RX_LPAREN = 16, /* Indicates atom is left parentheses */
    RX_RPAREN = 32, /* Indicates atom is right parentheses */
    RX_ALTERN = 64, /* Indicates atom begins alternative (|) */
    RX_REV = 128,   /* If set, use backwards find operations */
    RX_INDEX = 256, /* Set when link has been determined */
    RX_BACKTRACK = 512, /* Set if backtracking should occur on failure */
    RX_PAREN_LOC = 1024, /* Set when link to be set to paren storage loc */
    RX_ALIAS = 2048 /* Set if atom aliased so do not free any pointers
                     * contained in this atom */
};

enum opcodes { SING_CH, MULT_CHS, OPT_CHS, SING_SET, MULT_SETS, OPT_SETS,
    SING_STR, MULT_STRS, OPT_STRS, FIND_CH, FIND_SET, FIND_COMMON_SET,
    FIND_FIRST_SETS, FIND_NEXT_SETS, FIND_SING_SET, FIND_STR,

    SING_CH_REV, MULT_CHS_REV, OPT_CHS_REV, SING_SET_REV, MULT_SETS_REV,
    OPT_SETS_REV, SING_STR_REV, MULT_STRS_REV, OPT_STRS_REV,

    UNIV_SET, NEG_CHAR_SET, UNIV_SET_REV, NEG_CHAR_SET_REV, OPT_START_STORE,

    SING_BREF, SING_BREF_CS, SING_BREF_CH, MULT_BREFS, OPT_BREFS,
    FIND_BREF, FIND_BREF_CS,

    PAREN_START, PAREN_END, FIXED_LEN_PAR,
    GROUP_START, GROUP_END, GROUP_FAIL, TEST_ZERO_PAREN, POP_PAREN,

    PAREN_START_SPLIT, GROUP_START_SPLIT, CLEAR_SUB_PARENS, COPY_PAREN,
    RESET_STACK, START_STACK, ADVANCE_STACK, REENTER_PAREN, TEST_UNTRIED,

    PAREN_START_REV, PAREN_END_REV, FIXED_LEN_PAR_REV,
    GROUP_START_REV, GROUP_END_REV, GROUP_FAIL_REV,
    TEST_ZERO_PAREN_REV, POP_PAREN_REV,

    HAVE_ALT, ALT_FAIL, HAVE_ALT_REV, ALT_FAIL_REV,
    TOP_ALT_INIT, TOP_ALT_MATCH, TOP_ALT_FAIL, TOP_ALT_NEXT,

    FILE_START, FILE_END, LINE_START, LINE_END, WORD_BREAK, NON_BREAK,
    LINE_START_REV, LINE_END_REV, WORD_BREAK_REV, NON_BREAK_REV,
    FIND_FILE_START, FIND_FILE_END, FIND_LINE_START, FIND_LINE_END,
    FIND_WORD_BREAK, FIND_NON_BREAK,

    RX_ADVANCE, RX_MATCH, RX_FAIL, RX_RESET, RX_JUMP, JUMP_OFF, JUMP_NULL,
    SKIP_NEXT, SET_JUMP, INIT_BACKTRACK, BACKTRACK, INIT_FORTRACK, FORTRACK,
    CLEAR_PAREN, CLEAR_PARENS
};

/* This structure contains information about a parentheses pair */
typedef struct
{   int start_i;    /* Original index of Regex atom after opening ( */
    int end_i;      /* Original index of Regex atom after closing ) */
    int min;        /* minimum # of times data in parenthesis must occur */
    int max;        /* maximum # of times data in parenthesis can occur */
    int level;      /* # of enclosing parens plus one */
    int enclose_i;  /* Index of nearest enclosing paren */
    int enter_i;    /* Index of atom in which paren is entered */
    int store_i;    /* Index of atom in which paren in exited */
    int fail_i;     /* Index of where to jump on failure */
    int max_len;    /* Maximum length of parentheses contents */
    int curr_max;   /* Current maximum length of parentheses contents */
    int min_len;    /* Minimum length of parentheses contents */
    int curr_min;   /* Current minimum length of paren contents */

    int nbar;       /* Number of | in parentheses */
    int first_alt_i;/* Index of first alternation in paren (-1 if no alt) */
    int last_alt_i; /* Index of last alternation in paren */
    int jump_store_i;/* Index of atom storing next alternative to try if
                     * backtracking into alternative */
    int backtrack_i[2];/* Index of atom to backtrack into */
    int backtracked[2];/* Set if entire contents of paren might be relinquished
                     * in backtracking */
    int alt_btrack[2];/* Set if alternation has atom which is backtracked into*/
    int alt_ambig[2];  /* Set if an alternative can be the start of another
                     * alternative */

    int is_top_alt; /* Set if parentheses contains top-level alternation */
} Paren;

/* This structure records the location and paren level of the alternation
 * operators in the structure */
typedef struct
{   int i;         /* Number of alternatives preceding current | */
    int parse_i;   /* Index of Regex atom containing | after parsing */
    int encoded_i; /* Index of atom containing start of alternation after
                    * compilation */
    int paren;     /* Parentheses alternation is found in */
    int jumped_to; /* Set if jump has been set to atom following alternation*/
    int min_len;   /* Minimum length of alternative before | */
    int max_len;   /* Maximum length of alternative before | */
    int next_alt_i;/* If ambiguous, index of next alternative to try */
} Altern;

/* This structure is used during the analysis phase of compilation */
typedef struct
{   unsigned rarity;
    int nchar;         /* # of characters in character set */
    int encoded_i;     /* index where atom is encoded at */
    int backtrack_i[2];/* on failure, index of atom to "backtrack into"
                        *   first index will be for backtracks, 2nd for
                        *   fortracks*/
    int ngive_up[2];   /* maximum # of chars which atom might need to
                        * reliquish to obtain a match */
    int nbacktrack[2]; /* maximum # of chars which atom might need to
                        * backtrack to be found */
    int next_alt_i;    /* If atom in alternative, index of next alternative
                        * to try if atom fails, -1 if not in alternation
                        * or no more alternatives to try */
}AtomData;

/* This structure contains data which is only needed during the compilation
 * phase */
typedef struct
{   Paren *parens;          /* Array containing information about parens */
    int nparen;             /* Number of ( encountered */
    int par_alloc;          /* size of parens array */
    int paren_level;        /* number of enclosing parens */
    int open_par_i;         /* Index of nearest unclosed parentheses */

    Altern *alts;           /* Array containing information about | */
    int nalt;               /* number of | encounted */
    int alt_alloc;          /* Size of alts array */

    int nparse;             /* number of parsed atoms */
    int nalloc;             /* number of encoded atoms */

    int nbacktrack;         /* number of items which can be bactracked into */
    AtomData *atom_data;
    AtomLoc *paren_list;    /* list of parens - used to record parens
                             *  in unused alternatives */
}RxData;

typedef struct
{   RxAtom *start;      /* initial Regexp atom */
    int len;            /* # of Regexp atoms */
    RxAtom * *par_locs; /* Addresses of atoms in which parentheses locations
                         * are found */
    int nparen;         /* number of parentheses */
    RxData *data;       /* pointer to structure containing compile-time data*/
}RegExp;

void RegExp_Buf_init(const char *buf_start, const char *buf_end, int buf_loc);
int RegExp_init(RegExp *reg_exp, char *src);
const char *RegExp_find(RegExp *reg_exp, const char *start, size_t *len_ptr);
char *RegExp_paren(RegExp *reg_exp, int paren_no, int *paren_len);
int RegExp_nparens(RegExp *reg_exp);
void RegExp_kill(RegExp *reg_exp);

int Rx_parse(RegExp *reg_exp, char *src);
void Rx_analyse(RegExp *reg_exp);
void Rx_encode(RegExp *rx, int start_i, int end_i);

void RegExp_print_atom(int rx_i, RegExp *reg_exp);
void Rx_init(RegExp *rx);
int Rx_atom_len(RxAtom *rx_ptr);
char *Rx_atom_str(RxAtom *str_atom, int *str_len);
int Rx_paren_quant(Paren *paren);
int Rx_min_len(RegExp *rx, RxAtom *rx_ptr);
int Rx_max_len(RegExp *rx, RxAtom *rx_ptr);
void Rx_set_nfree(RegExp *rx, int min_nfree);
RxAtom *Rx_init_atom(RegExp *rx, int opcode);
RxAtom *Rx_init_atom2(RegExp *rx, int opcode, int paren_i);
RxAtom *Rx_init_atom3(RegExp *rx, int opcode, int paren_i, int flags);
void Rx_set_atom3(RxAtom *atom, int opcode, int paren_i, int flags);
RxAtom *Rx_copy_atom(RegExp *rx, int rx_i);
void Rx_drop(RegExp *rx);
int Rx_type(RxAtom *atom);
int Rx_is_data(RxAtom *atom);
int Rx_is_lparen(RxAtom *atom);
int Rx_is_paren(RxAtom *atom);
int Rx_is_rparen(RxAtom *atom);
int Rx_is_alt(RxAtom *atom);
int Rx_is_anchor(RxAtom *atom);
int Rx_is_alt_end(RxAtom *atom, int paren_i);
Paren *Rx_paren(RegExp *rx, int paren_i);
int Rx_in_paren(RegExp *rx, int rx_i, Paren *paren);
Paren *Rx_outer_paren(RegExp *rx, Paren *paren);
Paren *Rx_closest_quant(RegExp *rx, Paren *paren);
Paren *Rx_quant_paren(RegExp *rx, int rx_i, int start_i, int end_i);
int Rx_encoded_i(RegExp *rx, int rx_i);
int Rx_nsub_paren(RegExp *rx, int paren_i);
int Rx_exit_i(Paren *paren, int dir);
int Rx_next_alt_i(int start_i, int paren_i, int dir, RegExp *rx);

int find_rarest(RegExp *rx, int start_i, int end_i, int *rarest_i_ptr);
void find_backtracks(int start_i, int last_i, RegExp *rx);
int alt_fortrack(RegExp *rx, int paren_i, int start_i);

#endif /* REGEXP_H */
