(* File: pcre.mli

   Copyright (C) 1999  Markus Mottl
   email: mottl@miss.wu-wien.ac.at
   WWW: http://miss.wu-wien.ac.at/~mottl

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*)

(* $Id: pcre.mli,v 1.9 1999/08/31 22:10:06 mottl Exp $ *)

(*** IMPORTANT INFORMATION:

  Many functions support different interfaces depending on the
  parameters one can optionally use with them. This is reflected in
  their names. These are the postfix conventions:

  ...on   -   Function additionally expects runtime options and an offset
  ...o    -   Function additionally expects runtime options
  ...n    -   Function additionally expects an offset

  These postfixes are available for the following functions:

    exec, next_match, extract, pmatch, replace_first, replace_all,
    qreplace_first, qreplace_all, substitute_first, substitute_all,
    bounded_psplit, psplit, full_psplit, bounded_split, split,
    bounded_split_delim, split_delim, bounded_full_split, full_split

  Per default, no runtime options and an offset of '0' will be used if
  there is no additional postfix.

  All functions taking an offset will raise [Invalid_argument] if the
  offset is outside of the subject string.

  Some functions may also take the prefix 'a', which stands for "anchored
  match". This ensures that the pattern must match immediately at the
  position where matching is started. This is useful with the following
  functions:

    exec, next_match, extract, pmatch, replace_first, replace_all,
    qreplace_first, qreplace_all, substitute_first, substitute_all,

  See the function comments for details on their other parameters.
*)


(*** Exceptions *)

(* Gets raised when the regular expression is malformed *)
exception BadPattern of string * int


(*** Compilation and runtime flags and their conversion functions *)

type icflag (* Internal representation of compilation flags *)
and  irflag (* Internal representation of runtime flags *)

(* Compilation flags *)
and cflag =
  | CASELESS       (* case insensitive matching *)
  | MULTILINE      (* '^' and '$' match before/after newlines,
                      not just at the beginning/end of a string *)
  | DOTALL         (* '.' matches all characters (newlines, too) *)
  | EXTENDED       (* Ignores whitespace and PERL-comments. Behaves
                      like the '/x'-option in PERL *)
  | C_ANCHORED     (* Pattern matches only at start of string *)
  | DOLLAR_ENDONLY (* '$' in pattern matches only at end of string *)
  | EXTRA          (* Reserved for future extensions of PCRE *)
  | UNGREEDY       (* Quantifiers not greedy anymore, only
                      if followed by '?' *)

val cflags : cflag list -> icflag
        (* [cflags cflag_list] converts a list of compilation flags to
           their internal representation *)

val cflag_list : icflag -> cflag list
        (* [cflag_list cflags] converts internal representation of
           compilation flags to a list *)

(* Runtime flags *)
type rflag =
  | R_ANCHORED (* Treats pattern as if it were anchored *)
  | NOTBOL     (* Beginning of string is not treated as beginning of line *)
  | NOTEOL     (* End of string is not treated as end of line *)
  | NOTEMPTY   (* Empty strings are not considered to be a valid match *)

val rflags : rflag list -> irflag
        (* [rflags rflag_list] converts a list of runtime flags to
           their internal representation *)

val rflag_list : irflag -> rflag list
        (* [rflag_list rflags] converts internal representation of
           runtime flags to a list *)


(*** Information on patterns *)

(* Information on matching of "first chars" in patterns *)
type firstchar_info =
  | Char of char (* fixed first character *)
  | StartOnly    (* pattern matches at beginning and end of newlines *)
  | Anchored     (* pattern is anchored *)

(* Information on the study status of patterns *)
type study_stat =
  | Not_studied (* Pattern has not yet been studied *)
  | Studied     (* Pattern has been studied successfully *)
  | Optimal     (* Pattern could not be improved by studying *)

(* Information on patterns *)
type info =
  { subpats : int;              (* Number of subpatterns *)
    cflags : icflag;            (* Compilation flags *)
    firstchar : firstchar_info; (* Information on "first chars" *)
    study_stat : study_stat }   (* Study status of pattern *)

type regexp (* Compiled regular expressions *)

(* [info regexp] returns information on regular expression *)
external info : regexp -> info = "pcre_info_wrapper"


(*** Compilation of patterns *)

(* For detailed documentation on how you can specify PERL-style
   regular expressions (=patterns), please consult PERL-manuals or the
   man-page of PCRE! *)

type chartables (* Alternative set of chartables for pattern matching *)

external maketables : unit -> chartables = "pcre_maketables_wrapper"
        (* Generates new set of char tables for the current locale *)

external compile :
  icflag -> chartables option -> string -> regexp = "pcre_compile_wrapper"
        (* [compile icflags (Some chartables) str] returns the compiled
           regular expression, compiled with options [icflags] and an
           optional alternative set of chartables [chartables] *)

val study : regexp -> regexp
        (* [study regexp] studies the compiled regular expression and
           returns it again - especially useful for patterns with many
           alternations *)

val regexpo : icflag -> string -> regexp
        (* [regexpo cflags str] compiles the regular expression
           [str] with the default table set and options [cflags] *)

val regexp : string -> regexp
        (* [regexp str] compiles the regular expression [str]
           with the default table set *)

val regexp_case_fold : string -> regexp
        (* [regexp_case_fold str] compiles the regular expression
           [str] with the default table set and case-insensitive *)

val sregexpo : icflag -> string -> regexp
val sregexp : string -> regexp
val sregexp_case_fold : string -> regexp
        (* Same as the functions above, but patterns will be studied. *)

val quote : string -> string
        (* [quote str] returns the quoted string of [str] *)


(*** Matching of patterns and subpattern extraction *)

type substrings (* Information on substrings after pattern matching *)

val num_of_subs : substrings -> int
        (* [num_of_subs substrings] returns number of substrings
           (whole match inclusive) *)

val get_substring : substrings -> int -> string
        (* [get_substring substrings n] returns the [n]th substring
           (0 is whole match) *)

val get_substrings : substrings -> string array
        (* [get_substrings substrings] returns the array of substrings
           (whole match on index 0) *)

val pcre_exec :
  irflag -> regexp -> int -> string -> int array
        (* [pcre_exec rflags regexp offset subject] returns an array
           of offsets that describe the position of matched subpatterns
           in the string [subject] starting at position [offset] with
           regular expression [regexp] and runtime flags [rflags].
           Raises [Not_found] if pattern does not match. *)

val execon : irflag -> regexp -> int -> string -> substrings
        (* [execon rflags regexp offset subject] returns substring
           information on string [subject] starting at position [offset]
           with regular expression [regexp] and runtime flags [rflags].
           Raises [Not_found] if pattern does not match. *)
val execo : irflag -> regexp -> string -> substrings
val execn : regexp -> int -> string -> substrings
val exec : regexp -> string -> substrings
val aexecn : regexp -> int -> string -> substrings
val aexec : regexp -> string -> substrings

val next_matchon : irflag -> regexp -> int -> substrings -> substrings
        (* [next_matchon rflags regexp offset substrings] returns
           substring info on the match that follows on the last match
           denoted by [substrings], jumping over [offset] characters
           and using pattern [regexp] with runtime flags [rflags].
           Raises [Not_found] if pattern does not match. *)
val next_matcho : irflag -> regexp -> substrings -> substrings
val next_matchn : regexp -> int -> substrings -> substrings
val next_match : regexp -> substrings -> substrings
val anext_matchn : regexp -> int -> substrings -> substrings
val anext_match : regexp -> substrings -> substrings

val extracton : irflag -> regexp -> int -> string -> string array
        (* [extracton rflags regexp offset subject] returns the array of
           substrings that match the string [subject] starting at position
           [offset] with regular expression [regexp] and runtime flags
           [rflags] *)
val extracto : irflag -> regexp -> string -> string array
val extractn : regexp -> int -> string -> string array
val extract : regexp -> string -> string array
val aextractn : regexp -> int -> string -> string array
val aextract : regexp -> string -> string array

val pmatchon : irflag -> regexp -> int -> string -> bool
        (* [pmatchon rflags regexp offset subject] returns [true]
           if pattern [regexp] matches string [subject] starting at
           position [offset] with runtime flags [rflags], [false]
           otherwise *)
val pmatcho : irflag -> regexp -> string -> bool
val pmatchn : regexp -> int -> string -> bool
val pmatch : regexp -> string -> bool
val apmatchn : regexp -> int -> string -> bool
val apmatch : regexp -> string -> bool


(*** String substition *)

(* Information on substitution patterns *)
type substitution

val subst : string -> substitution
        (* [subst str] converts the string [str] representing a
           substitution pattern to the internal representation

           The contents of the substitution string [str] can be normal
           text mixed with any of the following (mostly as in PERL):

           $[0-9]+    - a "$" immediately followed by an arbitrary
                        number.  "$0" stands for the name of the
                        executable, any other number for the n-th
                        backreference.
           $&         - the whole matched pattern
           $`         - the text before the match
           $'         - the text after the match
           $+         - the last group that matched
           $$         - a single "$"
           $!         - Delimiter which does not appear in the
                        substitution. Can be used to part "$[0-9]+"
                        from an immediately following other number. *)

val replace_firston :
  irflag -> regexp -> int -> substitution -> string -> string
        (* [replace_firston rflags regexp offset repl subject] replaces
           the first occurrence of a pattern that matches [regexp]
           with runtime flags [rflags] starting at position [offset]
           in string [subject] with the substitution [repl].
           Raises [Failure] if there are backreferences to nonexistent
           subpatterns. *)
val replace_firsto : irflag -> regexp -> substitution -> string -> string
val replace_firstn : regexp -> int -> substitution -> string -> string
val replace_first : regexp -> substitution -> string -> string
val areplace_firstn : regexp -> int -> substitution -> string -> string
val areplace_first : regexp -> substitution -> string -> string

val qreplace_firston :
  irflag -> regexp -> int -> string -> string -> string
        (* [qreplace_firston rflags regexp offset str subject] replaces
           the first occurrence of a pattern that matches [regexp]
           with runtime flags [rflags] starting at position [offset]
           in string [subject] with the string [str] *)
val qreplace_firsto : irflag -> regexp -> string -> string -> string
val qreplace_firstn : regexp -> int -> string -> string -> string
val qreplace_first : regexp -> string -> string -> string
val aqreplace_firstn : regexp -> int -> string -> string -> string
val aqreplace_first : regexp -> string -> string -> string

val replace_allon : irflag -> regexp -> int -> substitution -> string -> string
        (* [replace_allon rflags regexp offset repl subject] replaces all
           occurrences of a pattern that matches [regexp] with runtime
           flags [rflags] starting at position [offset] in string
           [subject] with the substitution [repl].
           Raises [Failure] if there are backreferences to nonexistent
           subpatterns. *)
val replace_allo : irflag -> regexp -> substitution -> string -> string
val replace_alln : regexp -> int -> substitution -> string -> string
val replace_all : regexp -> substitution -> string -> string
val areplace_alln : regexp -> int -> substitution -> string -> string
val areplace_all : regexp -> substitution -> string -> string

val qreplace_allon : irflag -> regexp -> int -> string -> string -> string
        (* [qreplace_allon rflags regexp offset str subject] replaces all
           occurrences of a pattern that matches [regexp] with runtime
           flags [rflags] starting at position [offset] in string
           [subject] with the string [str] *)
val qreplace_allo : irflag -> regexp -> string -> string -> string
val qreplace_alln : regexp -> int -> string -> string -> string
val qreplace_all : regexp -> string -> string -> string
val aqreplace_alln : regexp -> int -> string -> string -> string
val aqreplace_all : regexp -> string -> string -> string

val substitute_firston :
  irflag -> regexp -> int -> (string -> string) -> string -> string
        (* [substitute_firston rflags regexp offset f subject] replaces
           the first occurrence of a pattern that matches [regexp]
           with runtime flags [rflags] starting at position [offset]
           in string [subject] with the result of function [f] applied
           to the match *)
val substitute_firsto :
  irflag -> regexp -> (string -> string) -> string -> string
val substitute_firstn : regexp -> int -> (string -> string) -> string -> string
val substitute_first : regexp -> (string -> string) -> string -> string
val asubstitute_firstn : regexp -> int -> (string -> string) -> string -> string
val asubstitute_first : regexp -> (string -> string) -> string -> string

val substitute_allon :
  irflag -> regexp -> int -> (string -> string) -> string -> string
        (* [substitute_allon rflags regexp offset repl subject]
           replaces all occurrences of a pattern that matches [regexp]
           with runtime flags [rflags] starting at position [offset]
           in string [subject] with the result of function [f] applied
           to the match *)
val substitute_allo : irflag -> regexp -> (string -> string) -> string -> string
val substitute_alln : regexp -> int -> (string -> string) -> string -> string
val substitute_all : regexp -> (string -> string) -> string -> string
val asubstitute_alln : regexp -> int -> (string -> string) -> string -> string
val asubstitute_all : regexp -> (string -> string) -> string -> string


(*** Splitting *)
      

(** Splitting compatible to PERL *)

val bounded_pspliton : irflag -> regexp -> int -> int -> string -> string list
        (* [bounded_pspliton rflags regexp bound offset subject] splits
           [subject] beginning at position [offset] and considering
           delimiter [regexp] with runtime flags [rflags] at most [bound]
           times into substrings (PERL-compatible). *)
val bounded_psplito : irflag -> regexp -> int -> string -> string list
val bounded_psplitn : regexp -> int -> int -> string -> string list
val bounded_psplit : regexp -> int -> string -> string list

val pspliton : irflag -> regexp -> int -> string -> string list
        (* [pspliton rflags regexp offset subject]
           Like [bounded_pspliton] but assumes infinite bound and strips
           trailing null fields. *)
val psplito : irflag -> regexp -> string -> string list
val psplitn : regexp -> int -> string -> string list
val psplit : regexp -> string -> string list

type psplit_result = PText of string        (* Text part of splitted string *)
                   | PDelim of string       (* Delimiter part of splitted
                                               string *)
                   | PGroup of int * string (* Subgroup of matched delimiter
                                               (subgroup_nr, subgroup_str) *)
                   | PNoGroup               (* Unmatched subgroup *)

val bounded_full_pspliton :
  irflag -> regexp -> int -> int -> string -> psplit_result list
        (* [bounded_full_pspliton rflags regexp bound offset subject]
           splits [subject] beginning at position [offset] and considering
           delimiter [regexp] with runtime flags [rflags] at most [bound]
           times and returns a list of "psplit_result". *)
val bounded_full_psplito :
  irflag -> regexp -> int -> string -> psplit_result list
val bounded_full_psplitn : regexp -> int -> int -> string -> psplit_result list
val bounded_full_psplit : regexp -> int -> string -> psplit_result list

val full_pspliton : irflag -> regexp -> int -> string -> psplit_result list
        (* [bounded_full_pspliton rflags regexp offset subject]
           Like [bounded_full_pspliton] but assumes infinite bound *)
val full_psplito : irflag -> regexp -> string -> psplit_result list
val full_psplitn : regexp -> int -> string -> psplit_result list
val full_psplit : regexp -> string -> psplit_result list


(** Splitting compatible to the "Str"-module.
    Only difference (besides kind and order of parameters):
    no crash on null patterns - behaves like PERL in this case. *)

val bounded_spliton : irflag -> regexp -> int -> int -> string -> string list
        (* [bounded_spliton rflags regexp bound offset subject] *)
val bounded_splito : irflag -> regexp -> int -> string -> string list
val bounded_splitn : regexp -> int -> int -> string -> string list
val bounded_split : regexp -> int -> string -> string list

val spliton : irflag -> regexp -> int -> string -> string list
        (* [spliton rflags regexp offset subject] *)
val splito : irflag -> regexp -> string -> string list
val splitn : regexp -> int -> string -> string list
val split : regexp -> string -> string list

val bounded_split_delimon :
  irflag -> regexp -> int -> int -> string -> string list
        (* [bounded_split_delimon rflags regexp bound offset subject] *)
val bounded_split_delimo : irflag -> regexp -> int -> string -> string list
val bounded_split_delimn : regexp -> int -> int -> string -> string list
val bounded_split_delim : regexp -> int -> string -> string list

val split_delimon : irflag -> regexp -> int -> string -> string list
        (* [split_delimon rflags regexp offset subject] *)
val split_delimo : irflag -> regexp -> string -> string list
val split_delimn : regexp -> int -> string -> string list
val split_delim : regexp -> string -> string list

type split_result = Text of string | Delim of string

val bounded_full_spliton :
  irflag -> regexp -> int -> int -> string -> split_result list
        (* [bounded_full_spliton rflags regexp bound offset subject] *)
val bounded_full_splito : irflag -> regexp -> int -> string -> split_result list
val bounded_full_splitn : regexp -> int -> int -> string -> split_result list
val bounded_full_split : regexp -> int -> string -> split_result list

val full_spliton : irflag -> regexp -> int -> string -> split_result list
        (* [bounded_full_spliton rflags regexp offset subject] *)
val full_splito : irflag -> regexp -> string -> split_result list
val full_splitn : regexp -> int -> string -> split_result list
val full_split : regexp -> string -> split_result list


(*** Version information *)

val version : string
        (* Version of the PCRE-C-library as string *)


(*** Additional convenience functions useful in combination with this
     library *)

(* [foreach_line ch f] applies [f] to each line in channel [ch] until
   the end-of-file is reached *)
val foreach_line : in_channel -> (string -> unit) -> unit

(* [foreach_file filenames f] opens each file in the list [filenames]
   for input and applies [f] to each tuple (filename, file_channel).
   The files get closed again, of course. *)
val foreach_file : string list -> (string * in_channel -> unit) -> unit


(*** UNSAFE STUFF - USE WITH CAUTION! *)

external unsafe_pcre_exec :
  irflag -> regexp -> int -> string ->
  int -> int array -> unit = "pcre_exec_wrapper_bc" "pcre_exec_wrapper"
        (* [unsafe_pcre_exec rflags regexp offset subject
                             subgroup_offsets offset-vector]
           You should read the C-source to know what happens.
           If you do not understand this - don't use this function! *)

(* [make_ovector regexp] calculates the tuple (subgroups2, ovector)
   which is the number of subgroup offsets and the offset array *)
val make_ovector : regexp -> int * int array
