(*
   PCRE-OCAML - Perl Compatibility Regular Expressions for OCaml

   Copyright (C) 1999  Markus Mottl
   email: mottl@miss.wu-wien.ac.at
   WWW: http://miss.wu-wien.ac.at/~mottl

   This library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2 of the License, or (at your option) any later version.

   This library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with this library; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*)

(* $Id: pcre.ml,v 1.13 1999/09/27 12:07:40 mottl Exp $ *)

(*** IMPORTANT INFORMATION:

  Many functions support different interfaces depending on the
  parameters one can optionally use with them. This is reflected in
  their names. These are the postfix conventions:

  ...on   -   Function additionally expects runtime options and an offset
  ...o    -   Function additionally expects runtime options
  ...n    -   Function additionally expects an offset

  These postfixes are available for the following functions:

    exec, next_match, extract, pmatch, replace_first, replace_all,
    qreplace_first, qreplace_all, substitute_first, substitute_all,
    bounded_psplit, psplit, full_psplit, bounded_split, split,
    bounded_split_delim, split_delim, bounded_full_split, full_split

  Per default, no runtime options and an offset of '0' will be used if
  there is no additional postfix.

  Some functions may also take the prefix 'a', which stands for "anchored
  match". This ensures that the pattern must match immediately at the
  position where matching is started. This is useful with the following
  functions:

    exec, next_match, extract, pmatch, replace_first, replace_all,
    qreplace_first, qreplace_all, substitute_first, substitute_all,

  See the function comments for details on their other parameters.
*)


(*** Public exceptions and their registration with the C runtime *)

(* Gets raised when the regular expression is malformed *)
exception BadPattern of string * int

(* Gets raised when the C-library exhibits undefined behaviour *)
exception InternalError of string

(* Puts exceptions into global C-variables for fast retrieval *)
external init_caml_names : unit -> unit = "pcre_init_caml_names"

(* Registers exceptions with the C runtime *)
let _ =
  Callback.register_exception "Pcre.Not_found" Not_found;
  Callback.register_exception "Pcre.BadPattern" (BadPattern ("", 0));
  Callback.register_exception "Pcre.InternalError" (InternalError "");
  init_caml_names ()


(*** Compilation and runtime flags and their conversion functions *)

type icflag = int (* Internal representation of compilation flags *)
and  irflag = int (* Internal representation of runtime flags *)

(* Compilation flags *)
and cflag =
  | CASELESS       (* Case insensitive matching *)
  | MULTILINE      (* '^' and '$' match before/after newlines,
                      not just at the beginning/end of a string *)
  | DOTALL         (* '.' matches all characters (newlines, too) *)
  | EXTENDED       (* Ignores whitespace and PERL-comments. Behaves
                      like the '/x'-option in PERL *)
  | C_ANCHORED     (* Pattern matches only at start of string *)
  | DOLLAR_ENDONLY (* '$' in pattern matches only at end of string *)
  | EXTRA          (* Reserved for future extensions of PCRE *)
  | UNGREEDY       (* Quantifiers not greedy anymore, only
                      if followed by '?' *)

(* [cflags cflag_list] converts a list of compilation flags to their
   internal representation *)
let cflags =
  let int_of_cflag = function
    | CASELESS   -> 0x0001 | MULTILINE      -> 0x0002
    | DOTALL     -> 0x0004 | EXTENDED       -> 0x0008
    | C_ANCHORED -> 0x0010 | DOLLAR_ENDONLY -> 0x0020
    | EXTRA      -> 0x0040 | UNGREEDY       -> 0x0200 in
  List.fold_left (fun icflag flag -> int_of_cflag flag lor icflag) 0

(* [cflag_list cflags] converts internal representation of compilation
   flags to a list *)
let cflag_list icflags =
  let cflag_of_int = function
    | 0x0001 -> CASELESS   | 0x0002 -> MULTILINE
    | 0x0004 -> DOTALL     | 0x0008 -> EXTENDED
    | 0x0010 -> C_ANCHORED | 0x0020 -> DOLLAR_ENDONLY
    | 0x0040 -> EXTRA      | 0x0200 -> UNGREEDY
    | _ -> failwith "Pcre.cflag_list: unknown compilation flag"
  and all_cflags =
    [0x0001; 0x0002; 0x0004; 0x0008; 0x0010; 0x0020; 0x0040; 0x0200] in
  let collect flag_list flag =
    if icflags land flag <> 0 then cflag_of_int flag :: flag_list
    else flag_list in
  List.fold_left collect [] all_cflags

(* Runtime flags *)
type rflag =
  | R_ANCHORED (* Treats pattern as if it were anchored *)
  | NOTBOL     (* Beginning of string is not treated as beginning of line *)
  | NOTEOL     (* End of string is not treated as end of line *)
  | NOTEMPTY   (* Empty strings are not considered to be a valid match *)

(* [rflags rflag_list] converts a list of runtime flags to their internal
   representation *)
let rflags =
  let int_of_rflag = function
    | R_ANCHORED  -> 0x0010 | NOTBOL   -> 0x0080
    | NOTEOL      -> 0x0100 | NOTEMPTY -> 0x0400 in
  List.fold_left (fun irflag flag -> int_of_rflag flag lor irflag) 0

(* [rflag_list rflags] converts internal representation of runtime flags
   to a list *)
let rflag_list irflags =
  let rflag_of_int = function
    | 0x0010 -> R_ANCHORED | 0x0080 -> NOTBOL
    | 0x0100 -> NOTEOL     | 0x0400 -> NOTEMPTY
    | _ -> failwith "Pcre.rflag_list: unknown runtime flag"
  and all_rflags = [0x0010; 0x0080; 0x0100; 0x0400] in
  let collect flag_list flag =
    if irflags land flag <> 0 then rflag_of_int flag :: flag_list
    else flag_list in
  List.fold_left collect [] all_rflags


(*** Information on patterns *)

(* Information on matching of "first chars" in patterns *)
type firstchar_info =
  | Char of char (* fixed first character *)
  | StartOnly    (* pattern matches at beginning and end of newlines *)
  | Anchored     (* pattern is anchored *)

(* Information on the study status of patterns *)
type study_stat =
  | Not_studied (* Pattern has not yet been studied *)
  | Studied     (* Pattern has been studied successfully *)
  | Optimal     (* Pattern could not be improved by studying *)

(* Information on patterns *)
type info =
  { subpats : int;              (* Number of subpatterns *)
    cflags : icflag;            (* Compilation flags *)
    firstchar : firstchar_info; (* Information on "first chars" *)
    study_stat : study_stat }   (* Study status of pattern *)

type regexp (* Compiled regular expressions *)

(* [info regexp] returns information on regular expression *)
external info : regexp -> info = "pcre_info_wrapper"


(*** Compilation of patterns *)

(* For detailed documentation on how you can specify PERL-style
   regular expressions (=patterns), please consult PERL-manuals or the
   man-page of PCRE! *)

type chartables (* Alternative set of chartables for pattern matching *)

external maketables : unit -> chartables = "pcre_maketables_wrapper"
        (* Generates new set of char tables for the current locale *)

external compile :
  icflag -> chartables option -> string -> regexp = "pcre_compile_wrapper"
        (* [compile cflags (Some chartables) str] returns the compiled
           regular expression, compiled with options [cflags] and an
           optional alternative set of chartables [chartables] *)

external pcre_study : regexp -> unit = "pcre_study_wrapper"
        (* [pcre_study regexp] studies the compiled regular expression -
           especially useful for patterns with many alternations *)

(* [study regexp] studies the compiled regular expression and returns
   it again - especially useful for patterns with many alternations *)
let study rex = pcre_study rex; rex

(* [regexpo cflags str] compiles the regular expression [str]
   with the default table set and options [cflags] *)
let regexpo flags = compile flags None

let regexp = regexpo 0
let regexp_case_fold = regexpo 0x0001

(* Same as the functions above, but patterns will be studied. *)
let sregexpo flags rex = study (regexpo flags rex)
let sregexp = sregexpo 0
let sregexp_case_fold = sregexpo 0x0001

(* [quote str] returns the quoted string of [str] *)
let quote s =
  let len = String.length s in
  let buf = String.create (len lsl 1)
  and pos = ref 0 in
  for i = 0 to pred len do
    match String.unsafe_get s i with
    | '\\' | '|' | '(' | ')' | '[' | '{' | '^' | '$' | '*'
    | '+' | '?' | '.' as c ->
      String.unsafe_set buf !pos '\\'; incr pos;
      String.unsafe_set buf !pos c; incr pos
    | c -> String.unsafe_set buf !pos c; incr pos
  done;
  String.sub buf 0 !pos


(*** Matching of patterns and subpattern extraction *)

(* Information on substrings after pattern matching *)
type substrings = string * int array

(* [num_of_subs substrings] returns number of substrings (whole match
   inclusive) *)
let num_of_subs (_, ovector) = Array.length ovector lsr 1

(* [get_substring substrings n] returns the [n]th substring (0 is whole
   match) *)
let get_substring (subj, ovector) str_num =
  let offset = str_num lsl 1 in
  let start = ovector.(offset) in
  if start < 0 then ""
  else String.sub subj start (ovector.(succ offset) - start)

(* [get_substrings substrings] returns the array of substrings (whole
   match on index 0) *)
let get_substrings (_, ovector as substrings) =
  Array.init (Array.length ovector lsr 1) (get_substring substrings)

external unsafe_pcre_exec :
  irflag -> regexp -> int -> string ->
  int -> int array -> unit = "pcre_exec_wrapper_bc" "pcre_exec_wrapper"
        (* [unsafe_pcre_exec rflags regexp offset subject
                             subgroup_offsets offset-vector]
           You should read the C-source to know what happens.
           If you do not understand this - don't use this function! *)


(* [make_ovector regexp] calculates the tuple (subgroups2, ovector)
   which is the number of subgroup offsets and the offset array *)
let make_ovector rex =
  let subgroups1 = succ (info rex).subpats in
  let subgroups2 = subgroups1 lsl 1 in
  subgroups2, Array.create (subgroups1 + subgroups2) 0

(* [pcre_exec rflags regexp offset subject] returns an array of offsets
   that describe the position of matched subpatterns in the string
   [subject] starting at position [offset] with regular expression
   [regexp] and runtime flags [rflags].
   Raises [Not_found] if pattern does not match. *)
let pcre_exec flags rex n subj =
  let subgroups2, ovector = make_ovector rex in
  unsafe_pcre_exec flags rex n subj subgroups2 ovector;
  Array.sub ovector 0 subgroups2

(* [execon rflags regexp offset subject] returns substring information on
   string [subject] starting at position [offset] with regular expression
   [regexp] and runtime flags [rflags].
   Raises [Not_found] if pattern does not match. *)
let execon flags rex n subj = subj, pcre_exec flags rex n subj
let execo flags rex = execon flags rex 0
let execn = execon 0x0000
let exec = execo 0x0000
let aexecn = execon 0x0010
let aexec = execo 0x0010

(* [next_matchon rflags regexp offset substrings] returns substring info
   on the match that follows on the last match denoted by [substrings],
   jumping over [offset] characters and using pattern [regexp] with
   runtime flags [rflags].
   Raises [Not_found] if pattern does not match. *)
let next_matchon flags rex n (subj, ovector) =
  let subj_len = String.length subj in
  if n < 0 || n > subj_len then invalid_arg "Pcre.next_matchon: illegal offset";
  let real_ofs = max (Array.unsafe_get ovector 1 + n) 1 in
  if real_ofs > subj_len then raise Not_found
  else subj, pcre_exec flags rex real_ofs subj

let next_matcho flags rex = next_matchon flags rex 0
let next_matchn = next_matchon 0x0000
let next_match = next_matcho 0x0000
let anext_matchn = next_matchon 0x0010
let anext_match = next_matcho 0x0010

(* [extracton rflags regexp offset subject] returns the array of
   substrings that match the string [subject] starting at position
   [offset] with regular expression [regexp] and runtime flags [rflags] *)
let extracton rflags rex n subj = get_substrings (execon rflags rex n subj)
let extracto rflags rex subj = get_substrings (execo rflags rex subj)
let extractn rex n subj = get_substrings (execn rex n subj)
let extract rex subj = get_substrings (exec rex subj)
let aextractn rex n subj = get_substrings (aexecn rex n subj)
let aextract rex subj = get_substrings (aexec rex subj)

(* [pmatchon rflags regexp offset subject] returns [true] if pattern
   [regexp] matches string [subject] starting at position [offset]
   with runtime flags [rflags], [false] otherwise *)
let pmatchon flags rex n subj =
  try ignore (pcre_exec flags rex n subj); true with Not_found -> false

let pmatcho flags rex = pmatchon flags rex 0
let pmatchn = pmatchon 0x0000
let pmatch = pmatcho 0x0000
let apmatchn = pmatchon 0x0010
let apmatch = pmatcho 0x0010


(*** String substitution *)

(* Elements of a substitution pattern *)
type subst =
  | SubstString of int * int (* Denotes a substring in the substitution *)
  | Backref of int           (* nth backreference ($0 is program name!) *)
  | Match                    (* The whole matched string *)
  | PreMatch                 (* The string before the match *)
  | PostMatch                (* The string after the match *)
  | LastParenMatch           (* The last matched group *)

(* Information on substitution patterns *)
type substitution = string     (* The substitution string *)
                  * int        (* Highest group number of backreferences *)
                  * bool       (* Makes use of "LastParenMatch" *)
                  * subst list (* The list of substitution elements *)

(* Only used internally in "subst" *)
exception FoundAt of int

(* [subst str] converts the string [str] representing a
   substitution pattern to the internal representation

   The contents of the substitution string [str] can be normal
   text mixed with any of the following (mostly as in PERL):

   $[0-9]+    - a "$" immediately followed by an arbitrary
                number.  "$0" stands for the name of the
                executable, any other number for the n-th
                backreference.
   $&         - the whole matched pattern
   $`         - the text before the match
   $'         - the text after the match
   $+         - the last group that matched
   $$         - a single "$"
   $!         - Delimiter which does not appear in the
                substitution. Can be used to part "$[0-9]+"
                from an immediately following other number.
*)
let subst str =
  let max_br = ref 0
  and with_lp = ref false
  and lix = pred (String.length str)
  and zero = Char.code '0' in

  let rec subst acc n =
    if lix < n then acc
    else
      try
        for i = n to lix do
          if String.unsafe_get str i = '$' then raise (FoundAt i)
        done;
        SubstString (n, succ lix - n) :: acc
      with FoundAt i ->
        if i = lix then SubstString (n, succ lix - n) :: acc
        else
          let i1 = succ i
          and acc = if n = i then acc else SubstString (n, i - n) :: acc in
          match String.unsafe_get str i1 with
          | '0'..'9' as c ->
              let subpat_nr = ref (Char.code c - zero) in
              begin try
                  for j = succ i1 to lix do
                    let c = String.unsafe_get str j in
                    if c >= '0' && c <= '9' then
                      subpat_nr := 10 * !subpat_nr + Char.code c - zero
                    else raise (FoundAt j)
                  done;
                  max_br := max !subpat_nr !max_br;
                  Backref !subpat_nr :: acc
                with FoundAt j ->
                  max_br := max !subpat_nr !max_br;
                  subst (Backref !subpat_nr :: acc) j end
          | '!'  -> subst acc (succ i1)
          | '$'  -> subst (SubstString (i1, 1) :: acc) (succ i1)
          | '&'  -> subst (Match :: acc) (succ i1)
          | '`'  -> subst (PreMatch :: acc) (succ i1)
          | '\'' -> subst (PostMatch :: acc) (succ i1)
          | '+'  ->
              with_lp := true;
              subst (LastParenMatch :: acc) (succ i1)
          | _    -> subst acc i1 in
  str, !max_br, !with_lp, subst [] 0

(* Calculates a list of tuples (str, offset, len) which contain
   substrings to be copied on substitutions. Internal use only! *)
let calc_trans_list subgroups2 ovector subj sstr subst_list =
  let prefix_len = Array.unsafe_get ovector 0
  and last = Array.unsafe_get ovector 1 in
  let collect (res_len, trans_list as accu) =
    let return_lst (str, ix, len as el) =
      if len = 0 then accu else res_len + len, el :: trans_list in
    function
    | SubstString (ix, len) -> return_lst (sstr, ix, len)
    | Backref 0 ->
        let prog_name = Sys.argv.(0) in
        return_lst (prog_name, 0, String.length prog_name)
    | Backref n ->
        let offset = n lsl 1 in
        let start = Array.unsafe_get ovector offset in
        let len = Array.unsafe_get ovector (succ offset) - start in
        return_lst (subj, start, len)
    | Match -> return_lst (subj, prefix_len, last - prefix_len)
    | PreMatch -> return_lst (subj, 0, prefix_len)
    | PostMatch -> return_lst (subj, last, String.length subj - last)
    | LastParenMatch ->
        let pos = ref (subgroups2 - 2) in
        let ix = ref (Array.unsafe_get ovector !pos) in
        while !ix < 0 do
          decr pos; decr pos;
          ix := Array.unsafe_get ovector !pos
        done;
        return_lst (subj, !ix, Array.unsafe_get ovector (succ !pos) - !ix) in
  List.fold_left collect (0, []) subst_list

(* [replace_firston rflags regexp offset repl subject] replaces
   the first occurrence of a pattern that matches [regexp] with runtime
   flags [rflags] starting at position [offset] in string [subject]
   with the substitution [repl].
   Raises [Failure] if there are backreferences to nonexistent subpatterns. *)
let replace_firston flags rex n (sstr, max_br, with_lp, subst_list) subj =
  let subgroups2, ovector = make_ovector rex in
  let nsubs = pred (subgroups2 lsr 1) in
  if max_br > nsubs then
    failwith
      "Pcre.replace_firston: backreference denotes nonexistent subpattern";
  if with_lp && nsubs = 0 then
    failwith "Pcre.replace_firston: no backreferences";

  try
    unsafe_pcre_exec flags rex n subj subgroups2 ovector;
    let res_len, trans_list =
      calc_trans_list subgroups2 ovector subj sstr subst_list in
    let first = Array.unsafe_get ovector 0
    and last = Array.unsafe_get ovector 1 in
    let rest = String.length subj - last in
    let res = String.create (first + res_len + rest) in
    String.unsafe_blit subj 0 res 0 first;
    let pos = ref first in
    let action (sstr, ix, len) =
      String.unsafe_blit sstr ix res !pos len;
      pos := !pos + len in
    List.iter action trans_list;
    String.unsafe_blit subj last res !pos rest;
    res
  with Not_found -> String.copy subj

let replace_firsto flags rex = replace_firston flags rex 0
let replace_firstn = replace_firston 0x0000
let replace_first = replace_firsto 0x0000
let areplace_firstn = replace_firston 0x0010
let areplace_first = replace_firsto 0x0010

(* [qreplace_firston rflags regexp offset str subject] replaces
   the first occurrence of a pattern that matches [regexp] with runtime
   flags [rflags] starting at position [offset] in string [subject]
   with the string [str] *)
let qreplace_firston flags rex n str subj =
  let subgroups2, ovector = make_ovector rex in
  try
    unsafe_pcre_exec flags rex n subj subgroups2 ovector;
    let first = Array.unsafe_get ovector 0
    and last = Array.unsafe_get ovector 1
    and len = String.length str in
    let rest = String.length subj - last
    and postfix_start = first + len in
    let res = String.create (postfix_start + rest) in
    String.unsafe_blit subj 0 res 0 first;
    String.unsafe_blit str 0 res first len;
    String.unsafe_blit subj last res postfix_start rest;
    res
  with Not_found -> String.copy subj

let qreplace_firsto flags rex = qreplace_firston flags rex 0
let qreplace_firstn = qreplace_firston 0x0000
let qreplace_first = qreplace_firsto 0x0000
let aqreplace_firstn = qreplace_firston 0x0010
let aqreplace_first = qreplace_firsto 0x0010

(* [replace_allon rflags regexp offset repl subject] replaces all
   occurrences of a pattern that matches [regexp] with runtime flags
   [rflags] starting at position [offset] in string [subject] with the
   substitution [repl].
   Raises [Failure] if there are backreferences to nonexistent subpatterns. *)
let replace_allon flags rex n (sstr, max_br, with_lp, subst_list) subj =
  let subj_len = String.length subj in
  if n < 0 || n > subj_len then
    invalid_arg "Pcre.replace_allon: illegal offset!";
  let subgroups2, ovector = make_ovector rex in
  let nsubs = pred (subgroups2 lsr 1) in
  if max_br > nsubs then
    failwith
      "Pcre.replace_allon: backreference denotes nonexistent subpattern";
  if with_lp && nsubs = 0 then failwith "Pcre.replace_allon: no backreferences";

  let full_len, trans_lists, last =
    let rec find_all_pats (full_len, trans_lists, pos as accu) =
      if pos > subj_len then accu
      else
        try
          unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
          let first = Array.unsafe_get ovector 0 in
          let len = first - pos in
          let trans_lists =
            if len > 0 then (len, [subj, pos, len]) :: trans_lists
            else trans_lists
          and last = Array.unsafe_get ovector 1
          and res_len, _ as trans_list_el =
            calc_trans_list subgroups2 ovector subj sstr subst_list in
          let full_len = full_len + len + res_len in
          let find_accu =
            let next = succ first
            and trans_lists = trans_list_el :: trans_lists in
            if last < next then
              if pos < subj_len then
                succ full_len, (1, [subj, pos, 1]) :: trans_lists, next
              else full_len, trans_lists, next
            else full_len, trans_lists, last in
          find_all_pats find_accu
        with Not_found -> accu in
    find_all_pats (0, [], n) in

  let postfix_len = max (subj_len - last) 0 in
  let left = n + full_len in
  let res = String.create (left + postfix_len) in
  String.unsafe_blit subj 0 res 0 n;
  String.unsafe_blit subj last res full_len postfix_len;
  let pos = ref left in
  let action (res_len, trans_list) =
    pos := !pos - res_len;
    let action (sstr, ix, len) = 
      String.unsafe_blit sstr ix res !pos len;
      pos := !pos + len in
    List.iter action trans_list;
    pos := !pos - res_len; in
  List.iter action trans_lists;
  res

let replace_allo flags rex = replace_allon flags rex 0
let replace_alln = replace_allon 0x0000
let replace_all = replace_allo 0x0000
let areplace_alln = replace_allon 0x0010
let areplace_all = replace_allo 0x0010

(* [qreplace_allon rflags regexp offset str subject] replaces all
   occurrences of a pattern that matches [regexp] with runtime flags
   [rflags] starting at position [offset] in string [subject] with the
   string [str] *)
let qreplace_allon flags rex n sstr subj =
  let subj_len = String.length subj in
  if n < 0 || n > subj_len then
    invalid_arg "Pcre.qreplace_allon: illegal offset!";
  let sstr_len = String.length sstr in

  let full_len, subst_list, last =
    let subgroups2, ovector = make_ovector rex in
    let rec find_all_pats (full_len, subst_list, pos as accu) =
      if pos > subj_len then accu
      else
        try
          unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
          let first = Array.unsafe_get ovector 0 in
          let len = first - pos in
          let subst_list =
            if len > 0 then Some (subj, pos, len) :: subst_list
            else subst_list
          and last = Array.unsafe_get ovector 1
          and full_len = full_len + len + sstr_len in
          let find_accu =
            let next = succ first
            and subst_list = None :: subst_list in
            if last < next then
              if pos < subj_len then
                succ full_len, Some (subj, pos, 1) :: subst_list, next
              else full_len, subst_list, next
            else full_len, subst_list, last in
          find_all_pats find_accu
        with Not_found -> accu in
    find_all_pats (0, [], n) in

  let postfix_len = max (subj_len - last) 0 in
  let left = n + full_len in
  let res = String.create (left + postfix_len) in
  String.unsafe_blit subj 0 res 0 n;
  String.unsafe_blit subj last res full_len postfix_len;
  let pos = ref left in
  let action = function
    | Some (substr, ix, len) ->
        pos := !pos - len;
        String.unsafe_blit substr ix res !pos len
    | None ->
        pos := !pos - sstr_len;
        String.unsafe_blit sstr 0 res !pos sstr_len in
  List.iter action subst_list;
  res

let qreplace_allo flags rex = qreplace_allon flags rex 0
let qreplace_alln = qreplace_allon 0x0000
let qreplace_all = qreplace_allo 0x0000
let aqreplace_alln = qreplace_allon 0x0010
let aqreplace_all = qreplace_allo 0x0010

(* [substitute_firston rflags regexp offset f subject] replaces
   the first occurrence of a pattern that matches [regexp] with runtime
   flags [rflags] starting at position [offset] in string [subject]
   with the result of function [f] applied to the match *)
let substitute_firston flags rex n f subj =
  let subgroups2, ovector = make_ovector rex in
  try
    unsafe_pcre_exec flags rex n subj subgroups2 ovector;
    let subj_len = String.length subj
    and prefix_len = Array.unsafe_get ovector 0
    and last = Array.unsafe_get ovector 1 in
    let repl = f (String.sub subj prefix_len (last - prefix_len))
    and postfix_len = subj_len - last in
    let repl_len = String.length repl in
    let postfix_start = prefix_len + repl_len in
    let res = String.create (postfix_start + postfix_len) in
    String.unsafe_blit subj 0 res 0 prefix_len;
    String.unsafe_blit repl 0 res prefix_len repl_len;
    String.unsafe_blit subj last res postfix_start postfix_len;
    res
  with Not_found -> String.copy subj

let substitute_firsto flags rex = substitute_firston flags rex 0
let substitute_firstn = substitute_firston 0x0000
let substitute_first = substitute_firsto 0x0000
let asubstitute_firstn = substitute_firston 0x0010
let asubstitute_first = substitute_firsto 0x0010

(* [substitute_allon rflags regexp offset repl subject] replaces all
   occurrences of a pattern that matches [regexp] with runtime flags
   [rflags] starting at position [offset] in string [subject] with the
   result of function [f] applied to the match *)
let substitute_allon flags rex n f subj =
  let subj_len = String.length subj in
  if n < 0 || n > subj_len then
    invalid_arg "Pcre.substitute_allon: illegal offset!";
  let full_len, subst_list, last =
    let subgroups2, ovector = make_ovector rex in
    let rec find_all_pats (full_len, subst_list, pos as accu) =
      if pos > subj_len then accu
      else
        try
          unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
          let first = Array.unsafe_get ovector 0 in
          let len = first - pos in
          let subst_list =
            if len > 0 then (subj, pos, len) :: subst_list else subst_list
          and last = Array.unsafe_get ovector 1 in
          let sstr = f (String.sub subj first (last - first)) in
          let sstr_len = String.length sstr in
          let full_len = full_len + len + sstr_len in
          let find_accu =
            let next = succ first
            and subst_list = (sstr, 0, sstr_len) :: subst_list in
            if last < next then
              if pos < subj_len then
                succ full_len, (subj, pos, 1) :: subst_list, next
              else full_len, subst_list, next
            else full_len, subst_list, last in
          find_all_pats find_accu
        with Not_found -> accu in
    find_all_pats (0, [], n) in

  let postfix_len = max (subj_len - last) 0 in
  let left = n + full_len in
  let res = String.create (left + postfix_len) in
  String.unsafe_blit subj 0 res 0 n;
  String.unsafe_blit subj last res full_len postfix_len;
  let pos = ref left in
  let action (sstr, ix, len) = 
    pos := !pos - len;
    String.unsafe_blit sstr ix res !pos len in
  List.iter action subst_list;
  res

let substitute_allo flags rex = substitute_allon flags rex 0
let substitute_alln = substitute_allon 0x0000
let substitute_all = substitute_allo 0x0000
let asubstitute_alln = substitute_allon 0x0010
let asubstitute_all = substitute_allo 0x0010


(*** Splitting *)


(** Splitting compatible to PERL *)

(* [internal_psplit rflags regexp bound offset subject] splits [subject]
   beginning at position [offset] and considering delimiter [regexp]
   with runtime flags [rflags] at most [bound] times into substrings.
   The result of the list is reversed! *)
let internal_psplit flags rex bound n subj =
  let subj_len = String.length subj in
  if subj_len = 0 then []
  else if bound = 1 then [String.copy subj]
  else
    let subgroups2, ovector = make_ovector rex in

    (* Adds contents of subgroups to the string accumulator *)
    let handle_subgroups strs =
      let strs = ref strs
      and i = ref 2 in
      while !i < subgroups2 do
        let first = Array.unsafe_get ovector !i in
        incr i;
        let last = Array.unsafe_get ovector !i in
        let str =
          if first < 0 then "" else String.sub subj first (last - first) in
        strs := str :: !strs; incr i
      done;
      !strs in

    (* Performs the recursive split *)
    let rec do_split (strs, cnt, pos, prematch) =
      let len = subj_len - pos in
      if len < 0 then strs
      else
        try
          (* Checks termination due to bound restriction *)
          if cnt = 0 then
            if prematch then begin
              unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
              let first = Array.unsafe_get ovector 0
              and last = Array.unsafe_get ovector 1 in
              String.sub subj last (subj_len-last) :: handle_subgroups strs end
            else raise Not_found

          (* Calculates next accumulator state for splitting *)
          else begin
            unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
            let first = Array.unsafe_get ovector 0
            and last = Array.unsafe_get ovector 1 in

            if first = pos then
              if last = pos then
                let strs = if prematch then handle_subgroups strs else strs in
                if len = 0 then "" :: strs
                else
                  let accu =
                    try
                      unsafe_pcre_exec (flags lor 0x0410) rex pos subj
                                        subgroups2 ovector;
                      handle_subgroups ("" :: strs), pred cnt,
                      Array.unsafe_get ovector 1, false
                    with Not_found ->
                      String.sub subj pos 1 :: strs, pred cnt, succ pos, true in
                  do_split accu
              else let accu =
                     if prematch then handle_subgroups strs, cnt, last, false
                     else handle_subgroups (""::strs), pred cnt, last, false in
                   do_split accu
            else let accu =
                   handle_subgroups (String.sub subj pos (first - pos) :: strs),
                   pred cnt, last, false in
                 do_split accu end
        with Not_found -> String.sub subj pos len :: strs in
    do_split ([], pred bound, n, false)

let rec strip_all_empty = function "" :: t -> strip_all_empty t | l -> l
let strip_first_empty = function "" :: t -> t | l -> l

(* [bounded_pspliton rflags regexp bound offset subject] splits [subject]
   beginning at position [offset] and considering delimiter [regexp]
   with runtime flags [rflags] at most [bound] times into substrings
   (PERL-compatible). *)
let bounded_pspliton flags rex bound n subj =
  let res = internal_psplit flags rex bound n subj in
  List.rev (if bound = 0 then strip_all_empty res else res)
let bounded_psplito flags rex bound = bounded_pspliton flags rex bound 0
let bounded_psplitn = bounded_pspliton 0x0000
let bounded_psplit = bounded_psplito 0x0000

(* [pspliton rflags regexp offset subject]
   Like [bounded_pspliton] but assumes infinite bound and strips trailing
   null fields. *)
let pspliton flags rex = bounded_pspliton flags rex 0
let psplito flags rex = pspliton flags rex 0
let psplitn = pspliton 0x0000
let psplit = psplito 0x0000

type psplit_result = PText of string        (* Text part of splitted string *)
                   | PDelim of string       (* Delimiter part of splitted
                                               string *)
                   | PGroup of int * string (* Subgroup of matched delimiter
                                               (subgroup_nr, subgroup_str) *)
                   | PNoGroup               (* Unmatched subgroup *)

(* [bounded_full_pspliton rflags regexp bound offset subject] splits
   [subject] beginning at position [offset] and considering delimiter
   [regexp] with runtime flags [rflags] at most [bound] times and returns
   a list of "psplit_result". *)
let bounded_full_pspliton flags rex bound n subj =
  let subj_len = String.length subj in
  if subj_len = 0 then []
  else if bound = 1 then [PText (String.copy subj)]
  else
    let subgroups2, ovector = make_ovector rex in

    (* Adds contents of subgroups to the string accumulator *)
    let handle_subgroups strs =
      let strs = ref strs
      and i = ref 2 in
      while !i < subgroups2 do
        let group_nr = !i lsr 1
        and first = Array.unsafe_get ovector !i in
        incr i;
        let last = Array.unsafe_get ovector !i in
        let str =
          if first < 0 then PNoGroup
          else let group_str = String.sub subj first (last - first) in
               PGroup (group_nr, group_str) in
        strs := str :: !strs; incr i
      done;
      !strs in

    (* Performs the recursive split *)
    let rec do_split (strs, cnt, pos, prematch) =
      let len = subj_len - pos in
      if len < 0 then strs
      else
        try
          (* Checks termination due to bound restriction *)
          if cnt = 0 then
            if prematch then begin
              unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
              let first = Array.unsafe_get ovector 0
              and last = Array.unsafe_get ovector 1 in
              let delim = PDelim (String.sub subj first (last - first)) in
              PText (String.sub subj last (subj_len - last))
                :: handle_subgroups (delim :: strs) end
            else raise Not_found

          (* Calculates next accumulator state for splitting *)
          else begin
            unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
            let first = Array.unsafe_get ovector 0
            and last = Array.unsafe_get ovector 1 in

            if first = pos then
              if last = pos then
                if len = 0 then handle_subgroups (PDelim "" :: strs)
                else
                  let accu =
                    let empty_groups = handle_subgroups [] in
                    try
                      unsafe_pcre_exec (flags lor 0x0410) rex pos subj
                                        subgroups2 ovector;
                      let first = Array.unsafe_get ovector 0
                      and last = Array.unsafe_get ovector 1 in
                      let delim = PDelim (String.sub subj first (last-first)) in
                      handle_subgroups (
                        delim :: (if prematch then strs
                                  else empty_groups @ (PDelim "" :: strs))),
                      pred cnt, last, false
                    with Not_found ->
                      PText (String.sub subj pos 1) :: empty_groups @ PDelim ""
                        :: strs, pred cnt, succ pos, true in
                  do_split accu 
              else
                let accu =
                  let delim = PDelim (String.sub subj first (last - first)) in
                  handle_subgroups (delim :: strs), cnt, last, false in
                do_split accu
            else
              let accu = 
                handle_subgroups
                  (PDelim (String.sub subj first (last - first))
                   :: PText (String.sub subj pos (first - pos)) :: strs),
                pred cnt, last, false in
              do_split accu end
        with Not_found ->
          if len = 0 then strs else PText (String.sub subj pos len) :: strs in
    List.rev (do_split ([], pred bound, n, true))

let bounded_full_psplito flags rex bound =
  bounded_full_pspliton flags rex bound 0
let bounded_full_psplitn = bounded_full_pspliton 0x0000
let bounded_full_psplit = bounded_full_psplito 0x0000

let full_pspliton flags rex = bounded_full_pspliton flags rex 0
let full_psplito flags rex = full_pspliton flags rex 0
let full_psplitn = full_pspliton 0x0000
let full_psplit = full_psplito 0x0000


(** Splitting compatible to the "Str"-module.
    Only difference (besides kind and order of parameters):
    no crash on null patterns - behaves like PERL in this case. *)

let internal_split flags rex bound n subj =
  let subj_len = String.length subj in
  if subj_len = 0 then []
  else if bound = 1 then [String.copy subj]
  else
    let subgroups2, ovector = make_ovector rex in

    (* Performs the recursive split *)
    let rec do_split (strs, cnt, pos, prematch) =
      let len = subj_len - pos in
      if len < 0 then strs
      else
        try
          (* Checks termination due to bound restriction *)
          if cnt = 0 then
            if prematch then begin
              unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
              let first = Array.unsafe_get ovector 0
              and last = Array.unsafe_get ovector 1 in
              String.sub subj last (subj_len - last) :: strs end
            else raise Not_found

          (* Calculates next accumulator state for splitting *)
          else begin
            unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
            let first = Array.unsafe_get ovector 0
            and last = Array.unsafe_get ovector 1 in

            if first = pos then
              if last = pos then
                if len = 0 then "" :: strs
                else
                  try
                    unsafe_pcre_exec (flags lor 0x0410) rex pos subj
                                      subgroups2 ovector;
                    let accu =
                      "" :: strs, pred cnt, Array.unsafe_get ovector 1, false in
                    do_split accu
                  with Not_found ->
                    let accu =
                      String.sub subj pos 1 :: strs, pred cnt, succ pos, true in
                    do_split accu
              else let accu = if prematch then strs, cnt, last, false
                              else "" :: strs, pred cnt, last, false in
                   do_split accu
            else let accu = String.sub subj pos (first - pos) :: strs,
                            pred cnt, last, false in
                 do_split accu end
        with Not_found ->
          if len = 0 then "" :: strs else String.sub subj pos len :: strs in
    do_split ([], pred bound, n, false)

let bounded_spliton flags rex bound n subj =
  let res = internal_split flags rex bound n subj in
  strip_first_empty (List.rev (strip_first_empty res))

let bounded_splito flags rex bound = bounded_spliton flags rex bound 0
let bounded_splitn = bounded_spliton 0x0000
let bounded_split = bounded_splito 0x0000

let spliton flags rex = bounded_spliton flags rex 0
let splito flags rex = spliton flags rex 0
let splitn = spliton 0x0000
let split = splito 0x0000

let bounded_split_delimon flags rex bound n subj =
  List.rev (internal_split flags rex bound n subj)
let bounded_split_delimo flags rex bound =
  bounded_split_delimon flags rex bound 0
let bounded_split_delimn = bounded_split_delimon 0x0000
let bounded_split_delim = bounded_split_delimo 0x0000

let split_delimon flags rex = bounded_split_delimon flags rex 0
let split_delimo flags rex = split_delimon flags rex 0
let split_delimn = split_delimon 0x0000
let split_delim = split_delimo 0x0000

type split_result = Text of string | Delim of string

let bounded_full_spliton flags rex bound n subj =
  let subj_len = String.length subj in
  if subj_len = 0 then []
  else if bound = 1 then [Text (String.copy subj)]
  else
    let subgroups2, ovector = make_ovector rex in

    (* Performs the recursive split *)
    let rec do_split (strs, cnt, pos, prematch) =
      let len = subj_len - pos in
      if len < 0 then strs
      else
        try
          (* Checks termination due to bound restriction *)
          if cnt = 0 then
            if prematch then begin
              unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
              let first = Array.unsafe_get ovector 0
              and last = Array.unsafe_get ovector 1 in
              Text (String.sub subj last (subj_len - last))
                :: Delim (String.sub subj first (last - first)) :: strs end
            else raise Not_found

          (* Calculates next accumulator state for splitting *)
          else begin
            unsafe_pcre_exec flags rex pos subj subgroups2 ovector;
            let first = Array.unsafe_get ovector 0 in
            let last = Array.unsafe_get ovector 1 in

            if first = pos then
              if last = pos then
                if len = 0 then Delim "" :: strs
                else
                  let accu =
                    try
                      unsafe_pcre_exec (flags lor 0x0410) rex pos subj
                                        subgroups2 ovector;
                      let first = Array.unsafe_get ovector 0
                      and last = Array.unsafe_get ovector 1 in
                      let delim = Delim (String.sub subj first (last-first)) in
                      delim :: (if prematch then strs else Delim "" :: strs),
                      pred cnt, last, false
                    with Not_found ->
                      Text (String.sub subj pos 1) :: Delim "" :: strs,
                      pred cnt, succ pos, true in
                  do_split accu
              else
                let delim = Delim (String.sub subj first (last - first)) in
                let accu = delim :: strs, pred cnt, last, true in
                do_split accu
            else
              let delim = Delim (String.sub subj first (last - first)) in
              let accu =
                delim :: Text (String.sub subj pos (first - pos)) :: strs,
                pred cnt, last, true in
              do_split accu end
        with Not_found ->
          if len = 0 then strs else Text (String.sub subj pos len) :: strs in
    List.rev (do_split ([], pred bound, n, false))

let bounded_full_splito flags rex bound = bounded_full_spliton flags rex bound 0
let bounded_full_splitn = bounded_full_spliton 0x0000
let bounded_full_split = bounded_full_splito 0x0000

let full_spliton flags rex = bounded_full_spliton flags rex 0
let full_splito flags rex = full_spliton flags rex 0
let full_splitn = full_spliton 0x0000
let full_split = full_splito 0x0000


(*** Version information *)

external pcre_version : unit -> string = "pcre_version_wrapper"
        (* Returns version of the PCRE-C-library as string *)

(* Version of the PCRE-C-library as string *)
let version = pcre_version ()


(*** Additional convenience functions useful in combination with this
     library *)

(* [foreach_line ch f] applies [f] to each line in channel [ch] until
   the end-of-file is reached *)
let foreach_line ch f =
  try while true do f (input_line ch) done with End_of_file -> ()

(* [foreach_file filenames f] opens each file in the list [filenames]
   for input and applies [f] to each tuple (filename, file_channel) *)
let foreach_file filenames f =
  let do_with_file filename =
    let file = open_in filename in f (filename, file); close_in file in
  List.iter do_with_file filenames
