unit RegEx;

interface

type
  TRegEx = String;

{ function Compile

Purpose:
  Create a compiled representation of a regular expression.

Parameters:
  RegEx - The regular expression string to be compiled.

Return value:
  Compiled regular expression string (binary).

Remarks:
  This function will process regular expressions of the following form.
          .      match any character*
          *      Kleene closure (0 or more)*
          (...)  group
          [...]  character class**
          |      choose
          \      escape next character***

  The precedence of operations is, from highest to lowest, Kleene closure,
  concatenation, choose.

  *   '.' by itself matches any character, but '.*' only matches characters
      that cannot follow it in sequence.  Thus '".*"' means 'a " followed by
      any sequence of characters not including a ", followed by a "'.  The
      expressions '.*(a|b)' and '[^ab]*(a|b)' will match the same strings.

  **  The syntax for a character class is:
      '[' ['^'] ['-'] [']'] (C1(|'-'C2))* ']'
      where C1 is any character except ']', and C2 is any character.  A term
      of the form C1'-'C2 includes (or excludes) all characters between and
      including C1 and C2 in (from) the class.
      If a '^' appears at the beginning of the class, then the sense of the
      class is inverted.  If a '-' appears at the beginning of the class, then
      '-' is included in the class.  If ']' appears at the beginning of the
      class, then ']' is included in the class.  Any combination of the three
      may appear at the beginning, but they must be in the order specified to
      retain their special meanings.  A '^' anywhere except the beginning of
      the class indicates that '^' is included in (or excluded from) the class.
      Within a character class, the only special characters are '-', which
      indicates ranges, and ']' which ends the class; all other special
      regular expression characters (i.e. '.', '\', etc.) are considered
      literals.

  *** '\' must be used to represent special characters (including '\') as
      literal.  It is used to give special meaning to regular literal
      characters.  The following special meanings are defined:
          '\t'   Tab character (#9)
}
function Compile(RegEx: String): TRegEx;

{ function MatchLength

Purpose:
  Determine the length of the regular expression match starting at Pos.

Parameters:
  S - String to match.
  Pos - Position in S from which to start (1-based).
  CompiledRegEx - Compiled (binary) regular expression as returned by
                  Compile.

Return value:
  Length of match.

Remarks:
  MatchLength will return 0 if the expression does not match beginning at Pos.
}
function MatchLength(S: String; Pos: Integer; CompiledRegEx: TRegEx): Integer;

{ procedure Match

Purpose:
  Find a match for a regular expression in S.

Parameters:
  S - String to match against.
  CompiledRegEx - Compiled (binary) regular expression as returned by Compile.
  Start - Starting position for matching (1-based).

Return values:
  Start - Index of first character of match, 0 if no match.
  Length - Length of match, 0 if no match.

Remarks:
  Match begins scanning S for a match at Start and continues until it finds
  a substring that matches the specified regular expression.
}
procedure Match(S: String; CompiledRegEx: TRegEx; var Start: Integer;
  out Length: Integer);

implementation

uses SysUtils, Classes;

type
  TRegExOperation = (roLiteral,roOption,roClass,roAnythingBut);
  PCharClass = ^TCharClass;
  TCharClass = set of Char;
  PRegExpStep = ^TRegExpStep;
  PRegExpCompStep = ^TRegExpCompStep;
  TStepRef = record
  case Boolean of
    False: (Ptr: PRegExpCompStep);
    True: (Index: Integer);
  end;
  TCharClassRef = record
  case Boolean of
    False: (Ptr: PChar);
    True: (Index: Integer);
  end;
  TRegExpStep = record
    Next: TStepRef;
  case StepType: TRegExOperation of
    roLiteral: (Literal: Char);
    roOption: (OtherBranch: TStepRef);
    roClass: (Literals: TCharClassRef);
  end;
  TStepArray = array of TRegExpStep;
  TRegExpCompStep = record
    Step: TRegExpStep;
    Prev: PRegExpCompStep;
  end;

function Compile(RegEx: String): TRegEx;
const
  StepPitch: Integer = -(-SizeOf(TRegExpStep) and -4);
var
  StepList: TList;

  function Compile(var Start: Integer): PRegExpCompStep;
  var
    Last, NewStep: PRegExpCompStep;

    procedure CompileLiteral(Ch: Char);
    begin
      New(NewStep);
      StepList.Add(NewStep);
      with NewStep^, Step do begin
        Next.Ptr := nil;
        StepType := roLiteral;
        Literal := Ch;
      end;
      if Last <> nil then begin
        while Last^.Step.Next.Ptr <> nil do
            Last := Last^.Step.Next.Ptr;
        Last^.Step.Next.Ptr := NewStep;
      end else
        Result := NewStep;
      NewStep^.Prev := Last;
      Inc(Start);
    end;

    function Escape(Ch: Char): Char;
    begin
      case Ch of
        't': Result := #9;
        else Result := Ch;
      end;
    end;

    procedure CompileEscaped(Literal: Char);
    begin
      CompileLiteral(Escape(Literal));
      Inc(Start);
    end;

    procedure CompileClass;
    var
      Inverse: Boolean;
      Ch: Char;
      CharClass: TCharClass;
      Encoded: array [0..256] of Char;
      I: Integer;
    begin
      New(NewStep);
      StepList.Add(NewStep);
      with NewStep^, Step do begin
        Next.Ptr := nil;
        StepType := roClass;
        CharClass := [];
      end;
      if Last <> nil then begin
        while Last^.Step.Next.Ptr <> nil do
          Last := Last^.Step.Next.Ptr;
        Last^.Step.Next.Ptr := NewStep;
      end else
        Result := NewStep;
      NewStep^.Prev := Last;
      Inc(Start);
      Inverse := RegEx[Start] = '^';
      if Inverse then
        Inc(Start);
      Ch := RegEx[Start];
      if Ch = '-' then begin
        Include(CharClass,Ch);
        Inc(Start);
        Ch := RegEx[Start];
      end;
      Include(CharClass,Ch);
      Inc(Start);
      while RegEx[Start] <> ']' do begin
        if RegEx[Start] = '-' then begin
          Inc(Start);
          CharClass := CharClass + [Ch..RegEx[Start]];
        end else begin
          Ch := RegEx[Start];
          Include(CharClass,Ch);
        end;
        Inc(Start);
      end;
      Inc(Start);
      I := 1;
      for Ch := #0 to #255 do
        if (I and 1) = 1 then
      begin
        if Ch in CharClass then begin
          Encoded[I] := Ch;
          Inc(I);
          if (Ch = High(Ch)) or not (Succ(Ch) in CharClass) then begin
            Encoded[I] := Ch;
            Inc(I);
          end;
        end;
      end else begin
        if (Ch = High(Ch)) or not (Succ(Ch) in CharClass) then begin
          Encoded[I] := Ch;
          Inc(I);
        end;
      end;
      if not Inverse then
        Encoded[0] := Chr((I-3) shr 1)
      else
        Encoded[0] := Chr(not ((I-3) shr 1));
      NewStep^.Step.Literals.Ptr := StrAlloc(I);
      Move(Encoded,NewStep^.Step.Literals.Ptr^,I);
    end;

    procedure CloseOtherBranch(Option, Closure: PRegExpCompStep);
    var
      Step, Last: PRegExpCompStep;
    begin
      Last := nil;
      Step := Option^.Step.OtherBranch.Ptr;
      if Step = nil then begin
        Option^.Step.OtherBranch.Ptr := Closure;
        Exit;
      end;
      while Step <> nil do begin
        // The OtherBranch of Kleene closures is already closed.
        if Step = Option then
          Exit;

        // If it is an option, close its OtherBranch.
        if Step^.Step.StepType = roOption then
          CloseOtherBranch(Step,Closure);

        // Remember the last non-nil pointer
        Last := Step;
        Step := Step^.Step.Next.Ptr;
      end;
      // Close the branch for which we were called.
      Last^.Step.Next.Ptr := Closure;
    end;

    procedure CompileNested;
    begin
      Inc(Start);
      NewStep := Compile(Start);
      NewStep^.Prev := Last;
      if Last <> nil then begin
        while Last^.Step.Next.Ptr <> nil do begin
          if Last^.Step.StepType = roOption then
            CloseOtherBranch(Last,NewStep);
          Last := Last^.Step.Next.Ptr;
        end;
        Last^.Step.Next.Ptr := NewStep;
      end else
        Result := NewStep;
      Inc(Start);
    end;

    procedure CompileKleene;
    begin
      if (Last = nil) or
        ((Last^.Step.StepType = roOption) and
        (Last^.Step.Next.Ptr = nil)) then
          raise Exception.Create(
            'Illegal expression: Kleene closure of empty expression.');
      New(NewStep);
      StepList.Add(NewStep);
      with NewStep^, Step do begin
        Next.Ptr := nil;
        StepType := roOption;
        OtherBranch.Ptr := Last;
        Prev := Last.Prev;
      end;
      if Last^.Prev <> nil then begin
        Last^.Prev^.Step.Next.Ptr := NewStep;
        Last^.Prev := NewStep;
      end else
        Result := NewStep;
      CloseOtherBranch(NewStep, NewStep);
      Inc(Start);
    end;

    procedure CompileOption;
    begin
      New(NewStep);
      StepList.Add(NewStep);
      with NewStep^, Step do begin
        Next.Ptr := nil;
        StepType := roOption;
        if Last <> nil then begin
          while Last^.Prev <> nil do
            Last := Last^.Prev;
          Last^.Prev := NewStep;
        end;
        OtherBranch.Ptr := Last;
        Prev := nil;
      end;
      Result := NewStep;
      Inc(Start);
    end;

    procedure CompileAnythingBut;
    begin
      New(NewStep);
      StepList.Add(NewStep);
      with NewStep^, Step do begin
        Next.Ptr := nil;
        StepType := roAnythingBut;
      end;
      if Last <> nil then begin
        while Last^.Step.Next.Ptr <> nil do
          Last := Last^.Step.Next.Ptr;
        Last^.Step.Next.Ptr := NewStep;
      end;
      NewStep^.Prev := Last;
      Inc(Start);
    end;

  begin
    Result := nil;
    Last := nil;
    while Start <= Length(RegEx) do begin
      case RegEx[Start] of
        '\': CompileEscaped(RegEx[Start + 1]);
        '(': CompileNested;
        ')': Exit;
        '*': CompileKleene;
        '|': CompileOption;
        '[': CompileClass;
        '.': CompileAnythingBut;
        else CompileLiteral(RegEx[Start]);
      end;
      Last := NewStep;
    end;
  end;

  procedure Resolve(var Ref: TStepRef); overload;
  begin
    Ref.Index := StepList.IndexOf(Ref.Ptr);
  end;

  procedure Resolve(var Ref: TCharClassRef); overload;
  var
    Start, Size: Integer;
  begin
    Start := Length(Result);
    Size := ((Ord(Ref.Ptr^) + 1) shl 1) + 1;
    SetLength(Result, Start + Size);
    Inc(Start);
    Move(Ref.Ptr^, Result[Start], Size);
    StrDispose(Ref.Ptr);
    Ref.Index := Start;
  end;

var
  I: Integer;
  Head: PRegExpCompStep;
begin
  StepList := TList.Create;
  try
    try
      I := 1;
      Head := Compile(I);
      // Put the first step in the first slot
      StepList.Exchange(StepList.IndexOf(Head),0);
    except
      for I := 0 to StepList.Count - 1 do begin
        with PRegExpCompStep(StepList[I]).Step do
          if StepType = roClass then
            Dispose(Literals.Ptr);
        Dispose(StepList[I]);
      end;
      raise;
    end;

    SetLength(Result,StepList.Count * StepPitch);
    for I := 0 to StepList.Count - 1 do begin
      Move(StepList[I]^, TStepArray(Pointer(Result))[I],
        SizeOf(TRegExpStep));
      with TStepArray(Result)[I] do begin
        Resolve(Next);
        case StepType of
          roOption: Resolve(OtherBranch);
          roClass: Resolve(Literals);
        end;
      end;
      Dispose(StepList[I]);
    end;
  finally
    StepList.Free;
  end;
end;

function MatchLength(S: String; Pos: Integer; CompiledRegEx: TRegEx): Integer;
var
  Current: Integer;
  Steps: TList;
  StepArray: TStepArray;
  AnythingButCheck: Boolean;

  function MatchStep(StepIndex: Integer): Boolean; forward;

  function MatchLiteral(Step: PRegExpStep; StepIndex: Integer): Boolean;
  var
    Next: Integer;
  begin
    Next := Step^.Next.Index;
    Result := S[Current] = Step^.Literal;
    if AnythingButCheck or not Result or (Next < 0) then
      Steps.Delete(StepIndex)
    else
      Steps[StepIndex] := @StepArray[Next];
  end;

  function MatchClass(Step: PRegExpStep; StepIndex: Integer): Boolean;
  var
    Next, I, Index: Integer;
  begin
    Next := Step^.Next.Index;
    Index := Step^.Literals.Index;
    I := BYTE(CompiledRegEx[Index]);
    Inc(Index);
    Result := (I and $80) <> 0;
    if Result then
      I := (not I) and $FF;
    for I := 0 to I do begin
      if (S[Current] >= CompiledRegEx[Index]) and
        (S[Current] <= CompiledRegEx[Index + 1]) then
      begin
        Result := not Result;
        Break;
      end;
      Inc(Index,2);
    end;
    if AnythingButCheck or not Result or (Next < 0) then
      Steps.Delete(StepIndex)
    else
      Steps[StepIndex] := @StepArray[Next];
  end;

  function MatchAnythingBut(Step: PRegExpStep; StepIndex: Integer): Boolean;
  var
    Next: Integer;
  begin
    Next := Step^.Next.Index;
    Result := True;
    if (Next >= 0) and (StepArray[Next].StepType = roOption) and
      (StepArray[Next].OtherBranch.Index >= 0) and
      (@StepArray[StepArray[Next].OtherBranch.Index] = Step) then
    begin
      Result := False;
      if not AnythingButCheck then begin
        AnythingButCheck := True;
        if Next >= 0 then begin
          Steps.Add(@StepArray[Next]);
          Result := not MatchStep(Steps.Count - 1);
        end else
          Result := True;
        AnythingButCheck := False;
      end;
    end;
    if not Result then
      Steps.Delete(StepIndex)
    else if Next >= 0 then
      Steps[StepIndex] := @StepArray[Next];
  end;

  function MatchOption(Option: PRegExpStep; StepIndex: Integer): Boolean;
  var
    Index: Integer;
    Branch: PRegExpStep;
  begin
    Result := False;
    // Go on the Next branch
    // We have to do this first to avoid infinite recursion with '.*'
    Index := Option^.Next.Index;
    if Index > 0 then begin
      Branch := @StepArray[Index];
      Steps.Add(Branch);
      Result := MatchStep(Steps.Count - 1);
    end;
    // Go on the OtherBranch
    Index := Option^.OtherBranch.Index;
    if Index >= 0 then begin
      Branch := @StepArray[Index];
      Steps[StepIndex] := Branch;
      Result := MatchStep(StepIndex) or Result;
    end else
      Steps.Delete(StepIndex);
  end;

  function MatchStep(StepIndex: Integer): Boolean;
  var
    Step: PRegExpStep;
  begin
    Step := PRegExpStep(Steps[StepIndex]);
    case Step^.StepType of
      roLiteral: Result := MatchLiteral(Step,StepIndex);
      roOption: Result := MatchOption(Step,StepIndex);
      roClass: Result := MatchClass(Step,StepIndex);
      roAnythingBut: Result := MatchAnythingBut(Step,StepIndex);
      else Result := False;
    end;
  end;

var
  I: Integer;
  Matched: Boolean;
begin
  AnythingButCheck := False;
  Current := Pos;
  StepArray := TStepArray(CompiledRegEx);
  Steps := TList.Create;
  try
    // Initialize Steps with the first step in the regular expression
    Steps.Add(@StepArray[0]);

    while (Steps.Count > 0) and (Current <= Length(S)) do begin
      Matched := False;
      for I := Steps.Count - 1 downto 0 do begin
        Matched := MatchStep(I) or Matched;
      end;
      if not Matched then begin
        Result := Current - Pos;
        Exit;
      end;
      Inc(Current);
    end;
    Result := Current - Pos;
  finally
    Steps.Free;
  end;
end;

procedure Match(S: String; CompiledRegEx: TRegEx; var Start: Integer;
  out Length: Integer);
var
  I: Integer;
begin
  for I := Start to System.Length(S) do begin
    Length := MatchLength(S,I,CompiledRegEx);
    if Length > 0 then begin
      Start := I;
      Exit;
    end;
  end;
  Start := 0;
end;

end.
