{
@abstract(provides simplified Pascal tokenizer)
@created(7 Mar 1999)
@lastmod(13 Apr 2000)
@author(Marco Schmidt (marcoschmidt@geocities.com))

The @link(TTokenizer) object creates @link(TToken) objects (tokens) for the
Pascal programming language from a character input stream.
The @link(Scanning) unit does the same (it actually uses this unit's
tokenizer), with the exception that it evaluates compiler directives,
which are comments that start with a dollar sign.
}

unit Tokenizi;

{$I platform.inc}

interface

uses
  Objects,
  Streams,
  Texts;

type
  { enumeration type that provides all types of tokens; each token's name
    starts with TOK_ }
  TTokenType = (TOK_WHITESPACE, TOK_COMMENT, TOK_IDENTIFIER,
    TOK_NUMBER, TOK_STRING, TOK_SYMBOL, TOK_DIRECTIVE, TOK_RESERVED);

const
  TOKEN_TYPE_NAMES: array[TTokenType] of string[15] =
  ('Whitespace', 'Comment', 'Identifier',
    'Number', 'String', 'Symbol', 'Directive', 'Reserved');

type
  { enumeration type that provides all types of symbols; each
    symbol's name starts with SYM_ }
  TSymbolType = (SYM_PLUS, SYM_MINUS, SYM_ASTERISK, SYM_SLASH, SYM_EQUAL,
    SYM_LESS_THAN, SYM_LESS_THAN_EQUAL, SYM_GREATER_THAN,
    SYM_GREATER_THAN_EQUAL, SYM_LEFT_BRACKET, SYM_RIGHT_BRACKET,
    SYM_COMMA, SYM_LEFT_PARENTHESIS, SYM_RIGHT_PARENTHESIS, SYM_COLON,
    SYM_SEMICOLON, SYM_ROOF, SYM_PERIOD, SYM_AT, SYM_LEFT_BRACE,
    SYM_RIGHT_BRACE, SYM_DOLLAR, SYM_NUMBER, SYM_ASSIGN, SYM_RANGE);
  { a pointer to @link(TToken) }
  PToken = ^TToken;
  { Stores the exact type and additional information on one token.
    Additionally, @link(Data) stores the array of characters }
  TToken = object(TObject)
    { the exact character representation of this token as it was found in the
      input file }
    Data: PText;
    { additional information on this token as a record of variant fields }
    Info: record
      case Integer of
        0: (WhitespaceRows: LongInt);
        2: (SymbolType: TSymbolType);
        3: (ReservedKey: LongInt);
    end;
    { the type of this token as @link(TTokenType) }
    MyType: TTokenType;
    { Create a token of and assign the argument token type to @link(MyType) }
    constructor Init(TT: TTokenType);
    function GetTypeName: string;
    { Returns if argument ST equals @link(MyType) }
    function IsSymbol(st: TSymbolType): Boolean;
  end;
  { a set of Char values }
  TCharSet = set of Char;
  { pointer to a @link(TScanner) }
  PTokenizer = ^TTokenizer;
  { @abstract(converts a @link(TInputStream) to a sequence of @link(TToken) objects)
  }
  TTokenizer = object(TObject)
    { if @link(IsCharBuffered) is true, this field contains the buffered
   character }
    BufferedChar: Char;
    { the last error message }
    ErrorMessage: string;
    { true if end of stream @link(Stream) has been reached, false otherwise }
    EOS: Boolean;
    { if this is true, @link(BufferedChar) contains a buffered character;
      the next call to @link(GetChar) or @link(PeekChar) will return this
      character, not the next in the associated stream @link(Stream) }
    IsCharBuffered: Boolean;
    { current row in stream @link(Stream); useful when giving error messages }
    Row: LongInt;
    { the input stream this tokenizer is working on }
    Stream: PInputStream;
    { Creates a TTokenizer and associates it with given @link(TInputStream). }
    constructor Init(s: PInputStream);
    { Releases all dynamically allocated memory. }
    destructor Done; virtual;
    procedure CheckForDirective(var t: TToken; Offset: Integer);
    procedure ConsumeChar;
    function CreateSymbolToken(st: TSymbolType; s: string; var t: PToken): Boolean;
    function CreateToken(TT: TTokenType; var t: PToken): Boolean;
    function getChar(var c: Char): Boolean;
    function HasData: Boolean;
    function GetStreamInfo: string;
    function GetToken(var t: PToken): Boolean;
    function PeekChar(var c: Char): Boolean;
    function ReadCommentType1(var t: PToken): Boolean;
    function ReadCommentType2(var t: PToken): Boolean;
    function ReadCommentType3(var t: PToken): Boolean;
    function ReadLiteralString(var t: PToken): Boolean;
    function ReadToken(c: Char; s: TCharSet; TT: TTokenType; var t: PToken): Boolean;
  end;

const
  TokenTypeNames: array[TOK_WHITESPACE..TOK_RESERVED] of string[15] =
  ('whitespace', 'comment', 'identifier', 'number', 'string',
    'symbol', 'directive', 'reserved');
  KEY_AND = 0;
  KEY_ARRAY = 1;
  KEY_AS = 2;
  KEY_ASM = 3;
  KEY_BEGIN = 4;
  KEY_CASE = 5;
  KEY_CLASS = 6;
  KEY_CONST = 7;
  KEY_CONSTRUCTOR = 8;
  KEY_DESTRUCTOR = 9;
  KEY_DISPINTERFACE = 10;
  KEY_DIV = 11;
  KEY_DO = 12;
  KEY_DOWNTO = 13;
  KEY_ELSE = 14;
  KEY_END = 15;
  KEY_EXCEPT = 16;
  KEY_EXPORTS = 17;
  KEY_FILE = 18;
  KEY_FINALIZATION = 19;
  KEY_FINALLY = 20;
  KEY_FOR = 21;
  KEY_FUNCTION = 22;
  KEY_GOTO = 23;
  KEY_IF = 24;
  KEY_IMPLEMENTATION = 25;
  KEY_IN = 26;
  KEY_INHERITED = 27;
  KEY_INITIALIZATION = 28;
  KEY_INLINE = 29;
  KEY_INTERFACE = 30;
  KEY_IS = 31;
  KEY_LABEL = 33;
  KEY_LIBRARY = 33;
  KEY_MOD = 34;
  KEY_NIL = 35;
  KEY_NOT = 36;
  KEY_OBJECT = 37;
  KEY_OF = 38;
  KEY_ON = 39;
  KEY_OR = 40;
  KEY_PACKED = 41;
  KEY_PROCEDURE = 42;
  KEY_PROGRAM = 43;
  KEY_PROPERTY = 44;
  KEY_RAISE = 45;
  KEY_RECORD = 46;
  KEY_REPEAT = 47;
  KEY_RESOURCESTRING = 48;
  KEY_SET = 49;
  KEY_SHL = 50;
  KEY_SHR = 51;
  KEY_STRING = 52;
  KEY_THEN = 53;
  KEY_THREADVAR = 54;
  KEY_TO = 55;
  KEY_TRY = 56;
  KEY_TYPE = 57;
  KEY_UNIT = 58;
  KEY_UNTIL = 59;
  KEY_USES = 60;
  KEY_VAR = 61;
  KEY_WHILE = 62;
  KEY_WITH = 63;
  KEY_XOR = 64;
  NUM_KEYS = 65;
  KeyWordArray: array[0..NUM_KEYS - 1] of string[15] =
  ('AND', 'ARRAY', 'AS', 'ASM', 'BEGIN', 'CASE', 'CLASS', 'CONST',
    'CONSTRUCTOR', 'DESTRUCTOR', 'DISPINTERFACE', 'DIV', 'DO', 'DOWNTO', 'ELSE', 'END',
    'EXCEPT', 'EXPORTS', 'FILE', 'FINALIZATION', 'FINALLY', 'FOR', 'FUNCTION',
    'GOTO', 'IF', 'IMPLEMENTATION', 'IN', 'INHERITED', 'INITIALIZATION',
    'INLINE', 'INTERFACE', 'IS', 'LABEL', 'LIBRARY', 'MOD', 'NIL', 'NOT',
    'OBJECT', 'OF', 'ON', 'OR', 'PACKED', 'PROCEDURE', 'PROGRAM', 'PROPERTY',
    'RAISE', 'RECORD', 'REPEAT', 'RESOURCESTRING', 'SET', 'SHL', 'SHR',
    'STRING', 'THEN', 'THREADVAR', 'TO', 'TRY', 'TYPE', 'UNIT', 'UNTIL', 'USES',
    'VAR', 'WHILE', 'WITH', 'XOR');
  SD_ABSOLUTE = 0;
  SD_ABSTRACT = 1;
  SD_APIENTRY = 2;
  SD_ASSEMBLER = 3;
  SD_AUTOMATED = 4;
  SD_CDECL = 5;
  SD_DEFAULT = 6;
  SD_DISPID = 7;
  SD_DYNAMIC = 8;
  SD_EXPORT = 9;
  SD_EXTERNAL = 10;
  SD_FAR = 11;
  SD_FORWARD = 12;
  SD_INDEX = 13;
  SD_MESSAGE = 14;
  SD_NAME = 15;
  SD_NEAR = 16;
  SD_NODEFAULT = 17;
  SD_OVERLOAD = 18;
  SD_OVERRIDE = 19;
  SD_PASCAL = 20;
  SD_PRIVATE = 21;
  SD_PROTECTED = 22;
  SD_PUBLIC = 23;
  SD_PUBLISHED = 24;
  SD_READ = 25;
  SD_REGISTER = 26;
  SD_REINTRODUCE = 27;
  SD_RESIDENT = 28;
  SD_STDCALL = 29;
  SD_STORED = 30;
  SD_VIRTUAL = 31;
  SD_WRITE = 32;
  NUM_STANDARD_DIRECTIVES = 33;
  StandardDirectiveArray: array[0..NUM_STANDARD_DIRECTIVES - 1] of string[15] =
  ('ABSOLUTE', 'ABSTRACT', 'APIENTRY', 'ASSEMBLER', 'AUTOMATED', 'CDECL',
    'DEFAULT', 'DISPID', 'DYNAMIC', 'EXPORT', 'EXTERNAL', 'FAR', 'FORWARD',
    'INDEX', 'MESSAGE', 'NAME', 'NEAR', 'NODEFAULT', 'OVERLOAD', 'OVERRIDE',
    'PASCAL', 'PRIVATE', 'PROTECTED', 'PUBLIC', 'PUBLISHED', 'READ', 'REGISTER',
    'REINTRODUCE', 'RESIDENT', 'STDCALL', 'STORED', 'VIRTUAL', 'WRITE');

var
  Keywords: PStringCollection = nil;
  StandardDirectives: PStringCollection = nil;

implementation

uses
  SysUtils,

  Arrays,
  Numbers;

const
  Whitespace = [#9, #10, #13, ' '];
  Letters = ['A'..'Z', 'a'..'z'];
  DecimalDigits = ['0'..'9'];
  HexadecimalDigits = DecimalDigits + ['A'..'F', 'a'..'f'];
  IdentifierStart = ['_'] + Letters;
  IdentifierOther = IdentifierStart + DecimalDigits;
  CharOther = HexadecimalDigits + ['$'];
  NumberStart = DecimalDigits + ['$'];
  NumberOther = HexadecimalDigits + ['.', '+', '-'];
  QuoteChar = '''';
  NUM_SINGLE_CHAR_SYMBOLS = 10;
  SingleCharSymbols: array[0..NUM_SINGLE_CHAR_SYMBOLS - 1] of
  record
    c: Char;
    s: TSymbolType;
  end =
  ((c: ';'; s: SYM_SEMICOLON),
    (c: ','; s: SYM_COMMA),
    (c: '['; s: SYM_LEFT_BRACKET),
    (c: ']'; s: SYM_RIGHT_BRACKET),
    (c: '+'; s: SYM_PLUS),
    (c: '-'; s: SYM_MINUS),
    (c: '*'; s: SYM_ASTERISK),
    (c: '='; s: SYM_EQUAL),
    (c: '^'; s: SYM_ROOF),
    (c: '@'; s: SYM_AT));

  { TToken }

constructor TToken.Init(TT: TTokenType);
begin
  inherited Init;
  MyType := TT;
end;

function TToken.GetTypeName: string;
begin
  GetTypeName := TOKEN_TYPE_NAMES[MyType];
end;

function TToken.IsSymbol(st: TSymbolType): Boolean;
begin
  IsSymbol := (MyType = TOK_SYMBOL) and (Info.SymbolType = st);
end;

{ TTokenizer }

constructor TTokenizer.Init(s: PInputStream);
begin
  inherited Init;
  EOS := False;
  ErrorMessage := '';
  IsCharBuffered := False;
  Row := 1;
  Stream := s;
end;

destructor TTokenizer.Done;
begin
  if Assigned(Stream) then Dispose(Stream, Done);
  inherited Done;
end;

procedure TTokenizer.CheckForDirective(var t: TToken; Offset: Integer);
begin
  if Assigned(t.Data) and (t.Data^.Content > Offset) and
    (t.Data^.Data[Offset] = '$') then
    begin
      t.MyType := TOK_DIRECTIVE;
      {WriteLn('Found directive: ', T.Data^.GetString);}
    end;
end;

procedure TTokenizer.ConsumeChar;
begin
  IsCharBuffered := False;
end;

function TTokenizer.CreateSymbolToken(st: TSymbolType; s: string; var t: PToken): Boolean;
begin
  if (not CreateToken(TOK_SYMBOL, t)) then
    begin
      Result := False;
      Exit;
    end;
  t^.Data^.AppendString(s);
  t^.Info.SymbolType := st;
  Result := True;
end;

function TTokenizer.CreateToken(TT: TTokenType; var t: PToken): Boolean;
begin
  Result := False;
  t := New(PToken, Init(TT));
  if (not Assigned(t)) then Exit;
  t^.Data := New(PText, Init);
  if (not Assigned(t^.Data)) then
    begin
      Dispose(t, Done);
      t := nil;
      Exit;
    end;
  Result := True;
end;

function TTokenizer.getChar(var c: Char): Boolean;
var
  s: TSInt32;
begin
  if IsCharBuffered then
    begin
      c := BufferedChar;
      IsCharBuffered := False;
      getChar := True;
    end
  else
    begin
      s := Stream^.ReadByte;
      c := Char(s);
      getChar := (s <> -1);
    end;
end;

function TTokenizer.GetStreamInfo: string;
begin
  GetStreamInfo := Stream^.GetName + '(' + IntToStr(Row) + ')';
end;

function TTokenizer.HasData: Boolean;
begin
  HasData := IsCharBuffered or Stream^.HasData;
end;

function TTokenizer.GetToken(var t: PToken): Boolean;
var
  c: Char;
  i: Integer;
  s: string {[255]};
begin
  Result := False;
  t := nil;
  if (not getChar(c)) then Exit;
  if (c in Whitespace) then
    begin
      if ReadToken(c, Whitespace, TOK_WHITESPACE, t) then
        begin
          { after successful reading all whitespace characters, update
            internal row counter to be able to state current row on errors;
            caveat: will fail on Mac files (row is 13) }
          i := 0;
          while (i < t^.Data^.Content) do
            begin
              if (t^.Data^.Data[i] = #10) then
                Inc(Row);
              Inc(i);
            end;
          Result := True;
        end
      else
        Result := False;
    end
  else
    if (c in IdentifierStart) then
      begin
        if ReadToken(c, IdentifierOther, TOK_IDENTIFIER, t) then
          begin
            { check if identifier is a reserved identifier }
            s := t^.Data^.GetString;
            Arrays.StringToUpper(s, s);
            i := Keywords^.IndexOf(@s);
            if (i <> -1) then
              begin
                t^.MyType := TOK_RESERVED;
                t^.Info.ReservedKey := i;
              end;
            Result := True;
          end
      end
    else
      if (c in NumberStart) then
        Result := ReadToken(c, NumberOther, TOK_NUMBER, t)
      else
        case c of
          QuoteChar: Result := ReadLiteralString(t);
          '#': Result := ReadToken(c, CharOther, TOK_STRING, t);
          '{':
            begin
              if ReadCommentType1(t) then
                begin
                  CheckForDirective(t^, 1);
                  Result := True;
                end;
            end;
          '(':
            begin
              c := ' ';
              if HasData and (not PeekChar(c)) then Exit;
              case c of
                '*':
                  begin
                    ConsumeChar;
                    if ReadCommentType2(t) then
                      begin
                        CheckForDirective(t^, 2);
                        Result := True;
                      end;
                  end;
                '.':
                  begin
                    ConsumeChar;
                    Result := CreateSymbolToken(SYM_LEFT_BRACKET, '(.', t);
                  end;
              else
                Result := CreateSymbolToken(SYM_LEFT_PARENTHESIS, '(', t);
              end;
            end;
          ')':
            begin
              c := ' ';
              Result := CreateSymbolToken(SYM_RIGHT_PARENTHESIS, ')', t);
            end;
          '.':
            begin
              c := ' ';
              if HasData and (not PeekChar(c)) then Exit;
              case c of
                '.':
                  begin
                    ConsumeChar;
                    Result := CreateSymbolToken(SYM_RANGE, '..', t);
                  end;
                ')':
                  begin
                    ConsumeChar;
                    Result := CreateSymbolToken(SYM_RIGHT_BRACKET, '.)', t);
                  end;
              else
                Result := CreateSymbolToken(SYM_PERIOD, '.', t);
              end;
            end;
          '/':
            begin
              c := ' ';
              if HasData and (not PeekChar(c)) then Exit;
              case c of
                '/':
                  begin
                    ConsumeChar;
                    Result := ReadCommentType3(t);
                  end;
              else
                Result := CreateSymbolToken(SYM_SLASH, '/', t);
              end;
            end;
          ':':
            begin
              c := ' ';
              if HasData and (not PeekChar(c)) then Exit;
              case c of
                '=':
                  begin
                    ConsumeChar;
                    Result := CreateSymbolToken(SYM_ASSIGN, ':=', t);
                  end;
              else
                Result := CreateSymbolToken(SYM_COLON, ':', t);
              end;
            end;
          '<':
            begin
              c := ' ';
              if HasData and (not PeekChar(c)) then Exit;
              case c of
                '=':
                  begin
                    ConsumeChar;
                    Result := CreateSymbolToken(SYM_LESS_THAN_EQUAL, '<=', t);
                  end;
              else
                Result := CreateSymbolToken(SYM_LESS_THAN, '<', t);
              end;
            end;
          '>':
            begin
              c := ' ';
              if HasData and (not PeekChar(c)) then Exit;
              case c of
                '=':
                  begin
                    ConsumeChar;
                    Result := CreateSymbolToken(SYM_GREATER_THAN_EQUAL, '<=', t);
                  end;
              else
                Result := CreateSymbolToken(SYM_GREATER_THAN, '<', t);
              end;
            end;
        else
          begin
            for i := 0 to NUM_SINGLE_CHAR_SYMBOLS - 1 do
              begin
                if (c = SingleCharSymbols[i].c) then
                  begin
                    Result := CreateSymbolToken(SingleCharSymbols[i].s,
                      SingleCharSymbols[i].c, t);
                    Exit;
                  end;
              end;
            ErrorMessage := 'Error: Invalid character in Pascal input stream - ';
          end;
        end;
end;

function TTokenizer.PeekChar(var c: Char): Boolean;
var
  s: TSInt32;
begin
  if IsCharBuffered then
    begin
      c := BufferedChar;
      PeekChar := True;
    end
  else
    begin
      if (Stream^.HasData) then
        begin
          s := Stream^.ReadByte;
          c := Char(s);
          BufferedChar := c;
          IsCharBuffered := True;
          PeekChar := True;
        end
      else
        begin
          EOS := True;
          PeekChar := False;
        end;
    end;
end;

function TTokenizer.ReadCommentType1(var t: PToken): Boolean;
var
  c: Char;
begin
  ReadCommentType1 := False;
  if (not CreateToken(TOK_COMMENT, t)) then Exit;
  t^.Data^.AppendChar('{');
  repeat
    if (not HasData) or (not getChar(c)) then
      begin
        Exit;
      end;
    if (c = #10) then
      Inc(Row);
    if (c = '{') then
      WriteLn(GetStreamInfo + 'Warning: Found opening brace { within comment, nested comments are not supported.');
    t^.Data^.AppendChar(c);
  until (c = '}');
  ReadCommentType1 := True;
end;

function TTokenizer.ReadCommentType2(var t: PToken): Boolean;
var
  c: Char;
  Finished: Boolean;
begin
  ReadCommentType2 := False;
  if (not CreateToken(TOK_COMMENT, t)) then Exit;
  t^.Data^.AppendString('(*');
  Finished := False;
  repeat
    if (not HasData) or (not getChar(c)) then
      begin
        Exit;
      end;
    t^.Data^.AppendChar(c);
    if (c = #10) then
      Inc(Row);
    if (c = '*') then
      begin
        if (not HasData) or (not PeekChar(c)) then
          begin
            Exit;
          end;
        if (c = ')') then
          begin
            ConsumeChar;
            t^.Data^.AppendChar(c);
            Finished := True;
          end;
      end;
  until Finished;
  ReadCommentType2 := True;
end;

function TTokenizer.ReadCommentType3(var t: PToken): Boolean;
var
  c: Char;
  Finished: Boolean;
begin
  ReadCommentType3 := False;
  if (not CreateToken(TOK_COMMENT, t)) then Exit;
  t^.Data^.AppendString('//');
  Finished := False;
  repeat
    if (not HasData) or (not getChar(c)) then
      begin
        Exit;
      end;
    if (c <> #10) and (c <> #13) then
      t^.Data^.AppendChar(c)
    else
      Finished := True;
  until Finished;
  ReadCommentType3 := True;
end;

function TTokenizer.ReadLiteralString(var t: PToken): Boolean;

  procedure ReleaseToken;
  begin
    Dispose(t, Done);
    t := nil;
  end;

var
  c: Char;
  Finished: Boolean;
begin
  ReadLiteralString := False;
  Finished := False;
  if (not CreateToken(TOK_STRING, t)) then Exit;
  t^.Data := New(PText, Init);
  if (not Assigned(t^.Data)) then
    begin
      ReleaseToken;
      Exit;
    end;
  t^.Data^.AppendChar('''');
  repeat
    if (not Stream^.HasData) then
      begin
        ErrorMessage := 'Error: tokenizer: unexpected end of stream.';
        ReleaseToken;
        Exit;
      end;
    if (not getChar(c)) then
      begin
        ErrorMessage := 'Error: tokenizer: could not read character.';
        ReleaseToken;
        Exit;
      end;
    if (c = QuoteChar) then
      begin
        if (not PeekChar(c)) then
          begin
            ErrorMessage := 'Error: tokenizer: could not peek character.';
            ReleaseToken;
            Exit;
          end;
        if (c = QuoteChar) then { escaped single quote within string }
          begin
            ConsumeChar;
            t^.Data^.AppendChar(QuoteChar);
          end
        else { end of string }
          begin
            Finished := True;
          end;
        t^.Data^.AppendChar(QuoteChar);
      end
    else
      begin
        t^.Data^.AppendChar(c);
      end;
  until Finished;
  ReadLiteralString := True;
end;

function TTokenizer.ReadToken(c: Char; s: TCharSet; TT: TTokenType; var t: PToken): Boolean;
begin
  Result := False;
  if (not CreateToken(TT, t)) then Exit;
  t^.Data^.AppendChar(c);
  repeat
    if (not PeekChar(c)) then
      begin
        if EOS then
          begin
            Result := True;
          end
        else
          begin
            Dispose(t, Done);
            t := nil;
          end;
        Exit;
      end;
    if (c in s) then
      begin
        t^.Data^.AppendChar(c);
        ConsumeChar;
      end
    else
      begin
        Result := True;
        Exit;
      end;
  until False;
end;

{ global functions and procedures }

procedure InitArrays;
var
  i: Integer;
begin
  Keywords := New(PStringCollection, Init(NUM_KEYS, 1));
  for i := 0 to NUM_KEYS - 1 do
    begin
      Keywords^.Insert(Objects.NewStr(KeyWordArray[i]));
    end;
  StandardDirectives := New(PStringCollection, Init(NUM_STANDARD_DIRECTIVES, 1));
  for i := 0 to NUM_STANDARD_DIRECTIVES - 1 do
    begin
      StandardDirectives^.Insert(Objects.NewStr(StandardDirectiveArray[i]));
    end;
end;

begin
  InitArrays;
end.

