{
  Scanner.pas

  Written by Frank Plagge
  Copyright (c) 1998 by Frank Plagge, Elsterweg 39, 38446 Wolfsburg, Germany
  All rights reserved

  Please send comments to plagge@positiv.escape.de

  V 1.01 - Jan 3rd, 1998
           first implementation, never trust a version 1.00 :-)
  V 1.02 - Jan 12th, 1998
           hexdecimals with the new token ttHexDecimal added
           the hex numbers are defined like C e.g. 0x12AB or like
           Pascal e.g. $12AB
           the state machine is extended by states 13, 14, 15 and 16

  *****************************************************************************
  Permission to use, copy,  modify, and distribute this software and its
  documentation without fee for any purpose is hereby granted, provided that
  the above copyright notice appears on all copies and that both that copyright
  notice and this permission notice appear in all supporting documentation.

  NO REPRESENTATIONS ARE MADE ABOUT THE SUITABILITY OF THIS SOFTWARE FOR ANY
  PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY.
  NEITHER FRANK PLAGGE OR ANY OTHER PERSON SHALL BE LIABLE FOR ANY DAMAGES
  SUFFERED BY THE USE OF THIS SOFTWARE.
  *****************************************************************************

   description:
   This module contains the component TScanner. It contains the complete
   lexical analysis of an ascii file. This is a necessary basis to realize
   a parser for any language. Ths analysis of an ascii file produces a stream
   of token.


   properties and events at design time and runtime:

    property AllowIdentifier: Boolean  -- default: true
    if this property is true, every identifier which is not a keyword will
    result a ttIdentifier token. if this property is false every non keyword
    identifier will cause a ttError token

    property CommentLine: string  -- default: '//'
    this property contains the leading char for a comment. every comment is
    introduced with this string and is ended by the end of line

    property Filename: string  -- default: ''
    this property contains the filename for the source file to analyze with
    the method analyze

    property Keywords: TStringList  -- default: empty
    the keyword of the scanner are stored in the property Keywords. the
    scanner component sorts list of keywords automatical. if the scanner reads
    an identifier and the identifier is a member of the keywords it will result
    a token ttKeyword

    property KeywordsCaseSensitive: Boolean -- default: false
    if this property is true the compare if an identifier is made case.

    property SpecialChars: string  -- default: ''
    every special chars that results a ttSpecialChar token have to be entered
    in this string. the scanner component sort the string automatical.

    property OnTokenRead: TTokenEvent
    this user defined event is called if a new token is read from the input
    file. this event is called atfer the token is read and before it is stored
    in the internal list of the scanner component. if any application dependent
    changes before registering the new token are neccessary it is possible to
    change every part of the new token.


   properties and methods at runtime:

    procedure Analyze
    this is the main method taking a source file to tokens. the name of the
    source file to analyze is the contents of the property filename. the
    number of read tokens is available via the property count. the read
    tokens are available in the array Token. before analyzing a new source file
    the results of a previously analysis are deleted.

    property Count: Integer
    this property contains the number of token read by the method analyze.

    property Token[Index: Integer]
    this property gives an easy access to the available token. a little example
    shows the access:

        for i := 1 to Count do
          WorkWithToken( Token[i-1] );

}


unit Scanner;

interface

uses
  Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs,
  FileIO;

type

  // exception from scanner is the filename is empty or the file does not exist
  EScannerError = class(Exception);

  // set type for the special chars
  TCharSet = set of Char;

  // enumeration with the possible result token
  TTokenType = ( ttComment, ttEof, ttError,
                 ttHexDecimal, ttIdentifier, ttInteger,
                 ttKeyword, ttReal, ttSpecialChar, ttString );

  // a token contains the token type, the belonging text and the position
  // within the source file
  TToken = class
             Token:       TTokenType;   // token type
             Text:        string;       // representing text
             Row, Column: Integer;      // row and column within source file
             Tag:         Integer;      // user information
           end;

  // this event type is called if a token from the input file is read
  TTokenEvent = procedure (Sender: TObject; Token: TToken) of object;

  // decralation of the main scanner component
  TScanner = class(TComponent)
  private
    { Private-Decarations}
    FFilename:  string;              // filename of source file
    TokenList:  TList;               // list of read token
    EAState:    Byte;                // condition of the state machine
    EAText:     string;              // string that belongs to a token
    EAToken:    TTokenType;          // recognized token
    EARow:      Integer;             // Row of the first token char
    EAColumn:   Integer;             // Column of the first token char
    NextChar:   Char;                // next not processed char in source file
    FAllowIdentifier: Boolean;       // allow identifiers
    FCaseSensitive:   Boolean;       // detecting keywords case sensitive
    FCharacters:      TCharSet;      // allowed special chars
    FCommentLine:     string;        // introducing comment chars
    FKeywords:        TStringList;   // list of defined keywords
    FOnTokenRead: TTokenEvent;       // user defined event if a new token is read
    CommentIndex: Integer;           // index for comment recognition
    SourceFile:   TFile;             // file for lexical analysis
    SourceFileY:  Integer;           // actual row within source file
    SourceFileX:  Integer;           // actual column within source file
    procedure EASwitch0( ch: Char ); // process a char at state 0 ( start)
    procedure EASwitch1( ch: Char ); // process a char at state 1
    procedure EASwitch3( ch: Char ); // process a char at state 3
    procedure EASwitch5( ch: Char ); // process a char at state 5
    procedure EASwitch7( ch: Char ); // process a char at state 7
    procedure EASwitch9( ch: Char ); // process a char at state 9
    procedure EASwitch11( ch: Char );// process a char at state 11
    procedure EASwitch13( ch: Char );// process a char at state 13
    procedure EASwitch14( ch: Char );// process a char at state 14
    procedure EASwitch16( ch: Char );// process a char at state 16
    function  GetCount: Integer;     // read method for property Count
    function  GetToken( Index: Integer ) : TToken;  // read method for property Token
    function  GetSpecialChars: string;  // read method for property SpecialChars
    function  IsKeyword( Value: string ) : Boolean; // test if a string is a keyword
    procedure ProcessChar;                      // process the available char
    procedure ReadChFromFile( var ch: Char );   // read a char from file
    procedure ReadToken;                        // read the next token
    procedure SetKeywords( Value: TStringList );// write method for property Keywords
    procedure SetSpecialChars( Value: string ); // write method for SpecialChars
  protected
    { Protected-Deklarationen }
  public
    { Public-Deklarationen }
    constructor Create(AOwner: TComponent); override;     // create the scanner component
    destructor  Destroy; override;                        // destroy the scanner component
    procedure Analyze;                                    // analyze the input file
    property Count: Integer read GetCount;                // number of found token
    property Token[Index: Integer] : TToken read GetToken;// array with found token
  published
    { Published-Deklarationen }
    property AllowIdentifier: Boolean read FAllowIdentifier write FAllowIdentifier;
    property CommentLine: string read FCommentLine write FCommentLine;
    property Filename: string read FFilename write FFilename;
    property Keywords: TStringList read FKeywords write SetKeywords;
    property KeywordsCaseSensitive: Boolean read FCaseSensitive write FCaseSensitive;
    property SpecialChars: string read GetSpecialChars write SetSpecialChars;
    property OnTokenRead: TTokenEvent read FOnTokenRead write FOnTokenRead;
  end;

procedure Register;

implementation

const
  WhiteSpace: TCharSet = [' ', #9, #10, #13];   // known white spaces

// this is create constructor of the scanner. no changes to the
// inherited create, only initialization of internal and external
// variables
constructor TScanner.Create(AOwner: TComponent);
begin
  inherited Create( AOwner );       // create the class
  TokenList := TList.Create;        // create the list of read token
  FKeywords := TStringList.Create;  // create the list of keywords
  FKeywords.Sorted := true;         // sort the keywords
  FAllowIdentifier := true;         // default no identifier is allowed
  FCaseSensitive := false;          // default no case sensitive keyword compare
  FCharacters := [];                // default there are no special chars
  FCommentLine := '//';             // the default comment begin is '//'
  FFilename := '';                  // the filename is empty
end;

// this is destructor of the scanner. it is neccessary to free the internal
// dynamic data structures
destructor TScanner.Destroy;
begin
  FKeywords.Free;     // deallocate the memory used by the list of keywords
  TokenList.Free;     // deallocate the memory used by the list of read token
  inherited Destroy;  // destroy the class
end;

// this is the main analysis method
procedure TScanner.Analyze;
var NewToken: TToken;             // dynamic creation of read token
begin
  if Filename = '' then begin     // the filename must not be empty
    raise EScannerError.Create( 'Leerer Dateiname fr Scanner' );
  end;
  if not FileExists( Filename ) then begin  // and the file name must exists
    raise EScannerError.Create( 'Datei '+Filename+' ist nicht vorhanden' );
  end;
  SourceFile := TFile.Open( Filename ); // open the source file
  TokenList.Clear;                      // delete old results
  SourceFileY := 1;                     // first row is 1
  SourceFileX := 1;                     // first column is 1
  repeat                                // repeat until ttEOF Token is read
    ReadToken;                          // read the next token
    NewToken := TToken.Create;          // create the token structure
    NewToken.Token := EAToken;          // save the read token
    NewToken.Text := EAText;            // save the belonging text
    NewToken.Row := EARow;              // save column
    NewToken.Column := EAColumn;        // save row
    NewToken.Tag := 0;                  // initialize the user information
    // if an identifier is read and the belonging text is a keyword
    // then change the token type into ttKeyword
    if (NewToken.Token=ttIdentifier) and IsKeyword(NewToken.Text) then begin
      NewToken.Token := ttKeyword;
    end;
    // if an identifier is read and no identifiers are allowed then
    // the token type is changed to ttError
    if (NewToken.Token = ttIdentifier) and (not FAllowIdentifier) then begin
      NewToken.Token := ttError;
    end;
    // if a user defined event is available this event is called before the
    // token is put in the tokenlist
    if Assigned( FOnTokenRead ) then begin
      FOnTokenRead( Self, NewToken );
    end;
    TokenList.Add( NewToken );  // put the token in the tokenlist
  until EAToken = ttEof;
  SourceFile.Close;             // close the open source file
end;

// get the number of read token
function TScanner.GetCount: Integer;
begin
  Result := TokenList.Count; // read token are saved in internal list
end;

// get the already read token at index Index
function TScanner.GetToken( Index: Integer ) : TToken;
begin
  if (Index < 0 ) or (Index >= Count) then begin  // if the index is invalid
    Result := nil;                                // return nil
  end else begin
    Result := TokenList.Items[Index];             // else return the token
  end;
end;

// internal the specail char are stored in a set of char
// this method converts the set of char into a string
function TScanner.GetSpecialChars: string;
var i: Integer;
begin
  Result := '';                          // first there are no special chars
  for i := 0 to 255 do begin             // for all possible chars
    if Chr(i) in FCharacters then begin  // if the char is in the set
      Result := Result + Chr(i);         // add the char to the string
    end;
  end;
end;

// this method tests if a string is a keyword. the keywords are defined in
// the list Keywords
function TScanner.IsKeyword( Value: string ) : Boolean;
var KeyCompare: string;
    i:          Integer;
begin
  Result := false;                        // first the string is not a keyword
  for i := 1 to FKeywords.Count do begin  // for all defined keywords
    KeyCompare := FKeywords.Strings[i-1]; // get the keyword at index i-1
    if not FCaseSensitive then begin      // if no case sensitive compare
      Value := Uppercase( Value );        // only the uppercase strings are compared
      KeyCompare := Uppercase( KeyCompare );
    end;
    if Value = KeyCompare then begin      // if the two strings are equal
       Result := true;                    // the given string is a keyword
       Break;                             // exit the for loop
    end;
  end;
end;

// this method reads a char from the source file and adds it to the
// actual token text
procedure TScanner.ProcessChar;
var ch: Char;
begin
  ReadChFromFile( ch );    // read a char
  EAText := EAText + ch;   // add the char to the actual token text
end;

// read a new char from the input file
// the char #10 is used as global linefeed; MAC file has only #13 as linefeed,
// havn't they? Sorry!
// this procedure count the actual row and colum of the input file.
procedure TScanner.ReadChFromFile( var ch: Char );
begin
  SourceFile.ReadCh( ch );         // read the next char
  if not SourceFile.Eof then begin // if not Eof the calculate row/column
    if ch = #10 then begin         // if the read char a linefeed
      Inc(SourceFileY, 1);         // increment the row
      SourceFileX := 1;            // next column is 1
    end else begin                 // if no lienfeed is found
      if ch <> #13 then begin      // and no other linefeed is found
        Inc(SourceFileX, 1);       // increment the column
      end;
    end;
  end;
end;

// read the next token with a state machine
procedure TScanner.ReadToken;
begin
  EAState := 0;  // the first state is zero
  EAText := '';  // first the token text is empty
  if SourceFile.Eof then begin  // if the end of file is reached
    EAToken := ttEof;           // create a ttEof token
  end else begin
    while true do begin              // endless loop for reading, leaved by a final state
      NextChar := SourceFile.NextCh; // get the next char without reading it (look ahead)
      case EAState of                // process the char in notice to the actual state
        0: EASwitch0( NextChar );
        1: EASwitch1( NextChar );
        3: EASwitch3( NextChar );
        5: EASwitch5( NextChar );
        7: EASwitch7( NextChar );
        9: EASwitch9( NextChar );
       11: EASwitch11( NextChar );
       13: EASwitch13( NextChar );
       14: EASwitch14( NextChar );
       16: EASwitch16( NextChar );
      end;
      // check if a final state is reached
      case EAState of
        2: begin      // final state 2 represents a read identifier
             EAToken := ttIdentifier;
             Break;
           end;
        4: begin     // final state 4 represents a read integer
             EAToken := ttInteger;
             Break;
           end;
        6: begin     // final state 6 represents a read real number
             EAToken := ttReal;
             Break;
           end;
        8: begin     // final state 8 represents a read string
             EAToken := ttString;
             Break;
           end;
       10: begin     // final state 10 represents a read special char
             EAToken := ttSpecialChar;
             Break;
           end;
       12: begin     // final state 12 represents a read comment
             EAToken := ttComment;
             Break;
           end;
       15: begin     // final state 15 represents a read hexadecimal number
             EAToken := ttHexdecimal;
             Break;
           end;
       98: begin     // final state 98 represents a lexical error
             EAToken := ttError;
             Break;
           end;
       99: begin     // final state 99 represents the end of the source file
             EAToken := ttEOF;
             Break;
           end;
      end;
    end;
  end;
end;

// this method seems to be unnecessary, but it is very important for
// the correct work of TStringList in the object inspector
procedure TScanner.SetKeywords( Value: TStringList );
begin
  FKeywords.Assign( Value );
end;

// the user defines the special char in a string. this string must be converted
// into a set of char. working with a set of char is much easier, but i
// do not want to implement a new property editor for die usage with the
// object inspector
procedure TScanner.SetSpecialChars( Value: string );
var i: Integer;
begin
  FCharacters := [];                          // first the set of char is empty
  for i := 1 to Length(Value) do begin        // for every char in the string
    FCharacters := FCharacters + [Value[i]];  // add the char to the set
  end;
end;

// the following methods are characterizing the internal state machine

// process a char if the state machine has the state 0
// state 0 is the starting state
procedure TScanner.EASwitch0( ch: Char );
begin
  if SourceFile.Eof then begin    // if the end of source file is reached
    EAState := 99;                // switch to state 99
  end else begin
    case ch of
      'a'..'z',                   // if a normal char is read switch to state 1
      'A'..'Z'  : begin           // and try to read an identifier
                    EAState := 1;
                    EARow := SourceFileY;
                    EAColumn := SourceFileX;
                    ProcessChar;
                  end;
      '$'       : begin            // if a '$' is available switch to state 16
                    EAState := 16; // and try to read a hex number or a spacial char
                    EARow := SourceFileY;
                    EAColumn := SourceFileX;
                    ProcessChar;
                  end;  
      '0'       : begin            // if a '0' is read switch to state 13
                    EAState := 13; // and try to read an integer, real or hex number
                    EARow := SourceFileY;
                    EAColumn := SourceFileX;
                    ProcessChar;
                  end;
      '1'..'9'  : begin           // if number is read switch to state 3
                    EAState := 3; // and try to read a integer or a floting point number
                    EARow := SourceFileY;
                    EAColumn := SourceFileX;
                    ProcessChar;
                  end;
      ''''      : begin           // if the char ' is read switch to state 7
                    EAState := 7; // and try to read a string limited by '
                    EARow := SourceFileY;
                    EAColumn := SourceFileX;
                    ReadChFromFile( ch );
                  end;
      '"'       : begin           // if the char " is read switch to state 9
                    EAState := 9; // and try to read a string limited by "
                    EARow := SourceFileY;
                    EAColumn := SourceFileX;
                    ReadChFromFile( ch );
                  end;
    else          begin
                    if ch in WhiteSpace then begin // if a white space is read
                      ReadChFromFile( ch );        // read the next char
                    // if a comment beginning is defined and if the char is its first char
                    end else if (Length(FCommentLine) <> 0) and (ch = FCommentLine[1]) then begin
                      EAState := 11;             // switch to state 11
                      EARow := SourceFileY;      // save row and column
                      EAColumn := SourceFileX;
                      CommentIndex := 1;         // the first char of the comment beginning is processed
                      ReadChFromFile( ch );      // read the next char
                    // if the actual char is a member of the special chars
                    end else if ch in FCharacters then begin
                      EAState := 10;             // switch to state 10
                      EARow := SourceFileY;      // save row and column
                      EAColumn := SourceFileX;
                      ProcessChar;               // process this char
                    // else an illegal char is read and this will cause an error
                    end else begin
                      EAState := 98;             // switch to state 98
                      EARow := SourceFileY;      // save row and column
                      EAColumn := SourceFileX;
                      ProcessChar;               // process this char
                    end;
                  end;
    end;
  end;
end;

// process a char if the state machine has the state 1.
// in this state the state machines tries to read an identifier. an identifier
// consists of a leading char and any following number or char
procedure TScanner.EASwitch1( ch: Char );
begin
  case ch of
    'a'..'z',
    'A'..'Z',
    '0'..'9'  : ProcessChar;      // if a char or a number is read stay in this state
  else          EAState := 2;     // else switch to final state 2
  end;
end;

// process a char if the state machine has the state 3.
// in this state a integer or a floating point number is read
procedure TScanner.EASwitch3( ch: Char );
begin
  case ch of
    '0'..'9'  : ProcessChar;    // if a number is read the char is processed
    '.',','   : begin           // if a '.' or a ',' is read the char is processed
                  ProcessChar;  // and the state is switched to state 5 in order
                  EAState := 5; // to read a floating point number
                end;
  else          EAState := 4;   // the state is switched to final state 4
  end;
end;

// process a char if the state machine has the state 5.
// in this state floating point number is read
procedure TScanner.EASwitch5( ch: Char );
begin
  case ch of
    '0'..'9'  : ProcessChar    // if a number is read process the char
  else          EAState := 6;  // else the state is switched to final state 6
  end;
end;

// process a char if the state machine has the state 7
// in this state string enclosed in ' is read
procedure TScanner.EASwitch7( ch: Char );
begin
  case ch of
    #0, #10,                    // if a #0 (eof) or a linefeed char is read there is an
    #13     : EAState := 98;    // error because the string is not finished
    ''''    : begin             // if the final ' is read
                EAState := 8;   // switch to final state 8 and read the next char
                ReadChFromFile( ch );
              end;
  else        ProcessChar;      // else the char is a member of the string
  end;
end;

// process a char if the state machine has the state 9
// in this state string enclosed in " is read
procedure TScanner.EASwitch9( ch: Char );
begin
  case ch of
    #0, #10,                    // if a #0 (eof) or a linefeed char is read there is an
    #13     : EAState := 98;    // error because the string is not finished
    '"'  : begin                // if the final " is read
             EAState := 8;      // switch to final state 8 and read the next char
             ReadChFromFile( ch );
           end;
  else     ProcessChar;         // else the char is a member of the string
  end;
end;

// process a char if the state machine has the state 11
// in this state the state machines tries to read a comment.
// a comment begins with the introducing user defined comment string. the
// first char of the comment is already read by the state 0. it is possible
// that the first char of the comment string is a special char. the state
// machine can handle this in state 11.
procedure TScanner.EASwitch11( ch: Char );
begin
  // is the introducing string read complete
  if CommentIndex < Length(FCommentLine) then begin
    // is the next char the next expected comment char within the comment string
    if ch = FCommentLine[CommentIndex+1] then begin
      Inc(CommentIndex);       // increment the comment index counter
      ReadChFromFile( ch );    // read the next char
    end else begin
      // if not the expected introducing char is available but the char is
      // the first char within the comment string and the char is a member
      // of the special chars, not a comment is read but a special char
      if (CommentIndex = 1) and (ch in FCharacters+WhiteSpace) then begin
        EAState := 10;             // switch to final state 10
        EAText := FCommentLine[1]; // the read special char is the first comment char
      end else begin               // else an error is occured
        EAState := 98;             // switch to the error state 98
        ProcessChar;               // process the available char
      end;
    end;
  end else begin                   // the introducing comment is complete
    case ch of                     // every linefeed finishes the comment
       #10, #13 : begin
                    EAState := 12;        // switch to final state 12
                    ReadChFromFile( ch ); // read the next char
                    // delete every leading blank from the comment line
                    while (Length(EAText)<> 0) and (EAText[1] = ' ') do begin
                      System.Delete( EAText, 1, 1);
                    end;
                    // delete every last blank from the comment line
                    while (Length(EAText)<> 0) and (EAText[Length(EAText)] = ' ') do begin
                      System.Delete( EAText, Length(EAText), 1);
                    end;
                  end;
    else     ProcessChar;  // if no linefeed is read add the char to the uncomplete comment line
    end;
  end;
end;

// process a char if the state machine has the state 13
// in this state a zero is already read and it is possible that it is
// an integer, a real or a hex number
procedure TScanner.EASwitch13( ch: Char );
begin
  case ch of
    'x', 'X': begin           // if a 'x' is read a hexnumer is found
               EAState := 14; // switch to state 14 to read rest of the hex number
               ProcessChar;
               EAText := '';  // clear the input text '0x' because this is not a part of the hex number
             end;
   '0'..'9': begin            // if another number is read it is an integer or
               EAState := 3;  // real number will follow
               ProcessChar;
             end;
   '.', ',': begin            // if a '.' or a ',' is read the char is processed
               ProcessChar;   // and the state is switched to state 5 in order
               EAState := 5;  // to read a floating point number
             end;
  else       EAState := 4;  // the state is switched to final state 4 ( single '0' read)
  end;
end;

// process a char if the state machine has the state 14
// in this state a hex number is read
procedure TScanner.EASwitch14( ch: Char );
begin
  case ch of
    'a'..'f',
    'A'..'F',
    '0'..'9' : ProcessChar;
    'g'..'z',
    'G'..'Z' : begin             // an error occured because
                 ProcessChar;    // another char without whitespace
                 EAState := 98;  // is not allowed
               end;
  else         EAState := 15;  // the state is switched to final state 15 (single '0')
  end;
end;

// process a char if the state machine has the state 14
// in this state a hex number is read
procedure TScanner.EASwitch16( ch: Char );
begin
  case ch of
    'a'..'f',                    // is an allowed hex char is read
    'A'..'F',
    '0'..'9' : begin
                 EAState := 14;  // switch to state 14 and read the complete hex number
                 EAText := '';   // clear the formerly read '$'
                 ProcessChar;
               end;
  else         begin
                 if '$' in FCharacters then begin // if $ is defined as a special char
                   EAState := 10;                 // switch to state final 10, do not read next char!!
                 end else begin
                   ProcessChar;     // if $ is not the first char of a hex number
                   EAState := 98;   // and no special char there is an error
                 end;
               end;
  end;
end;

// register the component at the component palette
// perhaps you want to change the palette register name
procedure Register;
begin
  RegisterComponents('Erweitert', [TScanner]);
end;

end.
