(***********************************************************************)
(*                                                                     *)
(*                           Htmlc                                     *)
(*                                                                     *)
(*                    Pierre Weis, INRIA Rocquencourt                  *)
(*                                                                     *)
(*  Copyright 2007 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  Distributed only by permission.                   *)
(*                                                                     *)
(***********************************************************************)

(* $Id: parser.ml,v 1.32 2012-10-12 08:14:03 weis Exp $ *)

(** {A very basic paring engine for Htmlc source files} *)

type marker = char;;

type error =
   | Invalid_lexeme_start of char
   | Invalid_parenthesis of char
   | Unterminated_comment
   | Unterminated_lexeme
;;

type reading_mode =
   | Input_line_mode
   | Get_line_mode
(** How do we read (multi)-lines: the resulting string may or may not have a
   final newline character, according to the reading mode. More precisely:

   - [Get_line_mode]: the result includes the final newline if any (as could be
   normal to distinguish between a true proper line and the final ``line'' of a
   file which could be without any newline character).

   - [Input_line_mode]: the result does not include a final newline (as in the
     basic [Pervasives.input_line function)].
*)
;;

type read_line_mode =
   | Verbatim_multi_line of reading_mode
   | Escaped_multi_line of reading_mode
(** How do we treat escaped end of lines: the resulting line may or may not
    have escaped end of lines and additional material in it.

   - [Verbatim_multi_line mode]: the result still have escaped end of
     lines (including spaces and backslashes).
   - [Escaped_multi_line mode]: the result is free of escaped end of
     lines (including spaces and backslashes).
*)
;;

(** In a definition "<ident> = <expression>", the <expression> can be parsed
   - as usual ([Parsing_mode]) or
   - just spreading to the next newline character ([End_of_line_mode]).
*)
type binding_mode =
   | Parsing_mode
   | End_of_line_mode
;;

type lexeme =
   | Number of float
   | String of string
   | Char of string
   | Bool of string
   | Ident of string
   | Symbol of string
   | Quoted of string
   | Forced of string
   | Let | Rec | Quote
   | Eq | SemiSemi
(*   | If | Then | Else*)
   | Eof
;;

exception Error of error;;

let string_of_exc = function
  | Error x ->
    begin match x with
    | Invalid_lexeme_start c ->
      Printf.sprintf "syntax error: ``%c'' cannot start a token." c
    | Invalid_parenthesis c ->
      Printf.sprintf "syntax error: ``%c'' cannot be a parenthesis marker." c
    | Unterminated_comment ->
      Printf.sprintf "syntax error: unterminated comment."
    | Unterminated_lexeme ->
      Printf.sprintf "syntax error: unterminated lexeme." end
  | x -> Printexc.to_string x
;;

let error e = raise (Error e);;

let lexeme_of_ident = function
  | "let" -> Let | "rec" -> Rec
(*  | "if" -> If | "then" -> Then | "else" -> Else*)
  | "true" | "false" as ident -> Bool ident
  | "quote" -> Quote
  | ident ->
    Debug.printf "Found ident %S.\n" ident;
    Ident ident
;;

let symbol_of_ident = function
  | "=" -> Eq
  | ";;" -> SemiSemi
  | ident -> Symbol ident
;;

(** Simplified version of a Caml comment lexer:
   we parse ".*" followed by an end of comment marker * ), i.e. until
   encountering a closing comment marker.
   We also skip strings in comments (although this could be overkill). *)
let rec skip_caml_comments sb =
  if Scanf.Scanning.end_of_input sb then error Unterminated_comment else
  Scanf.bscanf sb "%_[^\"*(]%0c" (function
   | '\"' ->
     Scanf.bscanf sb "%_S" (); skip_caml_comments sb
   | '*' ->
     Scanf.bscanf sb "%_c%0c" (function
     | ')' -> Scanf.bscanf sb "%_c" ()
     | _ -> skip_caml_comments sb)
   | '(' ->
     Scanf.bscanf sb "%_c%0c" (function
     | '*' ->
       Scanf.bscanf sb "%_c" ();
       skip_caml_comments sb;
       skip_caml_comments sb
     | _ -> skip_caml_comments sb)
   | _ -> skip_caml_comments sb)
;;

let quote_buff = Buffer.create 1024;;

let return_quote () =
  let res = Buffer.contents quote_buff in
  Buffer.clear quote_buff;
  Debug.printf "Returning quote_buff : %S" res;
  res
;;

let scan_quote sb =
  Debug.printf "Scan_quote.\n";
  let rec q sb =
    Scanf.bscanf sb "%[^\\\']%c" (fun s c ->
      match c with
      | '\'' ->
        Debug.printf "Found a \'.\n";
        Buffer.add_string quote_buff s;
        Quoted (return_quote ())
      | '\\' ->
        Debug.printf "Found a \\.\n";
        Buffer.add_char quote_buff c;
        Scanf.bscanf sb "%c" (fun c ->
          Buffer.add_char quote_buff c;
          q sb)
      | c ->
        Debug.printf "Found char %C\n" c;
        Buffer.add_char quote_buff c; q sb) in
  q sb
;;

let scan_backquote sb =
  let rec bq sb =
    Scanf.bscanf sb "%[^\\`]%c" (fun s c ->
      match c with
      | '`' -> (* ` helping emacs *)
        Buffer.add_string quote_buff s;
        Forced (return_quote ())
      | '\\' ->
        Buffer.add_char quote_buff c;
        Scanf.bscanf sb "%c" (fun c ->
          Buffer.add_char quote_buff c;
          bq sb)
      | c ->
        Buffer.add_char quote_buff c; bq sb) in
  bq sb
;;

let rec find_closing sb closing accu =
  Scanf.bscanf sb "%0c" (function
  | c when c = closing ->
    Scanf.bscanf sb "%_c"
     (Ident (String.concat "" (List.rev accu)))
  | '\"' ->
    Scanf.bscanf sb "%S" (fun s ->
    find_closing sb closing (Printf.sprintf "%S" s :: accu))
  | _c ->
    (if closing = ')'
     then Scanf.bscanf sb "%[^)\"]"
     else Scanf.bscanf sb "%[^}\"]")
     (fun ident -> find_closing sb closing (ident :: accu)))
;;

let closing = function
  | '(' -> ')'
  | '{' -> '}'
  | '[' -> ']'
  | '<' -> '>'
  | c -> raise (Error (Invalid_parenthesis c))
;;

let rec scan_lexeme_other marker sb = function
  | '\"' ->
    (* Looking for a string *)
    Debug.printf "Expecting a string.\n";
    Scanf.bscanf sb "%S" (fun s -> String s)
  | '0' .. '9' | '-' | '+' ->
    (* Looking for a number *)
    Debug.printf "Expecting a number.\n";
    Scanf.bscanf sb "%f" (fun f -> Number f)
  | '\'' ->
    (* Looking for a char *)
    Debug.printf "Expecting a char.\n";
    Scanf.bscanf sb "%C" (fun s -> Char (Char.escaped s))
  | 'a' .. 'z' | 'A' .. 'Z' | '_' ->
    (* Looking for an identifier *)
    Debug.printf "Expecting a regular ident.\n";
    Scanf.bscanf sb "%[a-zA-Z_0-9]" lexeme_of_ident
  | '(' | '[' | '{' | '}' | ']' | ')' ->
    (* Looking for a monocharacter symbol *)
    Debug.printf "Expecting a symbol.\n";
    Scanf.bscanf sb "%1[][{()}]" symbol_of_ident
  | c when c = marker ->
    (* Looking for an Htmlc ident or expression (marker is $ by default). *)
    Debug.printf "Found a %C sign.\n" marker;
    Scanf.bscanf sb "%_c" ();
    if Scanf.Scanning.end_of_input sb
      then Symbol (String.make 1 marker) else
    Scanf.bscanf sb "%0c" (function
    (* Looking for a parenthesized $(ident) or $(expression). *)
    | '(' | '{' as c -> (* Helping emacs '}' ')' *)
      Debug.printf "Expecting a %c(ident).\n" marker;
      Scanf.bscanf sb "%_c%0c" (lex_paren_ident_expr sb (closing c))
    (* Looking for a regular regular Hmlc ident. *)
    | 'a' .. 'z' | 'A' .. 'Z' | '_' ->
      Debug.printf "Expecting a %c ident.\n" marker;
      Scanf.bscanf sb "%[a-zA-Z_0-9]" (fun ident -> Ident ident)
    | c -> error (Invalid_lexeme_start c))
  | '~' | '`' (* helping emacs ` *)
  | '@' | '$' | '%' | '^' | '&'
  | '*' | '=' | '|' | ':'
  | ';' | ',' | '<' | '>' | '.' | '?' | '/' ->
    (* Looking for a terminator symbol *)
    Debug.printf "Expecting a symbol.\n";
    Scanf.bscanf sb "%[~`%@$%%^&*|:;,<>.?/-_]" symbol_of_ident
  | c -> error (Invalid_lexeme_start c)

and lex_paren_ident_expr sb closing = function
  | '\'' ->
    (* Looking for a $('...') quote expression. *)
    Debug.printf "Expecting a quoted expression.\n";
    let tok = Scanf.bscanf sb "%_c" scan_quote sb in
    Scanf.bscanf sb "%_c" tok
  | '`' -> (* ` helping emacs *)
    (* Looking for a $(`...`) backquote expression. *)
    Debug.printf "Expecting a BACKquoted expression.\n";
    let tok = Scanf.bscanf sb "%_c" scan_backquote sb in
    Scanf.bscanf sb "%_c" tok
  | 'a' .. 'z' | 'A' .. 'Z' | '_' ->
    (* Looking for a $(ident) or a ${ident} or a $( expression ). *)
    Debug.printf "Expecting a %c(ident).\n" (Htmlc_options.get_marker_char ());
    (if closing = ')'
     then Scanf.bscanf sb "%[^)\"]"
     else Scanf.bscanf sb "%[^}\"]")
      (fun ident -> find_closing sb closing [ident])
  | c -> error (Invalid_lexeme_start c)
;;

let rec scan_lexeme marker sb =
  if Scanf.Scanning.end_of_input sb then Eof else
  Scanf.bscanf sb " %0c" (function
  (* Our own uni-line comments. *)
  | '#' -> Scanf.bscanf sb "%_[^\n\r] " (); scan_lexeme marker sb
  (* Regular Caml comments starting with a left paren. *)
  | '(' -> (* ')' *)
    Scanf.bscanf sb "%_c" ();
    if Scanf.Scanning.end_of_input sb then Symbol "(" (* ")" *) else
    Scanf.bscanf sb "%0c" (function
    | '*' ->
      Scanf.bscanf sb "%_c" skip_caml_comments sb; scan_lexeme marker sb
    | _ -> Symbol "(" (* ")" *))
  (* Scan a lexeme that begins with char [c]. *)
  | c -> scan_lexeme_other marker sb c)
;;

let parse_expression marker sb =
  try scan_lexeme marker sb with
  | End_of_file -> error Unterminated_lexeme
;;

let add_final_newline mode ob c =
  match mode with
  | Verbatim_multi_line Input_line_mode
  | Escaped_multi_line Input_line_mode -> ()
  | Verbatim_multi_line Get_line_mode
  | Escaped_multi_line Get_line_mode ->
    Buffer.add_char ob c
;;

let add_escaped_newline mode ob s =
  match mode with
  | Escaped_multi_line Get_line_mode
  | Escaped_multi_line Input_line_mode -> ()
  | Verbatim_multi_line Get_line_mode
  | Verbatim_multi_line Input_line_mode ->
    Buffer.add_string ob s
;;

let rec scan_multi_line mode ob sb =

  let rec got_string sb =
    (* If end of input is found, we got the line. *)
    if not (Scanf.Scanning.end_of_input sb) then
    (* Otherwise look at the next char. *)
    Scanf.bscanf sb "%c" (function
    (* Next character is a '\\' char. *)
    | '\\' -> got_back_slash sb
    (* Next character is not a '\\' char: it must be '\n'.
       Return since we got the line. *)
    | c -> add_final_newline mode ob c)

  and got_back_slash sb =
    (* If end of input is found, we got the line. *)
    if Scanf.Scanning.end_of_input sb then Buffer.add_char ob '\\' else
      (* Otherwise look at the next char. *)
      Scanf.bscanf sb "%c" (function
      (* We found two '\\' in a row. *)
      | '\\' -> got_back_slash_back_slash sb
      (* We found a line continuation, i.e.
         a '\\' char followed by an end of line char;
         we handle (may be skip) the '\\' and the '\n' and continue
         scanning the line. *)
      | '\n' ->
        add_escaped_newline mode ob "\\\n";
        scan_multi_line mode ob sb
      (* We found a '\\' char which does not start a line continuation.
         We store the '\\' and the char and go on scanning the line. *)
      | c ->
        Buffer.add_char ob '\\';
        Buffer.add_char ob c;
        scan_multi_line mode ob sb)

  and got_back_slash_back_slash sb =
    (* If end of input is found, we got the line. *)
    if Scanf.Scanning.end_of_input sb then Buffer.add_string ob "\\\\" else
    (* Otherwise look at the next char. *)
    Scanf.bscanf sb "%c" (function
    (* We found three '\\' in a row:
       we output one and go on to the next char after 2 '\\', in order to
       properly escape the next '\\' or '\n' or copy all those '\\' if
       nothing special occurs afterwards. *)
    | '\\' ->
      Buffer.add_char ob '\\';
      got_back_slash_back_slash sb
    (* We found two back_slash chars before an end of line:
       this is precisely the mark of a single escaped '\\' char at the end
       of a line.
       Output the corresponding material and return since we got the line
       we were looking at. *)
    | '\n' as c ->
      Buffer.add_char ob '\\';
      add_final_newline mode ob c
    (* Two '\\' in a row before a char with no special meaning: nothing
       special to do; store chars read and go on scanning the line. *)
    | c ->
      Buffer.add_string ob "\\\\";
      Buffer.add_char ob c;
      scan_multi_line mode ob sb) in

  Scanf.bscanf sb "%[^\n\\]" (fun s ->
    Buffer.add_string ob s;
    got_string sb)
;;

let read_multi_line mode sb =
  let ob = Buffer.create Configuration.line_buffer_length in
  scan_multi_line mode ob sb;
  let res = Buffer.contents ob in
  Debug.printf "read_multi_line: (%s)\n" res;
  res
;;

let input_multi_line sb =
  read_multi_line (Verbatim_multi_line Input_line_mode) sb
(** [input_line] that supports to read a line as a sequence of lines that all
  end by escaped new lines.
  The multi-lines of the line are kept verbatim in the result.
  The resulting string has no final new line character. *)
;;
let get_multi_line sb =
  read_multi_line (Escaped_multi_line Get_line_mode) sb
(** [input_line] that supports to read a line as a sequence of lines that all
  end by escaped new lines.
  Multi-lines are treated as Caml regular multi-line string (the characters
  '\\' and '\n' at the end of a multi-line are omitted from the result,
  as well as all leading blanks of the next multi-line).
  The resulting string has a final new line character if the multi-line read
  ends by one. *)
;;

let get_multi_line_keep sb =
  read_multi_line (Verbatim_multi_line Get_line_mode) sb
(** [input_line] that supports to read a line as a sequence of lines that all
  end by escaped new lines.
  Multi-lines of the line are kept verbatim in the result.
  The resulting string has a final new line character if the multi-line read
  ends by one. *)
;;

let get_line sb =
  if Setting.default.Setting.expand_mode
  then get_multi_line_keep sb
  else get_multi_line sb
;;

(** Parsing an implicitely quoted identifier binding: <ident> = .*'\n',
   for instance, the line
   PROJECT_ROOT_DIR = .
   binds ident PROJECT_ROOT_DIR to the string ".".
   We use [input_multi_line] since:
   - we must not keep the final newline character,
   - we must properly keep all escaped newlines.
 *)
let parse_expression_to_end_of_line sb = String (input_multi_line sb);;

let rec parse_ident_list sb =
  Scanf.bscanf sb "%[^ =\"\t\n\r] " (fun bound_name ->
  if Scanf.Scanning.end_of_input sb then bound_name, [] else
  Scanf.bscanf sb "%0c" (function
  | 'a' .. 'z' | 'A' .. 'Z' | '_' ->
    let name, lname = parse_ident_list sb in
    bound_name, name :: lname
  | _ -> bound_name, []))
;;

(** To be enhanced (may be by using a ``true'', ``real'', ``effective'', and
   so on parsing machinery. *)

let rec parse_expression_list marker sb =
  let lxm = parse_expression marker sb in
  if lxm = Eof then Eof, [] else
  let lexeme, lexemes = parse_expression_list marker sb in
  lxm, if lexeme = Eof then lexemes else lexeme :: lexemes
;;

let unquote_string s =
  let len = String.length s in
  let rec loop in_slash i =
    if i < len then
    begin match s.[i] with
    | '\\' -> loop true (i + 1)
    | '\'' when in_slash ->
      Buffer.add_char quote_buff '\'';
      loop false (i + 1)
    | c when in_slash ->
      Buffer.add_char quote_buff '\\';
      Buffer.add_char quote_buff c;
      loop false (i + 1)
    | c ->
      Buffer.add_char quote_buff c;
      loop false (i + 1)
    end else
    if in_slash then Buffer.add_char quote_buff '\\' in

  loop false 0;
  return_quote ()
;;

let unforce_string s =
  let len = String.length s in
  if Substitute.is_forced_string s
  && len > 3 && s.[1] = '\'' && s.[len - 2] = '\''
  then String.sub s 2 (len - 4) else
  failwith (Printf.sprintf "unforce_string: cannot unforce string %S." s)
;;

let string_of_lexeme marker = function
  | Number float -> Printf.sprintf " %f" float
  | String string -> Printf.sprintf " %S" string
  | Char char -> Printf.sprintf " '%s'" char
  | Bool bool -> Printf.sprintf " %s" bool
  | Ident string -> Printf.sprintf " %c(%s)" marker string
  | Symbol string -> Printf.sprintf " %s " string
  | Quoted string -> unquote_string string
  | Quote -> Printf.sprintf "quote "
  | Forced string -> Printf.sprintf "%c(`%s`)" marker string
  | Let -> Printf.sprintf "let "
  | Rec -> Printf.sprintf " rec "
  | Eq -> Printf.sprintf " = "
  | SemiSemi -> Printf.sprintf "\n;;\n"
 (*   | If | Then | Else*)
  | Eof -> Printf.sprintf ""
;;

let unquote s =
  let sb = Scanf.Scanning.from_string s in
  let marker = Htmlc_options.get_marker_char () in
  let lexeme, lexemes = parse_expression_list marker sb in
  Buffer.clear quote_buff;
  List.iter
    (function lxm ->
     let s = string_of_lexeme marker lxm in
     Buffer.add_string quote_buff s)
    (lexeme :: lexemes);
  return_quote ()
;;
