(***********************************************************************)
(*                                                                     *)
(*                           CIME Caml                                 *)
(*                                                                     *)
(*            Pierre Weis, projet Cristal, INRIA Rocquencourt          *)
(*                                                                     *)
(*  Copyright 1997 Institut National de Recherche en Informatique et   *)
(*  en Automatique.  Distributed only by permission.                   *)
(*                                                                     *)
(***********************************************************************)

(* $Id: htmlc.ml,v 1.34 2004/09/01 08:04:58 weis Exp $ *)

open Arg;;
open Lib_strings;;
open Lang;;
open Date;;

(* "DATE_LOCAL" "DATE_GMT" "LAST_MODIFIED" "DOCUMENT_URI"
   "DOCUMENT_PATH_INFO" "USER_NAME" (default is "<unknown>")
   "DOCUMENT_NAME" "QUERY_STRING_UNESCAPED"

when echoing vars

encoding = none ou encoding = url ou encoding = entity

Formats:
"timefmt" not yet done (need metas in Ocaml...)

"sizefmt": "bytes" "abbrev"
*)

type encoding = | E_none | E_url | E_entity;;
type sizefmt = | S_bytes | S_abbrev;;

(* Given a floating point number, try to figure out if it is indeed an
   integer to return the corresponding string. Otherwise returns the
   (Caml compatible) string representation of the number. *)
let string_of_number f =
 let i = int_of_float f in
 let ifl = float_of_int i in
 if ifl = f then string_of_int i else string_of_float f;;

let warning s = prerr_endline (Printf.sprintf "Warning: %s" s);;

(* Currently supported languages. *)
let get_supported_languages () = [Fr; Uk];;

(* Reporting supported languages. *)
let print_supported_languages () =
 prerr_endline
  (Printf.sprintf
     "Htmlc currently supports the following languages:\n%s.\n\
      To volunteer for a new translation, \
      please contact Pierre.Weis@inria.fr."
     (String.concat ", "
        (List.map string_of_lang (get_supported_languages ()))));;

let compile_to_read_only, set_writable_target =
  let compile_to_read_only = ref true in
  (fun () -> !compile_to_read_only),
  (fun () -> compile_to_read_only := false);;

let make_writable_file fname = Unix.chmod fname 0o0666;;

(* stdout and stdin are (conventionally) associated to the filename
   "-". We redefine open, close, and remove to implement this convention. *)
let my_remove fname =
 if fname <> "-" then begin
   make_writable_file fname;
   Sys.remove fname
 end;;

let my_open_out fname =
  if fname = "-" then stdout else begin
    try
      make_writable_file fname;
      open_out fname
    with _ -> open_out fname
  end;;

let my_close_out fname oc =
  if fname <> "-" then begin
    close_out oc;
    if compile_to_read_only () then Unix.chmod fname 0o0444
  end;;

let my_open_in fname =
  if fname = "-" then stdin else open_in fname;;

let my_close_in fname ic =
  if fname <> "-" then close_in ic;;

(* Variables environment to expand into files. *)
let htmlc_env = Hashtbl.create 17;;

let htmlc_get as get_env = Hashtbl.find htmlc_env;;

let htmlc_bind name value = Hashtbl.add htmlc_env name value;;

(* Predefined bindings: the Htmlc's version. *)
htmlc_bind "htmlc_version" Version.htmlc_version;
htmlc_bind "htmlc_full_version" Version.htmlc_full_version;;

let get_env_files, add_e_filename =
  let env_files = ref [] in
  (fun () -> !env_files),
  (fun s ->
     env_files := s :: !env_files);;

let add_e_name, add_e_value, bind_names_to_values =
  let names = ref [] in
  let values = ref [] in
  (fun s -> names := s :: !names),
  (fun s -> values := s :: !values),
  (fun () ->
    List.iter2 htmlc_bind !names !values;
    names := [];
    values := []);;

(* Find a dollar ident into a name found in the input. *)
let find_dollar_ident =
 let htmlc_var = Buffer.create 100 in
 function name ->
   Substitute.add_string (fun s -> s) htmlc_var name;
   let s = Buffer.contents htmlc_var in
   Buffer.clear htmlc_var;
   s;;

(* Specialized buffer handling functions. *)
let htmlc_add_string b s = Substitute.add_string_substitute htmlc_env b s;;

let htmlc_output_line b line =
  htmlc_add_string b line; Buffer.add_char b '\n';;

let htmlc_substitute_string =
 let htmlc_line = Buffer.create 100 in
 function s ->
  htmlc_add_string htmlc_line s;
  let s = Buffer.contents htmlc_line in
  Buffer.clear htmlc_line;
  s;;

(* File inclusion functions. *)
let cannot_include_file s x =
  failwith
    (Printf.sprintf
       "htmlc: aborting while loading file %s: %s" s (Printexc.to_string x));;

let load_included_file include_ic b name =
  try
   let ic = Path.open_in name in
   include_ic ic b
  with
  | x -> cannot_include_file name x;;

let html_encode_escape b s =
 let l = String.length s in
 for i = 0 to l - 1 do
  match s.[i] with
  | '<' -> Buffer.add_string b "&lt;"
  | '>' -> Buffer.add_string b "&gt;"
  | '&' -> Buffer.add_string b "&amp;"
  (* Should be somewhat more complex and use stuff from dehtml ... *)
  | c -> Buffer.add_char b c
 done;;

let decimal_to_hexa i =
  if i < 10 then char_of_int (i + int_of_char '0')
  (* 55 = int_of_char 'A' - 10 *)
  else char_of_int (i + 55);;

let hexa_char c =
  let s = String.make 3 '%'
  and i = int_of_char c in
  s.[1] <- decimal_to_hexa (i / 16);
  s.[2] <- decimal_to_hexa (i mod 16);
  s;;

let url_encode b s =
  let l = String.length s in
  for i = 0 to l - 1 do
    match s.[i] with
    | ' ' -> Buffer.add_char b '+'
    | '0'..'9' | 'a'..'z' | 'A'..'Z' as c -> Buffer.add_char b c
    | '\n' -> Buffer.add_string b "%0D%0A"
    | c -> Buffer.add_string b (hexa_char c)
    done;;

let add_verbatim_ic ic b =
 try
  while true do
   let line = input_line ic in
   html_encode_escape b line;
   Buffer.add_char b '\n'
  done
 with End_of_file -> close_in ic;;

let add_verbatim_file b name =
 load_included_file add_verbatim_ic b name;;

(* Given a scanning buffer sb, an ident [name], and a(n unread) character,
   find the specified value for the name, then add the corresponding
   binding to the environment.
   We parse ident *[= *]value[;;]\n
   and value is either a string, a char, a boolean, or a floating point
   number. *)
let rec bind_name sb name = function
  | '=' ->
     Scanf.bscanf sb "= %0c" (bind_name sb name)
  | '"' -> (* Binding to a string *)
     Scanf.bscanf sb "%S %_[;\n]"
       (fun v -> htmlc_bind name v)
  | '0' .. '9' | '-' | '+' -> (* Binding to a number *)
     Scanf.bscanf sb "%f %_[;\n]"
       (fun v -> htmlc_bind name (string_of_number v))
  | '\'' -> (* Binding to a char *)
     Scanf.bscanf sb "%C %_[;\n]"
       (fun v -> htmlc_bind name (Char.escaped v))
  | 'a' .. 'z' | 'A' .. 'Z' -> (* Binding to another identifier *)
     Scanf.bscanf sb "%[^; \n] %_[;\n]"
       (fun ident ->
          match ident with
          | "true" | "false" -> htmlc_bind name ident
          | _ -> htmlc_bind name (Printf.sprintf "$(%s)" ident))
  | _ ->
     (* Binding to something else, for instance another $ identifier *)
     Scanf.bscanf sb "%[^; \n] %_[;\n]"
       (fun v -> htmlc_bind name v);;

(* Simplified version of a Caml comment lexer:
   we parse ".*\*)", i.e. until encountering a closing comment marker.
   We also skip strings in comments (although this could be overkill). *)
let rec skip_caml_comments sb =
  Scanf.bscanf sb "%_[^\"*(]%0c" (function
   | '"' -> Scanf.bscanf sb "%_S" (); skip_caml_comments sb
   | '*' ->
       Scanf.bscanf sb "%_c%0c" (function
       | ')' -> Scanf.bscanf sb "%_c" ()
       | _ -> skip_caml_comments sb)
   | '(' ->
       Scanf.bscanf sb "%_c%0c" (function
       | '*' ->
           Scanf.bscanf sb "%_c" ();
           skip_caml_comments sb;
           skip_caml_comments sb
       | _ -> skip_caml_comments sb)
   | _ -> skip_caml_comments sb);;

let load_env_sb ic sb =
  try
   while true do
     Scanf.bscanf sb " %0c" (function
     (* Skip our own comments *)
     | '#' -> Scanf.bscanf sb "%_s@\n " ()
     (* Skip also a simplified version of Caml comments:
        "(\*.*\n". *)
     | '(' (* ')' helping emacs *) ->
        Scanf.bscanf sb "%2s" (fun delim ->
         match delim with
         (* This is a Caml comment: skip it. *)
         | "(*" -> skip_caml_comments sb
         (* This is a syntax error: report it. *)
         | delim ->
            Scanf.bscanf sb "%s@\n" (fun line ->
              raise (Scanf.Scan_failure (Printf.sprintf "%s%s" delim line))))
     (* Otherwise find an ident, and call [bind_name] to bind it. *)
     | _ ->
        Scanf.bscanf sb " %[^ =\"\t\n\r] " (fun ident ->
          match ident with
          | "let" -> Scanf.bscanf sb " %[^ =\"\t\n\r] %0c" (bind_name sb)
          | ident -> Scanf.bscanf sb "%0c" (bind_name sb ident)))
   done
  with
  | End_of_file -> close_in ic
  | Scanf.Scan_failure s ->
      close_in ic;
      prerr_endline (Printf.sprintf "htmlc: wrong environment entry, %s" s);
      failwith "load environment";;

let load_env_file_in_line b name =
 let load_env_ic ic b = load_env_sb ic (b ic) in
 load_included_file load_env_ic (Scanf.Scanning.from_channel) name;;

let load_env_file fname =
 let load_env_ic ic b = load_env_sb ic (b ic) in
 load_included_file load_env_ic (Scanf.Scanning.from_channel) fname;;

let load_env_files () =
 try List.iter load_env_file (get_env_files ()) with
 | Failure s -> prerr_endline s; exit 4;;

let launch_cgi b name = failwith "Cgi not yet implemented";;

let launch_command b name =
 let ic = Unix.open_process_in name in
 let ib = String.create 1024 in
 let status =
   let rec loop () =
     let n = input ic ib 0 1024 in
     if n > 0 then (htmlc_add_string b (String.sub ib 0 n); loop ())
     else raise End_of_file in
   try loop (); 2 with
   | End_of_file ->
      (match Unix.close_process_in ic with
       | Unix.WEXITED n -> n
       | _ -> 2) in
 if status <> 0 then
   warning
    (Printf.sprintf "htmlc: command %s failed to execute properly." name);;

let size_of_file_in_byte name = (Unix.stat name).Unix.st_size;;

let size_of_file encoding name =
  let sz = size_of_file_in_byte name in
  match encoding with
  | S_bytes -> string_of_int sz
  | S_abbrev ->
     if sz < 1000000
     then Printf.sprintf "%dKB" ((sz + 500) / 1000) else
     if sz < 1000000000
     then Printf.sprintf "%dMB" ((sz + 500000) / 1000000) else
     Printf.sprintf "%d.%dGB"
       (sz / 1000000000)
       ((sz mod 1000000000) / 100000000);;

let htmlc_unknown s1 s2 =
  failwith (Printf.sprintf "htmlc: unknown %s %s" s1 s2);;

let unknown_qualifier m q =
  htmlc_unknown
   (Printf.sprintf "qualifier %s" q)
   (Printf.sprintf "for %s" m);;

let unknown_command c = htmlc_unknown "commande" c;;

let unknown_modifier m = htmlc_unknown "modifier" m;;

let echo_var b name modifiers =
  let encoding =
    let is = Scanf.Scanning.from_string modifiers in
    Scanf.kscanf is
     (fun ib exc -> E_none)
     " %[^= ] = %S"
     (fun mode encoding ->
       match String.lowercase mode with
       | "encoding" ->
         (match String.lowercase encoding with
          | "none" -> E_none
          | "url" -> E_url
          | "entity" -> E_entity
          | _ -> unknown_qualifier mode encoding)
       | _ -> unknown_modifier mode) in
  match encoding with
  | E_none -> htmlc_add_string b name
  | E_url -> url_encode b (htmlc_substitute_string name)
  | E_entity -> html_encode_escape b (htmlc_substitute_string name);;

let treat_fsize b mode name modifiers =
  match mode with
  | "file" as m ->
    let sizefmt =
      let is = Scanf.Scanning.from_string modifiers in
      Scanf.kscanf is
        (fun ib exc -> S_bytes)
         " %[^= ] = %S"
        (fun mode encoding ->
           match String.lowercase mode with
           | "sizefmt" ->
             (match String.lowercase encoding with
              | "bytes" -> S_bytes
              | "abbrev" -> S_abbrev
              | _ -> unknown_qualifier mode encoding)
           | _ -> unknown_qualifier m mode) in
    htmlc_add_string b (size_of_file sizefmt name)
  | _ -> unknown_modifier mode;;

(* The language may be set explicitly through:

   (0) setting the $lang variable into the source file,
   (1) setting the -lang option of htmlc,
   (2) setting the $LANG system variable,

  or

   (3) implicitely via the file name extension.

  The function [htmlc_get_lang] implements this logic (omitting case
  (3)), and sets the respective priorities of those different
  methods. *)

let htmlc_get_lang src =
  try Lang.lang_of_string (htmlc_get "lang") with
  | Not_found -> Lang.get_with_file_name src;;

(* The main routine to expand files. *)
let rec treat_channel src ic b =
 let ib = Scanf.Scanning.from_channel ic in
 let within_if, incr_within_if, decr_within_if =
   let within_if = ref 0 in
   (fun () -> !within_if > 0),
   (fun () -> incr within_if),
   (fun () -> decr within_if) in
 let rec scan_line b line =
   let is = Scanf.Scanning.from_string line in
   Scanf.kscanf is
     (fun ib exc -> htmlc_output_line b line)
     " <!-- # %s %[^= ] = %S%s@-->%[^\n]"
     (fun command mode name modifiers rest_of_line ->
        let treat_command = function
          | "include" ->
              let include_file name =
                load_included_file (treat_channel src) b name in
              let treat_include mode name =
                match mode with
                | "verbatim" -> add_verbatim_file b name
                | "environment" -> load_env_file_in_line b name
                | "virtual" -> include_file name
                | _ -> include_file name in
              treat_include mode name
          | "echo" as c ->
              let treat_echo mode name =
                match mode with
                | "var" ->
                    begin match String.lowercase name with
                    | "last_modified" ->
                         let lang = htmlc_get_lang src in
                         htmlc_add_string b
                          (last_modification_date_of_file lang src)
                    | "date_local" ->
                         let lang = htmlc_get_lang src in
                         htmlc_add_string b
                          (Date.local_date_of_day lang)
                    | "date_gmt" ->
                         let lang = htmlc_get_lang src in
                         htmlc_add_string b
                          (Date.date_of_day lang)
                    | "user_name" ->
                         let user =
                           try Sys.getenv "USER" with
                           | _ -> "<unknown>" in
                         htmlc_add_string b user
                    | "document_path_info" ->
                         htmlc_add_string b (Filename.dirname src)
                    | "document_name" ->
                         htmlc_add_string b (Filename.basename src)
                    | "query_string_unescaped"
                    | "document_uri"
                    | _ -> echo_var b name modifiers end
                | q -> unknown_qualifier c q in
              treat_echo mode name
          | "set" | "define" as c ->
              let value =
                if c = "define" then name else scan_line_to_string name in
              let name = find_dollar_ident mode in
              htmlc_bind name value
          | "exec" as c ->
              let treat_exec mode name =
                match mode with
                (* Should set the Unix environment of commands and CGIs before
                   launching them:
                   - for commands DOCUMENT_NAME, DOCUMENT_URI,
                     DATE_LOCAL, DATE_GMT, LAST_MODIFIED should be available.
                   - for CGI, in addition to those vriables we should
                     have the usual CGI's envrionment. *)
                | "cmd" -> launch_command b name
                | "cgi" -> launch_cgi b name
                | q -> unknown_qualifier c q in
              treat_exec mode name
          | "fsize" -> treat_fsize b mode name modifiers
          | "if" | "elif" | "else" as c ->
              let treat_if mode name =
                match mode with
                | "expr" ->
                    incr_within_if ();
                    failwith "Conditional not yet implemented"
                | q -> unknown_qualifier c q in
              treat_if mode name
          | "endif" ->
              if within_if () then decr_within_if ()
              else failwith "htmlc: endif not inside a conditional"
          | "config" as c ->
             (* errmsg or timefmt or sizefmt (bytes or abbrev) *)
             (match mode with
             | "timefmt" -> warning "timefmt not supported"
             | "sizefmt" -> raise Exit (* FIXME *)
             | q -> unknown_qualifier c q)
          | "flastmod" as c ->
             (* File last modification time *)
             let src =
               match mode with
               | "file" -> htmlc_substitute_string name
               | q -> unknown_qualifier c q in
             let lang = htmlc_get_lang src in
             htmlc_add_string b (last_modification_date_of_file lang src)      
          | "printenv" (* Prints the entire environment
                          (of the HTTP server process ?) *)
            -> raise Exit (* FIXME *)
          | c -> unknown_command c in
        try
          (* If we know the command, we treat it
             and output the rest of the line. *)
          treat_command command;
          if rest_of_line <> "" then htmlc_output_line b rest_of_line
        with
          (* If we don't know the command, we output the entire line. *)
        | Exit ->
            htmlc_output_line b rest_of_line)

 and scan_line_to_string name =
   let b = Buffer.create 100 in
   scan_line b name;
   Buffer.contents b in

 try
  while true do
   let line = Scanf.bscanf ib "%[^\n]\n" (function s -> s) in 
   scan_line b line
  done
 with End_of_file -> my_close_in src ic;;

let treat_file =
 let b = Buffer.create (20 * 1024) in
 fun src tgt ->
 (* printf "Source %s, Target %s\n" src
     (if tgt = "-" then "stdout" else tgt); *)
   Buffer.reset b;
   try
    let ic = my_open_in src in
    try
     let oc = my_open_out tgt in
     try
       treat_channel src ic b;
       my_close_in src ic;
       output_string oc (Buffer.contents b);
       my_close_out tgt oc
     with x -> my_remove tgt; raise x
    with
    | x -> my_close_in src ic; raise x
   with
   | Sys_error s | Failure s -> prerr_endline s; exit 2;;

let get_usage () =
  "Usage: htmlc <options> <files>\n       \
          htmlc -s <source.shtml> (output is <source.html>) or\n       \
          htmlc -c <source.html> (output is source.htm) or\n       \
          htmlc -tpl <source> (output is source.html) or\n       \
          htmlc <source> (output is stdout) or\n       \
          htmlc <source> <destination> or\n       \
          htmlc [-i|-f|-from] <source> | - [-o|-t|-to] <destination> | -\n\
  ";;

let usage () = prerr_endline (get_usage ()); exit 1;;

let input_output_names_collision so =
  prerr_endline
   (Printf.sprintf
      "Output file %s collides with input file: \
       htmlc would overwrite input file."
      so);
  exit 3;;

let get_filenames, set_filenames, add_o_filename, add_i_filename =
  let args = ref []
  and source = ref "-"
  and target = ref "-" in
  (fun () -> (* get_filenames *)
     (* Only one argument is provided: print the result on std_out *)
     if !args = [] then
       if !source = "-" then
         if !Arg.current = 1 then usage () else ()
       else args := [!source, !target];
     List.rev !args), (* get_filenames *)
  (fun s -> (* set_filenames *)
    if !source = "-" then source := s else target := s;
    if !target <> "-" then begin
      args := (!source, !target) :: !args;
      source := "-"; target := "-"
    end),
  (fun so -> (* add_o_filename *)
     match !args with
     | [] -> args := ("-", so) :: !args
     | (si, "-") :: rest ->
          if so <> si then args := (si, so) :: rest else
          input_output_names_collision so
     | (_, _) :: rest -> args := ("-", so) :: !args),
  (fun si -> (* add_i_filename *)
     match !args with
     | [] -> args := (si, "-") :: !args
     | ("-", so) :: rest ->
          if so <> si then args := (si, so) :: rest else
          input_output_names_collision so
     | (_, _) :: rest -> args := (si, "-") :: !args);;

let my_chop_extension fname =
  try String.sub fname 0 (String.rindex fname '.')
  with Not_found -> fname;;

(* [add_file_input_output suffix fname.xxx] adds
   - [fname.xxx] as input
   - [fname.suffix] as output
   [fname] is not even supposed to have an extension,
   in this case [fname] is added as input
   and [fname.suffix] as output. *)
let add_file_input_output suffix fname =
  add_i_filename fname;
  let fname = my_chop_extension fname in
  add_o_filename (fname ^ suffix);;

(* Given a file name [fname] of the form [name.xxx]
   [add_html_file_input_output fname]
      adds [fname.xxx] as input file and [fname.htm] as output file
   [add_shtml_file_input_output fname]
      adds [fname.xxx] as input file and [fname.html] as output file. *)
let add_html_file_input_output, add_shtml_file_input_output =
  (add_file_input_output ".htm"),
  (add_file_input_output ".html");;

(* [add_filename_input_output add_file suffix fname]
   given [fname] with suffix [suffix] as [name.suffix]
   adds relevant input and output files for [htmlc]
   according to function [add_files]. *)
let add_filename_input_output add_files suffix fname =
 if Filename.check_suffix fname suffix then add_files fname else
  begin
    prerr_endline
      (Printf.sprintf "File name %s has not the required %s suffix."
         fname suffix);
    exit 3
  end;;

let add_html_filename =
  add_filename_input_output add_html_file_input_output ".html";;
let add_shtml_filename =
  add_filename_input_output add_shtml_file_input_output ".shtml";;

let add_filename = add_file_input_output ".html";;

let set_lang s = Lang.set (Lang.lang_of_string s);;

let main () =
 parse [
  ("-I", String Path.push,
   "<dirname>: add directory <dirname> in front of the list\n        \
    of directories searched for include files.");
  ("-w", Unit set_writable_target,
   ": expanded file should be writable (default is read only).");
  ("-env", String add_e_filename,
   "<filename>: \
    read <filename> to define variable bindings for substitution.");
  ("-D", Tuple [String add_e_name; String add_e_value],
   "<ident> <value>: \
    bind <ident> to <value> for substitution.");
  ("-lang", String Lang.set_from_string,
   "<language>: language is set to <language>\n        \
    (no more guess from file-name extension).");
  ("-default-lang", String Lang.set_default_from_string,
   "<language>: default language is set to <language>.");
  ("--languages", Unit print_supported_languages,
   ": list of htmlc supported languages.");

  ("-s", String add_filename,
   "<filename.shtml>: input file is set to <filename.shtml>,\n        \
    output file is set to <filename.html>.");
  ("-c", String add_html_filename,
   "<filename.html>: input file is set to <filename.html>,\n        \
    output file is set to <filename.htm>.");
  ("-tpl", String add_filename,
   "<filename>: input file is set to <filename>,\n        \
    output file is set to <file.html>, where <file>\n        \
    designates <filename> without its extension (if any).");
  ("-o", String add_o_filename,
   "<filename>: output file is set to <filename>.");
  ("-t", String add_o_filename,
   "<filename>: output file is set to <filename>.");
  ("-to", String add_o_filename,
   "<filename>: output file is set to <filename>.");
  ("-i", String add_i_filename,
   "<filename>: input file is set to <filename>.");
  ("-f", String add_i_filename,
   "<filename>: input file is set to <filename>.");
  ("-from", String add_i_filename,
   "<filename>: input file is set to <filename>.");
  ("-v", Unit Version.print_version,
   ": print the htmlc version number.");
  ("--version", Unit Version.print_full_version,
   ": print the htmlc long version number.");
  ]
  set_filenames
  (get_usage ());
 (* Setting environment to expand variables. *)
 load_env_files ();
 bind_names_to_values ();
 List.iter (function (src, tgt) -> treat_file src tgt) (get_filenames ());
 exit 0;;

Printexc.print main ();;
