#open "camlwin";;


type DebugType = HTML_BREAK
               | SEARCH
               | DECOMPOSE
               | NONE
;;

let debug = NONE;; 


let rec debug_loop List =
  match List with
    [] -> print_newline ()
  | x::y -> print_string (x ^ "\n");
            debug_loop y
;;



(***************************************************************************)
(*                                                                         *)
(*      conversion of extended chars                                       *)
(*                                                                         *)
(***************************************************************************)
type HTML_cvt_type == (string * char);;

let HTML_cvt =
[ ("&lt",`<`);
  ("&gt",`>`);
  ("&amp",`&`);
  ("&quot",`"`);
  ("&AElig",`A`);
  ("&Aacute",`A`);
  ("&Agrave",`A`);
  ("&Aring",`A`);
  ("&Atilde",`A`);
  ("&Auml",`A`);
  ("&Ccedil",`C`);
  ("",` `)
];;

let HTML_extchar Str =
  let rec search_loop List =
    match List with
      [] -> ` `
    | x::y -> if fst x = Str 
              then (snd x)
              else search_loop y
  in
    search_loop HTML_cvt
;;


(***************************************************************************)
(*                                                                         *)
(*      to_upper : string -> string                                        *)
(*                                                                         *)
(***************************************************************************)
let int_of_A = int_of_char `A`;;
let int_of_a = int_of_char `a`;;

let char_to_upper a =
  match a with
   `a`..`z` -> char_of_int ( int_of_A + (int_of_char a) - int_of_a)
  | _      -> a
;;

let to_upper Str =
  let len = string_length Str in
    let Mstr = make_string len ` ` in
      for i = 0 to (len - 1) do
        set_nth_char Mstr i (char_to_upper (nth_char Str i))
      done;
  Mstr
;;





(***************************************************************************)
(*                                                                         *)
(*      search_ended_tag : string -> string list -> (string * string list) *)
(*                                                                         *)
(***************************************************************************)
let search_ended_tag Str List =
  if debug=SEARCH
  then print_string ("search_ended_tag: " ^ Str ^ "\n"); 
  
  let STR = to_upper Str in
    let rec search_str_loop SearchStr List Result Cpt =
      match List with
        []   -> (Result, List)
      | x::y -> 
          let X = to_upper x in
            if X = STR
             then 
             (
               if debug=SEARCH
               then 
               (
                 print_string ("search_ended_tag: found=" ^ STR ^ ", Cpt=");
                 print_int (Cpt+1);
                 print_newline ()
               );
               search_str_loop SearchStr y (Result ^ x) (Cpt+1)
             )
             else if X = SearchStr
                  then 
                  (
                    if Cpt=0
                    then 
                    (
                      if debug=SEARCH
                      then 
                      (
                          print_string ("search_ended_tag: return=" ^ Result);
                          print_newline ()
                      );
                      (Result, y) 
                    )
                    else 
                    (
                      if debug=SEARCH
                      then 
                      (
                          print_string ("search_ended_tag: found=" ^ 
                                        SearchStr ^ ", Cpt=");
                          print_int (Cpt-1);
                          print_newline ()
                      );
                      search_str_loop SearchStr y (Result ^ x) (Cpt-1)
                    )
                  )
                  else search_str_loop SearchStr y (Result ^ x) Cpt
    in  
      if nth_char Str 0 = `<`
      then
      (
       let SearchStr = to_upper ("</" ^ 
                                 (sub_string Str 1 ((string_length Str) - 1)))
       in
         search_str_loop SearchStr List "" 0
      )
      else (Str, List)
;;




(***************************************************************************)
(*                                                                         *)
(*  search_not_ended_tag : string list -> (string * string list)           *)
(*                                                                         *)
(***************************************************************************)
let search_not_ended_tag List =
  if debug=SEARCH
  then print_string ("search_not_ended_tag\n"); 
  
  let rec search_str_loop List Result =
    match List with
      []   ->           
          if debug=SEARCH
          then print_string ("search_not_ended_tag: return=" ^ Result ^ "\n");
          (Result, List)
    | x::y -> let X = to_upper x in
        if X = "<LI>" or X = "<DT>" or x = "<DD>"
        then 
        (
          if debug=SEARCH
          then print_string ("search_not_ended_tag: return=" ^ Result ^ "\n");
          (Result, List)
        )
        else if string_length X > 2
             then
             (
               if nth_char X 0 = `<` & not nth_char X 1 = `/`
               then 
               (
                 let res = search_ended_tag x y in
                   search_str_loop (snd res) (Result ^ x ^ (fst res) ^
                                    "</" ^ (sub_string x 1 
                                                       ((string_length x)-1)))
               )
               else search_str_loop y (Result ^ x)
            )
            else search_str_loop y (Result ^ x)
  in  
   search_str_loop List ""
;;







(***************************************************************************)
(*                                                                         *)
(*      html_break_str : string -> string list                             *)
(*                                                                         *)
(***************************************************************************)
let html_break_str  String =
  let len = string_length String in
  let rec break_loop pos pos0 Out =
    if pos = len
    then 
    (
      let str = sub_string String pos0 (pos - pos0) in
        if pos = pos0 
        then Out
        else Out @ [str ]
    )
    else
    (
      let a = nth_char String pos in
        match a with 
          `<`  -> if pos = pos0
                  then break_loop (pos+1) pos0 Out
                  else 
                    let str = sub_string String pos0 (pos - pos0) in
                      break_loop (pos+1) pos (Out @ [str]) 
        | `>`  -> let str = sub_string String pos0 (pos - pos0) in
                    break_loop (pos+1) (pos+1) (Out @ [(str ^ ">")]) 
        | _    -> break_loop (pos+1) pos0 Out
    )
  in
    break_loop 0 0 []
;;







(***************************************************************************)
(*                                                                         *)
(*      html_break_stream : in_stream -> string list                       *)
(*                                                                         *)
(***************************************************************************)
let add_char Str x =
  let str  = make_string 1 ` ` in
    set_nth_char str 0 x;
  Str ^ str
;;

let add_char_ext Str x =
  let a = if x=`\n` or x=`\t` then ` ` else x
  and len=string_length Str in
    if len < 1
    then add_char Str a
    else if a = ` ` & nth_char Str (len - 1) = ` `
         then Str
         else add_char Str a
;;



type html_break_state = Html_Cmd
                      | Html_String
                      | Html_Preformated
;;

let html_break_stream Stream =
  let rec html_break Out Str State =
    try
    (
      let a = input_char Stream in
        match a with 
          `<`  -> if Str=""
                  then html_break Out "<" (if State=Html_Preformated
                                           then Html_Preformated
                                           else Html_Cmd)
                  else html_break (Out @ [Str]) "<" (if State=Html_Preformated
                                                     then Html_Preformated
                                                     else Html_Cmd)
        | `>`  -> html_break (Out @ [(Str ^ ">")]) 
                             "" 
                             (if to_upper Str="<PRE" 
                              then Html_Preformated 
                              else if State=Html_Preformated & 
                                      not ((to_upper Str)="</PRE")
                                   then Html_Preformated
                                   else Html_String) 
        | _    -> html_break Out 
                             (if State=Html_Preformated 
                              then add_char Str a 
                              else add_char_ext Str a) 
                             State 
    )
    with End_of_file -> (Out @ [Str])
  in
    html_break [] "" Html_String
;;







(***************************************************************************)
(*                                                                         *)
(*      extract_anchor : string -> AnchorType                              *)
(*                                                                         *)
(***************************************************************************)
let extract_anchor Str =
  if debug=SEARCH
  then
  (
    print_string ("extract_anchor :" ^ Str);
    print_newline()
  );
  let len = string_length Str in
    let rec search_begin_type pos =
      match (nth_char Str pos) with 
        ` ` -> search_begin_type (pos+1)
      | _   -> pos

    and search_end_type pos =
      match (nth_char Str pos) with 
        ` ` -> (pos-1)
      | `=` -> (pos-1)
      | _   -> search_end_type (pos+1)

    and extract_name pos0 pos1 first=
      match (first, nth_char Str pos1) with
        (true, `"`)    -> extract_name (pos1+1) (pos1+1) false
      | (true, `>`)    -> (pos0, (pos1-1)) (* no quote surround name *)
      | (true, _)      -> extract_name pos0 (pos1+1) true 
      | (false, `"`)   -> (pos0, (pos1-1))
      | (false, _)     -> extract_name pos0 (pos1+1) false

    and search_end_cmd pos =
      match (nth_char Str pos) with 
        `>` -> (pos+1)
      | _   -> search_end_cmd (pos+1)

    and search_end_anchor pos =
      match to_upper (sub_string Str pos 4) with
        "</A>" -> pos
      | _      -> search_end_anchor (pos+1)

    in
      let b_type = search_begin_type 3 in
        let e_type = search_end_type b_type in
          let name_pos = extract_name e_type e_type true in
            let end_cmd = search_end_cmd (snd name_pos) in
              let aend = search_end_anchor end_cmd in
                let atype = sub_string Str b_type (e_type-b_type+1)
                and name = sub_string Str (fst name_pos) 
                                      ((snd name_pos)-(fst name_pos)+1) 
                and str  = sub_string Str end_cmd (aend-end_cmd) in
                  if debug=SEARCH
                  then
                  (
                    print_string ("extract_anchor type=" ^ atype ^ 
                                  ", name=" ^ name ^ ", label=" ^ str ^ "\n");
                    print_newline ()
                  );
  
    { AnchorType = (match to_upper atype with
                     "HREF" -> HREF
                   | _      -> NAME); 
      AnchorFile = name; 
      AnchorName = str
    }
;;





(***************************************************************************)
(*                                                                         *)
(*      extract_image : string -> ImageType                                *)
(*                                                                         *)
(***************************************************************************)
let extract_image Str =
  if debug=SEARCH
  then
  (
    print_string ("extract_image :" ^ Str);
    print_newline()
  );
(* <IMG SRC = "xxx.gif" ALT = "text rem" ALIGN = TOP | MIDDLE | BOTTOM> *)
  let len = string_length Str 
  and image_name = ref ""
  and alt_text = ref ""
  and align = ref AlignBottom in
    let rec break_str bpos epos state result=
      if epos = len 
      then (result @ [sub_string Str bpos (epos-bpos)])
      else
      (
         match (nth_char Str epos) with 
          ` ` | `=` | `<` | `>` -> 
           if state (* between quotes *)
           then break_str bpos (epos+1) state result
           else if bpos=epos (* serach the beginning of a new word *)
                then break_str (bpos+1) (epos+1) state result
                else break_str (epos+1) (epos+1) state
                               (result @ [sub_string Str bpos (epos-bpos)])
        | `"` -> 
           if state
           then break_str (epos+1) (epos+1) false
                           (result @ [sub_string Str bpos (epos-bpos)])
           else break_str (epos+1) (epos+1) true result

        | _   -> break_str bpos (epos+1) state result
      )
    and read_info str_list =
    match str_list with
      []   -> ()
    | x::y -> 
         (
           match to_upper x with
             "SRC"   -> image_name := hd y; read_info (tl y)
           | "ALT"   -> alt_text := hd y; read_info (tl y)
           | "ALIGN" -> align := ( match to_upper (hd y) with 
                                    "TOP" -> AlignTop
                                  | "MIDDLE" -> AlignMiddle
                                  | _        -> AlignBottom
                                 );
                        read_info (tl y)
           | _       -> read_info y
         )
    in
      let strings = break_str 0 0 false [] in
        read_info strings;
        if debug=SEARCH
        then
        (
          print_string ("extract_image file_name=" ^ !image_name ^ 
                        ", alt text=" ^ !alt_text ^ 
                        ", align=" ^ (match !align with
                                        AlignTop -> "Top"
                                      | AlignMiddle -> "Middle"
                                      | AlignBottom -> "Bottom"
                                     ) ^ "\n");
          print_newline ()
        );

    { 
     ImageAlign = !align; 
     ImageFile = !image_name; 
     ImageAlt = !alt_text 
    }
;;













(***************************************************************************)
(*                                                                         *)
(*      html_decompose : string list -> HTML_type list                     *)
(*                                                                         *)
(***************************************************************************)
let rec html_decompose List =
  match List with
   [] -> []
  | x::y -> 
       if debug=DECOMPOSE
       then 
       (
         print_string ("html_decompose : fst=" ^ x ^ "\n");
         debug_loop y;
         print_newline ()
       );

      let len = string_length x in
        let rec first_word pos =
          if pos = len 
          then to_upper x
          else if nth_char x pos = ` `
               then to_upper ((sub_string x 0 pos) ^ ">")
               else first_word (pos+1)
          in

      let UString = first_word 0 in
      (
        match UString with
          "<A>"     -> let res = search_ended_tag UString y in
                  let result = extract_anchor ( x ^ (fst res) ^ "</A>") in
                           (ANCHOR result)::(html_decompose (snd res))

        | "<IMG>"   -> let result = extract_image x  in
                           (IMG result)::(html_decompose y)

        | "<H1>"    -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (H1 result)::(html_decompose (snd res))

        | "<H2>"    -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (H2 result)::(html_decompose (snd res))

        | "<H3>"    -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (H3 result)::(html_decompose (snd res))

        | "<H4>"    -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (H4 result)::(html_decompose (snd res))

        | "<H5>"    -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (H5 result)::(html_decompose (snd res))

        | "<H6>"    -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (H6 result)::(html_decompose (snd res))

        | "<UL>"     -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (UList result)::(html_decompose (snd res))

        | "<OL>"     -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (OList result)::(html_decompose (snd res))

        | "<DL>"     -> let res = search_ended_tag UString y in
                  let result = html_decompose (html_break_str (fst res)) in
                         (DList result)::(html_decompose (snd res))

        | "<DD>"    -> let res = search_not_ended_tag y in
                   let result = html_decompose (html_break_str (fst res))in
                          (DD result)::(html_decompose (snd res))

        | "<DT>"    -> let res = search_not_ended_tag y in
                   let result = html_decompose (html_break_str (fst res))in
                          (DT result)::(html_decompose (snd res))

        | "<LI>"    -> let res = search_not_ended_tag y in
                   let result = html_decompose (html_break_str (fst res))in
                          (LI result)::(html_decompose (snd res))

        | "<PRE>"    -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (PRE result)::(html_decompose (snd res))

        | "<STRONG>" -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (STRONG result)::(html_decompose (snd res))

        | "<BLOCKQUOTE>" -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (BLOCKQUOTE result)::(html_decompose (snd res))

        | "<ADDRESS>" -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (ADDRESS result)::(html_decompose (snd res))

        | "<DFN>"    -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (DFN result)::(html_decompose (snd res))

        | "<EM>"     -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (EM result)::(html_decompose (snd res))

        | "<CITE>"  -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (CITE result)::(html_decompose (snd res))

        | "<CODE>"  -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (CODE result)::(html_decompose (snd res))

        | "<KBD>"   -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (KBD result)::(html_decompose (snd res))

        | "<SAMP>"  -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (SAMP result)::(html_decompose (snd res))

        | "<VAR>"    -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (VAR result)::(html_decompose (snd res))

        | "<B>"      -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (BOLD result)::(html_decompose (snd res))

        | "<I>"      -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (ITALIC result)::(html_decompose (snd res))

        | "<TT>"     -> let res = search_ended_tag UString y in
                   let result = html_decompose (html_break_str (fst res))in
                          (TT result)::(html_decompose (snd res))

        | "<HR>"     -> HLine::(html_decompose y)

        | "<P>"      -> LineBreak::(html_decompose y)

        | "<BR>"     -> LineBreak::(html_decompose y)

        | "<TITLE>"  -> let res = search_ended_tag UString y in
                          html_decompose (snd res)

        | "<HEAD>"   -> let res = search_ended_tag UString y in
                          html_decompose (snd res)

        | _          -> if string_length x > 0 
                        then 
                        (
                          if nth_char x 0 = `<`
                          then (html_decompose y)
                          else (HtmlText x)::(html_decompose y)
                        )
                        else (html_decompose y)
      )
;;




(****************************************************************************)
(*                                                                          *)
(*    html_exchar : HTML_type list -> HTML_type list                        *)
(*                                                                          *)
(****************************************************************************)
let html_exchar Html_list =
  let rec cvrt_str Str =
    let str = ref "&"
    and result = ref "" 
    and pos = ref 0
    and add = " "
    and len = string_length Str in
      while(!pos<len) do
        if nth_char Str !pos = `&`
        then
        (
          str := "";
          while(not (nth_char Str !pos = `;` or !pos=len)) do
            set_nth_char add 0 (nth_char Str !pos);
            str := !str ^ add;
            incr pos
          done;
          set_nth_char add 0 (HTML_extchar !str);
          result := ! result ^ add
        )
        else
        (
          set_nth_char add 0 (nth_char Str !pos);
          result := ! result ^ add
        );
        incr pos
      done;
    !result

  and extract_loop List Result=
    match List with
      []   -> rev Result
    | x::y ->
        (
          match x with
           HtmlText str -> 
                let str = cvrt_str str in
                  extract_loop y ((HtmlText str):: Result)
         | ANCHOR Anchor -> 
             let anchor = { 
                            AnchorType = Anchor.AnchorType; 
                            AnchorFile  = Anchor.AnchorFile;
                            AnchorName = cvrt_str Anchor.AnchorName
                          }
             in
               extract_loop y ((ANCHOR anchor) :: Result)
         | LineBreak -> extract_loop y (x :: Result)
         | STRONG HtmlList -> 
              extract_loop y ((STRONG (extract_loop HtmlList [])) :: Result)
         | Ignore str -> extract_loop y (x :: Result)
         | H1 HtmlList ->  
              extract_loop y ((H1 (extract_loop HtmlList [])) :: Result)
         | H2 HtmlList ->  
              extract_loop y ((H2 (extract_loop HtmlList [])) :: Result)
         | H3 HtmlList ->  
              extract_loop y ((H3 (extract_loop HtmlList [])) :: Result)
         | H4 HtmlList ->  
              extract_loop y ((H4 (extract_loop HtmlList [])) :: Result)
         | H5 HtmlList ->  
              extract_loop y ((H5 (extract_loop HtmlList [])) :: Result)
         | H6 HtmlList ->  
              extract_loop y ((H6 (extract_loop HtmlList [])) :: Result)
         | UList HtmlList ->  
              extract_loop y ((UList (extract_loop HtmlList [])) :: Result)
         | OList HtmlList ->  
              extract_loop y ((OList (extract_loop HtmlList [])) :: Result)
         | HLine    -> extract_loop y (x :: Result)
         | PRE HtmlList -> 
              extract_loop y ((PRE (extract_loop HtmlList [])) :: Result)
         | BLOCKQUOTE HtmlList -> 
              extract_loop y ((BLOCKQUOTE (extract_loop HtmlList [])) :: Result) 
         | CITE HtmlList ->  
              extract_loop y ((CITE (extract_loop HtmlList [])) :: Result)
         | KBD HtmlList ->   
              extract_loop y ((KBD (extract_loop HtmlList [])) :: Result)
         | CODE HtmlList ->  
              extract_loop y ((CODE (extract_loop HtmlList [])) :: Result)
         | DFN HtmlList ->  
              extract_loop y ((DFN (extract_loop HtmlList [])) :: Result)
         | EM HtmlList ->  
              extract_loop y ((EM (extract_loop HtmlList [])) :: Result)
         | VAR HtmlList ->  
              extract_loop y ((VAR (extract_loop HtmlList [])) :: Result)
         | ADDRESS HtmlList ->  
              extract_loop y ((ADDRESS (extract_loop HtmlList [])) :: Result)
         | SAMP HtmlList ->  
              extract_loop y ((SAMP (extract_loop HtmlList [])) :: Result)
         | DList HtmlList ->  
              extract_loop y ((DList (extract_loop HtmlList [])) :: Result)
         | DT HtmlList ->  
              extract_loop y ((DT (extract_loop HtmlList [])) :: Result)
         | DD HtmlList ->  
              extract_loop y ((DD (extract_loop HtmlList [])) :: Result)
         | LI HtmlList ->  
              extract_loop y ((LI (extract_loop HtmlList [])) :: Result)
         | BOLD HtmlList  ->  
              extract_loop y ((BOLD (extract_loop HtmlList [])) :: Result)
         | ITALIC HtmlList ->  
              extract_loop y ((ITALIC (extract_loop HtmlList [])) :: Result)
         | TT   HtmlList ->  
              extract_loop y ((TT (extract_loop HtmlList [])) :: Result)
         | IMG  img  -> extract_loop y (x :: Result)
        )
  in
    extract_loop Html_list []
;;


(****************************************************************************)
(*                                                                          *)
(*    html_interpreter : in_stream -> HTML_type list                        *)
(*                                                                          *)
(****************************************************************************)
let html_interpreter Stream =
    let StrList = html_break_stream Stream in
      if debug=HTML_BREAK
      then debug_loop StrList;
      html_exchar (html_decompose StrList)  
;;








