{
open Py_parse
open Py_string

(* 'zero_is_octal' determines the treatment of
   zero prefixed digits strings.

   If true (default), Python and C rules apply,
   and a zero prefixed digit string is treated
   as octal. Viper standard is decimal.
   A warning is issued in either case.
*)
   
type warning_mode = WarnAlways | WarnOnce | WarnNever;;
let octal_warning_mode = ref WarnOnce;;
let octal_warning_count = ref 0;;
let zero_is_octal = ref true;;
let set_zero_is_octal v = zero_is_octal := v;;
let set_octal_warning_mode x =
  octal_warning_mode :=
    begin 
      if x > 0 then WarnAlways
      else if x = 0 then WarnOnce
      else WarnNever
    end
;;

let do_octal_warning () = 
  !octal_warning_mode = WarnAlways
  or (!octal_warning_mode = WarnOnce && !octal_warning_count = 0)
;;

let comment_nesting_level = ref 0;;
let comment_text = ref "";;

let lexeme = Lexing.lexeme;;

let buf_pos = ref 0;;
let last_buf_pos = ref 0;;
let line_no = ref 0;;
let filename = ref "Unknown filename";;

let incr_lex_counters lexbuf =
  line_no := !line_no + 1;
  last_buf_pos := !buf_pos;
  buf_pos := Lexing.lexeme_end lexbuf
;;

let reset filename' =
  buf_pos :=0;
  last_buf_pos :=0;
  line_no := 0;
  filename := filename';
  comment_nesting_level := 0
;;

let set_buf_pos x = buf_pos := x;;
let get_buf_pos () = !buf_pos;;

let substr = String.sub
let len = String.length

(* special hackery to adjust the line count when a newline is
present _inside_ a string token -- allowed in some python strings *)
let lfcount s = 
  let n = ref 0 in
  for i = 0 to (String.length s) - 1 do
    if s.[i] = '\n' then incr n
  done;
  !n
;;

let adj s = line_no := !line_no + (lfcount s);;

let decode_qstring s = adj s; let n = len s in unescape (substr s 0 (n-1)) 
let decode_dqstring s = adj s; let n = len s in unescape (substr s 0 (n-1)) 
let decode_qqqstring s = adj s; let n = len s in unescape (substr s 0 (n-3)) 
let decode_dddstring s = adj s; let n = len s in unescape (substr s 0 (n-3)) 

let decode_raw_qstring s = adj s; let n = len s in substr s 0 (n-1) 
let decode_raw_dqstring s = adj s; let n = len s in substr s 0 (n-1) 
let decode_raw_qqqstring s = adj s; let n = len s in substr s 0 (n-3) 
let decode_raw_dddstring s = adj s; let n = len s in substr s 0 (n-3) 

} 

(* special characters *)
let quote = '\''
let dquote = '"'
let slosh = '\\'
let linefeed = '\n'
let tab = '\t'
let space = ' '
let formfeed = '\012'
let vtab = '\011'
let carriage_return = '\013'
let underscore = '_'

(* character sets *)
let bindigit = ['0'-'1']
let octdigit = ['0'-'7'] 
let digit = ['0'-'9']
let nzdigit = ['1'-'9']
let hexdigit = digit | ['A'-'F'] | ['a'-'f']
let lower = ['a'-'z']
let upper = ['A'-'Z']
(* let letter = lower | upper *)
let letter = lower | upper
let hichar = ['\128'-'\255']
let white = space | tab

(* nasty: form control characters *)
let form_control = linefeed | carriage_return | vtab | formfeed
let newline_prefix = linefeed | carriage_return
let newline = formfeed | linefeed  | carriage_return linefeed
(* let newline = newline_prefix form_control * *)

let ordinary = letter | digit | hichar |
  '!' | '#' | '$' | '%' | '&' | '(' | ')' | '*' |
  '+' | ',' | '-' | '.' | '/' | ':' | ';' | '<' |
  '=' | '>' | '?' | '@' | '[' | ']' | '^' | '_' |
  '`' | '{' | '|' | '}' | '~'

let printable = ordinary | quote | dquote | slosh

(* identifiers *)
let idletter = letter | underscore
let identifier = idletter (idletter | digit)*

(* integers *)
let bin_lit  = '0' ('b' | 'B') (underscore? bindigit) +
let oct_lit  = '0' ('o' | 'O') (underscore? octdigit) +
let dec_lit  = '0' ('d' | 'D') (underscore? digit) +
let nz_dec_lit  = nzdigit (underscore? digit) *
let z_dec_lit  = '0' (underscore? digit) *
let hex_lit  = '0' ('x' | 'X') (underscore? hexdigit)  +

(* floats *)
let decimal_string = digit (underscore? digit) *
let fixed_literal = decimal_string ? '.' decimal_string | 
  decimal_string '.' decimal_string?
let exponent = 'E' ('+'|'-')? decimal_string
let floating_literal = fixed_literal exponent? | decimal_string exponent

(* Python strings *)
let qqq = quote quote quote
let ddd = dquote dquote dquote 

let escape = slosh _ 

let dddnormal = ordinary | quote | escape | white | newline
let dddspecial = dddnormal | dquote dddnormal | dquote dquote dddnormal

let qqqnormal = ordinary | dquote | escape | white | newline
let qqqspecial = qqqnormal | quote qqqnormal | quote quote qqqnormal

let raw_dddnormal = ordinary | quote | slosh | white | newline
let raw_dddspecial = raw_dddnormal | dquote raw_dddnormal | dquote dquote raw_dddnormal

let raw_qqqnormal = ordinary | dquote | slosh | space | newline
let raw_qqqspecial = raw_qqqnormal | quote raw_qqqnormal | quote quote raw_qqqnormal

let qstring = (ordinary | dquote | escape | white) * quote
let dqstring = (ordinary | quote | escape | white) * dquote
let qqqstring = qqqspecial * qqq
let dddstring = dddspecial * ddd

let raw_qstring = (ordinary | dquote | escape | white) * quote
let raw_dqstring =  (ordinary | quote | escape | white) * dquote

let raw_qqqstring = raw_qqqspecial * qqq
let raw_dddstring = raw_dddspecial * ddd

(* string lexers *)
rule parse_qstring = parse
| qstring { STRING (decode_qstring (lexeme lexbuf)) }
| _ { ERRORTOKEN ("' string")}

and parse_dqstring = parse
| dqstring { STRING (decode_dqstring (lexeme lexbuf)) }
| _ { ERRORTOKEN ("\" string")}

and parse_qqqstring = parse
| qqqstring { STRING (decode_qqqstring (lexeme lexbuf)) }
| _ { ERRORTOKEN ("''' string")}

and parse_dddstring = parse
| dddstring { STRING (decode_dddstring (lexeme lexbuf)) }
| _ { ERRORTOKEN ("\"\"\" string")}

and parse_raw_qstring = parse
| raw_qstring { STRING (decode_raw_qstring (lexeme lexbuf)) }
| _ { ERRORTOKEN ("raw ' string")}

and parse_raw_dqstring = parse
| raw_dqstring { STRING (decode_raw_dqstring (lexeme lexbuf)) }
| _ { ERRORTOKEN ("raw \" string")}

and parse_raw_qqqstring = parse
| raw_qqqstring { STRING (decode_raw_qqqstring (lexeme lexbuf)) }
| _ { ERRORTOKEN ("raw ''' string")}

and parse_raw_dddstring = parse
| raw_dddstring { STRING (decode_raw_dddstring (lexeme lexbuf)) }
| _ { ERRORTOKEN (lexeme lexbuf)}

and parse_hash_comment = parse 
| [^'\n'] * newline {
    incr_lex_counters lexbuf;
    let lex = lexeme lexbuf in
    let n = String.length lex in
    COMMENT_NEWLINE  ((!line_no, !filename), String.sub lex 0 (n-1))
  }
| _ { ERRORTOKEN (lexeme lexbuf)}

and parse_C_comment = parse 
| newline {
    incr_lex_counters lexbuf;
    comment_text := !comment_text ^ (lexeme lexbuf);
    parse_C_comment lexbuf
  }
| "/*" { 
    comment_text := !comment_text ^ (lexeme lexbuf);
    incr comment_nesting_level; 
    parse_C_comment lexbuf 
  }
| "*/" { 
    comment_text := !comment_text ^ (lexeme lexbuf);
    decr comment_nesting_level; 
    if !comment_nesting_level > 0 
    then parse_C_comment lexbuf 
    else ()
  }
| _ {
    comment_text := !comment_text ^ (lexeme lexbuf);
    parse_C_comment lexbuf 
  }

and pre_pylex = parse
| '#' { parse_hash_comment lexbuf }
| "/*" { 
    comment_text := lexeme lexbuf;
    comment_nesting_level := 1; 
    parse_C_comment lexbuf; 
    COMMENT !comment_text  
  }

| identifier { Py_keywords.map_keywords (lexeme lexbuf)  } 

| bin_lit ("L"|"l") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-3) in
  LONG (binbig_int_of_string d)}

| bin_lit { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-2) in
  INTEGER (binint_of_string d)}

| oct_lit ("L"|"l") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-3) in
  LONG (octbig_int_of_string d)}

| oct_lit { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-2) in
  INTEGER (octint_of_string d)}

| dec_lit ("L"|"l") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-3) in
  LONG (decbig_int_of_string d)}

| dec_lit { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-2) in
  INTEGER (decint_of_string d)}

| hex_lit ("L"|"l") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-3) in
  LONG (hexbig_int_of_string d)}

| hex_lit { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 2 (n-2) in
  INTEGER (hexint_of_string d)}

| nz_dec_lit ("L"|"l") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 0 (n-1) in
  LONG (decbig_int_of_string d)}

| nz_dec_lit { 
  let s = lexeme lexbuf in
  let n = String.length s in
  INTEGER (decint_of_string s)}

| z_dec_lit ("L"|"l") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 0 (n-1) in
  if n = 2 then LONG (decbig_int_of_string d)
  else let converter = 
    if !zero_is_octal 
    then begin
      if do_octal_warning()
      then begin
        print_endline ("WARNING: " ^ s ^ " treated as octal (deprecated)");
        incr octal_warning_count
      end
      ;
      octbig_int_of_string
    end else begin
      if do_octal_warning()
      then begin
        print_endline ("WARNING: " ^ s ^ " treated as decimal (standard)");
        incr octal_warning_count
      end
      ;
      decbig_int_of_string
    end
  in LONG (converter d)}

| z_dec_lit { 
  let s = lexeme lexbuf in
  let n = String.length s in
  if n = 1 then INTEGER 0
  else let converter = 
    if !zero_is_octal 
    then begin 
      if do_octal_warning()
      then begin
        print_endline ("WARNING: " ^ s ^ " treated as octal (deprecated)");
        incr octal_warning_count
      end
      ;
      octint_of_string
    end else begin
      if do_octal_warning()
      then begin
        print_endline ("WARNING: " ^ s ^ " treated as decimal (standard)");
        incr octal_warning_count
      end
      ;
      decint_of_string
    end
  in INTEGER (converter s)}

| floating_literal { FLOAT (floating_of_string (lexeme lexbuf)) }

| floating_literal ("J"|"j") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 0 (n-1) in
  COMPLEX (0.0, floating_of_string d) }

| decimal_string ("J"|"j") { 
  let s = lexeme lexbuf in
  let n = String.length s in
  let d = String.sub s 0 (n-1) in
  COMPLEX (0.0, floating_of_string d) }

(* one character sequences *)
| "(" { LPAR }
| ")" { RPAR }
| "[" { LSQB }
| "]" { RSQB }
| ":" { COLON }
| "," { COMMA }
| ";" { SEMI }
| "+" { PLUS }
| "-" { MINUS }
| "*" { STAR }
| "/" { SLASH }
| "|" { VBAR }
| "&" { AMPER }
| "<" { LESS }
| ">" { GREATER }
| "=" { EQUAL }
| "." { DOT }
| "%" { PERCENT }
| "`" { BACKQUOTE }
| "{" { LBRACE }
| "}" { RBRACE }
| "~" { TILDE }
| "^" { CIRCUMFLEX }

(* two character sequences *)
| "==" { EQEQUAL }
| "<>" | "!="  { NOTEQUAL }
| "<=" { LESSEQUAL }
| ">=" { GREATEREQUAL }
| "<<" { LEFTSHIFT }
| ">>" { RIGHTSHIFT }
| "**" { POWER }
| "\\" { SLOSH }
| "++" { PLUSPLUS }
| "--" { MINUSMINUS}
| "+=" { PLUSEQUAL}
| "-=" { MINUSEQUAL}
| "*=" { STAREQUAL}
| "/=" { SLASHEQUAL}
| "%=" { PERCENTEQUAL}
| "^=" { CARETEQUAL}
| "|=" { VBAREQUAL}
| "&=" { AMPEREQUAL}
| "~=" { TILDEEQUAL}
| ":=" { COLONEQUAL}
| "<-" { LEFTARROW }
| "->" { RIGHTARROW }

(* three character sequences *)
| "<<=" { LEFTSHIFTEQUAL}
| ">>=" { RIGHTSHIFTEQUAL}

| quote { parse_qstring lexbuf }
| qqq { parse_qqqstring lexbuf }
| dquote { parse_dqstring lexbuf }
| ddd { parse_dddstring lexbuf }

| ('r'|'R') quote { parse_raw_qstring lexbuf }
| ('r'|'R') qqq { parse_raw_qqqstring lexbuf }
| ('r'|'R') dquote { parse_raw_dqstring lexbuf }
| ('r'|'R') ddd { parse_raw_dddstring lexbuf }

| white + { 
    let spaces=lexeme lexbuf in
    let column = ref 0 in
    let n = String.length spaces in
    for i=0 to n-1 do match spaces.[i] with
      | '\t' -> column := ((!column + 8) / 8) * 8
      | ' ' -> incr column
      | _ -> raise (Failure "Error in lexer, bad white space character")
    done;
    WHITE !column 
  }

| newline {incr_lex_counters lexbuf; NEWLINE (!line_no, !filename) }
| eof { ENDMARKER }
| _ { ERRORTOKEN (lexeme lexbuf)}

{
}
