diff options
Diffstat (limited to 'tools/lexer301.mll')
-rw-r--r-- | tools/lexer301.mll | 478 |
1 files changed, 478 insertions, 0 deletions
diff --git a/tools/lexer301.mll b/tools/lexer301.mll new file mode 100644 index 0000000000..991bd24020 --- /dev/null +++ b/tools/lexer301.mll @@ -0,0 +1,478 @@ +(***********************************************************************) +(* *) +(* Objective Caml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 1996 Institut National de Recherche en Informatique et *) +(* en Automatique. All rights reserved. This file is distributed *) +(* under the terms of the Q Public License version 1.0. *) +(* *) +(***********************************************************************) + +(* $Id$ *) + +(* The lexer definition *) + +{ +open Misc + +type token = + AMPERAMPER + | AMPERSAND + | AND + | AS + | ASSERT + | BACKQUOTE + | BAR + | BARBAR + | BARRBRACKET + | BEGIN + | CHAR of (char) + | CLASS + | COLON + | COLONCOLON + | COLONEQUAL + | COLONGREATER + | COMMA + | CONSTRAINT + | DO + | DONE + | DOT + | DOTDOT + | DOWNTO + | ELSE + | END + | EOF + | EQUAL + | EXCEPTION + | EXTERNAL + | FALSE + | FLOAT of (string) + | FOR + | FUN + | FUNCTION + | FUNCTOR + | GREATER + | GREATERRBRACE + | GREATERRBRACKET + | IF + | IN + | INCLUDE + | INFIXOP0 of (string) + | INFIXOP1 of (string) + | INFIXOP2 of (string) + | INFIXOP3 of (string) + | INFIXOP4 of (string) + | INHERIT + | INITIALIZER + | INT of (int) + | LABEL of (string) + | LAZY + | LBRACE + | LBRACELESS + | LBRACKET + | LBRACKETBAR + | LBRACKETLESS + | LESS + | LESSMINUS + | LET + | LIDENT of (string) + | LPAREN + | MATCH + | METHOD + | MINUS + | MINUSDOT + | MINUSGREATER + | MODULE + | MUTABLE + | NEW + | OBJECT + | OF + | OPEN + | OPTLABEL of (string) + | OR + | PARSER + | PLUS + | PREFIXOP of (string) + | PRIVATE + | QUESTION + | QUESTION2 + | QUOTE + | RBRACE + | RBRACKET + | REC + | RPAREN + | SEMI + | SEMISEMI + | SHARP + | SIG + | STAR + | STRING of (string) + | STRUCT + | THEN + | TILDE + | TO + | TRUE + | TRY + | TYPE + | UIDENT of (string) + | UNDERSCORE + | VAL + | VIRTUAL + | WHEN + | WHILE + | WITH + +type error = + | Illegal_character of char + | Unterminated_comment + | Unterminated_string + | Unterminated_string_in_comment + | Keyword_as_label of string +;; + +exception Error of error * int * int + +(* The table of keywords *) + +let keyword_table = + create_hashtable 149 [ + "and", AND; + "as", AS; + "assert", ASSERT; + "begin", BEGIN; + "class", CLASS; + "constraint", CONSTRAINT; + "do", DO; + "done", DONE; + "downto", DOWNTO; + "else", ELSE; + "end", END; + "exception", EXCEPTION; + "external", EXTERNAL; + "false", FALSE; + "for", FOR; + "fun", FUN; + "function", FUNCTION; + "functor", FUNCTOR; + "if", IF; + "in", IN; + "include", INCLUDE; + "inherit", INHERIT; + "initializer", INITIALIZER; + "lazy", LAZY; + "let", LET; + "match", MATCH; + "method", METHOD; + "module", MODULE; + "mutable", MUTABLE; + "new", NEW; + "object", OBJECT; + "of", OF; + "open", OPEN; + "or", OR; + "parser", PARSER; + "private", PRIVATE; + "rec", REC; + "sig", SIG; + "struct", STRUCT; + "then", THEN; + "to", TO; + "true", TRUE; + "try", TRY; + "type", TYPE; + "val", VAL; + "virtual", VIRTUAL; + "when", WHEN; + "while", WHILE; + "with", WITH; + + "mod", INFIXOP3("mod"); + "land", INFIXOP3("land"); + "lor", INFIXOP3("lor"); + "lxor", INFIXOP3("lxor"); + "lsl", INFIXOP4("lsl"); + "lsr", INFIXOP4("lsr"); + "asr", INFIXOP4("asr") +] + +(* To buffer string literals *) + +let initial_string_buffer = String.create 256 +let string_buff = ref initial_string_buffer +let string_index = ref 0 + +let reset_string_buffer () = + string_buff := initial_string_buffer; + string_index := 0 + +let store_string_char c = + if !string_index >= String.length (!string_buff) then begin + let new_buff = String.create (String.length (!string_buff) * 2) in + String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff)); + string_buff := new_buff + end; + String.unsafe_set (!string_buff) (!string_index) c; + incr string_index + +let get_stored_string () = + let s = String.sub (!string_buff) 0 (!string_index) in + string_buff := initial_string_buffer; + s + +(* To translate escape sequences *) + +let char_for_backslash = + match Sys.os_type with + | "Unix" | "Win32" | "Cygwin" -> + begin function + | 'n' -> '\010' + | 'r' -> '\013' + | 'b' -> '\008' + | 't' -> '\009' + | c -> c + end + | "MacOS" -> + begin function + | 'n' -> '\013' + | 'r' -> '\010' + | 'b' -> '\008' + | 't' -> '\009' + | c -> c + end + | x -> fatal_error "Lexer: unknown system type" + +let char_for_decimal_code lexbuf i = + let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) + + 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) + + (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in + Char.chr(c land 0xFF) + +(* To store the position of the beginning of a string and comment *) +let string_start_pos = ref 0;; +let comment_start_pos = ref [];; +let in_comment () = !comment_start_pos <> [];; + +(* Error report *) + +open Format + +let report_error ppf = function + | Illegal_character c -> + fprintf ppf "Illegal character (%s)" (Char.escaped c) + | Unterminated_comment -> + fprintf ppf "Comment not terminated" + | Unterminated_string -> + fprintf ppf "String literal not terminated" + | Unterminated_string_in_comment -> + fprintf ppf "This comment contains an unterminated string literal" + | Keyword_as_label kwd -> + fprintf ppf "`%s' is a keyword, it cannot be used as label name" kwd +;; + +} + +let blank = [' ' '\010' '\013' '\009' '\012'] +let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_'] +let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222'] +let identchar = + ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9'] +let symbolchar = + ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] +let decimal_literal = ['0'-'9']+ +let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+ +let oct_literal = '0' ['o' 'O'] ['0'-'7']+ +let bin_literal = '0' ['b' 'B'] ['0'-'1']+ +let float_literal = + ['0'-'9']+ ('.' ['0'-'9']* )? (['e' 'E'] ['+' '-']? ['0'-'9']+)? + +rule token = parse + blank + + { token lexbuf } + | "_" + { UNDERSCORE } + | "~" { TILDE } + | "~" lowercase identchar * ':' + { let s = Lexing.lexeme lexbuf in + let name = String.sub s 1 (String.length s - 2) in + if Hashtbl.mem keyword_table name then + raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf, + Lexing.lexeme_end lexbuf)); + LABEL name } + | "?" { QUESTION } + | "?" lowercase identchar * ':' + { let s = Lexing.lexeme lexbuf in + let name = String.sub s 1 (String.length s - 2) in + if Hashtbl.mem keyword_table name then + raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf, + Lexing.lexeme_end lexbuf)); + OPTLABEL name } + | lowercase identchar * + { let s = Lexing.lexeme lexbuf in + try + Hashtbl.find keyword_table s + with Not_found -> + LIDENT s } + | uppercase identchar * + { UIDENT(Lexing.lexeme lexbuf) } (* No capitalized keywords *) + | decimal_literal | hex_literal | oct_literal | bin_literal + { INT (int_of_string(Lexing.lexeme lexbuf)) } + | float_literal + { FLOAT (Lexing.lexeme lexbuf) } + | "\"" + { reset_string_buffer(); + let string_start = Lexing.lexeme_start lexbuf in + string_start_pos := string_start; + string lexbuf; + lexbuf.Lexing.lex_start_pos <- + string_start - lexbuf.Lexing.lex_abs_pos; + STRING (get_stored_string()) } + | "'" [^ '\\' '\''] "'" + { CHAR(Lexing.lexeme_char lexbuf 1) } + | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'" + { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) } + | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" + { CHAR(char_for_decimal_code lexbuf 2) } + | "(*" + { comment_start_pos := [Lexing.lexeme_start lexbuf]; + comment lexbuf; + token lexbuf } + | "(*)" + { let loc = { Location.loc_start = Lexing.lexeme_start lexbuf; + Location.loc_end = Lexing.lexeme_end lexbuf - 1; + Location.loc_ghost = false } + and warn = Warnings.Comment "the start of a comment" + in + Location.prerr_warning loc warn; + comment_start_pos := [Lexing.lexeme_start lexbuf]; + comment lexbuf; + token lexbuf + } + | "*)" + { let loc = { Location.loc_start = Lexing.lexeme_start lexbuf; + Location.loc_end = Lexing.lexeme_end lexbuf; + Location.loc_ghost = false } + and warn = Warnings.Comment "not the end of a comment" + in + Location.prerr_warning loc warn; + lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1; + STAR + } + | "#" [' ' '\t']* ['0'-'9']+ [^ '\n' '\r'] * ('\n' | '\r' | "\r\n") + (* # linenum ... *) + { token lexbuf } + | "#" { SHARP } + | "&" { AMPERSAND } + | "&&" { AMPERAMPER } + | "`" { BACKQUOTE } + | "'" { QUOTE } + | "(" { LPAREN } + | ")" { RPAREN } + | "*" { STAR } + | "," { COMMA } + | "??" { QUESTION2 } + | "->" { MINUSGREATER } + | "." { DOT } + | ".." { DOTDOT } + | ":" { COLON } + | "::" { COLONCOLON } + | ":=" { COLONEQUAL } + | ":>" { COLONGREATER } + | ";" { SEMI } + | ";;" { SEMISEMI } + | "<" { LESS } + | "<-" { LESSMINUS } + | "=" { EQUAL } + | "[" { LBRACKET } + | "[|" { LBRACKETBAR } + | "[<" { LBRACKETLESS } + | "]" { RBRACKET } + | "{" { LBRACE } + | "{<" { LBRACELESS } + | "|" { BAR } + | "||" { BARBAR } + | "|]" { BARRBRACKET } + | ">" { GREATER } + | ">]" { GREATERRBRACKET } + | "}" { RBRACE } + | ">}" { GREATERRBRACE } + + | "!=" { INFIXOP0 "!=" } + | "+" { PLUS } + | "-" { MINUS } + | "-." { MINUSDOT } + + | "!" symbolchar * + { PREFIXOP(Lexing.lexeme lexbuf) } + | ['~' '?'] symbolchar + + { PREFIXOP(Lexing.lexeme lexbuf) } + | ['=' '<' '>' '|' '&' '$'] symbolchar * + { INFIXOP0(Lexing.lexeme lexbuf) } + | ['@' '^'] symbolchar * + { INFIXOP1(Lexing.lexeme lexbuf) } + | ['+' '-'] symbolchar * + { INFIXOP2(Lexing.lexeme lexbuf) } + | "**" symbolchar * + { INFIXOP4(Lexing.lexeme lexbuf) } + | ['*' '/' '%'] symbolchar * + { INFIXOP3(Lexing.lexeme lexbuf) } + | eof { EOF } + | _ + { raise (Error(Illegal_character ((Lexing.lexeme lexbuf).[0]), + Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) } + +and comment = parse + "(*" + { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos; + comment lexbuf; + } + | "*)" + { match !comment_start_pos with + | [] -> assert false + | [x] -> comment_start_pos := []; + | _ :: l -> comment_start_pos := l; + comment lexbuf; + } + | "\"" + { reset_string_buffer(); + string_start_pos := Lexing.lexeme_start lexbuf; + begin try string lexbuf + with Error (Unterminated_string, _, _) -> + let st = List.hd !comment_start_pos in + raise (Error (Unterminated_string_in_comment, st, st + 2)) + end; + string_buff := initial_string_buffer; + comment lexbuf } + | "''" + { comment lexbuf } + | "'" [^ '\\' '\''] "'" + { comment lexbuf } + | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'" + { comment lexbuf } + | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'" + { comment lexbuf } + | eof + { let st = List.hd !comment_start_pos in + raise (Error (Unterminated_comment, st, st + 2)); + } + | _ + { comment lexbuf } + +and string = parse + '"' + { () } + | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] * + { string lexbuf } + | '\\' ['\\' '"' 'n' 't' 'b' 'r'] + { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1)); + string lexbuf } + | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] + { store_string_char(char_for_decimal_code lexbuf 1); + string lexbuf } + | eof + { raise (Error (Unterminated_string, + !string_start_pos, !string_start_pos+1)) } + | _ + { store_string_char(Lexing.lexeme_char lexbuf 0); + string lexbuf } |