summaryrefslogtreecommitdiff
path: root/tools/lexer301.mll
diff options
context:
space:
mode:
Diffstat (limited to 'tools/lexer301.mll')
-rw-r--r--tools/lexer301.mll478
1 files changed, 478 insertions, 0 deletions
diff --git a/tools/lexer301.mll b/tools/lexer301.mll
new file mode 100644
index 0000000000..991bd24020
--- /dev/null
+++ b/tools/lexer301.mll
@@ -0,0 +1,478 @@
+(***********************************************************************)
+(* *)
+(* Objective Caml *)
+(* *)
+(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
+(* *)
+(* Copyright 1996 Institut National de Recherche en Informatique et *)
+(* en Automatique. All rights reserved. This file is distributed *)
+(* under the terms of the Q Public License version 1.0. *)
+(* *)
+(***********************************************************************)
+
+(* $Id$ *)
+
+(* The lexer definition *)
+
+{
+open Misc
+
+type token =
+ AMPERAMPER
+ | AMPERSAND
+ | AND
+ | AS
+ | ASSERT
+ | BACKQUOTE
+ | BAR
+ | BARBAR
+ | BARRBRACKET
+ | BEGIN
+ | CHAR of (char)
+ | CLASS
+ | COLON
+ | COLONCOLON
+ | COLONEQUAL
+ | COLONGREATER
+ | COMMA
+ | CONSTRAINT
+ | DO
+ | DONE
+ | DOT
+ | DOTDOT
+ | DOWNTO
+ | ELSE
+ | END
+ | EOF
+ | EQUAL
+ | EXCEPTION
+ | EXTERNAL
+ | FALSE
+ | FLOAT of (string)
+ | FOR
+ | FUN
+ | FUNCTION
+ | FUNCTOR
+ | GREATER
+ | GREATERRBRACE
+ | GREATERRBRACKET
+ | IF
+ | IN
+ | INCLUDE
+ | INFIXOP0 of (string)
+ | INFIXOP1 of (string)
+ | INFIXOP2 of (string)
+ | INFIXOP3 of (string)
+ | INFIXOP4 of (string)
+ | INHERIT
+ | INITIALIZER
+ | INT of (int)
+ | LABEL of (string)
+ | LAZY
+ | LBRACE
+ | LBRACELESS
+ | LBRACKET
+ | LBRACKETBAR
+ | LBRACKETLESS
+ | LESS
+ | LESSMINUS
+ | LET
+ | LIDENT of (string)
+ | LPAREN
+ | MATCH
+ | METHOD
+ | MINUS
+ | MINUSDOT
+ | MINUSGREATER
+ | MODULE
+ | MUTABLE
+ | NEW
+ | OBJECT
+ | OF
+ | OPEN
+ | OPTLABEL of (string)
+ | OR
+ | PARSER
+ | PLUS
+ | PREFIXOP of (string)
+ | PRIVATE
+ | QUESTION
+ | QUESTION2
+ | QUOTE
+ | RBRACE
+ | RBRACKET
+ | REC
+ | RPAREN
+ | SEMI
+ | SEMISEMI
+ | SHARP
+ | SIG
+ | STAR
+ | STRING of (string)
+ | STRUCT
+ | THEN
+ | TILDE
+ | TO
+ | TRUE
+ | TRY
+ | TYPE
+ | UIDENT of (string)
+ | UNDERSCORE
+ | VAL
+ | VIRTUAL
+ | WHEN
+ | WHILE
+ | WITH
+
+type error =
+ | Illegal_character of char
+ | Unterminated_comment
+ | Unterminated_string
+ | Unterminated_string_in_comment
+ | Keyword_as_label of string
+;;
+
+exception Error of error * int * int
+
+(* The table of keywords *)
+
+let keyword_table =
+ create_hashtable 149 [
+ "and", AND;
+ "as", AS;
+ "assert", ASSERT;
+ "begin", BEGIN;
+ "class", CLASS;
+ "constraint", CONSTRAINT;
+ "do", DO;
+ "done", DONE;
+ "downto", DOWNTO;
+ "else", ELSE;
+ "end", END;
+ "exception", EXCEPTION;
+ "external", EXTERNAL;
+ "false", FALSE;
+ "for", FOR;
+ "fun", FUN;
+ "function", FUNCTION;
+ "functor", FUNCTOR;
+ "if", IF;
+ "in", IN;
+ "include", INCLUDE;
+ "inherit", INHERIT;
+ "initializer", INITIALIZER;
+ "lazy", LAZY;
+ "let", LET;
+ "match", MATCH;
+ "method", METHOD;
+ "module", MODULE;
+ "mutable", MUTABLE;
+ "new", NEW;
+ "object", OBJECT;
+ "of", OF;
+ "open", OPEN;
+ "or", OR;
+ "parser", PARSER;
+ "private", PRIVATE;
+ "rec", REC;
+ "sig", SIG;
+ "struct", STRUCT;
+ "then", THEN;
+ "to", TO;
+ "true", TRUE;
+ "try", TRY;
+ "type", TYPE;
+ "val", VAL;
+ "virtual", VIRTUAL;
+ "when", WHEN;
+ "while", WHILE;
+ "with", WITH;
+
+ "mod", INFIXOP3("mod");
+ "land", INFIXOP3("land");
+ "lor", INFIXOP3("lor");
+ "lxor", INFIXOP3("lxor");
+ "lsl", INFIXOP4("lsl");
+ "lsr", INFIXOP4("lsr");
+ "asr", INFIXOP4("asr")
+]
+
+(* To buffer string literals *)
+
+let initial_string_buffer = String.create 256
+let string_buff = ref initial_string_buffer
+let string_index = ref 0
+
+let reset_string_buffer () =
+ string_buff := initial_string_buffer;
+ string_index := 0
+
+let store_string_char c =
+ if !string_index >= String.length (!string_buff) then begin
+ let new_buff = String.create (String.length (!string_buff) * 2) in
+ String.blit (!string_buff) 0 new_buff 0 (String.length (!string_buff));
+ string_buff := new_buff
+ end;
+ String.unsafe_set (!string_buff) (!string_index) c;
+ incr string_index
+
+let get_stored_string () =
+ let s = String.sub (!string_buff) 0 (!string_index) in
+ string_buff := initial_string_buffer;
+ s
+
+(* To translate escape sequences *)
+
+let char_for_backslash =
+ match Sys.os_type with
+ | "Unix" | "Win32" | "Cygwin" ->
+ begin function
+ | 'n' -> '\010'
+ | 'r' -> '\013'
+ | 'b' -> '\008'
+ | 't' -> '\009'
+ | c -> c
+ end
+ | "MacOS" ->
+ begin function
+ | 'n' -> '\013'
+ | 'r' -> '\010'
+ | 'b' -> '\008'
+ | 't' -> '\009'
+ | c -> c
+ end
+ | x -> fatal_error "Lexer: unknown system type"
+
+let char_for_decimal_code lexbuf i =
+ let c = 100 * (Char.code(Lexing.lexeme_char lexbuf i) - 48) +
+ 10 * (Char.code(Lexing.lexeme_char lexbuf (i+1)) - 48) +
+ (Char.code(Lexing.lexeme_char lexbuf (i+2)) - 48) in
+ Char.chr(c land 0xFF)
+
+(* To store the position of the beginning of a string and comment *)
+let string_start_pos = ref 0;;
+let comment_start_pos = ref [];;
+let in_comment () = !comment_start_pos <> [];;
+
+(* Error report *)
+
+open Format
+
+let report_error ppf = function
+ | Illegal_character c ->
+ fprintf ppf "Illegal character (%s)" (Char.escaped c)
+ | Unterminated_comment ->
+ fprintf ppf "Comment not terminated"
+ | Unterminated_string ->
+ fprintf ppf "String literal not terminated"
+ | Unterminated_string_in_comment ->
+ fprintf ppf "This comment contains an unterminated string literal"
+ | Keyword_as_label kwd ->
+ fprintf ppf "`%s' is a keyword, it cannot be used as label name" kwd
+;;
+
+}
+
+let blank = [' ' '\010' '\013' '\009' '\012']
+let lowercase = ['a'-'z' '\223'-'\246' '\248'-'\255' '_']
+let uppercase = ['A'-'Z' '\192'-'\214' '\216'-'\222']
+let identchar =
+ ['A'-'Z' 'a'-'z' '_' '\192'-'\214' '\216'-'\246' '\248'-'\255' '\'' '0'-'9']
+let symbolchar =
+ ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
+let decimal_literal = ['0'-'9']+
+let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']+
+let oct_literal = '0' ['o' 'O'] ['0'-'7']+
+let bin_literal = '0' ['b' 'B'] ['0'-'1']+
+let float_literal =
+ ['0'-'9']+ ('.' ['0'-'9']* )? (['e' 'E'] ['+' '-']? ['0'-'9']+)?
+
+rule token = parse
+ blank +
+ { token lexbuf }
+ | "_"
+ { UNDERSCORE }
+ | "~" { TILDE }
+ | "~" lowercase identchar * ':'
+ { let s = Lexing.lexeme lexbuf in
+ let name = String.sub s 1 (String.length s - 2) in
+ if Hashtbl.mem keyword_table name then
+ raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf,
+ Lexing.lexeme_end lexbuf));
+ LABEL name }
+ | "?" { QUESTION }
+ | "?" lowercase identchar * ':'
+ { let s = Lexing.lexeme lexbuf in
+ let name = String.sub s 1 (String.length s - 2) in
+ if Hashtbl.mem keyword_table name then
+ raise (Error(Keyword_as_label name, Lexing.lexeme_start lexbuf,
+ Lexing.lexeme_end lexbuf));
+ OPTLABEL name }
+ | lowercase identchar *
+ { let s = Lexing.lexeme lexbuf in
+ try
+ Hashtbl.find keyword_table s
+ with Not_found ->
+ LIDENT s }
+ | uppercase identchar *
+ { UIDENT(Lexing.lexeme lexbuf) } (* No capitalized keywords *)
+ | decimal_literal | hex_literal | oct_literal | bin_literal
+ { INT (int_of_string(Lexing.lexeme lexbuf)) }
+ | float_literal
+ { FLOAT (Lexing.lexeme lexbuf) }
+ | "\""
+ { reset_string_buffer();
+ let string_start = Lexing.lexeme_start lexbuf in
+ string_start_pos := string_start;
+ string lexbuf;
+ lexbuf.Lexing.lex_start_pos <-
+ string_start - lexbuf.Lexing.lex_abs_pos;
+ STRING (get_stored_string()) }
+ | "'" [^ '\\' '\''] "'"
+ { CHAR(Lexing.lexeme_char lexbuf 1) }
+ | "'" '\\' ['\\' '\'' 'n' 't' 'b' 'r'] "'"
+ { CHAR(char_for_backslash (Lexing.lexeme_char lexbuf 2)) }
+ | "'" '\\' ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
+ { CHAR(char_for_decimal_code lexbuf 2) }
+ | "(*"
+ { comment_start_pos := [Lexing.lexeme_start lexbuf];
+ comment lexbuf;
+ token lexbuf }
+ | "(*)"
+ { let loc = { Location.loc_start = Lexing.lexeme_start lexbuf;
+ Location.loc_end = Lexing.lexeme_end lexbuf - 1;
+ Location.loc_ghost = false }
+ and warn = Warnings.Comment "the start of a comment"
+ in
+ Location.prerr_warning loc warn;
+ comment_start_pos := [Lexing.lexeme_start lexbuf];
+ comment lexbuf;
+ token lexbuf
+ }
+ | "*)"
+ { let loc = { Location.loc_start = Lexing.lexeme_start lexbuf;
+ Location.loc_end = Lexing.lexeme_end lexbuf;
+ Location.loc_ghost = false }
+ and warn = Warnings.Comment "not the end of a comment"
+ in
+ Location.prerr_warning loc warn;
+ lexbuf.Lexing.lex_curr_pos <- lexbuf.Lexing.lex_curr_pos - 1;
+ STAR
+ }
+ | "#" [' ' '\t']* ['0'-'9']+ [^ '\n' '\r'] * ('\n' | '\r' | "\r\n")
+ (* # linenum ... *)
+ { token lexbuf }
+ | "#" { SHARP }
+ | "&" { AMPERSAND }
+ | "&&" { AMPERAMPER }
+ | "`" { BACKQUOTE }
+ | "'" { QUOTE }
+ | "(" { LPAREN }
+ | ")" { RPAREN }
+ | "*" { STAR }
+ | "," { COMMA }
+ | "??" { QUESTION2 }
+ | "->" { MINUSGREATER }
+ | "." { DOT }
+ | ".." { DOTDOT }
+ | ":" { COLON }
+ | "::" { COLONCOLON }
+ | ":=" { COLONEQUAL }
+ | ":>" { COLONGREATER }
+ | ";" { SEMI }
+ | ";;" { SEMISEMI }
+ | "<" { LESS }
+ | "<-" { LESSMINUS }
+ | "=" { EQUAL }
+ | "[" { LBRACKET }
+ | "[|" { LBRACKETBAR }
+ | "[<" { LBRACKETLESS }
+ | "]" { RBRACKET }
+ | "{" { LBRACE }
+ | "{<" { LBRACELESS }
+ | "|" { BAR }
+ | "||" { BARBAR }
+ | "|]" { BARRBRACKET }
+ | ">" { GREATER }
+ | ">]" { GREATERRBRACKET }
+ | "}" { RBRACE }
+ | ">}" { GREATERRBRACE }
+
+ | "!=" { INFIXOP0 "!=" }
+ | "+" { PLUS }
+ | "-" { MINUS }
+ | "-." { MINUSDOT }
+
+ | "!" symbolchar *
+ { PREFIXOP(Lexing.lexeme lexbuf) }
+ | ['~' '?'] symbolchar +
+ { PREFIXOP(Lexing.lexeme lexbuf) }
+ | ['=' '<' '>' '|' '&' '$'] symbolchar *
+ { INFIXOP0(Lexing.lexeme lexbuf) }
+ | ['@' '^'] symbolchar *
+ { INFIXOP1(Lexing.lexeme lexbuf) }
+ | ['+' '-'] symbolchar *
+ { INFIXOP2(Lexing.lexeme lexbuf) }
+ | "**" symbolchar *
+ { INFIXOP4(Lexing.lexeme lexbuf) }
+ | ['*' '/' '%'] symbolchar *
+ { INFIXOP3(Lexing.lexeme lexbuf) }
+ | eof { EOF }
+ | _
+ { raise (Error(Illegal_character ((Lexing.lexeme lexbuf).[0]),
+ Lexing.lexeme_start lexbuf, Lexing.lexeme_end lexbuf)) }
+
+and comment = parse
+ "(*"
+ { comment_start_pos := Lexing.lexeme_start lexbuf :: !comment_start_pos;
+ comment lexbuf;
+ }
+ | "*)"
+ { match !comment_start_pos with
+ | [] -> assert false
+ | [x] -> comment_start_pos := [];
+ | _ :: l -> comment_start_pos := l;
+ comment lexbuf;
+ }
+ | "\""
+ { reset_string_buffer();
+ string_start_pos := Lexing.lexeme_start lexbuf;
+ begin try string lexbuf
+ with Error (Unterminated_string, _, _) ->
+ let st = List.hd !comment_start_pos in
+ raise (Error (Unterminated_string_in_comment, st, st + 2))
+ end;
+ string_buff := initial_string_buffer;
+ comment lexbuf }
+ | "''"
+ { comment lexbuf }
+ | "'" [^ '\\' '\''] "'"
+ { comment lexbuf }
+ | "'\\" ['\\' '\'' 'n' 't' 'b' 'r'] "'"
+ { comment lexbuf }
+ | "'\\" ['0'-'9'] ['0'-'9'] ['0'-'9'] "'"
+ { comment lexbuf }
+ | eof
+ { let st = List.hd !comment_start_pos in
+ raise (Error (Unterminated_comment, st, st + 2));
+ }
+ | _
+ { comment lexbuf }
+
+and string = parse
+ '"'
+ { () }
+ | '\\' ("\010" | "\013" | "\013\010") [' ' '\009'] *
+ { string lexbuf }
+ | '\\' ['\\' '"' 'n' 't' 'b' 'r']
+ { store_string_char(char_for_backslash(Lexing.lexeme_char lexbuf 1));
+ string lexbuf }
+ | '\\' ['0'-'9'] ['0'-'9'] ['0'-'9']
+ { store_string_char(char_for_decimal_code lexbuf 1);
+ string lexbuf }
+ | eof
+ { raise (Error (Unterminated_string,
+ !string_start_pos, !string_start_pos+1)) }
+ | _
+ { store_string_char(Lexing.lexeme_char lexbuf 0);
+ string lexbuf }