diff options
author | Alain Frisch <alain@frisch.fr> | 2017-07-20 15:24:51 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-07-20 15:24:51 +0200 |
commit | 9c1927b0cbcd34c4ce766799721f6cde20f0ba30 (patch) | |
tree | 3154f6c7b58595c567ba79d9112e604165a0d98d | |
parent | fd47ba9649f784c7d0772bdf282b8e7c8f1fe5aa (diff) | |
parent | d6d1739c0b66c0f7cc662295d49804ae382d72b0 (diff) | |
download | ocaml-9c1927b0cbcd34c4ce766799721f6cde20f0ba30.tar.gz |
Merge pull request #1232 from dbuenzli/string-unicode-escapes
Add Unicode character escape \u{H} to OCaml string literals.
-rw-r--r-- | Changes | 6 | ||||
-rw-r--r-- | manual/manual/refman/lex.etex | 9 | ||||
-rw-r--r-- | parsing/lexer.mll | 87 | ||||
-rw-r--r-- | testsuite/tests/lexing/Makefile | 3 | ||||
-rw-r--r-- | testsuite/tests/lexing/uchar_esc.ml | 34 | ||||
-rw-r--r-- | testsuite/tests/lexing/uchar_esc.ml.reference | 36 |
6 files changed, 135 insertions, 40 deletions
@@ -9,6 +9,12 @@ Working version can be used as a placeholder for a polymorphic function. (Stephen Dolan) +* GPR#1232: Support Unicode character escape sequences in string + literals via the \u{X+} syntax. These escapes are substituted by the + UTF-8 encoding of the Unicode character. + (Daniel Bünzli, review by Damien Doligez, Alain Frisch, Xavier + Leroy and Leo White) + - GPR#1249, MPR#6271, MPR#7529: Support "let open M in ..." in class expressions and class type expressions. (Alain Frisch, reviews by Thomas Refis and Jacques Garrigue) diff --git a/manual/manual/refman/lex.etex b/manual/manual/refman/lex.etex index 9ddce69b4d..fa24d1e468 100644 --- a/manual/manual/refman/lex.etex +++ b/manual/manual/refman/lex.etex @@ -151,13 +151,20 @@ string-literal: string-character: regular-string-char | escape-sequence + | "\u{" ("0"\ldots"9"||"A"\ldots"F"||"a"\ldots"f")+ "}" | '\' newline { space || tab } \end{syntax} String literals are delimited by @'"'@ (double quote) characters. The two double quotes enclose a sequence of either characters different from @'"'@ and @'\'@, or escape sequences from the -table given above for character literals. +table given above for character literals, or a Unicode character +escape sequence. + +A Unicode character escape sequence is substituted by the UTF-8 +encoding of the specified Unicode scalar value. The Unicode scalar +value, an integer in the ranges 0x0000...0xD7FF or 0xE000...0x10FFFF, +is defined using 1 to 6 hexadecimal digits; leading zeros are allowed. To allow splitting long string literals across lines, the sequence "\\"\var{newline}~\var{spaces-or-tabs} (a backslash at the end of a line diff --git a/parsing/lexer.mll b/parsing/lexer.mll index 6936c8265d..178f3be852 100644 --- a/parsing/lexer.mll +++ b/parsing/lexer.mll @@ -99,35 +99,14 @@ let keyword_table = (* To buffer string literals *) -let initial_string_buffer = Bytes.create 256 -let string_buff = ref initial_string_buffer -let string_index = ref 0 - -let reset_string_buffer () = - string_buff := initial_string_buffer; - string_index := 0 - -let store_string_char c = - if !string_index >= Bytes.length !string_buff then begin - let new_buff = Bytes.create (Bytes.length (!string_buff) * 2) in - Bytes.blit !string_buff 0 new_buff 0 (Bytes.length !string_buff); - string_buff := new_buff - end; - Bytes.unsafe_set !string_buff !string_index c; - incr string_index - -let store_string s = - for i = 0 to String.length s - 1 do - store_string_char s.[i]; - done - -let store_lexeme lexbuf = - store_string (Lexing.lexeme lexbuf) - -let get_stored_string () = - let s = Bytes.sub_string !string_buff 0 !string_index in - string_buff := initial_string_buffer; - s +let string_buffer = Buffer.create 256 +let reset_string_buffer () = Buffer.reset string_buffer +let get_stored_string () = Buffer.contents string_buffer + +let store_string_char c = Buffer.add_char string_buffer c +let store_string_utf_8_uchar u = Buffer.add_utf_8_uchar string_buffer u +let store_string s = Buffer.add_string string_buffer s +let store_lexeme lexbuf = store_string (Lexing.lexeme lexbuf) (* To store the position of the beginning of a string and comment *) let string_start_loc = ref Location.none;; @@ -141,6 +120,9 @@ let print_warnings = ref true let store_escaped_char lexbuf c = if in_comment () then store_lexeme lexbuf else store_string_char c +let store_escaped_uchar lexbuf u = + if in_comment () then store_lexeme lexbuf else store_string_utf_8_uchar u + let with_comment_buffer comment lexbuf = let start_loc = Location.curr lexbuf in comment_start_loc := [start_loc]; @@ -153,6 +135,21 @@ let with_comment_buffer comment lexbuf = (* To translate escape sequences *) +let hex_digit_value d = (* assert (d in '0'..'9' 'a'..'f' 'A'..'F') *) + let d = Char.code d in + if d >= 97 then d - 87 else + if d >= 65 then d - 55 else + d - 48 + +let hex_num_value lexbuf ~first ~last = + let rec loop acc i = match i > last with + | true -> acc + | false -> + let value = hex_digit_value (Lexing.lexeme_char lexbuf i) in + loop (16 * acc + value) (i + 1) + in + loop 0 first + let char_for_backslash = function | 'n' -> '\010' | 'r' -> '\013' @@ -178,17 +175,24 @@ let char_for_octal_code lexbuf i = Char.chr c let char_for_hexadecimal_code lexbuf i = - let d1 = Char.code (Lexing.lexeme_char lexbuf i) in - let val1 = if d1 >= 97 then d1 - 87 - else if d1 >= 65 then d1 - 55 - else d1 - 48 - in - let d2 = Char.code (Lexing.lexeme_char lexbuf (i+1)) in - let val2 = if d2 >= 97 then d2 - 87 - else if d2 >= 65 then d2 - 55 - else d2 - 48 + let byte = hex_num_value lexbuf ~first:i ~last:(i+1) in + Char.chr byte + +let uchar_for_uchar_escape lexbuf = + let err e = + raise + (Error (Illegal_escape (Lexing.lexeme lexbuf ^ e), Location.curr lexbuf)) in - Char.chr (val1 * 16 + val2) + let len = Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf in + let first = 3 (* skip opening \u{ *) in + let last = len - 2 (* skip closing } *) in + let digit_count = last - first + 1 in + match digit_count > 6 with + | true -> err ", too many digits, expected 1 to 6 hexadecimal digits" + | false -> + let cp = hex_num_value lexbuf ~first ~last in + if Uchar.is_valid cp then Uchar.unsafe_of_int cp else + err (", " ^ Printf.sprintf "%X" cp ^ " is not a Unicode scalar value") (* recover the name from a LABEL or OPTLABEL token *) @@ -290,6 +294,8 @@ let symbolchar = ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~'] let decimal_literal = ['0'-'9'] ['0'-'9' '_']* +let hex_digit = + ['0'-'9' 'A'-'F' 'a'-'f'] let hex_literal = '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']* let oct_literal = @@ -627,6 +633,9 @@ and string = parse | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F'] { store_escaped_char lexbuf (char_for_hexadecimal_code lexbuf 2); string lexbuf } + | '\\' 'u' '{' hex_digit+ '}' + { store_escaped_uchar lexbuf (uchar_for_uchar_escape lexbuf); + string lexbuf } | '\\' _ { if not (in_comment ()) then begin (* Should be an error, but we are very lax. diff --git a/testsuite/tests/lexing/Makefile b/testsuite/tests/lexing/Makefile new file mode 100644 index 0000000000..9625a3fbc3 --- /dev/null +++ b/testsuite/tests/lexing/Makefile @@ -0,0 +1,3 @@ +BASEDIR=../.. +include $(BASEDIR)/makefiles/Makefile.toplevel +include $(BASEDIR)/makefiles/Makefile.common diff --git a/testsuite/tests/lexing/uchar_esc.ml b/testsuite/tests/lexing/uchar_esc.ml new file mode 100644 index 0000000000..b288ac2561 --- /dev/null +++ b/testsuite/tests/lexing/uchar_esc.ml @@ -0,0 +1,34 @@ + +(* Correct escapes and their encoding *) + +let () = + assert ("\xF0\x9F\x90\xAB" = "\u{1F42B}"); + assert ("\xF0\x9F\x90\xAB" = "\u{01F42B}"); + assert ("\x00" = "\u{0}"); + assert ("\x00" = "\u{00}"); + assert ("\x00" = "\u{000}"); + assert ("\x00" = "\u{0000}"); + assert ("\x00" = "\u{00000}"); + assert ("\x00" = "\u{000000}"); + assert ("\xC3\xA9" = "\u{E9}"); + assert ("\xC3\xA9" = "\u{0E9}"); + assert ("\xC3\xA9" = "\u{00E9}"); + assert ("\xC3\xA9" = "\u{000E9}"); + assert ("\xC3\xA9" = "\u{0000E9}"); + assert ("\xC3\xA9" = "\u{0000E9}"); + assert ("\xF4\x8F\xBF\xBF" = "\u{10FFFF}"); + () +;; + + +(* Errors *) + +let invalid_sv = "\u{0D800}" ;; +let invalid_sv = "\u{D800}" ;; +let invalid_sv = "\u{D900}" ;; +let invalid_sv = "\u{DFFF}" ;; +let invalid_sv = "\u{110000} ;; + +let too_many_digits = "\u{01234567}" ;; +let no_hex_digits = "\u{}" ;; +let illegal_hex_digit = "\u{u}" ;; diff --git a/testsuite/tests/lexing/uchar_esc.ml.reference b/testsuite/tests/lexing/uchar_esc.ml.reference new file mode 100644 index 0000000000..8730059c99 --- /dev/null +++ b/testsuite/tests/lexing/uchar_esc.ml.reference @@ -0,0 +1,36 @@ + +# # Characters 34-43: + let invalid_sv = "\u{0D800}" ;; + ^^^^^^^^^ +Error: Illegal backslash escape in string or character (\u{0D800}, D800 is not a Unicode scalar value) +# Characters 18-26: + let invalid_sv = "\u{D800}" ;; + ^^^^^^^^ +Error: Illegal backslash escape in string or character (\u{D800}, D800 is not a Unicode scalar value) +# Characters 18-26: + let invalid_sv = "\u{D900}" ;; + ^^^^^^^^ +Error: Illegal backslash escape in string or character (\u{D900}, D900 is not a Unicode scalar value) +# Characters 18-26: + let invalid_sv = "\u{DFFF}" ;; + ^^^^^^^^ +Error: Illegal backslash escape in string or character (\u{DFFF}, DFFF is not a Unicode scalar value) +# Characters 18-28: + let invalid_sv = "\u{110000} ;; + ^^^^^^^^^^ +Error: Illegal backslash escape in string or character (\u{110000}, 110000 is not a Unicode scalar value) +# Characters 24-36: + let too_many_digits = "\u{01234567}" ;; + ^^^^^^^^^^^^ +Error: Illegal backslash escape in string or character (\u{01234567}, too many digits, expected 1 to 6 hexadecimal digits) +# Characters 21-23: + let no_hex_digits = "\u{}" ;; + ^^ +Warning 14: illegal backslash escape in string. +val no_hex_digits : string = "\\u{}" +# Characters 25-27: + let illegal_hex_digit = "\u{u}" ;; + ^^ +Warning 14: illegal backslash escape in string. +val illegal_hex_digit : string = "\\u{u}" +# |