summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlain Frisch <alain@frisch.fr>2017-07-20 15:24:51 +0200
committerGitHub <noreply@github.com>2017-07-20 15:24:51 +0200
commit9c1927b0cbcd34c4ce766799721f6cde20f0ba30 (patch)
tree3154f6c7b58595c567ba79d9112e604165a0d98d
parentfd47ba9649f784c7d0772bdf282b8e7c8f1fe5aa (diff)
parentd6d1739c0b66c0f7cc662295d49804ae382d72b0 (diff)
downloadocaml-9c1927b0cbcd34c4ce766799721f6cde20f0ba30.tar.gz
Merge pull request #1232 from dbuenzli/string-unicode-escapes
Add Unicode character escape \u{H} to OCaml string literals.
-rw-r--r--Changes6
-rw-r--r--manual/manual/refman/lex.etex9
-rw-r--r--parsing/lexer.mll87
-rw-r--r--testsuite/tests/lexing/Makefile3
-rw-r--r--testsuite/tests/lexing/uchar_esc.ml34
-rw-r--r--testsuite/tests/lexing/uchar_esc.ml.reference36
6 files changed, 135 insertions, 40 deletions
diff --git a/Changes b/Changes
index 8376357080..136acee02c 100644
--- a/Changes
+++ b/Changes
@@ -9,6 +9,12 @@ Working version
can be used as a placeholder for a polymorphic function.
(Stephen Dolan)
+* GPR#1232: Support Unicode character escape sequences in string
+ literals via the \u{X+} syntax. These escapes are substituted by the
+ UTF-8 encoding of the Unicode character.
+ (Daniel Bünzli, review by Damien Doligez, Alain Frisch, Xavier
+ Leroy and Leo White)
+
- GPR#1249, MPR#6271, MPR#7529: Support "let open M in ..."
in class expressions and class type expressions.
(Alain Frisch, reviews by Thomas Refis and Jacques Garrigue)
diff --git a/manual/manual/refman/lex.etex b/manual/manual/refman/lex.etex
index 9ddce69b4d..fa24d1e468 100644
--- a/manual/manual/refman/lex.etex
+++ b/manual/manual/refman/lex.etex
@@ -151,13 +151,20 @@ string-literal:
string-character:
regular-string-char
| escape-sequence
+ | "\u{" ("0"\ldots"9"||"A"\ldots"F"||"a"\ldots"f")+ "}"
| '\' newline { space || tab }
\end{syntax}
String literals are delimited by @'"'@ (double quote) characters.
The two double quotes enclose a sequence of either characters
different from @'"'@ and @'\'@, or escape sequences from the
-table given above for character literals.
+table given above for character literals, or a Unicode character
+escape sequence.
+
+A Unicode character escape sequence is substituted by the UTF-8
+encoding of the specified Unicode scalar value. The Unicode scalar
+value, an integer in the ranges 0x0000...0xD7FF or 0xE000...0x10FFFF,
+is defined using 1 to 6 hexadecimal digits; leading zeros are allowed.
To allow splitting long string literals across lines, the sequence
"\\"\var{newline}~\var{spaces-or-tabs} (a backslash at the end of a line
diff --git a/parsing/lexer.mll b/parsing/lexer.mll
index 6936c8265d..178f3be852 100644
--- a/parsing/lexer.mll
+++ b/parsing/lexer.mll
@@ -99,35 +99,14 @@ let keyword_table =
(* To buffer string literals *)
-let initial_string_buffer = Bytes.create 256
-let string_buff = ref initial_string_buffer
-let string_index = ref 0
-
-let reset_string_buffer () =
- string_buff := initial_string_buffer;
- string_index := 0
-
-let store_string_char c =
- if !string_index >= Bytes.length !string_buff then begin
- let new_buff = Bytes.create (Bytes.length (!string_buff) * 2) in
- Bytes.blit !string_buff 0 new_buff 0 (Bytes.length !string_buff);
- string_buff := new_buff
- end;
- Bytes.unsafe_set !string_buff !string_index c;
- incr string_index
-
-let store_string s =
- for i = 0 to String.length s - 1 do
- store_string_char s.[i];
- done
-
-let store_lexeme lexbuf =
- store_string (Lexing.lexeme lexbuf)
-
-let get_stored_string () =
- let s = Bytes.sub_string !string_buff 0 !string_index in
- string_buff := initial_string_buffer;
- s
+let string_buffer = Buffer.create 256
+let reset_string_buffer () = Buffer.reset string_buffer
+let get_stored_string () = Buffer.contents string_buffer
+
+let store_string_char c = Buffer.add_char string_buffer c
+let store_string_utf_8_uchar u = Buffer.add_utf_8_uchar string_buffer u
+let store_string s = Buffer.add_string string_buffer s
+let store_lexeme lexbuf = store_string (Lexing.lexeme lexbuf)
(* To store the position of the beginning of a string and comment *)
let string_start_loc = ref Location.none;;
@@ -141,6 +120,9 @@ let print_warnings = ref true
let store_escaped_char lexbuf c =
if in_comment () then store_lexeme lexbuf else store_string_char c
+let store_escaped_uchar lexbuf u =
+ if in_comment () then store_lexeme lexbuf else store_string_utf_8_uchar u
+
let with_comment_buffer comment lexbuf =
let start_loc = Location.curr lexbuf in
comment_start_loc := [start_loc];
@@ -153,6 +135,21 @@ let with_comment_buffer comment lexbuf =
(* To translate escape sequences *)
+let hex_digit_value d = (* assert (d in '0'..'9' 'a'..'f' 'A'..'F') *)
+ let d = Char.code d in
+ if d >= 97 then d - 87 else
+ if d >= 65 then d - 55 else
+ d - 48
+
+let hex_num_value lexbuf ~first ~last =
+ let rec loop acc i = match i > last with
+ | true -> acc
+ | false ->
+ let value = hex_digit_value (Lexing.lexeme_char lexbuf i) in
+ loop (16 * acc + value) (i + 1)
+ in
+ loop 0 first
+
let char_for_backslash = function
| 'n' -> '\010'
| 'r' -> '\013'
@@ -178,17 +175,24 @@ let char_for_octal_code lexbuf i =
Char.chr c
let char_for_hexadecimal_code lexbuf i =
- let d1 = Char.code (Lexing.lexeme_char lexbuf i) in
- let val1 = if d1 >= 97 then d1 - 87
- else if d1 >= 65 then d1 - 55
- else d1 - 48
- in
- let d2 = Char.code (Lexing.lexeme_char lexbuf (i+1)) in
- let val2 = if d2 >= 97 then d2 - 87
- else if d2 >= 65 then d2 - 55
- else d2 - 48
+ let byte = hex_num_value lexbuf ~first:i ~last:(i+1) in
+ Char.chr byte
+
+let uchar_for_uchar_escape lexbuf =
+ let err e =
+ raise
+ (Error (Illegal_escape (Lexing.lexeme lexbuf ^ e), Location.curr lexbuf))
in
- Char.chr (val1 * 16 + val2)
+ let len = Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf in
+ let first = 3 (* skip opening \u{ *) in
+ let last = len - 2 (* skip closing } *) in
+ let digit_count = last - first + 1 in
+ match digit_count > 6 with
+ | true -> err ", too many digits, expected 1 to 6 hexadecimal digits"
+ | false ->
+ let cp = hex_num_value lexbuf ~first ~last in
+ if Uchar.is_valid cp then Uchar.unsafe_of_int cp else
+ err (", " ^ Printf.sprintf "%X" cp ^ " is not a Unicode scalar value")
(* recover the name from a LABEL or OPTLABEL token *)
@@ -290,6 +294,8 @@ let symbolchar =
['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
let decimal_literal =
['0'-'9'] ['0'-'9' '_']*
+let hex_digit =
+ ['0'-'9' 'A'-'F' 'a'-'f']
let hex_literal =
'0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']*
let oct_literal =
@@ -627,6 +633,9 @@ and string = parse
| '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F']
{ store_escaped_char lexbuf (char_for_hexadecimal_code lexbuf 2);
string lexbuf }
+ | '\\' 'u' '{' hex_digit+ '}'
+ { store_escaped_uchar lexbuf (uchar_for_uchar_escape lexbuf);
+ string lexbuf }
| '\\' _
{ if not (in_comment ()) then begin
(* Should be an error, but we are very lax.
diff --git a/testsuite/tests/lexing/Makefile b/testsuite/tests/lexing/Makefile
new file mode 100644
index 0000000000..9625a3fbc3
--- /dev/null
+++ b/testsuite/tests/lexing/Makefile
@@ -0,0 +1,3 @@
+BASEDIR=../..
+include $(BASEDIR)/makefiles/Makefile.toplevel
+include $(BASEDIR)/makefiles/Makefile.common
diff --git a/testsuite/tests/lexing/uchar_esc.ml b/testsuite/tests/lexing/uchar_esc.ml
new file mode 100644
index 0000000000..b288ac2561
--- /dev/null
+++ b/testsuite/tests/lexing/uchar_esc.ml
@@ -0,0 +1,34 @@
+
+(* Correct escapes and their encoding *)
+
+let () =
+ assert ("\xF0\x9F\x90\xAB" = "\u{1F42B}");
+ assert ("\xF0\x9F\x90\xAB" = "\u{01F42B}");
+ assert ("\x00" = "\u{0}");
+ assert ("\x00" = "\u{00}");
+ assert ("\x00" = "\u{000}");
+ assert ("\x00" = "\u{0000}");
+ assert ("\x00" = "\u{00000}");
+ assert ("\x00" = "\u{000000}");
+ assert ("\xC3\xA9" = "\u{E9}");
+ assert ("\xC3\xA9" = "\u{0E9}");
+ assert ("\xC3\xA9" = "\u{00E9}");
+ assert ("\xC3\xA9" = "\u{000E9}");
+ assert ("\xC3\xA9" = "\u{0000E9}");
+ assert ("\xC3\xA9" = "\u{0000E9}");
+ assert ("\xF4\x8F\xBF\xBF" = "\u{10FFFF}");
+ ()
+;;
+
+
+(* Errors *)
+
+let invalid_sv = "\u{0D800}" ;;
+let invalid_sv = "\u{D800}" ;;
+let invalid_sv = "\u{D900}" ;;
+let invalid_sv = "\u{DFFF}" ;;
+let invalid_sv = "\u{110000} ;;
+
+let too_many_digits = "\u{01234567}" ;;
+let no_hex_digits = "\u{}" ;;
+let illegal_hex_digit = "\u{u}" ;;
diff --git a/testsuite/tests/lexing/uchar_esc.ml.reference b/testsuite/tests/lexing/uchar_esc.ml.reference
new file mode 100644
index 0000000000..8730059c99
--- /dev/null
+++ b/testsuite/tests/lexing/uchar_esc.ml.reference
@@ -0,0 +1,36 @@
+
+# # Characters 34-43:
+ let invalid_sv = "\u{0D800}" ;;
+ ^^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{0D800}, D800 is not a Unicode scalar value)
+# Characters 18-26:
+ let invalid_sv = "\u{D800}" ;;
+ ^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{D800}, D800 is not a Unicode scalar value)
+# Characters 18-26:
+ let invalid_sv = "\u{D900}" ;;
+ ^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{D900}, D900 is not a Unicode scalar value)
+# Characters 18-26:
+ let invalid_sv = "\u{DFFF}" ;;
+ ^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{DFFF}, DFFF is not a Unicode scalar value)
+# Characters 18-28:
+ let invalid_sv = "\u{110000} ;;
+ ^^^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{110000}, 110000 is not a Unicode scalar value)
+# Characters 24-36:
+ let too_many_digits = "\u{01234567}" ;;
+ ^^^^^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{01234567}, too many digits, expected 1 to 6 hexadecimal digits)
+# Characters 21-23:
+ let no_hex_digits = "\u{}" ;;
+ ^^
+Warning 14: illegal backslash escape in string.
+val no_hex_digits : string = "\\u{}"
+# Characters 25-27:
+ let illegal_hex_digit = "\u{u}" ;;
+ ^^
+Warning 14: illegal backslash escape in string.
+val illegal_hex_digit : string = "\\u{u}"
+#