Merge pull request #1232 from dbuenzli/string-unicode-escapes

Add Unicode character escape \u{H} to OCaml string literals.
author: Alain Frisch <alain@frisch.fr> 2017-07-20 15:24:51 +0200
committer: GitHub <noreply@github.com> 2017-07-20 15:24:51 +0200
commit: 9c1927b0cbcd34c4ce766799721f6cde20f0ba30 (patch)
tree: 3154f6c7b58595c567ba79d9112e604165a0d98d
parent: fd47ba9649f784c7d0772bdf282b8e7c8f1fe5aa (diff)
parent: d6d1739c0b66c0f7cc662295d49804ae382d72b0 (diff)
download: ocaml-9c1927b0cbcd34c4ce766799721f6cde20f0ba30.tar.gz
6 files changed, 135 insertions, 40 deletions
diff --git a/Changes b/Changes
index 8376357080..136acee02c 100644
--- a/Changes
+++ b/Changes
@@ -9,6 +9,12 @@ Working version
   can be used as a placeholder for a polymorphic function.
   (Stephen Dolan)
 
+* GPR#1232: Support Unicode character escape sequences in string
+  literals via the \u{X+} syntax. These escapes are substituted by the
+  UTF-8 encoding of the Unicode character.
+  (Daniel Bünzli, review by Damien Doligez, Alain Frisch, Xavier
+  Leroy and Leo White)
+
 - GPR#1249, MPR#6271, MPR#7529: Support "let open M in ..."
   in class expressions and class type expressions.
   (Alain Frisch, reviews by Thomas Refis and Jacques Garrigue)
diff --git a/manual/manual/refman/lex.etex b/manual/manual/refman/lex.etex
index 9ddce69b4d..fa24d1e468 100644
--- a/manual/manual/refman/lex.etex
+++ b/manual/manual/refman/lex.etex
@@ -151,13 +151,20 @@ string-literal:
 string-character:
           regular-string-char
         | escape-sequence
+        | "\u{" ("0"\ldots"9"||"A"\ldots"F"||"a"\ldots"f")+ "}"
         | '\' newline { space || tab }
 \end{syntax}
 
 String literals are delimited by @'"'@ (double quote) characters.
 The two double quotes enclose a sequence of either characters
 different from @'"'@ and @'\'@, or escape sequences from the
-table given above for character literals.
+table given above for character literals, or a Unicode character
+escape sequence.
+
+A Unicode character escape sequence is substituted by the UTF-8
+encoding of the specified Unicode scalar value. The Unicode scalar
+value, an integer in the ranges 0x0000...0xD7FF or 0xE000...0x10FFFF,
+is defined using 1 to 6 hexadecimal digits; leading zeros are allowed.
 
 To allow splitting long string literals across lines, the sequence
 "\\"\var{newline}~\var{spaces-or-tabs} (a backslash at the end of a line
diff --git a/parsing/lexer.mll b/parsing/lexer.mll
index 6936c8265d..178f3be852 100644
--- a/parsing/lexer.mll
+++ b/parsing/lexer.mll
@@ -99,35 +99,14 @@ let keyword_table =
 
 (* To buffer string literals *)
 
-let initial_string_buffer = Bytes.create 256
-let string_buff = ref initial_string_buffer
-let string_index = ref 0
-
-let reset_string_buffer () =
-  string_buff := initial_string_buffer;
-  string_index := 0
-
-let store_string_char c =
-  if !string_index >= Bytes.length !string_buff then begin
-    let new_buff = Bytes.create (Bytes.length (!string_buff) * 2) in
-    Bytes.blit !string_buff 0 new_buff 0 (Bytes.length !string_buff);
-    string_buff := new_buff
-  end;
-  Bytes.unsafe_set !string_buff !string_index c;
-  incr string_index
-
-let store_string s =
-  for i = 0 to String.length s - 1 do
-    store_string_char s.[i];
-  done
-
-let store_lexeme lexbuf =
-  store_string (Lexing.lexeme lexbuf)
-
-let get_stored_string () =
-  let s = Bytes.sub_string !string_buff 0 !string_index in
-  string_buff := initial_string_buffer;
-  s
+let string_buffer = Buffer.create 256
+let reset_string_buffer () = Buffer.reset string_buffer
+let get_stored_string () = Buffer.contents string_buffer
+
+let store_string_char c = Buffer.add_char string_buffer c
+let store_string_utf_8_uchar u = Buffer.add_utf_8_uchar string_buffer u
+let store_string s = Buffer.add_string string_buffer s
+let store_lexeme lexbuf = store_string (Lexing.lexeme lexbuf)
 
 (* To store the position of the beginning of a string and comment *)
 let string_start_loc = ref Location.none;;
@@ -141,6 +120,9 @@ let print_warnings = ref true
 let store_escaped_char lexbuf c =
   if in_comment () then store_lexeme lexbuf else store_string_char c
 
+let store_escaped_uchar lexbuf u =
+  if in_comment () then store_lexeme lexbuf else store_string_utf_8_uchar u
+
 let with_comment_buffer comment lexbuf =
   let start_loc = Location.curr lexbuf  in
   comment_start_loc := [start_loc];
@@ -153,6 +135,21 @@ let with_comment_buffer comment lexbuf =
 
 (* To translate escape sequences *)
 
+let hex_digit_value d = (* assert (d in '0'..'9' 'a'..'f' 'A'..'F') *)
+  let d = Char.code d in
+  if d >= 97 then d - 87 else
+  if d >= 65 then d - 55 else
+  d - 48
+
+let hex_num_value lexbuf ~first ~last =
+  let rec loop acc i = match i > last with
+  | true -> acc
+  | false ->
+      let value = hex_digit_value (Lexing.lexeme_char lexbuf i) in
+      loop (16 * acc + value) (i + 1)
+  in
+  loop 0 first
+
 let char_for_backslash = function
   | 'n' -> '\010'
   | 'r' -> '\013'
@@ -178,17 +175,24 @@ let char_for_octal_code lexbuf i =
   Char.chr c
 
 let char_for_hexadecimal_code lexbuf i =
-  let d1 = Char.code (Lexing.lexeme_char lexbuf i) in
-  let val1 = if d1 >= 97 then d1 - 87
-             else if d1 >= 65 then d1 - 55
-             else d1 - 48
-  in
-  let d2 = Char.code (Lexing.lexeme_char lexbuf (i+1)) in
-  let val2 = if d2 >= 97 then d2 - 87
-             else if d2 >= 65 then d2 - 55
-             else d2 - 48
+  let byte = hex_num_value lexbuf ~first:i ~last:(i+1) in
+  Char.chr byte
+
+let uchar_for_uchar_escape lexbuf =
+  let err e =
+    raise
+      (Error (Illegal_escape (Lexing.lexeme lexbuf ^ e), Location.curr lexbuf))
   in
-  Char.chr (val1 * 16 + val2)
+  let len = Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf in
+  let first = 3 (* skip opening \u{ *) in
+  let last = len - 2 (* skip closing } *) in
+  let digit_count = last - first + 1 in
+  match digit_count > 6 with
+  | true -> err ", too many digits, expected 1 to 6 hexadecimal digits"
+  | false ->
+      let cp = hex_num_value lexbuf ~first ~last in
+      if Uchar.is_valid cp then Uchar.unsafe_of_int cp else
+      err (", " ^ Printf.sprintf "%X" cp ^ " is not a Unicode scalar value")
 
 (* recover the name from a LABEL or OPTLABEL token *)
 
@@ -290,6 +294,8 @@ let symbolchar =
   ['!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
 let decimal_literal =
   ['0'-'9'] ['0'-'9' '_']*
+let hex_digit =
+  ['0'-'9' 'A'-'F' 'a'-'f']
 let hex_literal =
   '0' ['x' 'X'] ['0'-'9' 'A'-'F' 'a'-'f']['0'-'9' 'A'-'F' 'a'-'f' '_']*
 let oct_literal =
@@ -627,6 +633,9 @@ and string = parse
   | '\\' 'x' ['0'-'9' 'a'-'f' 'A'-'F'] ['0'-'9' 'a'-'f' 'A'-'F']
       { store_escaped_char lexbuf (char_for_hexadecimal_code lexbuf 2);
          string lexbuf }
+  | '\\' 'u' '{' hex_digit+ '}'
+        { store_escaped_uchar lexbuf (uchar_for_uchar_escape lexbuf);
+          string lexbuf }
   | '\\' _
       { if not (in_comment ()) then begin
 (*  Should be an error, but we are very lax.
diff --git a/testsuite/tests/lexing/Makefile b/testsuite/tests/lexing/Makefile
new file mode 100644
index 0000000000..9625a3fbc3
--- /dev/null
+++ b/testsuite/tests/lexing/Makefile
@@ -0,0 +1,3 @@
+BASEDIR=../..
+include $(BASEDIR)/makefiles/Makefile.toplevel
+include $(BASEDIR)/makefiles/Makefile.common
diff --git a/testsuite/tests/lexing/uchar_esc.ml b/testsuite/tests/lexing/uchar_esc.ml
new file mode 100644
index 0000000000..b288ac2561
--- /dev/null
+++ b/testsuite/tests/lexing/uchar_esc.ml
@@ -0,0 +1,34 @@
+
+(* Correct escapes and their encoding *)
+
+let () =
+  assert ("\xF0\x9F\x90\xAB" = "\u{1F42B}");
+  assert ("\xF0\x9F\x90\xAB" = "\u{01F42B}");
+  assert ("\x00" = "\u{0}");
+  assert ("\x00" = "\u{00}");
+  assert ("\x00" = "\u{000}");
+  assert ("\x00" = "\u{0000}");
+  assert ("\x00" = "\u{00000}");
+  assert ("\x00" = "\u{000000}");
+  assert ("\xC3\xA9" = "\u{E9}");
+  assert ("\xC3\xA9" = "\u{0E9}");
+  assert ("\xC3\xA9" = "\u{00E9}");
+  assert ("\xC3\xA9" = "\u{000E9}");
+  assert ("\xC3\xA9" = "\u{0000E9}");
+  assert ("\xC3\xA9" = "\u{0000E9}");
+  assert ("\xF4\x8F\xBF\xBF" = "\u{10FFFF}");
+  ()
+;;
+
+
+(* Errors *)
+
+let invalid_sv = "\u{0D800}" ;;
+let invalid_sv = "\u{D800}" ;;
+let invalid_sv = "\u{D900}" ;;
+let invalid_sv = "\u{DFFF}" ;;
+let invalid_sv = "\u{110000} ;;
+
+let too_many_digits = "\u{01234567}" ;;
+let no_hex_digits = "\u{}" ;;
+let illegal_hex_digit = "\u{u}" ;;
diff --git a/testsuite/tests/lexing/uchar_esc.ml.reference b/testsuite/tests/lexing/uchar_esc.ml.reference
new file mode 100644
index 0000000000..8730059c99
--- /dev/null
+++ b/testsuite/tests/lexing/uchar_esc.ml.reference
@@ -0,0 +1,36 @@
+
+#                                         #         Characters 34-43:
+  let invalid_sv = "\u{0D800}" ;;
+                    ^^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{0D800}, D800 is not a Unicode scalar value)
+# Characters 18-26:
+  let invalid_sv = "\u{D800}" ;;
+                    ^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{D800}, D800 is not a Unicode scalar value)
+# Characters 18-26:
+  let invalid_sv = "\u{D900}" ;;
+                    ^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{D900}, D900 is not a Unicode scalar value)
+# Characters 18-26:
+  let invalid_sv = "\u{DFFF}" ;;
+                    ^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{DFFF}, DFFF is not a Unicode scalar value)
+# Characters 18-28:
+  let invalid_sv = "\u{110000} ;;
+                    ^^^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{110000}, 110000 is not a Unicode scalar value)
+#   Characters 24-36:
+  let too_many_digits = "\u{01234567}" ;;
+                         ^^^^^^^^^^^^
+Error: Illegal backslash escape in string or character (\u{01234567}, too many digits, expected 1 to 6 hexadecimal digits)
+# Characters 21-23:
+  let no_hex_digits = "\u{}" ;;
+                       ^^
+Warning 14: illegal backslash escape in string.
+val no_hex_digits : string = "\\u{}"
+# Characters 25-27:
+  let illegal_hex_digit = "\u{u}" ;;
+                           ^^
+Warning 14: illegal backslash escape in string.
+val illegal_hex_digit : string = "\\u{u}"
+#
author	Alain Frisch <alain@frisch.fr>	2017-07-20 15:24:51 +0200
committer	GitHub <noreply@github.com>	2017-07-20 15:24:51 +0200
commit	9c1927b0cbcd34c4ce766799721f6cde20f0ba30 (patch)
tree	3154f6c7b58595c567ba79d9112e604165a0d98d
parent	fd47ba9649f784c7d0772bdf282b8e7c8f1fe5aa (diff)
parent	d6d1739c0b66c0f7cc662295d49804ae382d72b0 (diff)
download	ocaml-9c1927b0cbcd34c4ce766799721f6cde20f0ba30.tar.gz