diff options
author | Bruno Haible <bruno@clisp.org> | 2023-03-14 12:59:22 +0100 |
---|---|---|
committer | Bruno Haible <bruno@clisp.org> | 2023-03-14 13:10:10 +0100 |
commit | 0458d3efda00e9fd940675a8f64deda1ee5e688a (patch) | |
tree | e0fd4f2816af039f747a3f870be3eab210a805fa | |
parent | fa2fbdbcbf359dfb024467d698f2d8732527f97c (diff) | |
download | gettext-0458d3efda00e9fd940675a8f64deda1ee5e688a.tar.gz |
xgettext: In language Tcl, support \x, \u, \U escapes as specified in Tcl 8.6.
* gettext-tools/src/x-tcl.c: Update comments.
(phase1_pushback): Increase size to 5.
(do_getc_escaped): For \x, parse only up to 2 hexadecimal characters.
Handle '\U'.
(do_getc_escaped_low_surrogate): New function.
(accumulate_word): After reading a high surrogate, see if it is followed by a
low surrogate.
* gettext-tools/tests/xgettext-tcl-4: Change expected outcome for \x. Add test
cases for \u with surrogates and for \U.
* gettext-tools/tests/xgettext-tcl-5: Add more test cases.
* NEWS: Mention the change.
-rw-r--r-- | NEWS | 2 | ||||
-rw-r--r-- | gettext-tools/src/x-tcl.c | 151 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-tcl-4 | 16 | ||||
-rwxr-xr-x | gettext-tools/tests/xgettext-tcl-5 | 50 |
4 files changed, 189 insertions, 30 deletions
@@ -15,6 +15,8 @@ Version 0.21.2 - February 2023 - C, C++: xgettext now supports gettext-like functions that take wide strings (of type 'const wchar_t *', 'const char16_t *', or 'const char32_t *') as arguments. + - Tcl: xgettext now supports the \x, \u, and \U escapes as defined in + Tcl 8.6. * xgettext: - The xgettext option '--sorted-output' is now deprecated. diff --git a/gettext-tools/src/x-tcl.c b/gettext-tools/src/x-tcl.c index 5fcfe4f85..182ece6e8 100644 --- a/gettext-tools/src/x-tcl.c +++ b/gettext-tools/src/x-tcl.c @@ -55,7 +55,8 @@ #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) -/* The Tcl syntax is defined in the Tcl.n manual page. +/* The Tcl syntax is defined in the Tcl.n manual page, see + https://www.tcl-lang.org/man/tcl8.6/TclCmd/Tcl.htm . Summary of Tcl syntax: Like sh syntax, except that `...` is replaced with [...]. In detail: - In a preprocessing pass, backslash-newline-anywhitespace is replaced @@ -69,7 +70,7 @@ - The list of resulting words is split into commands by semicolon and newline. - '#' at the beginning of a command introduces a comment until end of line. - The parser is implemented in tcl8.3.3/generic/tclParse.c. */ + The parser is implemented in tcl8.6/generic/tclParse.c. */ /* ====================== Keyword set customization. ====================== */ @@ -174,7 +175,7 @@ do_ungetc (int c) /* An int that becomes a space when casted to 'unsigned char'. */ #define BS_NL (UCHAR_MAX + 1 + ' ') -static int phase1_pushback[1]; +static int phase1_pushback[5]; static int phase1_pushback_length; static int @@ -472,7 +473,7 @@ static int brace_nesting_depth; /* Read an escape sequence. The value is an ISO-8859-1 character (in the - range 0x00..0xff) or a Unicode character (in the range 0x0000..0xffff). */ + range 0x00..0xff) or a Unicode character (in the range 0x0000..0x10FFFF). */ static int do_getc_escaped () { @@ -499,14 +500,17 @@ do_getc_escaped () return '\v'; case 'x': { - int n = 0; + unsigned int n = 0; unsigned int i; - for (i = 0;; i++) + for (i = 0; i < 2; i++) { c = phase1_getc (); if (c == EOF || !c_isxdigit ((unsigned char) c)) - break; + { + phase1_ungetc (c); + break; + } if (c >= '0' && c <= '9') n = (n << 4) + (c - '0'); @@ -515,12 +519,11 @@ do_getc_escaped () else if (c >= 'a' && c <= 'f') n = (n << 4) + (c - 'a' + 10); } - phase1_ungetc (c); return (i > 0 ? (unsigned char) n : 'x'); } case 'u': { - int n = 0; + unsigned int n = 0; unsigned int i; for (i = 0; i < 4; i++) @@ -541,6 +544,29 @@ do_getc_escaped () } return (i > 0 ? n : 'u'); } + case 'U': + { + unsigned int n = 0; + unsigned int i; + + for (i = 0; i < 8; i++) + { + c = phase1_getc (); + if (c == EOF || !c_isxdigit ((unsigned char) c) || n >= 0x11000) + { + phase1_ungetc (c); + break; + } + + if (c >= '0' && c <= '9') + n = (n << 4) + (c - '0'); + else if (c >= 'A' && c <= 'F') + n = (n << 4) + (c - 'A' + 10); + else if (c >= 'a' && c <= 'f') + n = (n << 4) + (c - 'a' + 10); + } + return (i > 0 ? n : 'u'); + } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { @@ -572,6 +598,58 @@ do_getc_escaped () } } +/* Read an escape sequence for a low surrogate Unicode character. + The value is in the range 0xDC00..0xDFFF. + Return -1 when none was seen. */ +static int +do_getc_escaped_low_surrogate () +{ + int c; + + c = phase1_getc (); + switch (c) + { + case 'u': + { + unsigned char buf[4]; + unsigned int n = 0; + unsigned int i; + + for (i = 0; i < 4; i++) + { + c = phase1_getc (); + if (c == EOF || !c_isxdigit ((unsigned char) c)) + { + phase1_ungetc (c); + while (i > 0) + phase1_ungetc (buf[--i]); + phase1_ungetc ('u'); + return -1; + } + + if (c >= '0' && c <= '9') + n = (n << 4) + (c - '0'); + else if (c >= 'A' && c <= 'F') + n = (n << 4) + (c - 'A' + 10); + else if (c >= 'a' && c <= 'f') + n = (n << 4) + (c - 'a' + 10); + } + if (n >= 0xdc00 && n <= 0xdfff) + return n; + else + { + while (i > 0) + phase1_ungetc (buf[--i]); + phase1_ungetc ('u'); + return -1; + } + } + default: + phase1_ungetc (c); + return -1; + } +} + enum terminator { @@ -699,31 +777,48 @@ accumulate_word (struct word *wp, enum terminator looking_for, } else if (c == '\\') { - unsigned int uc; - unsigned char utf8buf[6]; - int count; - int i; - - uc = do_getc_escaped (); - assert (uc < 0x10000); - count = u8_uctomb (utf8buf, uc, 6); - if (count < 0) + unsigned int uc = do_getc_escaped (); + assert (uc < 0x110000); + if (uc >= 0xd800 && uc <= 0xdfff) { + if (uc < 0xdc00) + { + /* Saw a high surrogate Unicode character. + Is it followed by a low surrogate Unicode character? */ + c = phase2_getc (); + if (c == '\\') + { + int uc2 = do_getc_escaped_low_surrogate (); + if (uc2 >= 0) + { + /* Saw a low surrogate Unicode character. */ + assert (uc2 >= 0xdc00 && uc2 <= 0xdfff); + uc = 0x10000 + ((uc - 0xd800) << 10) + (uc2 - 0xdc00); + goto saw_unicode_escape; + } + } + phase2_ungetc (c); + } error_with_progname = false; error (0, 0, _("%s:%d: warning: invalid Unicode character"), logical_file_name, line_number); error_with_progname = true; + goto done_escape; } - else - { - assert (count > 0); - if (wp->type == t_string) - for (i = 0; i < count; i++) - { - grow_token (wp->token); - wp->token->chars[wp->token->charcount++] = utf8buf[i]; - } - } + saw_unicode_escape: + { + unsigned char utf8buf[6]; + int count = u8_uctomb (utf8buf, uc, 6); + int i; + assert (count > 0); + if (wp->type == t_string) + for (i = 0; i < count; i++) + { + grow_token (wp->token); + wp->token->chars[wp->token->charcount++] = utf8buf[i]; + } + } + done_escape: ; } else { diff --git a/gettext-tools/tests/xgettext-tcl-4 b/gettext-tools/tests/xgettext-tcl-4 index d1582f6f9..2419def60 100755 --- a/gettext-tools/tests/xgettext-tcl-4 +++ b/gettext-tools/tests/xgettext-tcl-4 @@ -7,11 +7,17 @@ cat <<\EOF > xg-t-4.tcl puts [_ "Hello\u200e\u201cWorld\u201d"] puts [_ "x\u20y\x20z"] puts [_ "\xFF20"] +puts [_ "\UFF20"] +puts [_ "\uD83D\udc1c"] +# Does not work yet in Tcl 8.6: +# puts [_ "\U0001F41C"] EOF : ${XGETTEXT=xgettext} ${XGETTEXT} --add-comments --no-location -k_ -o xg-t-4.tmp xg-t-4.tcl 2>xg-t-4.err -test $? = 0 || { cat xg-t-4.err; Exit 1; } +result=$? +cat xg-t-4.err +test $result = 0 || Exit 1 func_filter_POT_Creation_Date xg-t-4.tmp xg-t-4.pot cat <<\EOF > xg-t-4.ok @@ -39,7 +45,13 @@ msgstr "" msgid "x y z" msgstr "" -msgid " " +msgid "ÿ20" +msgstr "" + +msgid "@" +msgstr "" + +msgid "🐜" msgstr "" EOF diff --git a/gettext-tools/tests/xgettext-tcl-5 b/gettext-tools/tests/xgettext-tcl-5 index 1c0ee9fdd..612f5cf21 100755 --- a/gettext-tools/tests/xgettext-tcl-5 +++ b/gettext-tools/tests/xgettext-tcl-5 @@ -11,6 +11,26 @@ cat <<\EOF > xg-t-5b.tcl puts [_ "\udc1c"] EOF +cat <<\EOF > xg-t-5c.tcl +puts [_ "\uD83D\n"] +EOF + +cat <<\EOF > xg-t-5d.tcl +puts [_ "\uD83D\u"] +EOF + +cat <<\EOF > xg-t-5e.tcl +puts [_ "\uD83D\u9843"] +EOF + +cat <<\EOF > xg-t-5f.tcl +puts [_ "\uD83D\ud913"] +EOF + +cat <<\EOF > xg-t-5g.tcl +puts [_ "\udc1c\ud83d"] +EOF + : ${XGETTEXT=xgettext} LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5a.tcl 2>xg-t-5.err result=$? @@ -23,4 +43,34 @@ result=$? cat xg-t-5.err test $result = 0 || Exit 1 +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5c.tcl 2>xg-t-5.err +result=$? +cat xg-t-5.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5d.tcl 2>xg-t-5.err +result=$? +cat xg-t-5.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5e.tcl 2>xg-t-5.err +result=$? +cat xg-t-5.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5f.tcl 2>xg-t-5.err +result=$? +cat xg-t-5.err +test $result = 0 || Exit 1 + +: ${XGETTEXT=xgettext} +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5g.tcl 2>xg-t-5.err +result=$? +cat xg-t-5.err +test $result = 0 || Exit 1 + exit 0 |