From e23e8bc1957a5981b8a507b62471ae38ec06c661 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 12 Sep 2016 13:38:22 -0600 Subject: Add macro for determining if UTF-8 is Unicode-strict --- regen/regcharclass.pl | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'regen/regcharclass.pl') diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index bd677acd15..abc4942354 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1660,6 +1660,49 @@ SURROGATE: Surrogate code points #=> UTF8 :no_length_checks only_ebcdic_platform #0xA0 - 0x1FFFFF +#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points +#=> UTF8 :no_length_checks only_ascii_platform +#0x0080 - 0xD7FF +#0xE000 - 0xFDCF +#0xFDF0 - 0xFFFD +#0x10000 - 0x1FFFD +#0x20000 - 0x2FFFD +#0x30000 - 0x3FFFD +#0x40000 - 0x4FFFD +#0x50000 - 0x5FFFD +#0x60000 - 0x6FFFD +#0x70000 - 0x7FFFD +#0x80000 - 0x8FFFD +#0x90000 - 0x9FFFD +#0xA0000 - 0xAFFFD +#0xB0000 - 0xBFFFD +#0xC0000 - 0xCFFFD +#0xD0000 - 0xDFFFD +#0xE0000 - 0xEFFFD +#0xF0000 - 0xFFFFD +#0x100000 - 0x10FFFD +# +#STRICT_UTF8_CHAR: Matches legal Unicode UTF-8 variant code points, no surrrogates nor non-character code points +#=> UTF8 :no_length_checks only_ebcdic_platform +#0x00A0 - 0xD7FF +#0xE000 - 0xFDCF +#0xFDF0 - 0xFFFD +#0x10000 - 0x1FFFD +#0x20000 - 0x2FFFD +#0x30000 - 0x3FFFD +#0x40000 - 0x4FFFD +#0x50000 - 0x5FFFD +#0x60000 - 0x6FFFD +#0x70000 - 0x7FFFD +#0x80000 - 0x8FFFD +#0x90000 - 0x9FFFD +#0xA0000 - 0xAFFFD +#0xB0000 - 0xBFFFD +#0xC0000 - 0xCFFFD +#0xD0000 - 0xDFFFD +#0xE0000 - 0xEFFFD +#0xF0000 - 0xFFFFD +#0x100000 - 0x10FFFD QUOTEMETA: Meta-characters that \Q should quote => high :fast -- cgit v1.2.1