summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--regcharclass.h21
-rwxr-xr-xregen/regcharclass.pl12
-rw-r--r--utf8.h60
3 files changed, 42 insertions, 51 deletions
diff --git a/regcharclass.h b/regcharclass.h
index f4a7e083dc..ebda2f7a11 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1009,14 +1009,13 @@
: ( ( 0x72 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 )
/*
- UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes
+ UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
- 0x0 - 0x3FFF
+ 0xA0 - 0x3FFF
*/
/*** GENERATED CODE ***/
#define is_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ( ((U8*)s)[0] & 0xFE ) == 0x60 ) || ((U8*)s)[0] == 0x79 || ( ( ((U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((U8*)s)[0] & 0xEE ) == 0x82 ) || ( ( ((U8*)s)[0] & 0xEC ) == 0x84 ) || ( ( ((U8*)s)[0] & 0xEE ) == 0x88 ) || ((U8*)s)[0] == 0xA1 || ( ( ((U8*)s)[0] & 0xBE ) == 0xA2 ) || ( ( ((U8*)s)[0] & 0xBC ) == 0xA4 ) || ( ( ((U8*)s)[0] & 0xFE ) == 0xA8 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0xAD ) || ( ( ((U8*)s)[0] & 0xE8 ) == 0xC0 ) || ( ( ((U8*)s)[0] & 0xCE ) == 0xC8 ) || ((U8*)s)[0] == 0xE0 || ( ( ((U8*)s)[0] & 0xF8 ) == 0xF0 ) || ((U8*)s)[0] == 0xFF ) ? 1\
-: ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB6 ) ) ?\
+( ( 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAC ) || ( 0xAE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB6 ) ) ?\
( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ? 2 : 0 )\
: ( ( ( ( ( ((U8*)s)[0] & 0xFC ) == 0xB8 ) || ((U8*)s)[0] == 0xBC || ( ( ((U8*)s)[0] & 0xFE ) == 0xBE ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( ((U8*)s)[1] & 0xFC ) == 0x70 ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( ((U8*)s)[2] & 0xFC ) == 0x70 ) ) ? 3 : 0 )
@@ -1727,14 +1726,13 @@
: ( ( 0x71 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 )
/*
- UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes
+ UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
- 0x0 - 0x3FFF
+ 0xA0 - 0x3FFF
*/
/*** GENERATED CODE ***/
#define is_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xDF ) == 0x4B ) || ( ( ((U8*)s)[0] & 0xFC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xDE ) == 0x5A ) || ( ( ((U8*)s)[0] & 0xFE ) == 0x5C ) || ((U8*)s)[0] == 0x5E || ( ( ((U8*)s)[0] & 0xFE ) == 0x60 ) || ( ( ((U8*)s)[0] & 0xEC ) == 0x6C ) || ((U8*)s)[0] == 0x79 || ( ( ((U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((U8*)s)[0] & 0xEE ) == 0x82 ) || ( ( ((U8*)s)[0] & 0xEC ) == 0x84 ) || ( ( ((U8*)s)[0] & 0xEE ) == 0x88 ) || ((U8*)s)[0] == 0xA1 || ( ( ((U8*)s)[0] & 0xBE ) == 0xA2 ) || ( ( ((U8*)s)[0] & 0xBC ) == 0xA4 ) || ( ( ((U8*)s)[0] & 0xFE ) == 0xA8 ) || ((U8*)s)[0] == 0xB0 || ( ( ((U8*)s)[0] & 0xFE ) == 0xBA ) || ( ( ((U8*)s)[0] & 0xE8 ) == 0xC0 ) || ( ( ((U8*)s)[0] & 0xCE ) == 0xC8 ) || ((U8*)s)[0] == 0xE0 || ( ( ((U8*)s)[0] & 0xF8 ) == 0xF0 ) || ((U8*)s)[0] == 0xFF ) ? 1\
-: ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ( 0xB1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\
+( ( 0x78 == ((U8*)s)[0] || 0x80 == ((U8*)s)[0] || ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA0 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xAF ) || ( 0xB1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\
( ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) ? 2 : 0 )\
: ( ( ( ((U8*)s)[0] == 0xB7 || ( ( ((U8*)s)[0] & 0xFE ) == 0xB8 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xEE ) == 0xCA ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xCC ) ) && ( ( 0x41 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x4A ) || ( 0x51 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x59 ) || 0x5F == ((U8*)s)[1] || ( 0x62 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x6A ) || ( 0x70 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0x72 ) ) ) && ( ( 0x41 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x4A ) || ( 0x51 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x59 ) || 0x5F == ((U8*)s)[2] || ( 0x62 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x6A ) || ( 0x70 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0x72 ) ) ) ? 3 : 0 )
@@ -2453,14 +2451,13 @@
: ( ( 0x74 == ((U8*)s)[2] ) && ( 0x41 <= ((U8*)s)[3] && ((U8*)s)[3] <= 0x47 ) ) ? 4 : 0 ) : 0 )
/*
- UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes
+ UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
- 0x0 - 0x3FFF
+ 0xA0 - 0x3FFF
*/
/*** GENERATED CODE ***/
#define is_UTF8_CHAR_utf8_no_length_checks(s) \
-( ( ( ( ((U8*)s)[0] & 0xC0 ) == 0x00 ) || ( ( ((U8*)s)[0] & 0xEF ) == 0x40 ) || ( ( ((U8*)s)[0] & 0xCE ) == 0x4A ) || ( ( ((U8*)s)[0] & 0xCC ) == 0x4C ) || ( ( ((U8*)s)[0] & 0xFE ) == 0x60 ) || ( ( ((U8*)s)[0] & 0xAF ) == 0x81 ) || ( ( ((U8*)s)[0] & 0xEE ) == 0x82 ) || ( ( ((U8*)s)[0] & 0xEC ) == 0x84 ) || ( ( ((U8*)s)[0] & 0xEE ) == 0x88 ) || ( ( ((U8*)s)[0] & 0xFE ) == 0xA2 ) || ( ( ((U8*)s)[0] & 0xFC ) == 0xA4 ) || ( ( ((U8*)s)[0] & 0xFE ) == 0xA8 ) || ( ( ((U8*)s)[0] & 0xBF ) == 0xBB ) || ( ( ((U8*)s)[0] & 0xFE ) == 0xBC ) || ( ( ((U8*)s)[0] & 0xCE ) == 0xC2 ) || ( ( ((U8*)s)[0] & 0xCC ) == 0xC4 ) || ( ( ((U8*)s)[0] & 0xCE ) == 0xC8 ) || ( ( ((U8*)s)[0] & 0xFE ) == 0xF0 ) || ( ( ((U8*)s)[0] & 0xFD ) == 0xFD ) ) ? 1\
-: ( ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA1 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\
+( ( ( 0x8A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x90 ) || ( 0x9A <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xA1 ) || ( 0xAA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xB5 ) ) ?\
( ( ( ( ((U8*)s)[1] & 0xEF ) == 0x41 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x42 ) || ( ( ((U8*)s)[1] & 0xEC ) == 0x44 ) || ( ( ((U8*)s)[1] & 0xEE ) == 0x48 ) || ( ( ((U8*)s)[1] & 0xFC ) == 0x64 ) || ( ( ((U8*)s)[1] & 0xFE ) == 0x68 ) || ( ( ((U8*)s)[1] & 0xFA ) == 0x70 ) ) ? 2 : 0 )\
: ( ( ( ( 0xB7 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xBA ) || ( 0xBE <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xC0 ) || ( 0xCA <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xD0 ) || 0xDA == ((U8*)s)[0] ) && ( ( ( ((U8*)s)[1] & 0xEF ) == 0x41 ) || ( ( ((U8*)s)[1] & 0xCE ) == 0x42 ) || ( ( ((U8*)s)[1] & 0xEC ) == 0x44 ) || ( ( ((U8*)s)[1] & 0xEE ) == 0x48 ) || ( ( ((U8*)s)[1] & 0xFC ) == 0x64 ) || ( ( ((U8*)s)[1] & 0xFE ) == 0x68 ) || ( ( ((U8*)s)[1] & 0xFA ) == 0x70 ) ) ) && ( ( ( ((U8*)s)[2] & 0xEF ) == 0x41 ) || ( ( ((U8*)s)[2] & 0xCE ) == 0x42 ) || ( ( ((U8*)s)[2] & 0xEC ) == 0x44 ) || ( ( ((U8*)s)[2] & 0xEE ) == 0x48 ) || ( ( ((U8*)s)[2] & 0xFC ) == 0x64 ) || ( ( ((U8*)s)[2] & 0xFE ) == 0x68 ) || ( ( ((U8*)s)[2] & 0xFA ) == 0x70 ) ) ) ? 3 : 0 )
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index d37b8633d4..1f453e8369 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1646,12 +1646,16 @@ GCB_V: Grapheme_Cluster_Break=V
# wants a maximum other than 4 bytes, or this program creates better
# optimizations. Trying with 5 bytes used too much memory to calculate.
#
+# We don't generate code for invariants here because the EBCDIC form is too
+# complicated and would slow things down; instead the user should test for
+# invariants first.
+#
# NOTE: The number of bytes generated here must match the value in
# IS_UTF8_CHAR_FAST in utf8.h
#
-#UTF8_CHAR: Matches legal UTF-8 encoded characters from 1 through 4 bytes
+#UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
#=> UTF8 :no_length_checks only_ascii_platform
-#0x0 - 0x1FFFFF
+#0x80 - 0x1FFFFF
# This hasn't been commented out, but the number of bytes it works on has been
# cut down to 3, so it doesn't cover the full legal Unicode range. Making it
@@ -1662,9 +1666,9 @@ GCB_V: Grapheme_Cluster_Break=V
# NOTE: The number of bytes generated here must match the value in
# IS_UTF8_CHAR_FAST in utf8.h
#
-UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 1 through 3 bytes
+UTF8_CHAR: Matches legal UTF-EBCDIC encoded characters from 2 through 3 bytes
=> UTF8 :no_length_checks only_ebcdic_platform
-0x0 - 0x3FFF
+0xA0 - 0x3FFF
QUOTEMETA: Meta-characters that \Q should quote
=> high :fast
diff --git a/utf8.h b/utf8.h
index a18faa2e35..924380db85 100644
--- a/utf8.h
+++ b/utf8.h
@@ -606,48 +606,38 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
* don't take too long to generate, and there is a separate one for each code
* page, so they are in regcharclass.h instead of here */
/*
- UTF8_CHAR: Matches utf8 from 1 to 4 bytes
+ UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
- 0x0 - 0x1FFFFF
+ 0x80 - 0x1FFFFF
*/
/*** GENERATED CODE ***/
-#define is_UTF8_CHAR_utf8_safe(s,e) \
-( ((e)-(s) > 3) ? \
- ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \
- : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
- ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
- : ( 0xE0 == ((U8*)s)[0] ) ? \
- ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
- : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \
- ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
- : ( 0xF0 == ((U8*)s)[0] ) ? \
- ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
- : ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: ((e)-(s) > 2) ? \
- ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \
- : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
- ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
- : ( 0xE0 == ((U8*)s)[0] ) ? \
- ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
- : ( ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ((e)-(s) > 1) ? \
- ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1 \
- : ( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) ? 2 : 0 )\
-: ((e)-(s) > 0) ? \
- ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) \
-: 0 )
+#define is_UTF8_CHAR_utf8_no_length_checks(s) \
+( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ? \
+ ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 ) \
+: ( 0xE0 == ((U8*)s)[0] ) ? \
+ ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ? \
+ ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xF0 == ((U8*)s)[0] ) ? \
+ ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
#endif
/*
- * =for apidoc isUTF8_CHAR
- *
- * Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
- * UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into
- * C<s>. Returns 0 if the sequence starting at C<s> through C<e - 1> is not
- * well-formed UTF-8
+=head1 Unicode Support
+
+=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
+
+Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
+UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into
+C<s>. Returns 0 if the sequence starting at C<s> through C<e - 1> is not
+well-formed UTF-8
Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
-machines) is a valid UTF-8 character. */
+machines) is a valid UTF-8 character.
+
+=cut
+*/
#define isUTF8_CHAR(s, e) (((e) <= (s)) \
? 0 \
@@ -656,7 +646,7 @@ machines) is a valid UTF-8 character. */
: (((e) - (s)) < UTF8SKIP(s)) \
? 0 \
: (IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \
- ? is_UTF8_CHAR_utf8_safe(s,e) \
+ ? is_UTF8_CHAR_utf8_no_length_checks(s) \
: _is_utf8_char_slow(s, e))
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is