diff options
author | Karl Williamson <khw@cpan.org> | 2021-06-30 13:01:49 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2021-08-07 05:59:15 -0600 |
commit | e1a9b7adc32d702958dc07123a3e3ee55af05ad1 (patch) | |
tree | 1667773e9b957193a642d30b6fa46712061b9107 | |
parent | 42b360b2e07dd1c42764c476a72cc282a4400ce9 (diff) | |
download | perl-e1a9b7adc32d702958dc07123a3e3ee55af05ad1.tar.gz |
regcharclass.pl: Add fast surrogate UTF-8 trie
This will be used in the next commit. It requires only the first two
bytes to determine if a UTF-8 or UTF-EBCDIC sequence is for a surrogate
-rw-r--r-- | regcharclass.h | 14 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 2 |
2 files changed, 14 insertions, 2 deletions
diff --git a/regcharclass.h b/regcharclass.h index df0a654b4a..ce290b7975 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -220,6 +220,10 @@ \p{_Perl_Surrogate} */ /*** GENERATED CODE ***/ +#define is_SURROGATE_utf8(s) \ +( ( ( 0xED == ((const U8*)s)[0] ) && ( inRANGE_helper_(U8, ((const U8*)s)[1], 0xA0, 0xBF) ) ) ? 3 : 0 ) + +/*** GENERATED CODE ***/ #define is_SURROGATE_utf8_safe(s,e) \ ( ( ( ( ( ((e) - (s)) >= 3 ) && ( 0xED == ((const U8*)s)[0] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[1], 0xA0, 0xBF) ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[2], 0x80, 0xBF) ) ) ? 3 : 0 ) @@ -1451,6 +1455,10 @@ \p{_Perl_Surrogate} */ /*** GENERATED CODE ***/ +#define is_SURROGATE_utf8(s) \ +( ( ( 0xDD == ((const U8*)s)[0] ) && ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x65, 0x66) ) ) ? 4 : 0 ) + +/*** GENERATED CODE ***/ #define is_SURROGATE_utf8_safe(s,e) \ ( ( ( ( ( ( ((e) - (s)) >= 4 ) && ( 0xDD == ((const U8*)s)[0] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x65, 0x66) ) ) && ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[2]), 0xA0, 0xBF) ) ) && ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xA0, 0xBF) ) ) ? 4 : 0 ) @@ -2676,6 +2684,10 @@ \p{_Perl_Surrogate} */ /*** GENERATED CODE ***/ +#define is_SURROGATE_utf8(s) \ +( ( ( 0xDD == ((const U8*)s)[0] ) && ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x64, 0x65) ) ) ? 4 : 0 ) + +/*** GENERATED CODE ***/ #define is_SURROGATE_utf8_safe(s,e) \ ( ( ( ( ( ( ((e) - (s)) >= 4 ) && ( 0xDD == ((const U8*)s)[0] ) ) && ( inRANGE_helper_(U8, ((const U8*)s)[1], 0x64, 0x65) ) ) && ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[2]), 0xA0, 0xBF) ) ) && ( inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(((const U8*)s)[3]), 0xA0, 0xBF) ) ) ? 4 : 0 ) @@ -3753,6 +3765,6 @@ * 696e706fddd3ce8cd48c7ea91caf4c9edf5c296432d320aa7b78631f69aa9eac lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl - * ca1cae2ae68045dcfa7761a0b8d27399269f3dc395da5735ec4efbf4077c4dd0 regen/regcharclass.pl + * 3fb6bafb4c830dd501868e34f550cdad3bf8d2c9eed44756488f36c484969417 regen/regcharclass.pl * b2f896452d2b30da3e04800f478c60c1fd0b03d6b668689b020f1e3cf1f1cdd9 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 4f8a5ee042..890dd23208 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1806,7 +1806,7 @@ LARGER_NON_CHARS: # 5 bytes 0x10FFFE - 0x10FFFF SURROGATE: Surrogate code points -=> UTF8 :safe +=> UTF8 :safe fast \p{_Perl_Surrogate} QUOTEMETA: Meta-characters that \Q should quote |