summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-11-16 11:14:15 -0700
committerKarl Williamson <khw@cpan.org>2019-11-16 11:14:15 -0700
commitcc288b7a2732c37504039083ebb98241954636be (patch)
treed583b36015e879134747b2e49215145e07621dac
parent556a074941f5c3a4ba6613f10be6c56138989a11 (diff)
parent4e4df0522939c9283b72776a4779e87429296b52 (diff)
downloadperl-cc288b7a2732c37504039083ebb98241954636be.tar.gz
Merge branch 'multi-fold' into blead
These few commits fix the code that avoids splitting a multi-character fold across EXACTFish nodes in regex patterns
-rw-r--r--regcharclass.h360
-rw-r--r--regcomp.c395
-rwxr-xr-xregen/regcharclass.pl30
-rw-r--r--regen/regcharclass_multi_char_folds.pl23
-rw-r--r--t/re/pat_advanced.t2
5 files changed, 632 insertions, 178 deletions
diff --git a/regcharclass.h b/regcharclass.h
index a0f7b1b941..0f51ade47e 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -240,7 +240,7 @@
/*
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
- &regcharclass_multi_char_folds::multi_char_folds(1)
+ &regcharclass_multi_char_folds::multi_char_folds('u', 'a')
*/
/*** GENERATED CODE ***/
#define is_MULTI_CHAR_FOLD_utf8_safe_part0(s,e) \
@@ -459,7 +459,7 @@
/*
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
- &regcharclass_multi_char_folds::multi_char_folds(0)
+ &regcharclass_multi_char_folds::multi_char_folds('l', 'a')
*/
/*** GENERATED CODE ***/
#define is_MULTI_CHAR_FOLD_latin1_safe(s,e) \
@@ -476,6 +476,122 @@
: 0 )
/*
+ THREE_CHAR_FOLD: A three-character multi-char fold
+
+ &regcharclass_multi_char_folds::multi_char_folds('u', '3')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_utf8_safe(s,e) \
+( ((e)-(s) > 5) ? \
+ ( ( 0x66 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x66 == ((const U8*)s)[1] ) && ( 0x69 == ((const U8*)s)[2] || 0x6C == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : ( 0xCE == ((const U8*)s)[0] ) ? \
+ ( ( 0xB1 == ((const U8*)s)[1] || 0xB7 == ((const U8*)s)[1] ) ? \
+ ( ( ( ( ( 0xCD == ((const U8*)s)[2] ) && ( 0x82 == ((const U8*)s)[3] ) ) && ( 0xCE == ((const U8*)s)[4] ) ) && ( 0xB9 == ((const U8*)s)[5] ) ) ? 6 : 0 )\
+ : ( ( ( 0xB9 == ((const U8*)s)[1] ) && ( 0xCC == ((const U8*)s)[2] ) ) && ( 0x88 == ((const U8*)s)[3] ) ) ? ( ( 0xCC == ((const U8*)s)[4] ) ?\
+ ( ( inRANGE(((const U8*)s)[5], 0x80, 0x81 ) ) ? 6 : 0 )\
+ : ( ( 0xCD == ((const U8*)s)[4] ) && ( 0x82 == ((const U8*)s)[5] ) ) ? 6 : 0 ) : 0 )\
+ : ( 0xCF == ((const U8*)s)[0] ) ? \
+ ( ( 0x85 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xCC == ((const U8*)s)[2] ) && ( 0x88 == ((const U8*)s)[3] || 0x93 == ((const U8*)s)[3] ) ) ? ( ( 0xCC == ((const U8*)s)[4] ) ?\
+ ( ( inRANGE(((const U8*)s)[5], 0x80, 0x81 ) ) ? 6 : 0 )\
+ : ( ( 0xCD == ((const U8*)s)[4] ) && ( 0x82 == ((const U8*)s)[5] ) ) ? 6 : 0 ) : 0 )\
+ : ( ( ( ( ( 0x89 == ((const U8*)s)[1] ) && ( 0xCD == ((const U8*)s)[2] ) ) && ( 0x82 == ((const U8*)s)[3] ) ) && ( 0xCE == ((const U8*)s)[4] ) ) && ( 0xB9 == ((const U8*)s)[5] ) ) ? 6 : 0 )\
+ : 0 ) \
+: ( ( ( ((e)-(s) > 2) && ( 0x66 == ((const U8*)s)[0] ) ) && ( 0x66 == ((const U8*)s)[1] ) ) && ( 0x69 == ((const U8*)s)[2] || 0x6C == ((const U8*)s)[2] ) ) ? 3 : 0 )
+
+/*
+ THREE_CHAR_FOLD: A three-character multi-char fold
+
+ &regcharclass_multi_char_folds::multi_char_folds('l', '3')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_latin1_safe(s,e) \
+( ( ( ( ( ((e) - (s)) >= 3 ) && ( ( ((const U8*)s)[0] & 0xDF ) == 0x46 ) ) && ( ( ((const U8*)s)[1] & 0xDF ) == 0x46 ) ) && ( ( ( ((const U8*)s)[2] & 0xDF ) == 0x49 ) || ( ( ((const U8*)s)[2] & 0xDF ) == 0x4C ) ) ) ? 3 : 0 )
+
+/*
+ THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+
+ &regcharclass_multi_char_folds::multi_char_folds('u', 'h')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_HEAD_utf8_safe(s,e) \
+( ((e)-(s) > 3) ? \
+ ( ( 0x61 == ((const U8*)s)[0] || inRANGE(((const U8*)s)[0], 0x68, 0x6A ) || inRANGE(((const U8*)s)[0], 0x73, 0x74 ) || 0x77 == ((const U8*)s)[0] || 0x79 == ((const U8*)s)[0] ) ? 1\
+ : ( 0x66 == ((const U8*)s)[0] ) ? \
+ ( ( 0x66 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xCA == ((const U8*)s)[0] ) ? \
+ ( ( 0xBC == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xCE == ((const U8*)s)[0] ) ? \
+ ( ( ( ((const U8*)s)[1] & 0xFD ) == 0xAC ) ? 2 \
+ : ( 0xB1 == ((const U8*)s)[1] || 0xB7 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xCD == ((const U8*)s)[2] ) && ( 0x82 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : ( 0xB9 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xCC == ((const U8*)s)[2] ) && ( 0x88 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : 0 ) \
+ : ( 0xCF == ((const U8*)s)[0] ) ? \
+ ( ( 0x81 == ((const U8*)s)[1] || 0x8E == ((const U8*)s)[1] ) ? 2 \
+ : ( 0x85 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xCC == ((const U8*)s)[2] ) && ( 0x88 == ((const U8*)s)[3] || 0x93 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : ( 0x89 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xCD == ((const U8*)s)[2] ) && ( 0x82 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : 0 ) \
+ : ( 0xD5 == ((const U8*)s)[0] ) ? \
+ ( ( 0xA5 == ((const U8*)s)[1] || 0xB4 == ((const U8*)s)[1] || 0xBE == ((const U8*)s)[1] ) ? 2 : 0 )\
+ : ( 0xE1 == ((const U8*)s)[0] ) ? \
+ ( ( 0xBC == ((const U8*)s)[1] ) ? \
+ ( ( ( ((const U8*)s)[2] & 0xD8 ) == 0x80 ) ? 3 : 0 ) \
+ : ( ( 0xBD == ((const U8*)s)[1] ) && ( ( ( ((const U8*)s)[2] & 0xF8 ) == 0xA0 ) || ( ( ((const U8*)s)[2] & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0xBC ) ) ? 3 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 2) ? \
+ ( ( 0x61 == ((const U8*)s)[0] || inRANGE(((const U8*)s)[0], 0x68, 0x6A ) || inRANGE(((const U8*)s)[0], 0x73, 0x74 ) || 0x77 == ((const U8*)s)[0] || 0x79 == ((const U8*)s)[0] ) ? 1\
+ : ( 0x66 == ((const U8*)s)[0] ) ? \
+ ( ( 0x66 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xCA == ((const U8*)s)[0] ) ? \
+ ( ( 0xBC == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xCE == ((const U8*)s)[0] ) ? \
+ ( ( ( ( ((const U8*)s)[1] & 0xFD ) == 0xAC ) || ( ( ((const U8*)s)[1] & 0xF7 ) == 0xB1 ) || ((const U8*)s)[1] == 0xB7 ) ? 2 : 0 )\
+ : ( 0xCF == ((const U8*)s)[0] ) ? \
+ ( ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x81 ) || ((const U8*)s)[1] == 0x89 || ((const U8*)s)[1] == 0x8E ) ? 2 : 0 )\
+ : ( 0xD5 == ((const U8*)s)[0] ) ? \
+ ( ( 0xA5 == ((const U8*)s)[1] || 0xB4 == ((const U8*)s)[1] || 0xBE == ((const U8*)s)[1] ) ? 2 : 0 )\
+ : ( 0xE1 == ((const U8*)s)[0] ) ? \
+ ( ( 0xBC == ((const U8*)s)[1] ) ? \
+ ( ( ( ((const U8*)s)[2] & 0xD8 ) == 0x80 ) ? 3 : 0 ) \
+ : ( ( 0xBD == ((const U8*)s)[1] ) && ( ( ( ((const U8*)s)[2] & 0xF8 ) == 0xA0 ) || ( ( ((const U8*)s)[2] & 0xFB ) == 0xB0 ) || ((const U8*)s)[2] == 0xBC ) ) ? 3 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 1) ? \
+ ( ( 0x61 == ((const U8*)s)[0] || inRANGE(((const U8*)s)[0], 0x68, 0x6A ) || inRANGE(((const U8*)s)[0], 0x73, 0x74 ) || 0x77 == ((const U8*)s)[0] || 0x79 == ((const U8*)s)[0] ) ? 1\
+ : ( 0x66 == ((const U8*)s)[0] ) ? \
+ ( ( 0x66 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xCA == ((const U8*)s)[0] ) ? \
+ ( ( 0xBC == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xCE == ((const U8*)s)[0] ) ? \
+ ( ( ( ( ((const U8*)s)[1] & 0xFD ) == 0xAC ) || ( ( ((const U8*)s)[1] & 0xF7 ) == 0xB1 ) || ((const U8*)s)[1] == 0xB7 ) ? 2 : 0 )\
+ : ( 0xCF == ((const U8*)s)[0] ) ? \
+ ( ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x81 ) || ((const U8*)s)[1] == 0x89 || ((const U8*)s)[1] == 0x8E ) ? 2 : 0 )\
+ : ( ( 0xD5 == ((const U8*)s)[0] ) && ( 0xA5 == ((const U8*)s)[1] || 0xB4 == ((const U8*)s)[1] || 0xBE == ((const U8*)s)[1] ) ) ? 2 : 0 )\
+: ((e)-(s) > 0) ? \
+ ( 0x61 == ((const U8*)s)[0] || 0x66 == ((const U8*)s)[0] || inRANGE(((const U8*)s)[0], 0x68, 0x6A ) || inRANGE(((const U8*)s)[0], 0x73, 0x74 ) || 0x77 == ((const U8*)s)[0] || 0x79 == ((const U8*)s)[0] )\
+: 0 )
+
+/*
+ THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+
+ &regcharclass_multi_char_folds::multi_char_folds('l', 'h')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_HEAD_latin1_safe(s,e) \
+( ((e)-(s) > 1) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xDF ) == 0x41 ) || ( ( ((const U8*)s)[0] & 0xDE ) == 0x48 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x4A ) || ( ( ((const U8*)s)[0] & 0xDB ) == 0x53 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x54 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x59 ) ) ? 1\
+ : ( ( ((const U8*)s)[0] & 0xDF ) == 0x46 ) ? \
+ ( ( ( ((const U8*)s)[1] & 0xDF ) == 0x46 ) ? 2 : 1 ) \
+ : 0 ) \
+: ((e)-(s) > 0) ? \
+ ( ( ( ((const U8*)s)[0] & 0xDF ) == 0x41 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x46 ) || ( ( ((const U8*)s)[0] & 0xDE ) == 0x48 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x4A ) || ( ( ((const U8*)s)[0] & 0xDB ) == 0x53 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x54 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x59 ) )\
+: 0 )
+
+/*
FOLDS_TO_MULTI: characters that fold to multi-char strings
\p{_Perl_Folds_To_Multi_Char}
@@ -858,7 +974,7 @@
/*
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
- &regcharclass_multi_char_folds::multi_char_folds(1)
+ &regcharclass_multi_char_folds::multi_char_folds('u', 'a')
*/
/*** GENERATED CODE ***/
#define is_MULTI_CHAR_FOLD_utf8_safe_part0(s,e) \
@@ -1075,7 +1191,7 @@
/*
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
- &regcharclass_multi_char_folds::multi_char_folds(0)
+ &regcharclass_multi_char_folds::multi_char_folds('l', 'a')
*/
/*** GENERATED CODE ***/
#define is_MULTI_CHAR_FOLD_latin1_safe(s,e) \
@@ -1092,6 +1208,120 @@
: 0 )
/*
+ THREE_CHAR_FOLD: A three-character multi-char fold
+
+ &regcharclass_multi_char_folds::multi_char_folds('u', '3')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_utf8_safe(s,e) \
+( ((e)-(s) > 5) ? \
+ ( ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x86 == ((const U8*)s)[1] ) && ( 0x89 == ((const U8*)s)[2] || 0x93 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : ( 0xB4 == ((const U8*)s)[0] ) ? \
+ ( ( 0x58 == ((const U8*)s)[1] || 0x66 == ((const U8*)s)[1] ) ? \
+ ( ( ( ( ( 0xB1 == ((const U8*)s)[2] ) && ( 0x43 == ((const U8*)s)[3] ) ) && ( 0xB4 == ((const U8*)s)[4] ) ) && ( 0x68 == ((const U8*)s)[5] ) ) ? 6 : 0 )\
+ : ( ( ( 0x68 == ((const U8*)s)[1] ) && ( 0xAF == ((const U8*)s)[2] ) ) && ( 0x49 == ((const U8*)s)[3] ) ) ? ( ( 0xAF == ((const U8*)s)[4] ) ?\
+ ( ( inRANGE(((const U8*)s)[5], 0x41, 0x42 ) ) ? 6 : 0 )\
+ : ( ( 0xB1 == ((const U8*)s)[4] ) && ( 0x43 == ((const U8*)s)[5] ) ) ? 6 : 0 ) : 0 )\
+ : ( 0xB5 == ((const U8*)s)[0] ) ? \
+ ( ( 0x46 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAF == ((const U8*)s)[2] ) && ( 0x49 == ((const U8*)s)[3] || 0x62 == ((const U8*)s)[3] ) ) ? ( ( 0xAF == ((const U8*)s)[4] ) ?\
+ ( ( inRANGE(((const U8*)s)[5], 0x41, 0x42 ) ) ? 6 : 0 )\
+ : ( ( 0xB1 == ((const U8*)s)[4] ) && ( 0x43 == ((const U8*)s)[5] ) ) ? 6 : 0 ) : 0 )\
+ : ( ( ( ( ( 0x4A == ((const U8*)s)[1] ) && ( 0xB1 == ((const U8*)s)[2] ) ) && ( 0x43 == ((const U8*)s)[3] ) ) && ( 0xB4 == ((const U8*)s)[4] ) ) && ( 0x68 == ((const U8*)s)[5] ) ) ? 6 : 0 )\
+ : 0 ) \
+: ( ( ( ((e)-(s) > 2) && ( 0x86 == ((const U8*)s)[0] ) ) && ( 0x86 == ((const U8*)s)[1] ) ) && ( 0x89 == ((const U8*)s)[2] || 0x93 == ((const U8*)s)[2] ) ) ? 3 : 0 )
+
+/*
+ THREE_CHAR_FOLD: A three-character multi-char fold
+
+ &regcharclass_multi_char_folds::multi_char_folds('l', '3')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_latin1_safe(s,e) \
+( ( ( ( ( ((e) - (s)) >= 3 ) && ( ( ((const U8*)s)[0] & 0xBF ) == 0x86 ) ) && ( ( ((const U8*)s)[1] & 0xBF ) == 0x86 ) ) && ( ( ( ((const U8*)s)[2] & 0xBF ) == 0x89 ) || ( ( ((const U8*)s)[2] & 0xBF ) == 0x93 ) ) ) ? 3 : 0 )
+
+/*
+ THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+
+ &regcharclass_multi_char_folds::multi_char_folds('u', 'h')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_HEAD_utf8_safe(s,e) \
+( ((e)-(s) > 3) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA6 || ((const U8*)s)[0] == 0xA8 ) ? 1\
+ : ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( 0x86 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xAB == ((const U8*)s)[0] ) ? \
+ ( ( 0x70 == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xB4 == ((const U8*)s)[0] ) ? \
+ ( ( 0x53 == ((const U8*)s)[1] || 0x55 == ((const U8*)s)[1] ) ? 2 \
+ : ( 0x58 == ((const U8*)s)[1] || 0x66 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xB1 == ((const U8*)s)[2] ) && ( 0x43 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : ( 0x68 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAF == ((const U8*)s)[2] ) && ( 0x49 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : 0 ) \
+ : ( 0xB5 == ((const U8*)s)[0] ) ? \
+ ( ( 0x42 == ((const U8*)s)[1] || 0x55 == ((const U8*)s)[1] ) ? 2 \
+ : ( 0x46 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAF == ((const U8*)s)[2] ) && ( 0x49 == ((const U8*)s)[3] || 0x62 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : ( 0x4A == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xB1 == ((const U8*)s)[2] ) && ( 0x43 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : 0 ) \
+ : ( 0xB8 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x52 == ((const U8*)s)[1] ) && ( 0x46 == ((const U8*)s)[2] || 0x63 == ((const U8*)s)[2] || 0x72 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : ( 0xBF == ((const U8*)s)[0] ) ? \
+ ( ( inRANGE(((const U8*)s)[1], 0x67, 0x68 ) ) ? \
+ ( ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) ) ? 3 : 0 ) \
+ : ( ( 0x6A == ((const U8*)s)[1] ) && ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) || 0x57 == ((const U8*)s)[2] || 0x63 == ((const U8*)s)[2] || 0x70 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 2) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA6 || ((const U8*)s)[0] == 0xA8 ) ? 1\
+ : ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( 0x86 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xAB == ((const U8*)s)[0] ) ? \
+ ( ( 0x70 == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xB4 == ((const U8*)s)[0] ) ? \
+ ( ( 0x53 == ((const U8*)s)[1] || 0x55 == ((const U8*)s)[1] || 0x58 == ((const U8*)s)[1] || 0x66 == ((const U8*)s)[1] || 0x68 == ((const U8*)s)[1] ) ? 2 : 0 )\
+ : ( 0xB5 == ((const U8*)s)[0] ) ? \
+ ( ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ? 2 : 0 )\
+ : ( 0xB8 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x52 == ((const U8*)s)[1] ) && ( 0x46 == ((const U8*)s)[2] || 0x63 == ((const U8*)s)[2] || 0x72 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : ( 0xBF == ((const U8*)s)[0] ) ? \
+ ( ( inRANGE(((const U8*)s)[1], 0x67, 0x68 ) ) ? \
+ ( ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) ) ? 3 : 0 ) \
+ : ( ( 0x6A == ((const U8*)s)[1] ) && ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) || 0x57 == ((const U8*)s)[2] || 0x63 == ((const U8*)s)[2] || 0x70 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 1) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA6 || ((const U8*)s)[0] == 0xA8 ) ? 1\
+ : ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( 0x86 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xAB == ((const U8*)s)[0] ) ? \
+ ( ( 0x70 == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xB4 == ((const U8*)s)[0] ) ? \
+ ( ( 0x53 == ((const U8*)s)[1] || 0x55 == ((const U8*)s)[1] || 0x58 == ((const U8*)s)[1] || 0x66 == ((const U8*)s)[1] || 0x68 == ((const U8*)s)[1] ) ? 2 : 0 )\
+ : ( ( 0xB5 == ((const U8*)s)[0] ) && ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ) ? 2 : 0 )\
+: ((e)-(s) > 0) ? \
+ ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x86 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA8 )\
+: 0 )
+
+/*
+ THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+
+ &regcharclass_multi_char_folds::multi_char_folds('l', 'h')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_HEAD_latin1_safe(s,e) \
+( ((e)-(s) > 1) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xAF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0xA2 ) || ( ( ((const U8*)s)[0] & 0xBF ) == 0xA6 ) || ( ( ((const U8*)s)[0] & 0xBF ) == 0xA8 ) ) ? 1\
+ : ( ( ((const U8*)s)[0] & 0xBF ) == 0x86 ) ? \
+ ( ( ( ((const U8*)s)[1] & 0xBF ) == 0x86 ) ? 2 : 1 ) \
+ : 0 ) \
+: ((e)-(s) > 0) ? \
+ ( ( ( ((const U8*)s)[0] & 0xAF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0x9F ) == 0x86 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0xA2 ) || ( ( ((const U8*)s)[0] & 0xBF ) == 0xA8 ) )\
+: 0 )
+
+/*
FOLDS_TO_MULTI: characters that fold to multi-char strings
\p{_Perl_Folds_To_Multi_Char}
@@ -1475,7 +1705,7 @@
/*
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
- &regcharclass_multi_char_folds::multi_char_folds(1)
+ &regcharclass_multi_char_folds::multi_char_folds('u', 'a')
*/
/*** GENERATED CODE ***/
#define is_MULTI_CHAR_FOLD_utf8_safe_part0(s,e) \
@@ -1692,7 +1922,7 @@
/*
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
- &regcharclass_multi_char_folds::multi_char_folds(0)
+ &regcharclass_multi_char_folds::multi_char_folds('l', 'a')
*/
/*** GENERATED CODE ***/
#define is_MULTI_CHAR_FOLD_latin1_safe(s,e) \
@@ -1709,6 +1939,120 @@
: 0 )
/*
+ THREE_CHAR_FOLD: A three-character multi-char fold
+
+ &regcharclass_multi_char_folds::multi_char_folds('u', '3')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_utf8_safe(s,e) \
+( ((e)-(s) > 5) ? \
+ ( ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x86 == ((const U8*)s)[1] ) && ( 0x89 == ((const U8*)s)[2] || 0x93 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : ( 0xB3 == ((const U8*)s)[0] ) ? \
+ ( ( 0x58 == ((const U8*)s)[1] || 0x65 == ((const U8*)s)[1] ) ? \
+ ( ( ( ( ( 0xAF == ((const U8*)s)[2] ) && ( 0x43 == ((const U8*)s)[3] ) ) && ( 0xB3 == ((const U8*)s)[4] ) ) && ( 0x67 == ((const U8*)s)[5] ) ) ? 6 : 0 )\
+ : ( ( ( 0x67 == ((const U8*)s)[1] ) && ( 0xAD == ((const U8*)s)[2] ) ) && ( 0x49 == ((const U8*)s)[3] ) ) ? ( ( 0xAD == ((const U8*)s)[4] ) ?\
+ ( ( inRANGE(((const U8*)s)[5], 0x41, 0x42 ) ) ? 6 : 0 )\
+ : ( ( 0xAF == ((const U8*)s)[4] ) && ( 0x43 == ((const U8*)s)[5] ) ) ? 6 : 0 ) : 0 )\
+ : ( 0xB4 == ((const U8*)s)[0] ) ? \
+ ( ( 0x46 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAD == ((const U8*)s)[2] ) && ( 0x49 == ((const U8*)s)[3] || 0x5F == ((const U8*)s)[3] ) ) ? ( ( 0xAD == ((const U8*)s)[4] ) ?\
+ ( ( inRANGE(((const U8*)s)[5], 0x41, 0x42 ) ) ? 6 : 0 )\
+ : ( ( 0xAF == ((const U8*)s)[4] ) && ( 0x43 == ((const U8*)s)[5] ) ) ? 6 : 0 ) : 0 )\
+ : ( ( ( ( ( 0x4A == ((const U8*)s)[1] ) && ( 0xAF == ((const U8*)s)[2] ) ) && ( 0x43 == ((const U8*)s)[3] ) ) && ( 0xB3 == ((const U8*)s)[4] ) ) && ( 0x67 == ((const U8*)s)[5] ) ) ? 6 : 0 )\
+ : 0 ) \
+: ( ( ( ((e)-(s) > 2) && ( 0x86 == ((const U8*)s)[0] ) ) && ( 0x86 == ((const U8*)s)[1] ) ) && ( 0x89 == ((const U8*)s)[2] || 0x93 == ((const U8*)s)[2] ) ) ? 3 : 0 )
+
+/*
+ THREE_CHAR_FOLD: A three-character multi-char fold
+
+ &regcharclass_multi_char_folds::multi_char_folds('l', '3')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_latin1_safe(s,e) \
+( ( ( ( ( ((e) - (s)) >= 3 ) && ( ( ((const U8*)s)[0] & 0xBF ) == 0x86 ) ) && ( ( ((const U8*)s)[1] & 0xBF ) == 0x86 ) ) && ( ( ( ((const U8*)s)[2] & 0xBF ) == 0x89 ) || ( ( ((const U8*)s)[2] & 0xBF ) == 0x93 ) ) ) ? 3 : 0 )
+
+/*
+ THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+
+ &regcharclass_multi_char_folds::multi_char_folds('u', 'h')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_HEAD_utf8_safe(s,e) \
+( ((e)-(s) > 3) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA6 || ((const U8*)s)[0] == 0xA8 ) ? 1\
+ : ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( 0x86 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xAA == ((const U8*)s)[0] ) ? \
+ ( ( 0x6A == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xB3 == ((const U8*)s)[0] ) ? \
+ ( ( 0x53 == ((const U8*)s)[1] || 0x55 == ((const U8*)s)[1] ) ? 2 \
+ : ( 0x58 == ((const U8*)s)[1] || 0x65 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAF == ((const U8*)s)[2] ) && ( 0x43 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : ( 0x67 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAD == ((const U8*)s)[2] ) && ( 0x49 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : 0 ) \
+ : ( 0xB4 == ((const U8*)s)[0] ) ? \
+ ( ( 0x42 == ((const U8*)s)[1] || 0x55 == ((const U8*)s)[1] ) ? 2 \
+ : ( 0x46 == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAD == ((const U8*)s)[2] ) && ( 0x49 == ((const U8*)s)[3] || 0x5F == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : ( 0x4A == ((const U8*)s)[1] ) ? \
+ ( ( ( 0xAF == ((const U8*)s)[2] ) && ( 0x43 == ((const U8*)s)[3] ) ) ? 4 : 2 )\
+ : 0 ) \
+ : ( 0xB7 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x52 == ((const U8*)s)[1] ) && ( 0x46 == ((const U8*)s)[2] || 0x62 == ((const U8*)s)[2] || 0x71 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : ( 0xBF == ((const U8*)s)[0] ) ? \
+ ( ( inRANGE(((const U8*)s)[1], 0x66, 0x67 ) ) ? \
+ ( ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) ) ? 3 : 0 ) \
+ : ( ( 0x69 == ((const U8*)s)[1] ) && ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) || 0x57 == ((const U8*)s)[2] || 0x62 == ((const U8*)s)[2] || 0x6A == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 2) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA6 || ((const U8*)s)[0] == 0xA8 ) ? 1\
+ : ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( 0x86 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xAA == ((const U8*)s)[0] ) ? \
+ ( ( 0x6A == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xB3 == ((const U8*)s)[0] ) ? \
+ ( ( ((const U8*)s)[1] == 0x53 || ((const U8*)s)[1] == 0x55 || ((const U8*)s)[1] == 0x58 || ( ( ((const U8*)s)[1] & 0xFD ) == 0x65 ) ) ? 2 : 0 )\
+ : ( 0xB4 == ((const U8*)s)[0] ) ? \
+ ( ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ? 2 : 0 )\
+ : ( 0xB7 == ((const U8*)s)[0] ) ? \
+ ( ( ( 0x52 == ((const U8*)s)[1] ) && ( 0x46 == ((const U8*)s)[2] || 0x62 == ((const U8*)s)[2] || 0x71 == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : ( 0xBF == ((const U8*)s)[0] ) ? \
+ ( ( inRANGE(((const U8*)s)[1], 0x66, 0x67 ) ) ? \
+ ( ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) ) ? 3 : 0 ) \
+ : ( ( 0x69 == ((const U8*)s)[1] ) && ( inRANGE(((const U8*)s)[2], 0x41, 0x48 ) || 0x57 == ((const U8*)s)[2] || 0x62 == ((const U8*)s)[2] || 0x6A == ((const U8*)s)[2] ) ) ? 3 : 0 )\
+ : 0 ) \
+: ((e)-(s) > 1) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA6 || ((const U8*)s)[0] == 0xA8 ) ? 1\
+ : ( 0x86 == ((const U8*)s)[0] ) ? \
+ ( ( 0x86 == ((const U8*)s)[1] ) ? 2 : 1 ) \
+ : ( 0xAA == ((const U8*)s)[0] ) ? \
+ ( ( 0x6A == ((const U8*)s)[1] ) ? 2 : 0 ) \
+ : ( 0xB3 == ((const U8*)s)[0] ) ? \
+ ( ( ((const U8*)s)[1] == 0x53 || ((const U8*)s)[1] == 0x55 || ((const U8*)s)[1] == 0x58 || ( ( ((const U8*)s)[1] & 0xFD ) == 0x65 ) ) ? 2 : 0 )\
+ : ( ( 0xB4 == ((const U8*)s)[0] ) && ( ( ( ((const U8*)s)[1] & 0xFB ) == 0x42 ) || ((const U8*)s)[1] == 0x4A || ((const U8*)s)[1] == 0x55 ) ) ? 2 : 0 )\
+: ((e)-(s) > 0) ? \
+ ( ( ( ((const U8*)s)[0] & 0xEF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xDF ) == 0x86 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xFE ) == 0xA2 ) || ((const U8*)s)[0] == 0xA8 )\
+: 0 )
+
+/*
+ THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+
+ &regcharclass_multi_char_folds::multi_char_folds('l', 'h')
+*/
+/*** GENERATED CODE ***/
+#define is_THREE_CHAR_FOLD_HEAD_latin1_safe(s,e) \
+( ((e)-(s) > 1) ? \
+ ( ( ( ( ((const U8*)s)[0] & 0xAF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0xA2 ) || ( ( ((const U8*)s)[0] & 0xBF ) == 0xA6 ) || ( ( ((const U8*)s)[0] & 0xBF ) == 0xA8 ) ) ? 1\
+ : ( ( ((const U8*)s)[0] & 0xBF ) == 0x86 ) ? \
+ ( ( ( ((const U8*)s)[1] & 0xBF ) == 0x86 ) ? 2 : 1 ) \
+ : 0 ) \
+: ((e)-(s) > 0) ? \
+ ( ( ( ((const U8*)s)[0] & 0xAF ) == 0x81 ) || ( ( ((const U8*)s)[0] & 0x9F ) == 0x86 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0x88 ) || ( ( ((const U8*)s)[0] & 0xBE ) == 0xA2 ) || ( ( ((const U8*)s)[0] & 0xBF ) == 0xA8 ) )\
+: 0 )
+
+/*
FOLDS_TO_MULTI: characters that fold to multi-char strings
\p{_Perl_Folds_To_Multi_Char}
@@ -1904,6 +2248,6 @@
* 5214f368c189077a2a748b7ef0a5300abd0d012be568d18c1bbd8bede55818ae lib/unicore/mktables
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
- * 8cffbf838b6e8ea5310e4ad2e0498ad9c1d87d4babead678081859473591317c regen/regcharclass.pl
- * eb505a90982944b6053f0b7141210f201b972cd9a57be66fcfeb7506227f6fbe regen/regcharclass_multi_char_folds.pl
+ * f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl
+ * b549b9989c6987563dad8f8ad6b984c8026cdc283d60ea34457959c5d4b4ade0 regen/regcharclass_multi_char_folds.pl
* ex: set ro: */
diff --git a/regcomp.c b/regcomp.c
index 6ce4996839..9338531148 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13774,7 +13774,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
STRLEN len = 0;
UV ender = 0;
char *p;
- char *s;
+ char *s, *old_s = NULL, *old_old_s = NULL;
char *s0;
U32 max_string_len = 255;
@@ -13796,20 +13796,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
U8 node_type = EXACT;
/* Assume the node will be fully used; the excess is given back at
- * the end. Under /i, leave enough extra room so that we won't
- * overflow the buffer when we fold a character which would end up
- * overflowing the node. We can't make any other length
- * assumptions, as a byte input sequence could shrink down. */
+ * the end. Under /i, we may need to temporarily add the fold of
+ * an extra character or two at the end to check for splitting
+ * multi-char folds, so allocate extra space for that. We can't
+ * make any other length assumptions, as a byte input sequence
+ * could shrink down. */
Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
+ ((! FOLD)
? 0
- : 1 * ((UTF)
+ : 2 * ((UTF)
? UTF8_MAXBYTES_CASE
/* Max non-UTF-8 expansion is 2 */ : 2)));
bool next_is_quantifier;
char * oldp = NULL;
- char * old_oldp = NULL;
/* We can convert EXACTF nodes to EXACTFU if they contain only
* characters that match identically regardless of the target
@@ -13895,8 +13895,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* The exceptions override this */
Size_t added_len = 1;
- old_oldp = oldp;
oldp = p;
+ old_old_s = old_s;
+ old_s = s;
/* White space has already been ignored */
assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
@@ -14527,6 +14528,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
}
else if (! LOC) { /* XXX shouldn't /l assume could be a UTF-8
locale, and prepare for that? */
+ bool splittable = FALSE;
+ bool backed_up = FALSE;
+ char * e = s;
+
+ assert(FOLD);
/* Here is /i. Running out of room creates a problem if we are
* folding, and the split happens in the middle of a
@@ -14539,188 +14545,261 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* things that fold to them) as 'ff' and 'ss' are
* multi-character folds.
*
+ * The Unicode standard says that multi character folds consist
+ * of either two or three characters. That means we would be
+ * splitting one if the final character in the node is at the
+ * beginning of either type, or is the second of a three
+ * character fold.
+ *
* At this point:
- * old_oldp points to the beginning in the input of the
- * penultimate character in the node.
- * oldp points to the beginning in the input of the
- * final character in the node.
- * p points to the beginning in the input of the
- * next character in the input, the one that won't
- * fit in the node.
+ * ender is the code point of the character that won't fit
+ * in the node
+ * s points to just beyond the final byte in the node.
+ * It's where we would place ender if there were
+ * room, and where in fact we do place ender's fold
+ * in the code below, as we've over-allocated space
+ * for s0 (hence s) to allow for this
+ * e starts at 's' and advances as we append things.
+ * old_s is the same as 's'. (If ender had fit, 's' would
+ * have been advanced to beyond it).
+ * old_old_s points to the beginning byte of the final
+ * character in the node
+ * p points to the beginning byte in the input of the
+ * character beyond 'ender'.
+ * oldp points to the beginning byte in the input of
+ * 'ender'.
*
- * We aren't in the middle of a multi-char fold unless the
- * final character in the node can appear in a non-final
- * position in such a fold. Very few characters actually
- * participate in multi-character folds, and fewer still can be
- * in the non-final position. But it's complicated to know
- * here if that final character is folded or not, so skip this
- * check */
-
- /* Make sure enough space for final char of node,
- * first char of following node, and the fold of the
- * following char (so we don't have to worry about
- * that fold running off the end */
- U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
- STRLEN fold_len;
- UV folded;
- char * const sav_oldp = oldp;
-
- assert(FOLD);
-
- /* The Unicode standard says that multi character folds consist
- * of either two or three characters. So we create a buffer
- * containing a window of three. The first is the final
- * character in the node (folded), and then the two that begin
- * the following node. But if the first character of the
- * following node can't be in a non-final fold position, there
- * is no need to look at its successor character. The macros
- * used below to check for multi character folds require folded
- * inputs, so we have to fold these. (The fold of p was likely
- * calculated in the loop above, but it hasn't beeen saved, and
- * khw thinks it would be too entangled to change to do so) */
-
- if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
- folded = _to_uni_fold_flags(ender,
- foldbuf,
- &fold_len,
- FOLD_FLAGS_FULL);
- }
- else {
- foldbuf[0] = folded = MICRO_SIGN;
- fold_len = 1;
- }
-
- /* Here, foldbuf contains the fold of the first character in
- * the next node. We may also need the next one (if there is
- * one) to get our third, but if the first character folded to
- * more than one, those extra one(s) will serve as the third.
- * Also, we don't need a third unless the previous one can
- * appear in a non-final position in a fold */
- if ( ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
- && (fold_len == 1 || ( UTF
- && UVCHR_SKIP(folded) == fold_len))
- && UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
- {
- if (UTF) {
- STRLEN next_fold_len;
+ * If the final character of the node and the fold of ender
+ * form the first two characters of a three character fold, we
+ * need to peek ahead at the next (unparsed) character in the
+ * input to determine if the three actually do form such a
+ * fold. Just looking at that character is not generally
+ * sufficient, as it could be, for example, an escape sequence
+ * that evaluates to something else, and it needs to be folded.
+ *
+ * khw originally thought to just go through the parse loop one
+ * extra time, but that doesn't work easily as that iteration
+ * could cause things to think that the parse is over and to
+ * goto loopdone. The character could be a '$' for example, or
+ * the character beyond could be a quantifier, and other
+ * glitches as well.
+ *
+ * The solution used here for peeking ahead is to look at that
+ * next character. If it isn't ASCII punctuation, then it will
+ * be something that continues in an EXACTish node if there
+ * were space. We append the fold of it to s, having reserved
+ * enough room in s0 for the purpose. If we can't reasonably
+ * peek ahead, we instead assume the worst case: that it is
+ * something that would form the completion of a multi-char
+ * fold.
+ *
+ * If we can't split between s and ender, we work backwards
+ * character-by-character down to s0. At each current point
+ * see if we are at the beginning of a multi-char fold. If so,
+ * that means we would be splitting the fold across nodes, and
+ * so we back up one and try again.
+ *
+ * If we're not at the beginning, we still could be at the
+ * final two characters of a (rare) three character fold. We
+ * check if the sequence starting at the character before the
+ * current position (and including the current and next
+ * characters) is a three character fold. If not, the node can
+ * be split here. If it is, we have to backup two characters
+ * and try again.
+ *
+ * Otherwise, the node can be split at the current position.
+ */
+ s = old_old_s; /* Point to the beginning of the final char
+ that fits in the node */
- toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
- (U8*) RExC_end, foldbuf + fold_len,
- &next_fold_len);
- fold_len += next_fold_len;
- }
- else {
- if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
- foldbuf[fold_len] = 's';
+ /* The same logic is used for UTF-8 patterns and not */
+ if (UTF) {
+ Size_t added_len;
+
+ /* Append the fold of ender */
+ (void) _to_uni_fold_flags(
+ ender,
+ (U8 *) e,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ e += added_len;
+
+ /* 's' and the character folded to by ender may be the
+ * first two of a three-character fold, in which case the
+ * node should not be split here. That may mean examining
+ * the so-far unparsed character starting at 'p'. But if
+ * ender folded to more than one character, we already have
+ * three characters to look at. Also, we first check if
+ * the sequence consisting of s and the next character form
+ * the first two of some three character fold. If not,
+ * there's no need to peek ahead. */
+ if ( added_len <= UTF8SKIP(e - added_len)
+ && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
+ {
+ /* Here, the two do form the beginning of a potential
+ * three character fold. The unexamined character may
+ * or may not complete it. Peek at it. It might be
+ * something that ends the node or an escape sequence,
+ * in which case we don't know without a lot of work
+ * what it evaluates to, so we have to assume the worst
+ * case: that it does complete the fold, and so we
+ * can't split here. All such instances will have
+ * that character be an ASCII punctuation character,
+ * like a backslash. So, for that case, backup one and
+ * drop down to try at that position */
+ if (isPUNCT(*p)) {
+ s = (char *) utf8_hop_back((U8 *) s, -1,
+ (U8 *) s0);
+ backed_up = TRUE;
}
else {
- foldbuf[fold_len] = toLOWER_L1(p[1]);
+ /* Here, since it's not punctuation, it must be a
+ * real character, and we can append its fold to
+ * 'e' (having deliberately reserved enough space
+ * for this eventuality) and drop down to check if
+ * the three actually do form a folded sequence */
+ (void) _to_utf8_fold_flags(
+ (U8 *) p, (U8 *) RExC_end,
+ (U8 *) e,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ e += added_len;
}
- fold_len++;
}
- }
-
- /* Here foldbuf contains the the fold of p, and if appropriate
- * that of the character following p in the input. */
- /* Search backwards until find a place that doesn't split a
- * multi-char fold */
- while (1) {
- STRLEN s_len;
- char s_fold_buf[UTF8_MAXBYTES_CASE];
- char * s_fold = s_fold_buf;
+ /* Here, we either have three characters available in
+ * sequence starting at 's', or we have two characters and
+ * know that the following one can't possibly be part of a
+ * three character fold. We go through the node backwards
+ * until we find a place where we can split it without
+ * breaking apart a multi-character fold. At any given
+ * point we have to worry about if such a fold begins at
+ * the current 's', and also if a three-character fold
+ * begins at s-1, (containing s and s+1). Splitting in
+ * either case would break apart a fold */
+ do {
+ char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
+ (U8 *) s0);
+
+ /* If is a multi-char fold, can't split here. Backup
+ * one char and try again */
+ if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
+ s = prev_s;
+ backed_up = TRUE;
+ continue;
+ }
- if (s <= s0) {
+ /* If the two characters beginning at 's' are part of a
+ * three character fold starting at the character
+ * before s, we can't split either before or after s.
+ * Backup two chars and try again */
+ if ( LIKELY(s > s0)
+ && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
+ {
+ s = prev_s;
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ backed_up = TRUE;
+ continue;
+ }
- /* There's no safe place in the node to split. Quit so
- * will take the whole node */
- oldp = sav_oldp;
+ /* Here there's no multi-char fold between s and the
+ * next character following it. We can split */
+ splittable = TRUE;
break;
- }
- /* Backup 1 character. The first time through this moves s
- * to point to the final character in the node */
- if (UTF) {
- s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ } while (s > s0); /* End of loops backing up through the node */
+
+ /* Here we either couldn't find a place to split the node,
+ * or else we broke out of the loop setting 'splittable' to
+ * true. In the latter case, the place to split is between
+ * the first and second characters in the sequence starting
+ * at 's' */
+ if (splittable) {
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Pattern not UTF-8 */
+ if ( ender != LATIN_SMALL_LETTER_SHARP_S
+ || ASCII_FOLD_RESTRICTED)
+ {
+ *e++ = toLOWER_L1(ender);
}
else {
- s--;
+ *e++ = 's';
+ *e++ = 's';
}
- /* 's' may or may not be folded; so make sure it is, and
- * use just the final character in its fold (should there
- * be more than one */
- if (UTF) {
- toFOLD_utf8_safe((U8*) s,
- (U8*) s + UTF8SKIP(s),
- (U8 *) s_fold_buf, &s_len);
- while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
- {
- s_fold += UTF8SKIP(s_fold);
- }
- s_len = UTF8SKIP(s_fold);
- }
- else {
- if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
- {
- s_fold_buf[0] = 's';
+ if ( e - s <= 1
+ && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
+ {
+ if (isPUNCT(*p)) {
+ s--;
+ backed_up = TRUE;
}
- else { /* This works for all other non-UTF-8 folds
- */
- s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+ else {
+ if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
+ || ASCII_FOLD_RESTRICTED)
+ {
+ *e++ = toLOWER_L1(ender);
+ }
+ else {
+ *e++ = 's';
+ *e++ = 's';
+ }
}
- s_len = 1;
}
- /* Unshift this character to the beginning of the buffer,
- * No longer needed trailing characters are overwritten.
- * */
- Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
- Copy(s_fold, foldbuf, s_len, U8);
-
- /* If this isn't a multi-character fold, we have found a
- * splittable place. If this is the final character in the
- * node, that means the node is valid as-is, and can quit.
- * Otherwise, we note how much we can fill the node before
- * coming to a non-splittable position, and go parse it
- * again, stopping there. This is done because we know
- * where in the output to stop, but we don't have a map to
- * where that is in the input. One could be created, but
- * it seems like overkill for such a rare event as we are
- * dealing with here */
- if (UTF) {
- if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
- foldbuf + UTF8_MAXBYTES_CASE))
+ do {
+ if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
+ s--;
+ backed_up = TRUE;
+ continue;
+ }
+
+ if ( LIKELY(s > s0)
+ && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
{
- upper_fill = s + UTF8SKIP(s) - s0;
- if (LIKELY(oldp)) {
- break;
- }
- goto reparse;
+ s -= 2;
+ backed_up = TRUE;
+ continue;
}
+
+ splittable = TRUE;
+ break;
+
+ } while (s > s0);
+
+ if (splittable) {
+ s++;
}
- else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
- foldbuf + UTF8_MAXBYTES_CASE))
- {
- upper_fill = s + 1 - s0;
- if (LIKELY(oldp)) {
- break;
- }
+ }
+
+ /* Here, we are done backing up. If we didn't backup at all
+ * (the likely case), just proceed */
+ if (backed_up) {
+
+ /* If we did find a place to split, reparse the entire node
+ * stopping where we have calculated. */
+ if (splittable) {
+ upper_fill = s - s0;
goto reparse;
}
- oldp = old_oldp;
- old_oldp = NULL;
-
- } /* End of loop backing up through the node */
/* Here the node consists entirely of non-final multi-char
* folds. (Likely it is all 'f's or all 's's.) There's no
* decent place to split it, so give up and just take the
* whole thing */
-
+ len = old_s - s0;
+ }
} /* End of verifying node ends with an appropriate char */
- p = oldp;
+ /* We need to start the next node at the character that didn't fit
+ * in this one */
+ p = oldp;
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 8e3f06df41..fd3e2d6fa9 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1581,15 +1581,35 @@ QUOTEMETA: Meta-characters that \Q should quote
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
=> UTF8 :safe
-
-# 1 => All folds
-&regcharclass_multi_char_folds::multi_char_folds(1)
+&regcharclass_multi_char_folds::multi_char_folds('u', 'a')
MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
=> LATIN1 : safe
+&regcharclass_multi_char_folds::multi_char_folds('l', 'a')
+
+THREE_CHAR_FOLD: A three-character multi-char fold
+=> UTF8 :safe
+&regcharclass_multi_char_folds::multi_char_folds('u', '3')
+
+THREE_CHAR_FOLD: A three-character multi-char fold
+=> LATIN1 :safe
+&regcharclass_multi_char_folds::multi_char_folds('l', '3')
-&regcharclass_multi_char_folds::multi_char_folds(0)
-# 0 => Latin1-only
+THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+=> UTF8 :safe
+&regcharclass_multi_char_folds::multi_char_folds('u', 'h')
+
+THREE_CHAR_FOLD_HEAD: The first two of three-character multi-char folds
+=> LATIN1 :safe
+&regcharclass_multi_char_folds::multi_char_folds('l', 'h')
+#
+#THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
+#=> UTF8 :safe
+#&regcharclass_multi_char_folds::multi_char_folds('u', 'fm')
+#
+#THREE_CHAR_FOLD_NON_FINAL: The first or middle character of multi-char folds
+#=> LATIN1 :safe
+#&regcharclass_multi_char_folds::multi_char_folds('l', 'fm')
FOLDS_TO_MULTI: characters that fold to multi-char strings
=> UTF8 :fast
diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl
index 7900a18782..a54b05cb19 100644
--- a/regen/regcharclass_multi_char_folds.pl
+++ b/regen/regcharclass_multi_char_folds.pl
@@ -60,10 +60,11 @@ sub gen_combinations ($;) {
return @ret;
}
-sub multi_char_folds ($) {
- my $all_folds = shift; # The single parameter is true if wants all
- # multi-char folds; false if just the ones that
- # are all ascii
+sub multi_char_folds ($$) {
+ my $type = shift; # 'u' for UTF-8; 'l' for latin1
+ my $range = shift; # 'a' for all; 'h' for starting 2 bytes; 'm' for ending 2
+ die "[lu] only valid values for first parameter" if $type !~ /[lu]/;
+ die "[aht3] only valid values for 2nd parameter" if $range !~ /[aht3]/;
return () if pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) lt v3.0.1;
@@ -87,6 +88,16 @@ sub multi_char_folds ($) {
die sprintf("regcomp.c can't cope with a latin1 multi-char fold (found in the fold of 0x%X", $cp_ref->[$i]) if grep { $_ < 256 && chr($_) !~ /[[:ascii:]]/ } @{$folds_ref->[$i]};
@folds = @{$folds_ref->[$i]};
+ if ($range eq '3') {
+ next if @folds < 3;
+ }
+ elsif ($range eq 'h') {
+ pop @folds;
+ }
+ elsif ($range eq 't') {
+ next if @folds < 3;
+ shift @folds;
+ }
# Create a line that looks like "\x{foo}\x{bar}\x{baz}" of the code
# points that make up the fold (use the actual character if
@@ -100,7 +111,7 @@ sub multi_char_folds ($) {
# Skip if something else already has this fold
next if grep { $_ eq $fold } @output_folds;
- if ($all_folds) {
+ if ($type eq 'u') {
push @output_folds, $fold;
} # Skip if wants only all-ascii folds, and there is a non-ascii
elsif (! grep { chr($_) =~ /[^[:ascii:]]/ } @folds) {
@@ -143,7 +154,7 @@ sub multi_char_folds ($) {
#
# No combinations of this with 's' need be added, as any of these
# containing 's' are prohibited under /iaa.
- push @output_folds, '"\x{17F}\x{17F}"' if $all_folds;
+ push @output_folds, '"\x{17F}\x{17F}"' if $type eq 'u' && $range eq 'a';
return @output_folds;
}
diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t
index b4f32eec04..59f2987015 100644
--- a/t/re/pat_advanced.t
+++ b/t/re/pat_advanced.t
@@ -2152,7 +2152,7 @@ EOP
"Check TRIE does not overwrite EXACT following NOTHING at start - RT #111842";
{
- my $single = ":";
+ my $single = "z";
my $upper = "\x{390}"; # Fold is 3 chars.
my $multi = CORE::fc($upper);