summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-05-08 23:06:17 -0600
committerKarl Williamson <public@khwilliamson.com>2013-05-09 10:15:13 -0600
commit3345a47950127cf99a197eba4dce9c91f7bc9139 (patch)
treeb584b8866960a66c9d80ac8dc993144c99cb9d10
parentfe02ddb7a070fc75fab3f7c2ed77f31b0dc5fc23 (diff)
downloadperl-3345a47950127cf99a197eba4dce9c91f7bc9139.tar.gz
Fix regex /il and /iaa failures for single element [] class
This was a regression introduced in the v5.17 series. It only affected UTF-8 encoded patterns. Basically, the code here should have corresponded to, and didn't, similar logic located after the defchar: label in this file, which is executed for the general case (not stemming from a single element [bracketed] character class node). We don't fold code points 0-255 under locale, as those aren't known until run time. Similarly, we don't allow folds that cross the 255/256 boundary, as those aren't well-defined; and under /aa we don't allow folds that cross the 127/128 boundary.
-rw-r--r--regcomp.c16
-rw-r--r--t/re/fold_grind.t8
2 files changed, 18 insertions, 6 deletions
diff --git a/regcomp.c b/regcomp.c
index de17958146..bc0c0efd86 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10131,8 +10131,9 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
* additionally will populate the node's STRING with <code_point>, if <len>
* is 0. In both cases <*flagp> is appropriately set
*
- * It knows that under FOLD, UTF characters and the Latin Sharp S must be
- * folded (the latter only when the rules indicate it can match 'ss') */
+ * It knows that under FOLD, the Latin Sharp S and UTF characters above
+ * 255, must be folded (the former only when the rules indicate it can
+ * match 'ss') */
bool len_passed_in = cBOOL(len != 0);
U8 character[UTF8_MAXBYTES_CASE+1];
@@ -10141,8 +10142,15 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
if (! len_passed_in) {
if (UTF) {
- if (FOLD) {
- to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
+ if (FOLD && (! LOC || code_point > 255)) {
+ _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+ character,
+ &len,
+ FOLD_FLAGS_FULL | ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
}
else {
uvchr_to_utf8( character, code_point);
diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t
index 3267336d84..bb45a699ad 100644
--- a/t/re/fold_grind.t
+++ b/t/re/fold_grind.t
@@ -666,6 +666,8 @@ foreach my $test (sort { numerically } keys %tests) {
foreach my $bracketed (0, 1) { # Put rhs in [...], or not
next if $bracketed && @pattern != 1; # bracketed makes these
# or's instead of a sequence
+ foreach my $optimize_bracketed (0, 1) {
+ next if $optimize_bracketed && ! $bracketed;
foreach my $inverted (0,1) {
next if $inverted && ! $bracketed; # inversion only valid in [^...]
next if $inverted && @target != 1; # [perl #89750] multi-char
@@ -687,8 +689,9 @@ foreach my $test (sort { numerically } keys %tests) {
$rhs .= $rhs_char;
# Add a character to the class, so class doesn't get
- # optimized out
- $rhs .= '_]' if $bracketed;
+ # optimized out, unless we are testing that optimization
+ $rhs .= '_' if $optimize_bracketed;
+ $rhs .= ']' if $bracketed;
}
# Add one of: no capturing parens
@@ -812,6 +815,7 @@ foreach my $test (sort { numerically } keys %tests) {
}
}
}
+ }
}
}
unless($list_all_tests) {