diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-05-08 23:06:17 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2013-05-09 10:15:13 -0600 |
commit | 3345a47950127cf99a197eba4dce9c91f7bc9139 (patch) | |
tree | b584b8866960a66c9d80ac8dc993144c99cb9d10 | |
parent | fe02ddb7a070fc75fab3f7c2ed77f31b0dc5fc23 (diff) | |
download | perl-3345a47950127cf99a197eba4dce9c91f7bc9139.tar.gz |
Fix regex /il and /iaa failures for single element [] class
This was a regression introduced in the v5.17 series. It only affected
UTF-8 encoded patterns. Basically, the code here should have
corresponded to, and didn't, similar logic located after the defchar:
label in this file, which is executed for the general case (not stemming
from a single element [bracketed] character class node).
We don't fold code points 0-255 under locale, as those aren't known
until run time. Similarly, we don't allow folds that cross the 255/256
boundary, as those aren't well-defined; and under /aa we don't allow
folds that cross the 127/128 boundary.
-rw-r--r-- | regcomp.c | 16 | ||||
-rw-r--r-- | t/re/fold_grind.t | 8 |
2 files changed, 18 insertions, 6 deletions
@@ -10131,8 +10131,9 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * additionally will populate the node's STRING with <code_point>, if <len> * is 0. In both cases <*flagp> is appropriately set * - * It knows that under FOLD, UTF characters and the Latin Sharp S must be - * folded (the latter only when the rules indicate it can match 'ss') */ + * It knows that under FOLD, the Latin Sharp S and UTF characters above + * 255, must be folded (the former only when the rules indicate it can + * match 'ss') */ bool len_passed_in = cBOOL(len != 0); U8 character[UTF8_MAXBYTES_CASE+1]; @@ -10141,8 +10142,15 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 if (! len_passed_in) { if (UTF) { - if (FOLD) { - to_uni_fold(NATIVE_TO_UNI(code_point), character, &len); + if (FOLD && (! LOC || code_point > 255)) { + _to_uni_fold_flags(NATIVE_TO_UNI(code_point), + character, + &len, + FOLD_FLAGS_FULL | ((LOC) + ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); } else { uvchr_to_utf8( character, code_point); diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 3267336d84..bb45a699ad 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -666,6 +666,8 @@ foreach my $test (sort { numerically } keys %tests) { foreach my $bracketed (0, 1) { # Put rhs in [...], or not next if $bracketed && @pattern != 1; # bracketed makes these # or's instead of a sequence + foreach my $optimize_bracketed (0, 1) { + next if $optimize_bracketed && ! $bracketed; foreach my $inverted (0,1) { next if $inverted && ! $bracketed; # inversion only valid in [^...] next if $inverted && @target != 1; # [perl #89750] multi-char @@ -687,8 +689,9 @@ foreach my $test (sort { numerically } keys %tests) { $rhs .= $rhs_char; # Add a character to the class, so class doesn't get - # optimized out - $rhs .= '_]' if $bracketed; + # optimized out, unless we are testing that optimization + $rhs .= '_' if $optimize_bracketed; + $rhs .= ']' if $bracketed; } # Add one of: no capturing parens @@ -812,6 +815,7 @@ foreach my $test (sort { numerically } keys %tests) { } } } + } } } unless($list_all_tests) { |