summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-04-04 16:06:52 -0600
committerKarl Williamson <khw@cpan.org>2020-10-16 07:01:41 -0600
commitb1826163632422d276c89895546bd113c8f2cfe6 (patch)
tree3bdfe941cff2457ae6ecef09d0b8e97170ba9f1f /regcomp.c
parent4414955b8d69f301cec98246b177ffcc2eb9b061 (diff)
downloadperl-b1826163632422d276c89895546bd113c8f2cfe6.tar.gz
regcomp.c: Do some extra folding
Generally we have to wait until runtime to do folding for regnodes that are locale dependent, because we don't know what the locale at runtime will be, and hence what the folds will be. But UTF-8 locales all have the same folding behavior, no matter what the locale is, with the exception of two fold pairs in Turkish. (Lithuanian too, but Perl doesn't support that language's special folding rules.) UTF-8 is the only locale type that Perl supports that can represent code points above 255. Therefore we do know at compile time what the above-255 folds are (again excepting the two in Turkish), and so we can do the folding then. But only if both the components are above 255. There are a few folds that cross the 255/256 boundary, and they must be deferred. However, there are two instances where there are three characters that fold together in which two of them are above 255, and the third isn't. That the two high ones are equivalent under /i is known at compile time, and so that equivalence can be stated then.
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c23
1 files changed, 19 insertions, 4 deletions
diff --git a/regcomp.c b/regcomp.c
index 8b90579ddc..72d788f975 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -14547,6 +14547,16 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* things */
maybe_exactfu = FALSE;
+ /* Although these two characters have folds that are
+ * locale-problematic, they also have folds to above Latin1
+ * that aren't a problem. Doing these now helps at
+ * runtime. */
+ if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
+ || ender == LATIN_CAPITAL_LETTER_SHARP_S))
+ {
+ goto fold_anyway;
+ }
+
/* Here, we are adding a problematic fold character.
* "Problematic" in this context means that its fold isn't
* known until runtime. (The non-problematic code points
@@ -14600,15 +14610,20 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
*(s)++ = (U8) toFOLD(ender);
}
else {
- UV folded = _to_uni_fold_flags(
+ UV folded;
+
+ fold_anyway:
+ folded = _to_uni_fold_flags(
ender,
(U8 *) s, /* We have allocated extra space
in 's' so can't run off the
end */
&added_len,
- FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
+ FOLD_FLAGS_FULL
+ | (( ASCII_FOLD_RESTRICTED
+ || node_type == EXACTFL)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
if (UNLIKELY(len + added_len > max_string_len)) {
overflowed = TRUE;
break;