summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/unicore/To/Fold.pl12
-rw-r--r--lib/unicore/mktables4
-rw-r--r--regcomp.c11
-rw-r--r--regexec.c19
-rw-r--r--utf8.c2
5 files changed, 36 insertions, 12 deletions
diff --git a/lib/unicore/To/Fold.pl b/lib/unicore/To/Fold.pl
index 1502690246..6b0c2e9a71 100644
--- a/lib/unicore/To/Fold.pl
+++ b/lib/unicore/To/Fold.pl
@@ -266,9 +266,12 @@ return <<'END';
01B7 0292
01B8 01B9
01BC 01BD
-01C4 01C5 01C6
-01C7 01C8 01C9
-01CA 01CB 01CC
+01C4 01C6
+01C5 01C6
+01C7 01C9
+01C8 01C9
+01CA 01CC
+01CB 01CC
01CD 01CE
01CF 01D0
01D1 01D2
@@ -286,7 +289,8 @@ return <<'END';
01EA 01EB
01EC 01ED
01EE 01EF
-01F1 01F2 01F3
+01F1 01F3
+01F2 01F3
01F4 01F5
01F6 0195
01F7 01BF
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 546b3cf8f4..34d138826b 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -833,12 +833,14 @@ if (open(my $CaseFold, "CaseFold.txt")) {
my %Fold;
while (<$CaseFold>) {
+ # Skip status 'S', simple case folding
next unless /^([0-9A-Fa-f]+)\s*;\s*([CFI])\s*;\s*([0-9A-Fa-f]+(?: [0-9A-Fa-f]+)*)\s*;/;
my ($code, $status, $fold) = ($1, $2, $3);
if ($status eq 'C') { # Common: one-to-one folding
- append(\@Fold, $code, $fold);
+ # No append() since several codes may fold into one.
+ push @Fold, [ $code, $code, $fold ];
} else { # F: full, or I: dotted uppercase I -> dotless lowercase I
$Fold{hex($code)} = $fold;
}
diff --git a/regcomp.c b/regcomp.c
index b5d9860631..c537eaadbd 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3985,6 +3985,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
(UV)value);
if (FOLD) {
+ U8 tmpbuf [UTF8_MAXLEN+1];
+ U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+ STRLEN foldlen;
+ UV f;
+
+ uvchr_to_utf8(tmpbuf, value);
+ f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+
+ if (f != value)
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+
if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
(UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
diff --git a/regexec.c b/regexec.c
index 7b459e2d45..3f1449dbb8 100644
--- a/regexec.c
+++ b/regexec.c
@@ -979,8 +979,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
while (s <= e) {
if ( utf8_to_uvchr((U8*)s, &len) == c1
&& (ln == len ||
- ibcmp_utf8(s, do_utf8, strend - s,
- m, UTF, ln))
+ ibcmp_utf8(s, do_utf8, (I32)(strend - s),
+ m, UTF, (I32)ln))
&& (norun || regtry(prog, s)) )
goto got_it;
s += len;
@@ -988,14 +988,21 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
else {
while (s <= e) {
+ U8 tmpbuf [UTF8_MAXLEN+1];
+ U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+ STRLEN foldlen;
UV c = utf8_to_uvchr((U8*)s, &len);
+ UV f;
+
+ uvchr_to_utf8(tmpbuf, c);
+ f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
+
if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
- if ( (c == c1 || c == c2)
- && (ln == len ||
- ibcmp_utf8(s, do_utf8, strend - s,
- m, UTF, ln))
+ if ( (c == c1 || c == c2 || f == c1 || f == c2)
+ && ibcmp_utf8(s, do_utf8, (I32)(strend - s),
+ m, UTF, (I32)ln)
&& (norun || regtry(prog, s)) )
goto got_it;
s += len;
diff --git a/utf8.c b/utf8.c
index debfb9ceac..27f86b60c8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1651,7 +1651,7 @@ Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
}
/*
-=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|const char *s2|bool u2|register I32 len
+=for apidoc A|I32|ibcmp_utf8|const char *s1|bool u1|register I32 len1|const char *s2|bool u2|register I32 len2
Return true if the strings s1 and s2 differ case-insensitively, false
if not (if they are equal case-insensitively). If u1 is true, the