summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-12-31 16:33:08 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-12-31 16:33:08 +0000
commit60a8b682cede796bc3c248d2778db979d6f9b9ff (patch)
treea0f7e39d5d66439494d6d23331b7f8051fcf49dd
parent8a4852225e5d282a2188e5d193a7e81b9749812e (diff)
downloadperl-60a8b682cede796bc3c248d2778db979d6f9b9ff.tar.gz
Add some comments to the recent Unicode case-folding saga.
p4raw-id: //depot/perl@13985
-rw-r--r--regcomp.c23
-rw-r--r--regexec.c21
2 files changed, 42 insertions, 2 deletions
diff --git a/regcomp.c b/regcomp.c
index cac14bf8e6..b442f2c246 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2987,7 +2987,7 @@ tryagain:
STRLEN numlen;
STRLEN ulen;
STRLEN foldlen;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1], *foldbuf;
+ U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
parse_start = RExC_parse - 1;
@@ -3131,13 +3131,18 @@ tryagain:
}
if (RExC_flags16 & PMf_EXTENDED)
p = regwhite(p, RExC_end);
- if (UTF && FOLD)
+ if (UTF && FOLD) {
+ /* Prime the casefolded buffer. */
toFOLD_uni(ender, tmpbuf, &foldlen);
+ /* Need to peek at the first character. */
+ ender = utf8_to_uvchr(tmpbuf, 0);
+ }
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
else if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
if (FOLD) {
+ /* Emit all the Unicode characters. */
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
@@ -3162,6 +3167,7 @@ tryagain:
}
if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(ender)) && UTF) {
if (FOLD) {
+ /* Emit all the Unicode characters. */
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
@@ -3206,6 +3212,8 @@ tryagain:
break;
}
+ /* If the encoding pragma is in effect recode the text of
+ * any EXACT-kind nodes. */
if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
STRLEN oldlen = STR_LEN(ret);
SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
@@ -4020,9 +4028,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
to_utf8_fold(tmpbuf, foldbuf, &foldlen);
f = utf8_to_uvchr(foldbuf, 0);
+ /* If folding and foldable, insert also
+ * the folded version to the charclass. */
if (f != value)
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f);
+ /* If folding and the value is one of the Greek
+ * sigmas insert a few more sigmas to make the
+ * folding rules of the sigmas to work right.
+ * Note that not all the possible combinations
+ * are handled here: some of them are handled
+ * handled by the standard folding rules, and
+ * some of them (literal or EXACTF cases) are
+ * handled during runtime in
+ * regexec.c:S_find_byclass(). */
if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
(UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
diff --git a/regexec.c b/regexec.c
index e67774dc22..cf33abb51d 100644
--- a/regexec.c
+++ b/regexec.c
@@ -965,6 +965,16 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (norun && e < s)
e = s; /* Due to minlen logic of intuit() */
+ /* The idea in the EXACTF* cases is to first find the
+ * first character of the EXACTF* node and then, if
+ * necessary, case-insensitively compare the full
+ * text of the node. The c1 and c2 are the first
+ * characters (though in Unicode it gets a bit
+ * more complicated because there are more cases
+ * than just upper and lower: one is really supposed
+ * to use the so-called folding case for case-insensitive
+ * matching (called "loose matching" in Unicode). */
+
if (do_utf8) {
UV c, f;
U8 tmpbuf [UTF8_MAXLEN+1];
@@ -1009,6 +1019,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
while (s <= e) {
c = utf8_to_uvchr((U8*)s, &len);
+ /* Handle some of the three Greek sigmas cases.
+ * Note that not all the possible combinations
+ * are handled here: some of them are handled
+ * handled by the standard folding rules, and
+ * some of them (the character class or ANYOF
+ * cases) are handled during compiletime in
+ * regexec.c:S_regclass(). */
if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
@@ -2396,12 +2413,16 @@ S_regmatch(pTHX_ regnode *prog)
U8 lfoldbuf[UTF8_MAXLEN_FOLD+1];
STRLEN lfoldlen;
+ /* Try one of them folded. */
+
to_utf8_fold((U8*)l, lfoldbuf, &lfoldlen);
if (UTF8SKIP(s) != lfoldlen ||
memNE(s, (char*)lfoldbuf, lfoldlen)) {
U8 sfoldbuf[UTF8_MAXLEN_FOLD+1];
STRLEN sfoldlen;
+ /* Try both of them folded. */
+
to_utf8_fold((U8*)s, sfoldbuf, &sfoldlen);
if (sfoldlen != lfoldlen ||
memNE((char*)sfoldbuf,