summaryrefslogtreecommitdiff
path: root/regexec.c
diff options
context:
space:
mode:
authorNick Ing-Simmons <nik@tiuk.ti.com>2003-08-09 00:05:49 +0100
committerJarkko Hietaniemi <jhi@iki.fi>2003-08-08 21:05:24 +0000
commit5dab1207de1f24b66daa019a365ada0ee1faa6f9 (patch)
tree33595e6d06e53418570b6587e1a989626f5b8002 /regexec.c
parentc1e0e3d213c8ff11e0eeef6aecef3894e8ec6e96 (diff)
downloadperl-5dab1207de1f24b66daa019a365ada0ee1faa6f9.tar.gz
Re: UNICODE regexp bug
Message-Id: <20030808220549.5109.4@llama.ni-s.u-net.com> Better patch (than #20566) from NI-S, one more test (introduce another variable rather than reuse the tmp, though) p4raw-id: //depot/perl@20568
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c23
1 files changed, 16 insertions, 7 deletions
diff --git a/regexec.c b/regexec.c
index d4cf4ed7ea..1f36027c5c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -953,6 +953,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
char *m;
STRLEN ln;
+ STRLEN lnc;
unsigned int c1;
unsigned int c2;
char *e;
@@ -1008,10 +1009,12 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
break;
case EXACTF:
- m = STRING(c);
- ln = STR_LEN(c);
+ m = STRING(c);
+ ln = STR_LEN(c); /* length to match in octets/bytes */
+ lnc = (I32) ln; /* length to match in characters */
if (UTF) {
STRLEN ulen1, ulen2;
+ U8 *sm = (U8 *) m;
U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
@@ -1022,6 +1025,11 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ lnc = 0;
+ while (sm < ((U8 *) m + ln)) {
+ lnc++;
+ sm += UTF8SKIP(sm);
+ }
}
else {
c1 = *(U8*)m;
@@ -1029,14 +1037,13 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
goto do_exactf;
case EXACTFL:
- m = STRING(c);
- ln = STR_LEN(c);
+ m = STRING(c);
+ ln = STR_LEN(c);
+ lnc = (I32) ln;
c1 = *(U8*)m;
c2 = PL_fold_locale[c1];
do_exactf:
- /* The last byte to try is ln-1 characters before strend
- * since the strend points one byte past the string. */
- e = HOP3c(strend, (I32)1 - (I32)ln, s);
+ e = HOP3c(strend, -lnc, s);
if (norun && e < s)
e = s; /* Due to minlen logic of intuit() */
@@ -1059,6 +1066,8 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
STRLEN len, foldlen;
if (c1 == c2) {
+ /* Upper and lower of 1st char are equal -
+ * probably not a "letter". */
while (s <= e) {
c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
ckWARN(WARN_UTF8) ?