summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorInaba Hiroto <inaba@st.rim.or.jp>2000-12-30 23:27:10 +0900
committerJarkko Hietaniemi <jhi@iki.fi>2000-12-30 17:14:19 +0000
commit1aa99e6b6d14c469ac825dde483d9c9f913a3ee2 (patch)
tree76ac8dd0d9473e84a19f6184baa4eddb6337e7b1
parent13e8c8e316d3839d0834fb8b851566b00d81e876 (diff)
downloadperl-1aa99e6b6d14c469ac825dde483d9c9f913a3ee2.tar.gz
more UTF8 test suites and an UTF8 patch
Message-ID: <3A4D722D.243AFD88@st.rim.or.jp> Just the patch part for now, and the pragma renamed as unicode::distinct. p4raw-id: //depot/perl@8267
-rw-r--r--MANIFEST1
-rw-r--r--doop.c333
-rw-r--r--embed.h8
-rwxr-xr-xembed.pl2
-rw-r--r--lib/unicode/distinct.pm35
-rw-r--r--mg.c14
-rw-r--r--op.c4
-rw-r--r--op.h3
-rw-r--r--perl.h1
-rw-r--r--pod/perlapi.pod11
-rw-r--r--pp.c14
-rw-r--r--pp_ctl.c17
-rw-r--r--pp_hot.c54
-rw-r--r--proto.h2
-rw-r--r--regcomp.c129
-rw-r--r--regcomp.h1
-rw-r--r--regexec.c519
-rw-r--r--sv.c28
-rw-r--r--toke.c36
-rw-r--r--utf8.c6
20 files changed, 798 insertions, 420 deletions
diff --git a/MANIFEST b/MANIFEST
index 7445fd785d..18ae760ecb 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1072,6 +1072,7 @@ lib/unicode/To/Upper.pl Unicode character database
lib/unicode/UCD301.html Unicode character database
lib/unicode/UCDFF301.html Unicode character database
lib/unicode/Unicode.301 Unicode character database
+lib/unicode/distinct.pm Perl pragma to strictly distinguish UTF8 data and non-UTF data
lib/unicode/mktables.PL Unicode character database generator
lib/unicode/syllables.txt Unicode character database
lib/utf8.pm Pragma to control Unicode support
diff --git a/doop.c b/doop.c
index 3548556a89..8256b93d53 100644
--- a/doop.c
+++ b/doop.c
@@ -21,18 +21,6 @@
#endif
#endif
-#define HALF_UTF8_UPGRADE(start,end) \
- STMT_START { \
- if ((start)<(end)) { \
- U8* NeWsTr; \
- STRLEN LeN = (end) - (start); \
- NeWsTr = bytes_to_utf8(start, &LeN); \
- Safefree(start); \
- (start) = NeWsTr; \
- (end) = (start) + LeN; \
- } \
- } STMT_END
-
STATIC I32
S_do_trans_simple(pTHX_ SV *sv)
{
@@ -41,7 +29,6 @@ S_do_trans_simple(pTHX_ SV *sv)
U8 *send;
U8 *dstart;
I32 matches = 0;
- I32 sutf = SvUTF8(sv);
STRLEN len;
short *tbl;
I32 ch;
@@ -54,7 +41,7 @@ S_do_trans_simple(pTHX_ SV *sv)
send = s + len;
/* First, take care of non-UTF8 input strings, because they're easy */
- if (!sutf) {
+ if (!SvUTF8(sv)) {
while (s < send) {
if ((ch = tbl[*s]) >= 0) {
matches++;
@@ -79,7 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv)
c = utf8_to_uv(s, send - s, &ulen, 0);
if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
matches++;
- d = uv_to_utf8(d,ch);
+ if (ch < 0x80)
+ *d++ = ch;
+ else
+ d = uv_to_utf8(d,ch);
s += ulen;
}
else { /* No match -> copy */
@@ -88,8 +78,7 @@ S_do_trans_simple(pTHX_ SV *sv)
}
}
*d = '\0';
- sv_setpvn(sv, (const char*)dstart, d - dstart);
- Safefree(dstart);
+ sv_setpvn(sv, (char*)dstart, d - dstart);
SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
@@ -101,7 +90,6 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
U8 *s;
U8 *send;
I32 matches = 0;
- I32 hasutf = SvUTF8(sv);
STRLEN len;
short *tbl;
@@ -112,22 +100,20 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
s = (U8*)SvPV(sv, len);
send = s + len;
- while (s < send) {
- if (hasutf && *s & 0x80)
- s += UTF8SKIP(s);
- else {
- UV c;
- STRLEN ulen;
- ulen = 1;
- if (hasutf)
- c = utf8_to_uv(s, send - s, &ulen, 0);
- else
- c = *s;
- if (c < 0x100 && tbl[c] >= 0)
+ if (!SvUTF8(sv))
+ while (s < send) {
+ if (tbl[*s++] >= 0)
matches++;
- s += ulen;
- }
- }
+ }
+ else
+ while (s < send) {
+ UV c;
+ STRLEN ulen;
+ c = utf8_to_uv(s, send - s, &ulen, 0);
+ if (c < 0x100 && tbl[c] >= 0)
+ matches++;
+ s += ulen;
+ }
return matches;
}
@@ -139,7 +125,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
U8 *send;
U8 *d;
U8 *dstart;
- I32 hasutf = SvUTF8(sv);
+ I32 isutf8;
I32 matches = 0;
STRLEN len;
short *tbl;
@@ -150,64 +136,96 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
Perl_croak(aTHX_ "panic: do_trans_complex");
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
send = s + len;
- Newz(0, d, len*2+1, U8);
- dstart = d;
-
- if (PL_op->op_private & OPpTRANS_SQUASH) {
- U8* p = send;
-
- while (s < send) {
- if (hasutf && *s & 0x80)
- s += UTF8SKIP(s);
- else {
- if ((ch = tbl[*s]) >= 0) {
+ if (!isutf8) {
+ dstart = d = s;
+ if (PL_op->op_private & OPpTRANS_SQUASH) {
+ U8* p = send;
+ while (s < send) {
+ if ((ch = tbl[*s]) >= 0) {
*d = ch;
matches++;
- if (p != d - 1 || *p != *d)
- p = d++;
- }
- else if (ch == -1) /* -1 is unmapped character */
- *d++ = *s; /* -2 is delete character */
- s++;
- }
+ if (p != d - 1 || *p != *d)
+ p = d++;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s;
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s++;
+ }
}
+ else {
+ while (s < send) {
+ if ((ch = tbl[*s]) >= 0) {
+ matches++;
+ *d++ = ch;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s;
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s++;
+ }
+ }
+ SvCUR_set(sv, d - dstart);
}
- else {
- while (s < send) {
- UV comp;
- if (hasutf && *s & 0x80)
- comp = utf8_to_uv_simple(s, NULL);
- else
- comp = *s;
-
- ch = tbl[comp];
-
- if (ch == -1) { /* -1 is unmapped character */
- ch = comp;
- matches--;
- }
-
- if (ch >= 0)
- d = uv_to_utf8(d, ch);
-
- matches++;
-
- s += hasutf && *s & 0x80 ? UNISKIP(*s) : 1;
-
+ else { /* isutf8 */
+ Newz(0, d, len*2+1, U8);
+ dstart = d;
+
+ if (PL_op->op_private & OPpTRANS_SQUASH) {
+ U8* p = send;
+ UV pch = 0xfeedface;
+ while (s < send) {
+ STRLEN len;
+ UV comp = utf8_to_uv_simple(s, &len);
+
+ if (comp > 0xff)
+ d = uv_to_utf8(d, comp); /* always unmapped */
+ else if ((ch = tbl[comp]) >= 0) {
+ matches++;
+ if (ch != pch) {
+ d = uv_to_utf8(d, ch);
+ pch = ch;
+ }
+ s += len;
+ continue;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ d = uv_to_utf8(d, comp);
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s += len;
+ pch = 0xfeedface;
+ }
}
+ else {
+ while (s < send) {
+ STRLEN len;
+ UV comp = utf8_to_uv_simple(s, &len);
+ if (comp > 0xff)
+ d = uv_to_utf8(d, comp); /* always unmapped */
+ else if ((ch = tbl[comp]) >= 0) {
+ d = uv_to_utf8(d, ch);
+ matches++;
+ }
+ else if (ch == -1) { /* -1 is unmapped character */
+ d = uv_to_utf8(d, comp);
+ }
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s += len;
+ }
+ }
+ *d = '\0';
+ sv_setpvn(sv, (char*)dstart, d - dstart);
+ SvUTF8_on(sv);
}
-
- *d = '\0';
-
- sv_setpvn(sv, (const char*)dstart, d - dstart);
- Safefree(dstart);
- if (hasutf)
- SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
-
}
STATIC I32
@@ -217,7 +235,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
U8 *send;
U8 *d;
U8 *start;
- U8 *dstart;
+ U8 *dstart, *dend;
I32 matches = 0;
STRLEN len;
@@ -228,11 +246,19 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
UV extra = none + 1;
UV final;
UV uv;
- I32 isutf;
- I32 howmany;
+ I32 isutf8;
+ U8 hibit = 0;
- isutf = SvUTF8(sv);
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
+ if (!isutf8) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ s = bytes_to_utf8(s, &len);
+ }
send = s + len;
start = s;
@@ -241,41 +267,46 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
final = SvUV(*svp);
/* d needs to be bigger than s, in case e.g. upgrading is required */
- Newz(0, d, len*2+1, U8);
+ New(0, d, len*3+UTF8_MAXLEN, U8);
+ dend = d + len * 3;
dstart = d;
+
while (s < send) {
if ((uv = swash_fetch(rv, s)) < none) {
s += UTF8SKIP(s);
matches++;
- if ((uv & 0x80) && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
d = uv_to_utf8(d, uv);
}
else if (uv == none) {
- int i;
- i = UTF8SKIP(s);
- if (i > 1 && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
+ int i = UTF8SKIP(s);
while(i--)
*d++ = *s++;
}
else if (uv == extra) {
- int i;
- i = UTF8SKIP(s);
+ int i = UTF8SKIP(s);
s += i;
matches++;
- if (i > 1 && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
d = uv_to_utf8(d, final);
}
else
s += UTF8SKIP(s);
+
+ if (d >= dend) {
+ STRLEN clen = d - dstart;
+ STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
+ }
}
*d = '\0';
- sv_setpvn(sv, (const char*)dstart, d - dstart);
+ sv_setpvn(sv, (char*)dstart, d - dstart);
SvSETMAGIC(sv);
- if (isutf)
- SvUTF8_on(sv);
+ SvUTF8_on(sv);
+ if (hibit)
+ Safefree(start);
+ if (!isutf8 && !(PL_hints & HINT_UTF8))
+ sv_utf8_downgrade(sv, TRUE);
return matches;
}
@@ -284,7 +315,7 @@ STATIC I32
S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
{
U8 *s;
- U8 *send;
+ U8 *start, *send;
I32 matches = 0;
STRLEN len;
@@ -293,10 +324,17 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
UV none = svp ? SvUV(*svp) : 0x7fffffff;
UV uv;
+ U8 hibit = 0;
s = (U8*)SvPV(sv, len);
- if (!SvUTF8(sv))
- s = bytes_to_utf8(s, &len);
+ if (!SvUTF8(sv)) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ start = s = bytes_to_utf8(s, &len);
+ }
send = s + len;
while (s < send) {
@@ -304,6 +342,8 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
matches++;
s += UTF8SKIP(s);
}
+ if (hibit)
+ Safefree(start);
return matches;
}
@@ -312,7 +352,7 @@ STATIC I32
S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
{
U8 *s;
- U8 *send;
+ U8 *start, *send;
U8 *d;
I32 matches = 0;
I32 squash = PL_op->op_private & OPpTRANS_SQUASH;
@@ -325,41 +365,45 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
UV final;
UV uv;
STRLEN len;
- U8 *dst;
- I32 isutf = SvUTF8(sv);
+ U8 *dstart, *dend;
+ I32 isutf8;
+ U8 hibit = 0;
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
+ if (!isutf8) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ s = bytes_to_utf8(s, &len);
+ }
send = s + len;
+ start = s;
svp = hv_fetch(hv, "FINAL", 5, FALSE);
if (svp)
final = SvUV(*svp);
- Newz(0, d, len*2+1, U8);
- dst = d;
+ New(0, d, len*3+UTF8_MAXLEN, U8);
+ dend = d + len * 3;
+ dstart = d;
if (squash) {
UV puv = 0xfeedface;
while (s < send) {
- if (SvUTF8(sv))
- uv = swash_fetch(rv, s);
- else {
- U8 tmpbuf[2];
- uv = *s++;
- if (uv < 0x80)
- tmpbuf[0] = uv;
- else {
- tmpbuf[0] = (( uv >> 6) | 0xc0);
- tmpbuf[1] = (( uv & 0x3f) | 0x80);
- }
- uv = swash_fetch(rv, tmpbuf);
+ uv = swash_fetch(rv, s);
+
+ if (d >= dend) {
+ STRLEN clen = d - dstart, nlen = dend - dstart + len;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
}
-
if (uv < none) {
matches++;
if (uv != puv) {
- if ((uv & 0x80) && !isutf++)
- HALF_UTF8_UPGRADE(dst,d);
d = uv_to_utf8(d, uv);
puv = uv;
}
@@ -367,9 +411,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- STRLEN ulen;
- *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
- s += ulen;
+ int i = UTF8SKIP(s);
+ while(i--)
+ *d++ = *s++;
puv = 0xfeedface;
continue;
}
@@ -388,18 +432,12 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
}
else {
while (s < send) {
- if (SvUTF8(sv))
- uv = swash_fetch(rv, s);
- else {
- U8 tmpbuf[2];
- uv = *s++;
- if (uv < 0x80)
- tmpbuf[0] = uv;
- else {
- tmpbuf[0] = (( uv >> 6) | 0xc0);
- tmpbuf[1] = (( uv & 0x3f) | 0x80);
- }
- uv = swash_fetch(rv, tmpbuf);
+ uv = swash_fetch(rv, s);
+ if (d >= dend) {
+ STRLEN clen = d - dstart, nlen = dend - dstart + len;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
}
if (uv < none) {
matches++;
@@ -408,9 +446,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- STRLEN ulen;
- *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
- s += ulen;
+ int i = UTF8SKIP(s);
+ while(i--)
+ *d++ = *s++;
continue;
}
else if (uv == extra && !del) {
@@ -423,12 +461,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
s += UTF8SKIP(s);
}
}
- if (dst)
- sv_usepvn(sv, (char*)dst, d - dst);
- else {
- *d = '\0';
- SvCUR_set(sv, d - (U8*)SvPVX(sv));
- }
+ *d = '\0';
+ sv_setpvn(sv, (char*)dstart, d - dstart);
+ SvUTF8_on(sv);
+ if (hibit)
+ Safefree(start);
+ if (!isutf8 && !(PL_hints & HINT_UTF8))
+ sv_utf8_downgrade(sv, TRUE);
SvSETMAGIC(sv);
return matches;
diff --git a/embed.h b/embed.h
index 3b54154de1..414a642033 100644
--- a/embed.h
+++ b/embed.h
@@ -1030,7 +1030,9 @@
#define regcp_set_to S_regcp_set_to
#define cache_re S_cache_re
#define reghop S_reghop
+#define reghop3 S_reghop3
#define reghopmaybe S_reghopmaybe
+#define reghopmaybe3 S_reghopmaybe3
#define find_byclass S_find_byclass
#endif
#if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT)
@@ -2492,7 +2494,9 @@
#define regcp_set_to(a) S_regcp_set_to(aTHX_ a)
#define cache_re(a) S_cache_re(aTHX_ a)
#define reghop(a,b) S_reghop(aTHX_ a,b)
+#define reghop3(a,b,c) S_reghop3(aTHX_ a,b,c)
#define reghopmaybe(a,b) S_reghopmaybe(aTHX_ a,b)
+#define reghopmaybe3(a,b,c) S_reghopmaybe3(aTHX_ a,b,c)
#define find_byclass(a,b,c,d,e,f) S_find_byclass(aTHX_ a,b,c,d,e,f)
#endif
#if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT)
@@ -4853,8 +4857,12 @@
#define cache_re S_cache_re
#define S_reghop CPerlObj::S_reghop
#define reghop S_reghop
+#define S_reghop3 CPerlObj::S_reghop3
+#define reghop3 S_reghop3
#define S_reghopmaybe CPerlObj::S_reghopmaybe
#define reghopmaybe S_reghopmaybe
+#define S_reghopmaybe3 CPerlObj::S_reghopmaybe3
+#define reghopmaybe3 S_reghopmaybe3
#define S_find_byclass CPerlObj::S_find_byclass
#define find_byclass S_find_byclass
#endif
diff --git a/embed.pl b/embed.pl
index 32f3ddc329..7b83635804 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2407,7 +2407,9 @@ s |char*|regcppop
s |char*|regcp_set_to |I32 ss
s |void |cache_re |regexp *prog
s |U8* |reghop |U8 *pos|I32 off
+s |U8* |reghop3 |U8 *pos|I32 off|U8 *lim
s |U8* |reghopmaybe |U8 *pos|I32 off
+s |U8* |reghopmaybe3 |U8 *pos|I32 off|U8 *lim
s |char* |find_byclass |regexp * prog|regnode *c|char *s|char *strend|char *startpos|I32 norun
#endif
diff --git a/lib/unicode/distinct.pm b/lib/unicode/distinct.pm
new file mode 100644
index 0000000000..6471ac8d84
--- /dev/null
+++ b/lib/unicode/distinct.pm
@@ -0,0 +1,35 @@
+package unicode:distinct;
+
+our $VERSION = '0.01';
+
+$unicode::distinct::hint_bits = 0x01000000;
+
+sub import {
+ $^H |= $unicode::distinct::hint_bits;
+}
+
+sub unimport {
+ $^H &= ~$unicode::distinct::hint_bits;
+}
+
+1;
+__END__
+
+=head1 NAME
+
+unicode::distinct - Perl pragma to strictly distinguish UTF8 data and non-UTF data.
+
+=head1 SYNOPSIS
+
+ use unicode::distinct;
+ no unicode::distinct;
+
+=head1 DESCRIPTION
+
+ *NOT YET*
+
+=head1 SEE ALSO
+
+L<perlunicode>, L<utf8>
+
+=cut
diff --git a/mg.c b/mg.c
index 821c32537d..340c1e80d1 100644
--- a/mg.c
+++ b/mg.c
@@ -366,6 +366,11 @@ Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg)
i = t;
else /* @- */
i = s;
+
+ if (i > 0 && DO_UTF8(PL_reg_sv)) {
+ char *b = rx->subbeg;
+ i = Perl_utf8_length(aTHX_ (U8*)b, (U8*)(b+i));
+ }
sv_setiv(sv,i);
}
}
@@ -1410,7 +1415,14 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
{
STRLEN len;
char *tmps = SvPV(sv,len);
- sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+ if (DO_UTF8(sv)) {
+ sv_utf8_upgrade(LvTARG(sv));
+ sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+ SvUTF8_on(LvTARG(sv));
+ }
+ else
+ sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+
return 0;
}
diff --git a/op.c b/op.c
index 28e7e98606..8879854f31 100644
--- a/op.c
+++ b/op.c
@@ -6772,6 +6772,8 @@ Perl_peep(pTHX_ register OP *o)
if (!fields || !GvHV(*fields))
break;
key = SvPV(*svp, keylen);
+ if (SvUTF8(*svp))
+ keylen = -keylen;
indsvp = hv_fetch(GvHV(*fields), key, keylen, FALSE);
if (!indsvp) {
Perl_croak(aTHX_ "No such pseudo-hash field \"%s\" in variable %s of type %s",
@@ -6837,6 +6839,8 @@ Perl_peep(pTHX_ register OP *o)
key_op = (SVOP*)key_op->op_sibling) {
svp = cSVOPx_svp(key_op);
key = SvPV(*svp, keylen);
+ if (SvUTF8(*svp))
+ keylen = -keylen;
indsvp = hv_fetch(GvHV(*fields), key, keylen, FALSE);
if (!indsvp) {
Perl_croak(aTHX_ "No such pseudo-hash field \"%s\" "
diff --git a/op.h b/op.h
index 55b85a5494..e0fc3bc645 100644
--- a/op.h
+++ b/op.h
@@ -250,6 +250,9 @@ struct pmop {
#define PMdf_USED 0x01 /* pm has been used once already */
#define PMdf_TAINTED 0x02 /* pm compiled from tainted pattern */
#define PMdf_UTF8 0x04 /* pm compiled from utf8 data */
+#define PMdf_DYN_UTF8 0x08
+
+#define PMdf_CMP_UTF8 (PMdf_UTF8|PMdf_DYN_UTF8)
#define PMf_RETAINT 0x0001 /* taint $1 etc. if target tainted */
#define PMf_ONCE 0x0002 /* use pattern only once per reset */
diff --git a/perl.h b/perl.h
index cccf728439..77ef4c9817 100644
--- a/perl.h
+++ b/perl.h
@@ -2698,6 +2698,7 @@ enum { /* pass one of these to get_vtbl */
#define HINT_FILETEST_ACCESS 0x00400000
#define HINT_UTF8 0x00800000
+#define HINT_UTF8_DISTINCT 0x01000000
/* Various states of an input record separator SV (rs, nrs) */
#define RsSNARF(sv) (! SvOK(sv))
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 642efe96ea..f5596e27c9 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -2582,8 +2582,9 @@ Found in file sv.c
=item sv_catsv
-Concatenates the string from SV C<ssv> onto the end of the string in SV
-C<dsv>. Handles 'get' magic, but not 'set' magic. See C<sv_catsv_mg>.
+Concatenates the string from SV C<ssv> onto the end of the string in
+SV C<dsv>. Modifies C<dsv> but not C<ssv>. Handles 'get' magic, but
+not 'set' magic. See C<sv_catsv_mg>.
void sv_catsv(SV* dsv, SV* ssv)
@@ -3283,8 +3284,7 @@ Found in file utf8.c
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding and no longer than C<curlen>;
-C<retlen> will be set to the length, in bytes, of that character,
-and the pointer C<s> will be advanced to the end of the character.
+C<retlen> will be set to the length, in bytes, of that character.
If C<s> does not point to a well-formed UTF8 character, the behaviour
is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
@@ -3306,8 +3306,7 @@ Found in file utf8.c
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+length, in bytes, of that character.
If C<s> does not point to a well-formed UTF8 character, zero is
returned and retlen is set, if possible, to -1.
diff --git a/pp.c b/pp.c
index 11506977b2..ff4508c24f 100644
--- a/pp.c
+++ b/pp.c
@@ -5739,9 +5739,9 @@ PP(pp_split)
AV *ary;
register IV limit = POPi; /* note, negative is forever */
SV *sv = POPs;
- bool do_utf8 = DO_UTF8(sv);
STRLEN len;
register char *s = SvPV(sv, len);
+ bool do_utf8 = DO_UTF8(sv);
char *strend = s + len;
register PMOP *pm;
register REGEXP *rx;
@@ -5878,7 +5878,7 @@ PP(pp_split)
SV *csv = CALLREG_INTUIT_STRING(aTHX_ rx);
len = rx->minlen;
- if (len == 1 && !tail) {
+ if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) {
STRLEN n_a;
char c = *SvPV(csv, n_a);
while (--limit) {
@@ -5895,7 +5895,10 @@ PP(pp_split)
XPUSHs(dstr);
/* The rx->minlen is in characters but we want to step
* s ahead by bytes. */
- s = m + (do_utf8 ? SvCUR(csv) : len);
+ if (do_utf8)
+ s = (char*)utf8_hop((U8*)m, len);
+ else
+ s = m + len; /* Fake \n at the end */
}
}
else {
@@ -5914,7 +5917,10 @@ PP(pp_split)
XPUSHs(dstr);
/* The rx->minlen is in characters but we want to step
* s ahead by bytes. */
- s = m + (do_utf8 ? SvCUR(csv) : len); /* Fake \n at the end */
+ if (do_utf8)
+ s = (char*)utf8_hop((U8*)m, len);
+ else
+ s = m + len; /* Fake \n at the end */
}
}
}
diff --git a/pp_ctl.c b/pp_ctl.c
index aff58153ce..dd4bae9ecb 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -115,7 +115,9 @@ PP(pp_regcomp)
pm->op_pmflags = pm->op_pmpermflags; /* reset case sensitivity */
if (DO_UTF8(tmpstr))
- pm->op_pmdynflags |= PMdf_UTF8;
+ pm->op_pmdynflags |= PMdf_DYN_UTF8;
+ else
+ pm->op_pmdynflags &= ~PMdf_DYN_UTF8;
pm->op_pmregexp = CALLREGCOMP(aTHX_ t, t + len, pm);
PL_reginterp_cnt = 0; /* XXXX Be extra paranoid - needed
inside tie/overload accessors. */
@@ -185,12 +187,12 @@ PP(pp_substcont)
SvPVX(targ) = SvPVX(dstr);
SvCUR_set(targ, SvCUR(dstr));
SvLEN_set(targ, SvLEN(dstr));
+ if (DO_UTF8(dstr))
+ SvUTF8_on(targ);
SvPVX(dstr) = 0;
sv_free(dstr);
TAINT_IF(cx->sb_rxtainted & 1);
- if (pm->op_pmdynflags & PMdf_UTF8)
- SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */
PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1)));
(void)SvPOK_only_UTF8(targ);
@@ -1439,8 +1441,13 @@ Perl_die_where(pTHX_ char *message, STRLEN msglen)
}
}
}
- else
+ else {
sv_setpvn(ERRSV, message, msglen);
+ if (PL_hints & HINT_UTF8)
+ SvUTF8_on(ERRSV);
+ else
+ SvUTF8_off(ERRSV);
+ }
}
else
message = SvPVx(ERRSV, msglen);
@@ -2754,7 +2761,7 @@ Perl_sv_compile_2op(pTHX_ SV *sv, OP** startop, char *code, AV** avp)
#else
SAVEVPTR(PL_op);
#endif
- PL_hints = 0;
+ PL_hints &= HINT_UTF8;
PL_op = &dummy;
PL_op->op_type = OP_ENTEREVAL;
diff --git a/pp_hot.c b/pp_hot.c
index c7555c4762..f9c5960a63 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -140,30 +140,31 @@ PP(pp_concat)
djSP; dATARGET; tryAMAGICbin(concat,opASSIGN);
{
dPOPTOPssrl;
- STRLEN len;
- U8 *s;
+ STRLEN len, llen, rlen;
+ U8 *s, *l, *r;
bool left_utf8;
bool right_utf8;
- if (TARG == right && SvGMAGICAL(right))
- mg_get(right);
- if (SvGMAGICAL(left))
+ r = (U8*)SvPV(right,rlen);
+
+ if (TARG != left)
+ l = (U8*)SvPV(left,llen);
+ else if (SvGMAGICAL(left))
mg_get(left);
left_utf8 = DO_UTF8(left);
right_utf8 = DO_UTF8(right);
- if (left_utf8 != right_utf8) {
+ if (left_utf8 != right_utf8 && !IN_BYTE) {
if (TARG == right && !right_utf8) {
sv_utf8_upgrade(TARG); /* Now straight binary copy */
SvUTF8_on(TARG);
}
else {
/* Set TARG to PV(left), then add right */
- U8 *l, *c, *olds = NULL;
+ U8 *c, *olds = NULL;
STRLEN targlen;
- s = (U8*)SvPV(right,len);
- right_utf8 |= DO_UTF8(right);
+ s = r; len = rlen;
if (TARG == right) {
/* Take a copy since we're about to overwrite TARG */
olds = s = (U8*)savepvn((char*)s, len);
@@ -174,12 +175,12 @@ PP(pp_concat)
else
sv_setpv(left, ""); /* Suppress warning. */
}
- l = (U8*)SvPV(left, targlen);
- left_utf8 |= DO_UTF8(left);
if (TARG != left)
- sv_setpvn(TARG, (char*)l, targlen);
- if (!left_utf8)
- sv_utf8_upgrade(TARG);
+ sv_setpvn(TARG, (char*)l, llen);
+ if (!left_utf8) {
+ SvUTF8_off(TARG);
+ sv_utf8_upgrade(TARG);
+ }
/* Extend TARG to length of right (s) */
targlen = SvCUR(TARG) + len;
if (!right_utf8) {
@@ -207,17 +208,16 @@ PP(pp_concat)
}
if (TARG != left) {
- s = (U8*)SvPV(left,len);
if (TARG == right) {
- sv_insert(TARG, 0, 0, (char*)s, len);
+ sv_insert(TARG, 0, 0, (char*)l, llen);
SETs(TARG);
RETURN;
}
- sv_setpvn(TARG, (char *)s, len);
+ sv_setpvn(TARG, (char *)l, llen);
}
else if (!SvOK(TARG) && SvTYPE(TARG) <= SVt_PVMG)
sv_setpv(TARG, ""); /* Suppress warning. */
- s = (U8*)SvPV(right,len);
+ s = r; len = rlen;
if (SvOK(TARG)) {
#if defined(PERL_Y2KWARN)
if ((SvIOK(right) || SvNOK(right)) && ckWARN(WARN_Y2K)) {
@@ -235,7 +235,7 @@ PP(pp_concat)
}
else
sv_setpvn(TARG, (char *)s, len); /* suppress warning */
- if (left_utf8)
+ if (left_utf8 && !IN_BYTE)
SvUTF8_on(TARG);
SETTARG;
RETURN;
@@ -1240,7 +1240,8 @@ play_it_again:
if (update_minmatch++)
minmatch = had_zerolen;
}
- if (rx->reganch & RE_USE_INTUIT) {
+ if (rx->reganch & RE_USE_INTUIT &&
+ DO_UTF8(TARG) == ((rx->reganch & ROPT_UTF8) != 0)) {
s = CALLREG_INTUIT_START(aTHX_ rx, TARG, s, strend, r_flags, NULL);
if (!s)
@@ -1337,7 +1338,13 @@ yup: /* Confirmed by INTUIT */
if (global) {
rx->subbeg = truebase;
rx->startp[0] = s - truebase;
- rx->endp[0] = s - truebase + rx->minlen;
+ if (DO_UTF8(PL_reg_sv)) {
+ char *t = (char*)utf8_hop((U8*)s, rx->minlen);
+ rx->endp[0] = t - truebase;
+ }
+ else {
+ rx->endp[0] = s - truebase + rx->minlen;
+ }
rx->sublen = strend - truebase;
goto gotcha;
}
@@ -2000,6 +2007,8 @@ PP(pp_subst)
if (CALLREGEXEC(aTHX_ rx, s, strend, orig, 0, TARG, NULL,
r_flags | REXEC_CHECKED))
{
+ bool isutf8;
+
if (force_on_match) {
force_on_match = 0;
s = SvPV_force(TARG, len);
@@ -2045,6 +2054,7 @@ PP(pp_subst)
SvPVX(TARG) = SvPVX(dstr);
SvCUR_set(TARG, SvCUR(dstr));
SvLEN_set(TARG, SvLEN(dstr));
+ isutf8 = DO_UTF8(dstr);
SvPVX(dstr) = 0;
sv_free(dstr);
@@ -2053,6 +2063,8 @@ PP(pp_subst)
PUSHs(sv_2mortal(newSViv((I32)iters)));
(void)SvPOK_only(TARG);
+ if (isutf8)
+ SvUTF8_on(TARG);
TAINT_IF(rxtainted);
SvSETMAGIC(TARG);
SvTAINT(TARG);
diff --git a/proto.h b/proto.h
index 1bcb5cdd91..4c5499ec6b 100644
--- a/proto.h
+++ b/proto.h
@@ -1147,7 +1147,9 @@ STATIC char* S_regcppop(pTHX);
STATIC char* S_regcp_set_to(pTHX_ I32 ss);
STATIC void S_cache_re(pTHX_ regexp *prog);
STATIC U8* S_reghop(pTHX_ U8 *pos, I32 off);
+STATIC U8* S_reghop3(pTHX_ U8 *pos, I32 off, U8 *lim);
STATIC U8* S_reghopmaybe(pTHX_ U8 *pos, I32 off);
+STATIC U8* S_reghopmaybe3(pTHX_ U8 *pos, I32 off, U8 *lim);
STATIC char* S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun);
#endif
diff --git a/regcomp.c b/regcomp.c
index 8748271ec1..bbd91c69e5 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -127,6 +127,7 @@ typedef struct RExC_state_t {
I32 extralen;
I32 seen_zerolen;
I32 seen_evals;
+ I32 utf8;
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
@@ -148,6 +149,7 @@ typedef struct RExC_state_t {
#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
+#define RExC_utf8 (pRExC_state->utf8)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
@@ -229,8 +231,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
#define SCF_WHILEM_VISITED_POS 0x2000
-#define RF_utf8 8
-#define UTF (PL_reg_flags & RF_utf8)
+#define UTF RExC_utf8
#define LOC (RExC_flags16 & PMf_LOCALE)
#define FOLD (RExC_flags16 & PMf_FOLD)
@@ -469,7 +470,7 @@ S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *c
ANYOF_CLASS_ZERO(cl);
for (value = 0; value < 256; ++value)
ANYOF_BITMAP_SET(cl, value);
- cl->flags = ANYOF_EOS;
+ cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
}
@@ -483,6 +484,8 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
for (value = 0; value <= ANYOF_MAX; value += 2)
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
return 1;
+ if (!(cl->flags & ANYOF_UNICODE_ALL))
+ return 0;
for (value = 0; value < 256; ++value)
if (!ANYOF_BITMAP_TEST(cl, value))
return 0;
@@ -530,6 +533,16 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl,
} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
if (!(and_with->flags & ANYOF_EOS))
cl->flags &= ~ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ cl->flags |= ANYOF_UNICODE;
+ ARG_SET(cl, ARG(and_with));
+ }
+ if (!(and_with->flags & ANYOF_UNICODE_ALL))
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
+ cl->flags &= ~ANYOF_UNICODE;
}
/* 'OR' a given class with another one. Can create false positives */
@@ -580,6 +593,16 @@ S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, str
}
if (or_with->flags & ANYOF_EOS)
cl->flags |= ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
+ ARG(cl) != ARG(or_with)) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
+ if (or_with->flags & ANYOF_UNICODE_ALL) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
}
/* REx optimizer. Converts nodes into quickier variants "in place".
@@ -787,15 +810,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
}
else if (OP(scan) == EXACT) {
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8*)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uv_simple(s, NULL);
}
min += l;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
@@ -815,21 +834,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
+ if (uc >= 0x100 ||
+ !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
&& (!(data->start_class->flags & ANYOF_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)])))
+ || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat)
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
}
else if (flags & SCF_DO_STCLASS_OR) {
/* false positive possible if the class is case-folded */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
cl_and(data->start_class, &and_with);
}
@@ -837,19 +857,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
}
else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR)
scan_commit(pRExC_state, data);
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8 *)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uv_simple(s, NULL);
}
min += l;
if (data && (flags & SCF_DO_SUBSTR))
@@ -858,15 +874,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
- && !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)]))
+ if (uc >= 0x100 ||
+ !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
+ && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat) {
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
data->start_class->flags |= ANYOF_FOLD;
if (OP(scan) == EXACTFL)
@@ -877,7 +893,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
if (data->start_class->flags & ANYOF_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
}
cl_and(data->start_class, &and_with);
@@ -1580,11 +1597,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
FAIL("NULL regexp argument");
/* XXXX This looks very suspicious... */
- if (pm->op_pmdynflags & PMdf_UTF8) {
- PL_reg_flags |= RF_utf8;
- }
+ if (pm->op_pmdynflags & PMdf_CMP_UTF8)
+ RExC_utf8 = 1;
else
- PL_reg_flags = 0;
+ RExC_utf8 = 0;
RExC_precomp = savepvn(exp, xend - exp);
DEBUG_r(if (!PL_colorset) reginitcolors());
@@ -1705,9 +1721,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
/* Starting-point info. */
again:
if (PL_regkind[(U8)OP(first)] == EXACT) {
- if (OP(first) == EXACT); /* Empty, get anchored substr later. */
- else if ((OP(first) == EXACTF || OP(first) == EXACTFL)
- && !UTF)
+ if (OP(first) == EXACT)
+ ; /* Empty, get anchored substr later. */
+ else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
r->regstclass = first;
}
else if (strchr((char*)PL_simple,OP(first)))
@@ -3164,6 +3180,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
SV *listsv;
register char *e;
UV n;
+ bool dont_optimize_invert = FALSE;
ret = reganode(pRExC_state, ANYOF, 0);
@@ -3350,6 +3367,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
@@ -3360,6 +3378,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
case ANYOF_ALNUMC:
@@ -3370,6 +3389,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
@@ -3380,6 +3400,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
@@ -3390,6 +3411,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
@@ -3400,6 +3422,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
@@ -3415,6 +3438,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
@@ -3430,6 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
@@ -3440,6 +3465,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
@@ -3450,6 +3476,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
@@ -3460,6 +3487,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
@@ -3470,6 +3498,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
break;
case ANYOF_DIGIT:
@@ -3480,6 +3509,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
for (value = '0'; value <= '9'; value++)
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
break;
case ANYOF_NDIGIT:
@@ -3492,6 +3522,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
for (value = '9' + 1; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
@@ -3502,6 +3533,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
@@ -3512,6 +3544,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
@@ -3522,6 +3555,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
@@ -3532,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
@@ -3542,6 +3577,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
@@ -3552,6 +3588,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
@@ -3562,6 +3599,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
@@ -3572,6 +3610,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
@@ -3582,6 +3621,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
@@ -3592,6 +3632,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
break;
case ANYOF_SPACE:
@@ -3602,6 +3643,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
break;
case ANYOF_NSPACE:
@@ -3612,6 +3654,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isSPACE(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
@@ -3622,6 +3665,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
@@ -3632,6 +3676,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
@@ -3642,6 +3687,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
@@ -3652,6 +3698,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
@@ -3755,12 +3802,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
}
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY &&
+ if (!SIZE_ONLY && !dont_optimize_invert &&
/* If the only flag is inversion. */
(ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
- ANYOF_FLAGS(ret) = 0;
+ ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
}
if (!SIZE_ONLY) {
@@ -4218,6 +4265,8 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
if (flags & ANYOF_UNICODE)
sv_catpv(sv, "{unicode}");
+ else if (flags & ANYOF_UNICODE_ALL)
+ sv_catpv(sv, "{all-unicode}");
{
SV *lv;
diff --git a/regcomp.h b/regcomp.h
index c8094e14a4..066e31f01d 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -192,6 +192,7 @@ struct regnode_charclass_class {
/* There is a character or a range past 0xff */
#define ANYOF_UNICODE 0x20
+#define ANYOF_UNICODE_ALL 0x40 /* Can match any char past 0xff */
/* Are there any runtime flags on in this node? */
#define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f)
diff --git a/regexec.c b/regexec.c
index bdbdb5918c..be683a3b2f 100644
--- a/regexec.c
+++ b/regexec.c
@@ -107,15 +107,22 @@
*/
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
-#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
+#define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b)
#define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
#define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
-#define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off))
-#define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
+#define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off))
+#define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
#define HOPc(pos,off) ((char*)HOP(pos,off))
#define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
+#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
+#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
+#define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
+#define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
+#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
+#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
+
static void restore_pos(pTHXo_ void *arg);
@@ -354,11 +361,15 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
(strend - strpos > 60 ? "..." : ""))
);
- if (prog->minlen > strend - strpos) {
+ if (prog->reganch & ROPT_UTF8)
+ PL_reg_flags |= RF_utf8;
+
+ if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) {
DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n"));
goto fail;
}
strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
+ PL_regeol = strend;
check = prog->check_substr;
if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
@@ -377,8 +388,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
/* Substring at constant offset from beg-of-str... */
I32 slen;
- PL_regeol = strend; /* Used in HOP() */
- s = HOPc(strpos, prog->check_offset_min);
+ s = HOP3c(strpos, prog->check_offset_min, strend);
if (SvTAIL(check)) {
slen = SvCUR(check); /* >= 1 */
@@ -412,7 +422,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
if (!ml_anch) {
I32 end = prog->check_offset_max + CHR_SVLEN(check)
- (SvTAIL(check) != 0);
- I32 eshift = strend - s - end;
+ I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
if (end_shift < eshift)
end_shift = eshift;
@@ -451,8 +461,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
*data->scream_olds = s;
}
else
- s = fbm_instr((unsigned char*)s + start_shift,
- (unsigned char*)strend - end_shift,
+ s = fbm_instr(HOP3(s, start_shift, strend),
+ HOP3(strend, -end_shift, strbeg),
check, PL_multiline ? FBMrf_MULTILINE : 0);
/* Update the count-of-usability, remove useless subpatterns,
@@ -491,34 +501,33 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
if (check == prog->float_substr) {
do_other_anchored:
{
- char *last = s - start_shift, *last1, *last2;
+ char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
char *s1 = s;
- tmp = PL_bostr;
t = s - prog->check_offset_max;
if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
&& (!(prog->reganch & ROPT_UTF8)
- || (PL_bostr = strpos, /* Used in regcopmaybe() */
- (t = reghopmaybe_c(s, -(prog->check_offset_max)))
+ || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
&& t > strpos)))
/* EMPTY */;
else
t = strpos;
- t += prog->anchored_offset;
+ t = HOP3c(t, prog->anchored_offset, strend);
if (t < other_last) /* These positions already checked */
t = other_last;
- PL_bostr = tmp;
- last2 = last1 = strend - prog->minlen;
+ last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
if (last < last1)
last1 = last;
/* XXXX It is not documented what units *_offsets are in. Assume bytes. */
/* On end-of-str: see comment below. */
s = fbm_instr((unsigned char*)t,
- (unsigned char*)last1 + prog->anchored_offset
- + SvCUR(prog->anchored_substr)
- - (SvTAIL(prog->anchored_substr)!=0),
- prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0);
- DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s",
+ HOP3(HOP3(last1, prog->anchored_offset, strend)
+ + SvCUR(prog->anchored_substr),
+ -(SvTAIL(prog->anchored_substr)!=0), strbeg),
+ prog->anchored_substr,
+ PL_multiline ? FBMrf_MULTILINE : 0);
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "%s anchored substr `%s%.*s%s'%s",
(s ? "Found" : "Contradicts"),
PL_colors[0],
(int)(SvCUR(prog->anchored_substr)
@@ -533,17 +542,16 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
}
DEBUG_r(PerlIO_printf(Perl_debug_log,
", trying floating at offset %ld...\n",
- (long)(s1 + 1 - i_strpos)));
- PL_regeol = strend; /* Used in HOP() */
- other_last = last1 + prog->anchored_offset + 1;
- s = HOPc(last, 1);
+ (long)(HOP3c(s1, 1, strend) - i_strpos)));
+ other_last = HOP3c(last1, prog->anchored_offset+1, strend);
+ s = HOP3c(last, 1, strend);
goto restart;
}
else {
DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
(long)(s - i_strpos)));
- t = s - prog->anchored_offset;
- other_last = s + 1;
+ t = HOP3c(s, -prog->anchored_offset, strbeg);
+ other_last = HOP3c(s, 1, strend);
s = s1;
if (t == strpos)
goto try_at_start;
@@ -555,11 +563,12 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
char *last, *last1;
char *s1 = s;
- t = s - start_shift;
- last1 = last = strend - prog->minlen + prog->float_min_offset;
- if (last - t > prog->float_max_offset)
- last = t + prog->float_max_offset;
- s = t + prog->float_min_offset;
+ t = HOP3c(s, -start_shift, strbeg);
+ last1 = last =
+ HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
+ if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
+ last = HOP3c(t, prog->float_max_offset, strend);
+ s = HOP3c(t, prog->float_min_offset, strend);
if (s < other_last)
s = other_last;
/* XXXX It is not documented what units *_offsets are in. Assume bytes. */
@@ -587,8 +596,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
", trying anchored starting at offset %ld...\n",
(long)(s1 + 1 - i_strpos)));
other_last = last + 1;
- PL_regeol = strend; /* Used in HOP() */
- s = HOPc(t, 1);
+ s = HOP3c(t, 1, strend);
goto restart;
}
else {
@@ -604,13 +612,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
}
t = s - prog->check_offset_max;
- tmp = PL_bostr;
if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
&& (!(prog->reganch & ROPT_UTF8)
- || (PL_bostr = strpos, /* Used in regcopmaybe() */
- ((t = reghopmaybe_c(s, -(prog->check_offset_max)))
- && t > strpos)))) {
- PL_bostr = tmp;
+ || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
+ && t > strpos))) {
/* Fixed substring is found far enough so that the match
cannot start at strpos. */
try_at_offset:
@@ -668,7 +673,6 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
++BmUSEFUL(prog->check_substr); /* hooray/5 */
}
else {
- PL_bostr = tmp;
/* The found string does not prohibit matching at strpos,
- no optimization of calling REx engine can be performed,
unless it was an MBOL and we are not after MBOL,
@@ -721,13 +725,16 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
regstclass does not come from lookahead... */
/* If regstclass takes bytelength more than 1: If charlength==1, OK.
This leaves EXACTF only, which is dealt with in find_byclass(). */
+ U8* str = (U8*)STRING(prog->regstclass);
int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
- ? STR_LEN(prog->regstclass)
+ ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
: 1);
char *endpos = (prog->anchored_substr || ml_anch)
- ? s + (prog->minlen? cl_l : 0)
- : (prog->float_substr ? check_at - start_shift + cl_l
- : strend) ;
+ ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
+ : (prog->float_substr
+ ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
+ cl_l, strend)
+ : strend);
char *startpos = strbeg;
t = s;
@@ -754,8 +761,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
if (prog->anchored_substr == check) {
DEBUG_r( what = "anchored" );
hop_and_restart:
- PL_regeol = strend; /* Used in HOP() */
- s = HOPc(t, 1);
+ s = HOP3c(t, 1, strend);
if (s + start_shift + end_shift > strend) {
/* XXXX Should be taken into account earlier? */
DEBUG_r( PerlIO_printf(Perl_debug_log,
@@ -854,8 +860,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
case EXACTF:
m = STRING(c);
ln = STR_LEN(c);
- c1 = *(U8*)m;
- c2 = PL_fold[c1];
+ if (UTF) {
+ c1 = to_utf8_lower((U8*)m);
+ c2 = to_utf8_upper((U8*)m);
+ }
+ else {
+ c1 = *(U8*)m;
+ c2 = PL_fold[c1];
+ }
goto do_exactf;
case EXACTFL:
m = STRING(c);
@@ -867,27 +879,45 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (norun && e < s)
e = s; /* Due to minlen logic of intuit() */
- /* Here it is NOT UTF! */
- if (c1 == c2) {
- while (s <= e) {
- if ( *(U8*)s == c1
- && (ln == 1 || !(OP(c) == EXACTF
- ? ibcmp(s, m, ln)
- : ibcmp_locale(s, m, ln)))
- && (norun || regtry(prog, s)) )
- goto got_it;
- s++;
- }
- } else {
- while (s <= e) {
- if ( (*(U8*)s == c1 || *(U8*)s == c2)
- && (ln == 1 || !(OP(c) == EXACTF
- ? ibcmp(s, m, ln)
- : ibcmp_locale(s, m, ln)))
- && (norun || regtry(prog, s)) )
- goto got_it;
- s++;
- }
+
+ if (do_utf8) {
+ STRLEN len;
+ if (c1 == c2)
+ while (s <= e) {
+ if ( utf8_to_uv_simple((U8*)s, &len) == c1
+ && regtry(prog, s) )
+ goto got_it;
+ s += len;
+ }
+ else
+ while (s <= e) {
+ UV c = utf8_to_uv_simple((U8*)s, &len);
+ if ( (c == c1 || c == c2) && regtry(prog, s) )
+ goto got_it;
+ s += len;
+ }
+ }
+ else {
+ if (c1 == c2)
+ while (s <= e) {
+ if ( *(U8*)s == c1
+ && (ln == 1 || !(OP(c) == EXACTF
+ ? ibcmp(s, m, ln)
+ : ibcmp_locale(s, m, ln)))
+ && (norun || regtry(prog, s)) )
+ goto got_it;
+ s++;
+ }
+ else
+ while (s <= e) {
+ if ( (*(U8*)s == c1 || *(U8*)s == c2)
+ && (ln == 1 || !(OP(c) == EXACTF
+ ? ibcmp(s, m, ln)
+ : ibcmp_locale(s, m, ln)))
+ && (norun || regtry(prog, s)) )
+ goto got_it;
+ s++;
+ }
}
break;
case BOUNDL:
@@ -898,7 +928,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (s == startpos)
tmp = '\n';
else {
- U8 *r = reghop((U8*)s, -1);
+ U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
}
@@ -940,7 +970,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (s == startpos)
tmp = '\n';
else {
- U8 *r = reghop((U8*)s, -1);
+ U8 *r = reghop3((U8*)s, -1, (U8*)startpos);
tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0);
}
@@ -1346,6 +1376,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
I32 scream_pos = -1; /* Internal iterator of scream. */
char *scream_olds;
SV* oreplsv = GvSV(PL_replgv);
+ bool do_utf8 = DO_UTF8(sv);
PL_regcc = 0;
@@ -1361,12 +1392,22 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
}
minlen = prog->minlen;
- if (strend - startpos < minlen) goto phooey;
+ if (do_utf8) {
+ if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey;
+ }
+ else {
+ if (strend - startpos < minlen) goto phooey;
+ }
if (startpos == strbeg) /* is ^ valid at stringarg? */
PL_regprev = '\n';
else {
- PL_regprev = (U32)stringarg[-1];
+ if (prog->reganch & ROPT_UTF8 && do_utf8) {
+ U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg);
+ PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0);
+ }
+ else
+ PL_regprev = (U32)stringarg[-1];
if (!PL_multiline && PL_regprev == '\n')
PL_regprev = '\0'; /* force ^ to NOT match */
}
@@ -1454,7 +1495,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
if (minlen)
dontbother = minlen - 1;
- end = HOPc(strend, -dontbother) - 1;
+ end = HOP3c(strend, -dontbother, strbeg) - 1;
/* for multiline we only have to try after newlines */
if (prog->check_substr) {
if (s == startpos)
@@ -1500,7 +1541,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
int did_match = 0;
#endif
- if (UTF) {
+ if (do_utf8) {
while (s < strend) {
if (*s == ch) {
DEBUG_r( did_match = 1 );
@@ -1529,18 +1570,19 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
"Did not find anchored character...\n"));
}
/*SUPPRESS 560*/
- else if (prog->anchored_substr != Nullsv
- || (prog->float_substr != Nullsv
- && prog->float_max_offset < strend - s)) {
+ else if (do_utf8 == (UTF!=0) &&
+ (prog->anchored_substr != Nullsv
+ || (prog->float_substr != Nullsv
+ && prog->float_max_offset < strend - s))) {
SV *must = prog->anchored_substr
? prog->anchored_substr : prog->float_substr;
I32 back_max =
prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset;
I32 back_min =
prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset;
- char *last = HOPc(strend, /* Cannot start after this */
+ char *last = HOP3c(strend, /* Cannot start after this */
-(I32)(CHR_SVLEN(must)
- - (SvTAIL(must) != 0) + back_min));
+ - (SvTAIL(must) != 0) + back_min), strbeg);
char *last1; /* Last position checked before */
#ifdef DEBUGGING
int did_match = 0;
@@ -1558,9 +1600,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
strend = HOPc(strend, -dontbother);
while ( (s <= last) &&
((flags & REXEC_SCREAM)
- ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg,
+ ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
end_shift, &scream_pos, 0))
- : (s = fbm_instr((unsigned char*)HOP(s, back_min),
+ : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
(unsigned char*)strend, must,
PL_multiline ? FBMrf_MULTILINE : 0))) ) {
DEBUG_r( did_match = 1 );
@@ -1574,7 +1616,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
last1 = HOPc(s, -back_min);
s = t;
}
- if (UTF) {
+ if (do_utf8) {
while (s <= last1) {
if (regtry(prog, s))
goto got_it;
@@ -1655,7 +1697,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *
dontbother = minlen - 1;
strend -= dontbother; /* this one's always in bytes! */
/* We don't know much -- general case. */
- if (UTF) {
+ if (do_utf8) {
for (;;) {
if (regtry(prog, s))
goto got_it;
@@ -1926,20 +1968,25 @@ S_regmatch(pTHX_ regnode *prog)
SV *prop = sv_newmortal();
int docolor = *PL_colors[0];
int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
- int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput);
+ int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
/* The part of the string before starttry has one color
(pref0_len chars), between starttry and current
position another one (pref_len - pref0_len chars),
after the current position the third one.
We assume that pref0_len <= pref_len, otherwise we
decrease pref0_len. */
- int pref_len = (locinput - PL_bostr > (5 + taill) - l
- ? (5 + taill) - l : locinput - PL_bostr);
- int pref0_len = pref_len - (locinput - PL_reg_starttry);
+ int pref_len = (locinput - PL_bostr) > (5 + taill) - l
+ ? (5 + taill) - l : locinput - PL_bostr;
+ int pref0_len;
+ while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
+ pref_len++;
+ pref0_len = pref_len - (locinput - PL_reg_starttry);
if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
l = ( PL_regeol - locinput > (5 + taill) - pref_len
? (5 + taill) - pref_len : PL_regeol - locinput);
+ while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
+ l--;
if (pref0_len < 0)
pref0_len = 0;
if (pref0_len > pref_len)
@@ -2013,7 +2060,7 @@ S_regmatch(pTHX_ regnode *prog)
sayNO;
break;
case SANY:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
locinput += PL_utf8skip[nextchr];
if (locinput > PL_regeol)
sayNO;
@@ -2025,20 +2072,46 @@ S_regmatch(pTHX_ regnode *prog)
nextchr = UCHARAT(++locinput);
break;
case REG_ANY:
- if (DO_UTF8(PL_reg_sv)) {
+ if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
+ sayNO;
+ if (do_utf8) {
locinput += PL_utf8skip[nextchr];
if (locinput > PL_regeol)
sayNO;
nextchr = UCHARAT(locinput);
- break;
}
- if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
- sayNO;
- nextchr = UCHARAT(++locinput);
+ else
+ nextchr = UCHARAT(++locinput);
break;
case EXACT:
s = STRING(scan);
ln = STR_LEN(scan);
+ if (do_utf8 != (UTF!=0)) {
+ char *l = locinput;
+ char *e = s + ln;
+ STRLEN len;
+ if (do_utf8)
+ while (s < e) {
+ if (l >= PL_regeol)
+ sayNO;
+ if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len))
+ sayNO;
+ s++;
+ l += len;
+ }
+ else
+ while (s < e) {
+ if (l >= PL_regeol)
+ sayNO;
+ if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len))
+ sayNO;
+ s += len;
+ l++;
+ }
+ locinput = l;
+ nextchr = UCHARAT(locinput);
+ break;
+ }
/* Inline the first character, for speed. */
if (UCHARAT(s) != nextchr)
sayNO;
@@ -2056,21 +2129,19 @@ S_regmatch(pTHX_ regnode *prog)
s = STRING(scan);
ln = STR_LEN(scan);
- if (UTF) {
+ if (do_utf8) {
char *l = locinput;
- char *e = s + ln;
+ char *e;
+ e = s + ln;
c1 = OP(scan) == EXACTF;
while (s < e) {
- if (l >= PL_regeol)
- sayNO;
- if (utf8_to_uv((U8*)s, e - s, 0, 0) !=
- (c1 ?
- toLOWER_utf8((U8*)l) :
- toLOWER_LC_utf8((U8*)l)))
- {
+ if (l >= PL_regeol) {
sayNO;
}
- s += UTF8SKIP(s);
+ if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) !=
+ (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l)))
+ sayNO;
+ s += UTF ? UTF8SKIP(s) : 1;
l += UTF8SKIP(l);
}
locinput = l;
@@ -2201,7 +2272,7 @@ S_regmatch(pTHX_ regnode *prog)
case SPACE:
if (!nextchr)
sayNO;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
if (nextchr & 0x80) {
if (!(OP(scan) == SPACE
? swash_fetch(PL_utf8_space, (U8*)locinput)
@@ -2231,7 +2302,7 @@ S_regmatch(pTHX_ regnode *prog)
case NSPACE:
if (!nextchr && locinput >= PL_regeol)
sayNO;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
if (OP(scan) == NSPACE
? swash_fetch(PL_utf8_space, (U8*)locinput)
: isSPACE_LC_utf8((U8*)locinput))
@@ -2253,7 +2324,7 @@ S_regmatch(pTHX_ regnode *prog)
case DIGIT:
if (!nextchr)
sayNO;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
if (!(OP(scan) == DIGIT
? swash_fetch(PL_utf8_digit, (U8*)locinput)
: isDIGIT_LC_utf8((U8*)locinput)))
@@ -2275,7 +2346,7 @@ S_regmatch(pTHX_ regnode *prog)
case NDIGIT:
if (!nextchr && locinput >= PL_regeol)
sayNO;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
if (OP(scan) == NDIGIT
? swash_fetch(PL_utf8_digit, (U8*)locinput)
: isDIGIT_LC_utf8((U8*)locinput))
@@ -2315,7 +2386,7 @@ S_regmatch(pTHX_ regnode *prog)
break;
s = PL_bostr + ln;
- if (UTF && OP(scan) != REF) { /* REF can do byte comparison */
+ if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
char *l = locinput;
char *e = PL_bostr + PL_regendp[n];
/*
@@ -2420,7 +2491,6 @@ S_regmatch(pTHX_ regnode *prog)
I32 onpar = PL_regnpar;
pm.op_pmflags = 0;
- pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0);
re = CALLREGCOMP(aTHX_ t, t + len, &pm);
if (!(SvFLAGS(ret)
& (SVs_TEMP | SVs_PADTMP | SVf_READONLY)))
@@ -3035,13 +3105,23 @@ S_regmatch(pTHX_ regnode *prog)
* when we know what character comes next.
*/
if (PL_regkind[(U8)OP(next)] == EXACT) {
- c1 = (U8)*STRING(next);
- if (OP(next) == EXACTF)
- c2 = PL_fold[c1];
- else if (OP(next) == EXACTFL)
- c2 = PL_fold_locale[c1];
- else
- c2 = c1;
+ U8 *s = (U8*)STRING(next);
+ if (!UTF) {
+ c2 = c1 = *s;
+ if (OP(next) == EXACTF)
+ c2 = PL_fold[c1];
+ else if (OP(next) == EXACTFL)
+ c2 = PL_fold_locale[c1];
+ }
+ else { /* UTF */
+ if (OP(next) == EXACTF) {
+ c1 = to_utf8_lower(s);
+ c2 = to_utf8_upper(s);
+ }
+ else {
+ c2 = c1 = utf8_to_uv_simple(s, NULL);
+ }
+ }
}
else
c1 = c2 = -1000;
@@ -3054,29 +3134,65 @@ S_regmatch(pTHX_ regnode *prog)
locinput = PL_reginput;
REGCP_SET(lastcp);
if (c1 != -1000) {
- char *e = locinput + n - ln; /* Should not check after this */
+ char *e; /* Should not check after this */
char *old = locinput;
- if (e >= PL_regeol || (n == REG_INFTY))
+ if (n == REG_INFTY) {
e = PL_regeol - 1;
+ if (do_utf8)
+ while (UTF8_IS_CONTINUATION(*(U8*)e))
+ e--;
+ }
+ else if (do_utf8) {
+ int m = n - ln;
+ for (e = locinput;
+ m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
+ e += UTF8SKIP(e);
+ }
+ else {
+ e = locinput + n - ln;
+ if (e >= PL_regeol)
+ e = PL_regeol - 1;
+ }
while (1) {
+ int count;
/* Find place 'next' could work */
- if (c1 == c2) {
- while (locinput <= e && *locinput != c1)
- locinput++;
- } else {
- while (locinput <= e
- && *locinput != c1
- && *locinput != c2)
- locinput++;
+ if (!do_utf8) {
+ if (c1 == c2) {
+ while (locinput <= e && *locinput != c1)
+ locinput++;
+ } else {
+ while (locinput <= e
+ && *locinput != c1
+ && *locinput != c2)
+ locinput++;
+ }
+ count = locinput - old;
+ }
+ else {
+ STRLEN len;
+ if (c1 == c2) {
+ for (count = 0;
+ locinput <= e &&
+ utf8_to_uv_simple((U8*)locinput, &len) != c1;
+ count++)
+ locinput += len;
+
+ } else {
+ for (count = 0; locinput <= e; count++) {
+ UV c = utf8_to_uv_simple((U8*)locinput, &len);
+ if (c == c1 || c == c2)
+ break;
+ locinput += len;
+ }
+ }
}
if (locinput > e)
sayNO;
/* PL_reginput == old now */
if (locinput != old) {
ln = 1; /* Did some */
- if (regrepeat(scan, locinput - old) <
- locinput - old)
+ if (regrepeat(scan, count) < count)
sayNO;
}
/* PL_reginput == locinput now */
@@ -3084,15 +3200,24 @@ S_regmatch(pTHX_ regnode *prog)
PL_reginput = locinput; /* Could be reset... */
REGCP_UNWIND(lastcp);
/* Couldn't or didn't -- move forward. */
- old = locinput++;
+ old = locinput;
+ if (do_utf8)
+ locinput += UTF8SKIP(locinput);
+ else
+ locinput++;
}
}
else
while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
+ UV c;
+ if (c1 != -1000) {
+ if (do_utf8)
+ c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+ else
+ c = UCHARAT(PL_reginput);
+ }
/* If it could work, try it. */
- if (c1 == -1000 ||
- UCHARAT(PL_reginput) == c1 ||
- UCHARAT(PL_reginput) == c2)
+ if (c1 == -1000 || c == c1 || c == c2)
{
TRYPAREN(paren, n, PL_reginput);
REGCP_UNWIND(lastcp);
@@ -3122,11 +3247,16 @@ S_regmatch(pTHX_ regnode *prog)
}
REGCP_SET(lastcp);
if (paren) {
+ UV c;
while (n >= ln) {
+ if (c1 != -1000) {
+ if (do_utf8)
+ c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+ else
+ c = UCHARAT(PL_reginput);
+ }
/* If it could work, try it. */
- if (c1 == -1000 ||
- UCHARAT(PL_reginput) == c1 ||
- UCHARAT(PL_reginput) == c2)
+ if (c1 == -1000 || c == c1 || c == c2)
{
TRYPAREN(paren, n, PL_reginput);
REGCP_UNWIND(lastcp);
@@ -3137,11 +3267,16 @@ S_regmatch(pTHX_ regnode *prog)
}
}
else {
+ UV c;
while (n >= ln) {
+ if (c1 != -1000) {
+ if (do_utf8)
+ c = utf8_to_uv_simple((U8*)PL_reginput, NULL);
+ else
+ c = UCHARAT(PL_reginput);
+ }
/* If it could work, try it. */
- if (c1 == -1000 ||
- UCHARAT(PL_reginput) == c1 ||
- UCHARAT(PL_reginput) == c2)
+ if (c1 == -1000 || c == c1 || c == c2)
{
TRYPAREN(paren, n, PL_reginput);
REGCP_UNWIND(lastcp);
@@ -3401,9 +3536,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
loceol = scan + max;
switch (OP(p)) {
case REG_ANY:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && *scan != '\n') {
+ while (scan < loceol && hardcount < max && *scan != '\n') {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3413,9 +3548,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case SANY:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol) {
+ while (hardcount < max && scan < loceol) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3444,7 +3579,8 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
case ANYOF:
if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) {
+ while (hardcount < max && scan < loceol &&
+ reginclass(p, (U8*)scan, do_utf8)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3454,9 +3590,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case ALNUM:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+ while (hardcount < max && scan < loceol &&
+ swash_fetch(PL_utf8_alnum, (U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3467,9 +3604,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
break;
case ALNUML:
PL_reg_flags |= RF_tainted;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) {
+ while (hardcount < max && scan < loceol &&
+ isALNUM_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3479,9 +3617,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case NALNUM:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
+ while (hardcount < max && scan < loceol &&
+ !swash_fetch(PL_utf8_alnum, (U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3492,9 +3631,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
break;
case NALNUML:
PL_reg_flags |= RF_tainted;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) {
+ while (hardcount < max && scan < loceol &&
+ !isALNUM_LC_utf8((U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3504,9 +3644,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case SPACE:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol &&
+ while (hardcount < max && scan < loceol &&
(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
scan += UTF8SKIP(scan);
hardcount++;
@@ -3518,9 +3658,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
break;
case SPACEL:
PL_reg_flags |= RF_tainted;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol &&
+ while (hardcount < max && scan < loceol &&
(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
scan += UTF8SKIP(scan);
hardcount++;
@@ -3531,9 +3671,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case NSPACE:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol &&
+ while (hardcount < max && scan < loceol &&
!(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) {
scan += UTF8SKIP(scan);
hardcount++;
@@ -3545,9 +3685,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
case NSPACEL:
PL_reg_flags |= RF_tainted;
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol &&
+ while (hardcount < max && scan < loceol &&
!(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
scan += UTF8SKIP(scan);
hardcount++;
@@ -3558,9 +3698,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case DIGIT:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) {
+ while (hardcount < max && scan < loceol &&
+ swash_fetch(PL_utf8_digit,(U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3570,9 +3711,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max)
}
break;
case NDIGIT:
- if (DO_UTF8(PL_reg_sv)) {
+ if (do_utf8) {
loceol = PL_regeol;
- while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) {
+ while (hardcount < max && scan < loceol &&
+ !swash_fetch(PL_utf8_digit,(U8*)scan)) {
scan += UTF8SKIP(scan);
hardcount++;
}
@@ -3623,7 +3765,7 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
return 0;
start = PL_reginput;
- if (UTF) {
+ if (DO_UTF8(PL_reg_sv)) {
while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
if (!count++) {
l = 0;
@@ -3701,16 +3843,21 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
{
char flags = ANYOF_FLAGS(n);
bool match = FALSE;
+ UV c;
+ STRLEN len;
+
+ if (do_utf8)
+ c = utf8_to_uv_simple(p, &len);
+ else
+ c = *p;
if (do_utf8 || (flags & ANYOF_UNICODE)) {
if (do_utf8 && !ANYOF_RUNTIME(n)) {
- STRLEN len;
- UV c = utf8_to_uv_simple(p, &len);
-
if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
match = TRUE;
}
-
+ if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256)
+ match = TRUE;
if (!match) {
SV *sw = regclass_swash(n, TRUE, 0);
@@ -3724,17 +3871,15 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
PL_reg_flags |= RF_tainted;
uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p));
}
- else
- uv_to_utf8(tmpbuf, toLOWER_utf8(p));
+ else
+ uv_to_utf8(tmpbuf, toLOWER_utf8(p));
if (swash_fetch(sw, tmpbuf))
match = TRUE;
}
}
}
}
- else {
- U8 c = *p;
-
+ if (!match && c < 256) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if (flags & ANYOF_FOLD) {
@@ -3796,18 +3941,24 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8)
STATIC U8 *
S_reghop(pTHX_ U8 *s, I32 off)
{
+ return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
+}
+
+STATIC U8 *
+S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
+{
if (off >= 0) {
- while (off-- && s < (U8*)PL_regeol) {
+ while (off-- && s < lim) {
/* XXX could check well-formedness here */
s += UTF8SKIP(s);
}
}
else {
while (off++) {
- if (s > (U8*)PL_bostr) {
+ if (s > lim) {
s--;
if (UTF8_IS_CONTINUED(*s)) {
- while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
+ while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
s--;
}
/* XXX could check well-formedness here */
@@ -3818,10 +3969,16 @@ S_reghop(pTHX_ U8 *s, I32 off)
}
STATIC U8 *
-S_reghopmaybe(pTHX_ U8* s, I32 off)
+S_reghopmaybe(pTHX_ U8 *s, I32 off)
+{
+ return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
+}
+
+STATIC U8 *
+S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
{
if (off >= 0) {
- while (off-- && s < (U8*)PL_regeol) {
+ while (off-- && s < lim) {
/* XXX could check well-formedness here */
s += UTF8SKIP(s);
}
@@ -3830,10 +3987,10 @@ S_reghopmaybe(pTHX_ U8* s, I32 off)
}
else {
while (off++) {
- if (s > (U8*)PL_bostr) {
+ if (s > lim) {
s--;
if (UTF8_IS_CONTINUED(*s)) {
- while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s))
+ while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
s--;
}
/* XXX could check well-formedness here */
diff --git a/sv.c b/sv.c
index 662b9747c7..3a32525c54 100644
--- a/sv.c
+++ b/sv.c
@@ -3517,7 +3517,7 @@ Perl_sv_setpvn(pTHX_ register SV *sv, register const char *ptr, register STRLEN
Move(ptr,dptr,len,char);
dptr[len] = '\0';
SvCUR_set(sv, len);
- (void)SvPOK_only(sv); /* validate pointer */
+ (void)SvPOK_only_UTF8(sv); /* validate pointer */
SvTAINT(sv);
}
@@ -3561,7 +3561,7 @@ Perl_sv_setpv(pTHX_ register SV *sv, register const char *ptr)
SvGROW(sv, len + 1);
Move(ptr,SvPVX(sv),len+1,char);
SvCUR_set(sv, len);
- (void)SvPOK_only(sv); /* validate pointer */
+ (void)SvPOK_only_UTF8(sv); /* validate pointer */
SvTAINT(sv);
}
@@ -3611,7 +3611,7 @@ Perl_sv_usepvn(pTHX_ register SV *sv, register char *ptr, register STRLEN len)
SvCUR_set(sv, len);
SvLEN_set(sv, len+1);
*SvEND(sv) = '\0';
- (void)SvPOK_only(sv); /* validate pointer */
+ (void)SvPOK_only_UTF8(sv); /* validate pointer */
SvTAINT(sv);
}
@@ -4658,13 +4658,24 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2)
/* do not utf8ize the comparands as a side-effect */
if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) {
+ if (PL_hints & HINT_UTF8_DISTINCT)
+ return FALSE;
+
if (SvUTF8(sv1)) {
- pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
- pv2tmp = TRUE;
+ (void)utf8_to_bytes((U8*)(pv1 = savepvn(pv1, cur1)), &cur1);
+ if (cur1 < 0) {
+ Safefree(pv1);
+ return 0;
+ }
+ pv1tmp = TRUE;
}
else {
- pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1);
- pv1tmp = TRUE;
+ (void)utf8_to_bytes((U8*)(pv2 = savepvn(pv2, cur2)), &cur2);
+ if (cur2 < 0) {
+ Safefree(pv2);
+ return 0;
+ }
+ pv2tmp = TRUE;
}
}
@@ -4714,6 +4725,9 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2)
/* do not utf8ize the comparands as a side-effect */
if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) {
+ if (PL_hints & HINT_UTF8_DISTINCT)
+ return SvUTF8(sv1) ? 1 : -1;
+
if (SvUTF8(sv1)) {
pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2);
pv2tmp = TRUE;
diff --git a/toke.c b/toke.c
index 46278e8be4..07c86b8b05 100644
--- a/toke.c
+++ b/toke.c
@@ -1336,6 +1336,8 @@ S_scan_const(pTHX_ char *start)
/* backslashes */
if (*s == '\\' && s+1 < send) {
+ bool to_be_utf8 = FALSE;
+
s++;
/* some backslashes we leave behind */
@@ -1404,7 +1406,7 @@ S_scan_const(pTHX_ char *start)
else {
STRLEN len = 1; /* allow underscores */
uv = (UV)scan_hex(s + 1, e - s - 1, &len);
- has_utf8 = TRUE;
+ to_be_utf8 = TRUE;
}
s = e + 1;
}
@@ -1421,8 +1423,8 @@ S_scan_const(pTHX_ char *start)
* There will always enough room in sv since such escapes will
* be longer than any utf8 sequence they can end up as
*/
- if (uv > 127 || has_utf8) {
- if (!this_utf8 && !has_utf8 && uv > 255) {
+ if (uv > 127) {
+ if (!has_utf8 && (to_be_utf8 || uv > 255)) {
/* might need to recode whatever we have accumulated so far
* if it contains any hibit chars
*/
@@ -1454,7 +1456,7 @@ S_scan_const(pTHX_ char *start)
}
}
- if (has_utf8 || uv > 255) {
+ if (to_be_utf8 || uv > 255) {
d = (char*)uv_to_utf8((U8*)d, uv);
has_utf8 = TRUE;
}
@@ -3120,6 +3122,9 @@ Perl_yylex(pTHX)
if (*d == '}') {
char minus = (PL_tokenbuf[0] == '-');
s = force_word(s + minus, WORD, FALSE, TRUE, FALSE);
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, 0) &&
+ PL_nextval[PL_nexttoke-1].opval)
+ SvUTF8_on(((SVOP*)PL_nextval[PL_nexttoke-1].opval)->op_sv);
if (minus)
force_next('-');
}
@@ -3774,6 +3779,8 @@ Perl_yylex(pTHX)
CLINE;
yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0));
yylval.opval->op_private = OPpCONST_BARE;
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len))
+ SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
TERM(WORD);
}
@@ -3933,6 +3940,8 @@ Perl_yylex(pTHX)
if (*s == '=' && s[1] == '>') {
CLINE;
sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf);
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len))
+ SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
TERM(WORD);
}
@@ -4104,6 +4113,10 @@ Perl_yylex(pTHX)
}
}
#endif
+#ifdef PERLIO_LAYERS
+ if (UTF && !IN_BYTE)
+ PerlIO_apply_layers(PL_rsfp, NULL, ":utf8");
+#endif
PL_rsfp = Nullfp;
}
goto fake_eof;
@@ -4634,6 +4647,7 @@ Perl_yylex(pTHX)
int warned = 0;
d = SvPV_force(PL_lex_stuff, len);
while (len) {
+ SV *sv;
for (; isSPACE(*d) && len; --len, ++d) ;
if (len) {
char *b = d;
@@ -4654,8 +4668,11 @@ Perl_yylex(pTHX)
else {
for (; !isSPACE(*d) && len; --len, ++d) ;
}
+ sv = newSVpvn(b, d-b);
+ if (DO_UTF8(PL_lex_stuff))
+ SvUTF8_on(sv);
words = append_elem(OP_LIST, words,
- newSVOP(OP_CONST, 0, tokeq(newSVpvn(b, d-b))));
+ newSVOP(OP_CONST, 0, tokeq(sv)));
}
}
if (words) {
@@ -6248,7 +6265,9 @@ S_scan_trans(pTHX_ char *start)
squash = OPpTRANS_SQUASH;
s++;
}
- o->op_private = del|squash|complement;
+ o->op_private = del|squash|complement|
+ (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
+ (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF : 0);
PL_lex_op = o;
yylval.ival = OP_TRANS;
@@ -6442,6 +6461,8 @@ retval:
Renew(SvPVX(tmpstr), SvLEN(tmpstr), char);
}
SvREFCNT_dec(herewas);
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr)))
+ SvUTF8_on(tmpstr);
PL_lex_stuff = tmpstr;
yylval.ival = op_type;
return s;
@@ -7205,7 +7226,8 @@ vstring:
SvREADONLY_on(sv);
if (utf8) {
SvUTF8_on(sv);
- sv_utf8_downgrade(sv, TRUE);
+ if (!UTF||IN_BYTE)
+ sv_utf8_downgrade(sv, TRUE);
}
}
}
diff --git a/utf8.c b/utf8.c
index 24dc69297b..f65c94fbc7 100644
--- a/utf8.c
+++ b/utf8.c
@@ -163,9 +163,13 @@ bool
Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
{
U8* x = s;
- U8* send = s + len;
+ U8* send;
STRLEN c;
+ if (!len)
+ len = strlen(s);
+ send = s + len;
+
while (x < send) {
c = is_utf8_char(x);
if (!c)