diff options
author | Inaba Hiroto <inaba@st.rim.or.jp> | 2000-12-30 23:27:10 +0900 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2000-12-30 17:14:19 +0000 |
commit | 1aa99e6b6d14c469ac825dde483d9c9f913a3ee2 (patch) | |
tree | 76ac8dd0d9473e84a19f6184baa4eddb6337e7b1 | |
parent | 13e8c8e316d3839d0834fb8b851566b00d81e876 (diff) | |
download | perl-1aa99e6b6d14c469ac825dde483d9c9f913a3ee2.tar.gz |
more UTF8 test suites and an UTF8 patch
Message-ID: <3A4D722D.243AFD88@st.rim.or.jp>
Just the patch part for now, and the pragma renamed
as unicode::distinct.
p4raw-id: //depot/perl@8267
-rw-r--r-- | MANIFEST | 1 | ||||
-rw-r--r-- | doop.c | 333 | ||||
-rw-r--r-- | embed.h | 8 | ||||
-rwxr-xr-x | embed.pl | 2 | ||||
-rw-r--r-- | lib/unicode/distinct.pm | 35 | ||||
-rw-r--r-- | mg.c | 14 | ||||
-rw-r--r-- | op.c | 4 | ||||
-rw-r--r-- | op.h | 3 | ||||
-rw-r--r-- | perl.h | 1 | ||||
-rw-r--r-- | pod/perlapi.pod | 11 | ||||
-rw-r--r-- | pp.c | 14 | ||||
-rw-r--r-- | pp_ctl.c | 17 | ||||
-rw-r--r-- | pp_hot.c | 54 | ||||
-rw-r--r-- | proto.h | 2 | ||||
-rw-r--r-- | regcomp.c | 129 | ||||
-rw-r--r-- | regcomp.h | 1 | ||||
-rw-r--r-- | regexec.c | 519 | ||||
-rw-r--r-- | sv.c | 28 | ||||
-rw-r--r-- | toke.c | 36 | ||||
-rw-r--r-- | utf8.c | 6 |
20 files changed, 798 insertions, 420 deletions
@@ -1072,6 +1072,7 @@ lib/unicode/To/Upper.pl Unicode character database lib/unicode/UCD301.html Unicode character database lib/unicode/UCDFF301.html Unicode character database lib/unicode/Unicode.301 Unicode character database +lib/unicode/distinct.pm Perl pragma to strictly distinguish UTF8 data and non-UTF data lib/unicode/mktables.PL Unicode character database generator lib/unicode/syllables.txt Unicode character database lib/utf8.pm Pragma to control Unicode support @@ -21,18 +21,6 @@ #endif #endif -#define HALF_UTF8_UPGRADE(start,end) \ - STMT_START { \ - if ((start)<(end)) { \ - U8* NeWsTr; \ - STRLEN LeN = (end) - (start); \ - NeWsTr = bytes_to_utf8(start, &LeN); \ - Safefree(start); \ - (start) = NeWsTr; \ - (end) = (start) + LeN; \ - } \ - } STMT_END - STATIC I32 S_do_trans_simple(pTHX_ SV *sv) { @@ -41,7 +29,6 @@ S_do_trans_simple(pTHX_ SV *sv) U8 *send; U8 *dstart; I32 matches = 0; - I32 sutf = SvUTF8(sv); STRLEN len; short *tbl; I32 ch; @@ -54,7 +41,7 @@ S_do_trans_simple(pTHX_ SV *sv) send = s + len; /* First, take care of non-UTF8 input strings, because they're easy */ - if (!sutf) { + if (!SvUTF8(sv)) { while (s < send) { if ((ch = tbl[*s]) >= 0) { matches++; @@ -79,7 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv) c = utf8_to_uv(s, send - s, &ulen, 0); if (c < 0x100 && (ch = tbl[(short)c]) >= 0) { matches++; - d = uv_to_utf8(d,ch); + if (ch < 0x80) + *d++ = ch; + else + d = uv_to_utf8(d,ch); s += ulen; } else { /* No match -> copy */ @@ -88,8 +78,7 @@ S_do_trans_simple(pTHX_ SV *sv) } } *d = '\0'; - sv_setpvn(sv, (const char*)dstart, d - dstart); - Safefree(dstart); + sv_setpvn(sv, (char*)dstart, d - dstart); SvUTF8_on(sv); SvSETMAGIC(sv); return matches; @@ -101,7 +90,6 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ U8 *s; U8 *send; I32 matches = 0; - I32 hasutf = SvUTF8(sv); STRLEN len; short *tbl; @@ -112,22 +100,20 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ s = (U8*)SvPV(sv, len); send = s + len; - while (s < send) { - if (hasutf && *s & 0x80) - s += UTF8SKIP(s); - else { - UV c; - STRLEN ulen; - ulen = 1; - if (hasutf) - c = utf8_to_uv(s, send - s, &ulen, 0); - else - c = *s; - if (c < 0x100 && tbl[c] >= 0) + if (!SvUTF8(sv)) + while (s < send) { + if (tbl[*s++] >= 0) matches++; - s += ulen; - } - } + } + else + while (s < send) { + UV c; + STRLEN ulen; + c = utf8_to_uv(s, send - s, &ulen, 0); + if (c < 0x100 && tbl[c] >= 0) + matches++; + s += ulen; + } return matches; } @@ -139,7 +125,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ U8 *send; U8 *d; U8 *dstart; - I32 hasutf = SvUTF8(sv); + I32 isutf8; I32 matches = 0; STRLEN len; short *tbl; @@ -150,64 +136,96 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */ Perl_croak(aTHX_ "panic: do_trans_complex"); s = (U8*)SvPV(sv, len); + isutf8 = SvUTF8(sv); send = s + len; - Newz(0, d, len*2+1, U8); - dstart = d; - - if (PL_op->op_private & OPpTRANS_SQUASH) { - U8* p = send; - - while (s < send) { - if (hasutf && *s & 0x80) - s += UTF8SKIP(s); - else { - if ((ch = tbl[*s]) >= 0) { + if (!isutf8) { + dstart = d = s; + if (PL_op->op_private & OPpTRANS_SQUASH) { + U8* p = send; + while (s < send) { + if ((ch = tbl[*s]) >= 0) { *d = ch; matches++; - if (p != d - 1 || *p != *d) - p = d++; - } - else if (ch == -1) /* -1 is unmapped character */ - *d++ = *s; /* -2 is delete character */ - s++; - } + if (p != d - 1 || *p != *d) + p = d++; + } + else if (ch == -1) /* -1 is unmapped character */ + *d++ = *s; + else if (ch == -2) /* -2 is delete character */ + matches++; + s++; + } } + else { + while (s < send) { + if ((ch = tbl[*s]) >= 0) { + matches++; + *d++ = ch; + } + else if (ch == -1) /* -1 is unmapped character */ + *d++ = *s; + else if (ch == -2) /* -2 is delete character */ + matches++; + s++; + } + } + SvCUR_set(sv, d - dstart); } - else { - while (s < send) { - UV comp; - if (hasutf && *s & 0x80) - comp = utf8_to_uv_simple(s, NULL); - else - comp = *s; - - ch = tbl[comp]; - - if (ch == -1) { /* -1 is unmapped character */ - ch = comp; - matches--; - } - - if (ch >= 0) - d = uv_to_utf8(d, ch); - - matches++; - - s += hasutf && *s & 0x80 ? UNISKIP(*s) : 1; - + else { /* isutf8 */ + Newz(0, d, len*2+1, U8); + dstart = d; + + if (PL_op->op_private & OPpTRANS_SQUASH) { + U8* p = send; + UV pch = 0xfeedface; + while (s < send) { + STRLEN len; + UV comp = utf8_to_uv_simple(s, &len); + + if (comp > 0xff) + d = uv_to_utf8(d, comp); /* always unmapped */ + else if ((ch = tbl[comp]) >= 0) { + matches++; + if (ch != pch) { + d = uv_to_utf8(d, ch); + pch = ch; + } + s += len; + continue; + } + else if (ch == -1) /* -1 is unmapped character */ + d = uv_to_utf8(d, comp); + else if (ch == -2) /* -2 is delete character */ + matches++; + s += len; + pch = 0xfeedface; + } } + else { + while (s < send) { + STRLEN len; + UV comp = utf8_to_uv_simple(s, &len); + if (comp > 0xff) + d = uv_to_utf8(d, comp); /* always unmapped */ + else if ((ch = tbl[comp]) >= 0) { + d = uv_to_utf8(d, ch); + matches++; + } + else if (ch == -1) { /* -1 is unmapped character */ + d = uv_to_utf8(d, comp); + } + else if (ch == -2) /* -2 is delete character */ + matches++; + s += len; + } + } + *d = '\0'; + sv_setpvn(sv, (char*)dstart, d - dstart); + SvUTF8_on(sv); } - - *d = '\0'; - - sv_setpvn(sv, (const char*)dstart, d - dstart); - Safefree(dstart); - if (hasutf) - SvUTF8_on(sv); SvSETMAGIC(sv); return matches; - } STATIC I32 @@ -217,7 +235,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ U8 *send; U8 *d; U8 *start; - U8 *dstart; + U8 *dstart, *dend; I32 matches = 0; STRLEN len; @@ -228,11 +246,19 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ UV extra = none + 1; UV final; UV uv; - I32 isutf; - I32 howmany; + I32 isutf8; + U8 hibit = 0; - isutf = SvUTF8(sv); s = (U8*)SvPV(sv, len); + isutf8 = SvUTF8(sv); + if (!isutf8) { + U8 *t = s, *e = s + len; + while (t < e) + if ((hibit = *t++ & 0x80)) + break; + if (hibit) + s = bytes_to_utf8(s, &len); + } send = s + len; start = s; @@ -241,41 +267,46 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ final = SvUV(*svp); /* d needs to be bigger than s, in case e.g. upgrading is required */ - Newz(0, d, len*2+1, U8); + New(0, d, len*3+UTF8_MAXLEN, U8); + dend = d + len * 3; dstart = d; + while (s < send) { if ((uv = swash_fetch(rv, s)) < none) { s += UTF8SKIP(s); matches++; - if ((uv & 0x80) && !isutf++) - HALF_UTF8_UPGRADE(dstart,d); d = uv_to_utf8(d, uv); } else if (uv == none) { - int i; - i = UTF8SKIP(s); - if (i > 1 && !isutf++) - HALF_UTF8_UPGRADE(dstart,d); + int i = UTF8SKIP(s); while(i--) *d++ = *s++; } else if (uv == extra) { - int i; - i = UTF8SKIP(s); + int i = UTF8SKIP(s); s += i; matches++; - if (i > 1 && !isutf++) - HALF_UTF8_UPGRADE(dstart,d); d = uv_to_utf8(d, final); } else s += UTF8SKIP(s); + + if (d >= dend) { + STRLEN clen = d - dstart; + STRLEN nlen = dend - dstart + len + UTF8_MAXLEN; + Renew(dstart, nlen+UTF8_MAXLEN, U8); + d = dstart + clen; + dend = dstart + nlen; + } } *d = '\0'; - sv_setpvn(sv, (const char*)dstart, d - dstart); + sv_setpvn(sv, (char*)dstart, d - dstart); SvSETMAGIC(sv); - if (isutf) - SvUTF8_on(sv); + SvUTF8_on(sv); + if (hibit) + Safefree(start); + if (!isutf8 && !(PL_hints & HINT_UTF8)) + sv_utf8_downgrade(sv, TRUE); return matches; } @@ -284,7 +315,7 @@ STATIC I32 S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ { U8 *s; - U8 *send; + U8 *start, *send; I32 matches = 0; STRLEN len; @@ -293,10 +324,17 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ SV** svp = hv_fetch(hv, "NONE", 4, FALSE); UV none = svp ? SvUV(*svp) : 0x7fffffff; UV uv; + U8 hibit = 0; s = (U8*)SvPV(sv, len); - if (!SvUTF8(sv)) - s = bytes_to_utf8(s, &len); + if (!SvUTF8(sv)) { + U8 *t = s, *e = s + len; + while (t < e) + if ((hibit = *t++ & 0x80)) + break; + if (hibit) + start = s = bytes_to_utf8(s, &len); + } send = s + len; while (s < send) { @@ -304,6 +342,8 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ matches++; s += UTF8SKIP(s); } + if (hibit) + Safefree(start); return matches; } @@ -312,7 +352,7 @@ STATIC I32 S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ { U8 *s; - U8 *send; + U8 *start, *send; U8 *d; I32 matches = 0; I32 squash = PL_op->op_private & OPpTRANS_SQUASH; @@ -325,41 +365,45 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ UV final; UV uv; STRLEN len; - U8 *dst; - I32 isutf = SvUTF8(sv); + U8 *dstart, *dend; + I32 isutf8; + U8 hibit = 0; s = (U8*)SvPV(sv, len); + isutf8 = SvUTF8(sv); + if (!isutf8) { + U8 *t = s, *e = s + len; + while (t < e) + if ((hibit = *t++ & 0x80)) + break; + if (hibit) + s = bytes_to_utf8(s, &len); + } send = s + len; + start = s; svp = hv_fetch(hv, "FINAL", 5, FALSE); if (svp) final = SvUV(*svp); - Newz(0, d, len*2+1, U8); - dst = d; + New(0, d, len*3+UTF8_MAXLEN, U8); + dend = d + len * 3; + dstart = d; if (squash) { UV puv = 0xfeedface; while (s < send) { - if (SvUTF8(sv)) - uv = swash_fetch(rv, s); - else { - U8 tmpbuf[2]; - uv = *s++; - if (uv < 0x80) - tmpbuf[0] = uv; - else { - tmpbuf[0] = (( uv >> 6) | 0xc0); - tmpbuf[1] = (( uv & 0x3f) | 0x80); - } - uv = swash_fetch(rv, tmpbuf); + uv = swash_fetch(rv, s); + + if (d >= dend) { + STRLEN clen = d - dstart, nlen = dend - dstart + len; + Renew(dstart, nlen+UTF8_MAXLEN, U8); + d = dstart + clen; + dend = dstart + nlen; } - if (uv < none) { matches++; if (uv != puv) { - if ((uv & 0x80) && !isutf++) - HALF_UTF8_UPGRADE(dst,d); d = uv_to_utf8(d, uv); puv = uv; } @@ -367,9 +411,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ continue; } else if (uv == none) { /* "none" is unmapped character */ - STRLEN ulen; - *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0); - s += ulen; + int i = UTF8SKIP(s); + while(i--) + *d++ = *s++; puv = 0xfeedface; continue; } @@ -388,18 +432,12 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ } else { while (s < send) { - if (SvUTF8(sv)) - uv = swash_fetch(rv, s); - else { - U8 tmpbuf[2]; - uv = *s++; - if (uv < 0x80) - tmpbuf[0] = uv; - else { - tmpbuf[0] = (( uv >> 6) | 0xc0); - tmpbuf[1] = (( uv & 0x3f) | 0x80); - } - uv = swash_fetch(rv, tmpbuf); + uv = swash_fetch(rv, s); + if (d >= dend) { + STRLEN clen = d - dstart, nlen = dend - dstart + len; + Renew(dstart, nlen+UTF8_MAXLEN, U8); + d = dstart + clen; + dend = dstart + nlen; } if (uv < none) { matches++; @@ -408,9 +446,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ continue; } else if (uv == none) { /* "none" is unmapped character */ - STRLEN ulen; - *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0); - s += ulen; + int i = UTF8SKIP(s); + while(i--) + *d++ = *s++; continue; } else if (uv == extra && !del) { @@ -423,12 +461,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ s += UTF8SKIP(s); } } - if (dst) - sv_usepvn(sv, (char*)dst, d - dst); - else { - *d = '\0'; - SvCUR_set(sv, d - (U8*)SvPVX(sv)); - } + *d = '\0'; + sv_setpvn(sv, (char*)dstart, d - dstart); + SvUTF8_on(sv); + if (hibit) + Safefree(start); + if (!isutf8 && !(PL_hints & HINT_UTF8)) + sv_utf8_downgrade(sv, TRUE); SvSETMAGIC(sv); return matches; @@ -1030,7 +1030,9 @@ #define regcp_set_to S_regcp_set_to #define cache_re S_cache_re #define reghop S_reghop +#define reghop3 S_reghop3 #define reghopmaybe S_reghopmaybe +#define reghopmaybe3 S_reghopmaybe3 #define find_byclass S_find_byclass #endif #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT) @@ -2492,7 +2494,9 @@ #define regcp_set_to(a) S_regcp_set_to(aTHX_ a) #define cache_re(a) S_cache_re(aTHX_ a) #define reghop(a,b) S_reghop(aTHX_ a,b) +#define reghop3(a,b,c) S_reghop3(aTHX_ a,b,c) #define reghopmaybe(a,b) S_reghopmaybe(aTHX_ a,b) +#define reghopmaybe3(a,b,c) S_reghopmaybe3(aTHX_ a,b,c) #define find_byclass(a,b,c,d,e,f) S_find_byclass(aTHX_ a,b,c,d,e,f) #endif #if defined(PERL_IN_RUN_C) || defined(PERL_DECL_PROT) @@ -4853,8 +4857,12 @@ #define cache_re S_cache_re #define S_reghop CPerlObj::S_reghop #define reghop S_reghop +#define S_reghop3 CPerlObj::S_reghop3 +#define reghop3 S_reghop3 #define S_reghopmaybe CPerlObj::S_reghopmaybe #define reghopmaybe S_reghopmaybe +#define S_reghopmaybe3 CPerlObj::S_reghopmaybe3 +#define reghopmaybe3 S_reghopmaybe3 #define S_find_byclass CPerlObj::S_find_byclass #define find_byclass S_find_byclass #endif @@ -2407,7 +2407,9 @@ s |char*|regcppop s |char*|regcp_set_to |I32 ss s |void |cache_re |regexp *prog s |U8* |reghop |U8 *pos|I32 off +s |U8* |reghop3 |U8 *pos|I32 off|U8 *lim s |U8* |reghopmaybe |U8 *pos|I32 off +s |U8* |reghopmaybe3 |U8 *pos|I32 off|U8 *lim s |char* |find_byclass |regexp * prog|regnode *c|char *s|char *strend|char *startpos|I32 norun #endif diff --git a/lib/unicode/distinct.pm b/lib/unicode/distinct.pm new file mode 100644 index 0000000000..6471ac8d84 --- /dev/null +++ b/lib/unicode/distinct.pm @@ -0,0 +1,35 @@ +package unicode:distinct; + +our $VERSION = '0.01'; + +$unicode::distinct::hint_bits = 0x01000000; + +sub import { + $^H |= $unicode::distinct::hint_bits; +} + +sub unimport { + $^H &= ~$unicode::distinct::hint_bits; +} + +1; +__END__ + +=head1 NAME + +unicode::distinct - Perl pragma to strictly distinguish UTF8 data and non-UTF data. + +=head1 SYNOPSIS + + use unicode::distinct; + no unicode::distinct; + +=head1 DESCRIPTION + + *NOT YET* + +=head1 SEE ALSO + +L<perlunicode>, L<utf8> + +=cut @@ -366,6 +366,11 @@ Perl_magic_regdatum_get(pTHX_ SV *sv, MAGIC *mg) i = t; else /* @- */ i = s; + + if (i > 0 && DO_UTF8(PL_reg_sv)) { + char *b = rx->subbeg; + i = Perl_utf8_length(aTHX_ (U8*)b, (U8*)(b+i)); + } sv_setiv(sv,i); } } @@ -1410,7 +1415,14 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg) { STRLEN len; char *tmps = SvPV(sv,len); - sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len); + if (DO_UTF8(sv)) { + sv_utf8_upgrade(LvTARG(sv)); + sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len); + SvUTF8_on(LvTARG(sv)); + } + else + sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len); + return 0; } @@ -6772,6 +6772,8 @@ Perl_peep(pTHX_ register OP *o) if (!fields || !GvHV(*fields)) break; key = SvPV(*svp, keylen); + if (SvUTF8(*svp)) + keylen = -keylen; indsvp = hv_fetch(GvHV(*fields), key, keylen, FALSE); if (!indsvp) { Perl_croak(aTHX_ "No such pseudo-hash field \"%s\" in variable %s of type %s", @@ -6837,6 +6839,8 @@ Perl_peep(pTHX_ register OP *o) key_op = (SVOP*)key_op->op_sibling) { svp = cSVOPx_svp(key_op); key = SvPV(*svp, keylen); + if (SvUTF8(*svp)) + keylen = -keylen; indsvp = hv_fetch(GvHV(*fields), key, keylen, FALSE); if (!indsvp) { Perl_croak(aTHX_ "No such pseudo-hash field \"%s\" " @@ -250,6 +250,9 @@ struct pmop { #define PMdf_USED 0x01 /* pm has been used once already */ #define PMdf_TAINTED 0x02 /* pm compiled from tainted pattern */ #define PMdf_UTF8 0x04 /* pm compiled from utf8 data */ +#define PMdf_DYN_UTF8 0x08 + +#define PMdf_CMP_UTF8 (PMdf_UTF8|PMdf_DYN_UTF8) #define PMf_RETAINT 0x0001 /* taint $1 etc. if target tainted */ #define PMf_ONCE 0x0002 /* use pattern only once per reset */ @@ -2698,6 +2698,7 @@ enum { /* pass one of these to get_vtbl */ #define HINT_FILETEST_ACCESS 0x00400000 #define HINT_UTF8 0x00800000 +#define HINT_UTF8_DISTINCT 0x01000000 /* Various states of an input record separator SV (rs, nrs) */ #define RsSNARF(sv) (! SvOK(sv)) diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 642efe96ea..f5596e27c9 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -2582,8 +2582,9 @@ Found in file sv.c =item sv_catsv -Concatenates the string from SV C<ssv> onto the end of the string in SV -C<dsv>. Handles 'get' magic, but not 'set' magic. See C<sv_catsv_mg>. +Concatenates the string from SV C<ssv> onto the end of the string in +SV C<dsv>. Modifies C<dsv> but not C<ssv>. Handles 'get' magic, but +not 'set' magic. See C<sv_catsv_mg>. void sv_catsv(SV* dsv, SV* ssv) @@ -3283,8 +3284,7 @@ Found in file utf8.c Returns the character value of the first character in the string C<s> which is assumed to be in UTF8 encoding and no longer than C<curlen>; -C<retlen> will be set to the length, in bytes, of that character, -and the pointer C<s> will be advanced to the end of the character. +C<retlen> will be set to the length, in bytes, of that character. If C<s> does not point to a well-formed UTF8 character, the behaviour is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY, @@ -3306,8 +3306,7 @@ Found in file utf8.c Returns the character value of the first character in the string C<s> which is assumed to be in UTF8 encoding; C<retlen> will be set to the -length, in bytes, of that character, and the pointer C<s> will be -advanced to the end of the character. +length, in bytes, of that character. If C<s> does not point to a well-formed UTF8 character, zero is returned and retlen is set, if possible, to -1. @@ -5739,9 +5739,9 @@ PP(pp_split) AV *ary; register IV limit = POPi; /* note, negative is forever */ SV *sv = POPs; - bool do_utf8 = DO_UTF8(sv); STRLEN len; register char *s = SvPV(sv, len); + bool do_utf8 = DO_UTF8(sv); char *strend = s + len; register PMOP *pm; register REGEXP *rx; @@ -5878,7 +5878,7 @@ PP(pp_split) SV *csv = CALLREG_INTUIT_STRING(aTHX_ rx); len = rx->minlen; - if (len == 1 && !tail) { + if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) { STRLEN n_a; char c = *SvPV(csv, n_a); while (--limit) { @@ -5895,7 +5895,10 @@ PP(pp_split) XPUSHs(dstr); /* The rx->minlen is in characters but we want to step * s ahead by bytes. */ - s = m + (do_utf8 ? SvCUR(csv) : len); + if (do_utf8) + s = (char*)utf8_hop((U8*)m, len); + else + s = m + len; /* Fake \n at the end */ } } else { @@ -5914,7 +5917,10 @@ PP(pp_split) XPUSHs(dstr); /* The rx->minlen is in characters but we want to step * s ahead by bytes. */ - s = m + (do_utf8 ? SvCUR(csv) : len); /* Fake \n at the end */ + if (do_utf8) + s = (char*)utf8_hop((U8*)m, len); + else + s = m + len; /* Fake \n at the end */ } } } @@ -115,7 +115,9 @@ PP(pp_regcomp) pm->op_pmflags = pm->op_pmpermflags; /* reset case sensitivity */ if (DO_UTF8(tmpstr)) - pm->op_pmdynflags |= PMdf_UTF8; + pm->op_pmdynflags |= PMdf_DYN_UTF8; + else + pm->op_pmdynflags &= ~PMdf_DYN_UTF8; pm->op_pmregexp = CALLREGCOMP(aTHX_ t, t + len, pm); PL_reginterp_cnt = 0; /* XXXX Be extra paranoid - needed inside tie/overload accessors. */ @@ -185,12 +187,12 @@ PP(pp_substcont) SvPVX(targ) = SvPVX(dstr); SvCUR_set(targ, SvCUR(dstr)); SvLEN_set(targ, SvLEN(dstr)); + if (DO_UTF8(dstr)) + SvUTF8_on(targ); SvPVX(dstr) = 0; sv_free(dstr); TAINT_IF(cx->sb_rxtainted & 1); - if (pm->op_pmdynflags & PMdf_UTF8) - SvUTF8_on(targ); /* could also copy SvUTF8(dstr)? */ PUSHs(sv_2mortal(newSViv((I32)cx->sb_iters - 1))); (void)SvPOK_only_UTF8(targ); @@ -1439,8 +1441,13 @@ Perl_die_where(pTHX_ char *message, STRLEN msglen) } } } - else + else { sv_setpvn(ERRSV, message, msglen); + if (PL_hints & HINT_UTF8) + SvUTF8_on(ERRSV); + else + SvUTF8_off(ERRSV); + } } else message = SvPVx(ERRSV, msglen); @@ -2754,7 +2761,7 @@ Perl_sv_compile_2op(pTHX_ SV *sv, OP** startop, char *code, AV** avp) #else SAVEVPTR(PL_op); #endif - PL_hints = 0; + PL_hints &= HINT_UTF8; PL_op = &dummy; PL_op->op_type = OP_ENTEREVAL; @@ -140,30 +140,31 @@ PP(pp_concat) djSP; dATARGET; tryAMAGICbin(concat,opASSIGN); { dPOPTOPssrl; - STRLEN len; - U8 *s; + STRLEN len, llen, rlen; + U8 *s, *l, *r; bool left_utf8; bool right_utf8; - if (TARG == right && SvGMAGICAL(right)) - mg_get(right); - if (SvGMAGICAL(left)) + r = (U8*)SvPV(right,rlen); + + if (TARG != left) + l = (U8*)SvPV(left,llen); + else if (SvGMAGICAL(left)) mg_get(left); left_utf8 = DO_UTF8(left); right_utf8 = DO_UTF8(right); - if (left_utf8 != right_utf8) { + if (left_utf8 != right_utf8 && !IN_BYTE) { if (TARG == right && !right_utf8) { sv_utf8_upgrade(TARG); /* Now straight binary copy */ SvUTF8_on(TARG); } else { /* Set TARG to PV(left), then add right */ - U8 *l, *c, *olds = NULL; + U8 *c, *olds = NULL; STRLEN targlen; - s = (U8*)SvPV(right,len); - right_utf8 |= DO_UTF8(right); + s = r; len = rlen; if (TARG == right) { /* Take a copy since we're about to overwrite TARG */ olds = s = (U8*)savepvn((char*)s, len); @@ -174,12 +175,12 @@ PP(pp_concat) else sv_setpv(left, ""); /* Suppress warning. */ } - l = (U8*)SvPV(left, targlen); - left_utf8 |= DO_UTF8(left); if (TARG != left) - sv_setpvn(TARG, (char*)l, targlen); - if (!left_utf8) - sv_utf8_upgrade(TARG); + sv_setpvn(TARG, (char*)l, llen); + if (!left_utf8) { + SvUTF8_off(TARG); + sv_utf8_upgrade(TARG); + } /* Extend TARG to length of right (s) */ targlen = SvCUR(TARG) + len; if (!right_utf8) { @@ -207,17 +208,16 @@ PP(pp_concat) } if (TARG != left) { - s = (U8*)SvPV(left,len); if (TARG == right) { - sv_insert(TARG, 0, 0, (char*)s, len); + sv_insert(TARG, 0, 0, (char*)l, llen); SETs(TARG); RETURN; } - sv_setpvn(TARG, (char *)s, len); + sv_setpvn(TARG, (char *)l, llen); } else if (!SvOK(TARG) && SvTYPE(TARG) <= SVt_PVMG) sv_setpv(TARG, ""); /* Suppress warning. */ - s = (U8*)SvPV(right,len); + s = r; len = rlen; if (SvOK(TARG)) { #if defined(PERL_Y2KWARN) if ((SvIOK(right) || SvNOK(right)) && ckWARN(WARN_Y2K)) { @@ -235,7 +235,7 @@ PP(pp_concat) } else sv_setpvn(TARG, (char *)s, len); /* suppress warning */ - if (left_utf8) + if (left_utf8 && !IN_BYTE) SvUTF8_on(TARG); SETTARG; RETURN; @@ -1240,7 +1240,8 @@ play_it_again: if (update_minmatch++) minmatch = had_zerolen; } - if (rx->reganch & RE_USE_INTUIT) { + if (rx->reganch & RE_USE_INTUIT && + DO_UTF8(TARG) == ((rx->reganch & ROPT_UTF8) != 0)) { s = CALLREG_INTUIT_START(aTHX_ rx, TARG, s, strend, r_flags, NULL); if (!s) @@ -1337,7 +1338,13 @@ yup: /* Confirmed by INTUIT */ if (global) { rx->subbeg = truebase; rx->startp[0] = s - truebase; - rx->endp[0] = s - truebase + rx->minlen; + if (DO_UTF8(PL_reg_sv)) { + char *t = (char*)utf8_hop((U8*)s, rx->minlen); + rx->endp[0] = t - truebase; + } + else { + rx->endp[0] = s - truebase + rx->minlen; + } rx->sublen = strend - truebase; goto gotcha; } @@ -2000,6 +2007,8 @@ PP(pp_subst) if (CALLREGEXEC(aTHX_ rx, s, strend, orig, 0, TARG, NULL, r_flags | REXEC_CHECKED)) { + bool isutf8; + if (force_on_match) { force_on_match = 0; s = SvPV_force(TARG, len); @@ -2045,6 +2054,7 @@ PP(pp_subst) SvPVX(TARG) = SvPVX(dstr); SvCUR_set(TARG, SvCUR(dstr)); SvLEN_set(TARG, SvLEN(dstr)); + isutf8 = DO_UTF8(dstr); SvPVX(dstr) = 0; sv_free(dstr); @@ -2053,6 +2063,8 @@ PP(pp_subst) PUSHs(sv_2mortal(newSViv((I32)iters))); (void)SvPOK_only(TARG); + if (isutf8) + SvUTF8_on(TARG); TAINT_IF(rxtainted); SvSETMAGIC(TARG); SvTAINT(TARG); @@ -1147,7 +1147,9 @@ STATIC char* S_regcppop(pTHX); STATIC char* S_regcp_set_to(pTHX_ I32 ss); STATIC void S_cache_re(pTHX_ regexp *prog); STATIC U8* S_reghop(pTHX_ U8 *pos, I32 off); +STATIC U8* S_reghop3(pTHX_ U8 *pos, I32 off, U8 *lim); STATIC U8* S_reghopmaybe(pTHX_ U8 *pos, I32 off); +STATIC U8* S_reghopmaybe3(pTHX_ U8 *pos, I32 off, U8 *lim); STATIC char* S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun); #endif @@ -127,6 +127,7 @@ typedef struct RExC_state_t { I32 extralen; I32 seen_zerolen; I32 seen_evals; + I32 utf8; #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -148,6 +149,7 @@ typedef struct RExC_state_t { #define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) +#define RExC_utf8 (pRExC_state->utf8) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ @@ -229,8 +231,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) #define SCF_WHILEM_VISITED_POS 0x2000 -#define RF_utf8 8 -#define UTF (PL_reg_flags & RF_utf8) +#define UTF RExC_utf8 #define LOC (RExC_flags16 & PMf_LOCALE) #define FOLD (RExC_flags16 & PMf_FOLD) @@ -469,7 +470,7 @@ S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *c ANYOF_CLASS_ZERO(cl); for (value = 0; value < 256; ++value) ANYOF_BITMAP_SET(cl, value); - cl->flags = ANYOF_EOS; + cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; if (LOC) cl->flags |= ANYOF_LOCALE; } @@ -483,6 +484,8 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; + if (!(cl->flags & ANYOF_UNICODE_ALL)) + return 0; for (value = 0; value < 256; ++value) if (!ANYOF_BITMAP_TEST(cl, value)) return 0; @@ -530,6 +533,16 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl, } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ if (!(and_with->flags & ANYOF_EOS)) cl->flags &= ~ANYOF_EOS; + + if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) { + cl->flags &= ~ANYOF_UNICODE_ALL; + cl->flags |= ANYOF_UNICODE; + ARG_SET(cl, ARG(and_with)); + } + if (!(and_with->flags & ANYOF_UNICODE_ALL)) + cl->flags &= ~ANYOF_UNICODE_ALL; + if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL))) + cl->flags &= ~ANYOF_UNICODE; } /* 'OR' a given class with another one. Can create false positives */ @@ -580,6 +593,16 @@ S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, str } if (or_with->flags & ANYOF_EOS) cl->flags |= ANYOF_EOS; + + if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && + ARG(cl) != ARG(or_with)) { + cl->flags |= ANYOF_UNICODE_ALL; + cl->flags &= ~ANYOF_UNICODE; + } + if (or_with->flags & ANYOF_UNICODE_ALL) { + cl->flags |= ANYOF_UNICODE_ALL; + cl->flags &= ~ANYOF_UNICODE; + } } /* REx optimizer. Converts nodes into quickier variants "in place". @@ -787,15 +810,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } else if (OP(scan) == EXACT) { I32 l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); if (UTF) { - unsigned char *s = (unsigned char *)STRING(scan); - unsigned char *e = s + l; - I32 newl = 0; - while (s < e) { - newl++; - s += UTF8SKIP(s); - } - l = newl; + U8 *s = (U8*)STRING(scan); + l = utf8_length(s, s + l); + uc = utf8_to_uv_simple(s, NULL); } min += l; if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ @@ -815,21 +834,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Check whether it is compatible with what we know already! */ int compat = 1; - if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) - && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) + if (uc >= 0x100 || + !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) && (!(data->start_class->flags & ANYOF_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*(U8*)STRING(scan)]))) + || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; } else if (flags & SCF_DO_STCLASS_OR) { /* false positive possible if the class is case-folded */ - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + if (uc < 0x100) + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; cl_and(data->start_class, &and_with); } @@ -837,19 +857,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg } else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */ I32 l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) scan_commit(pRExC_state, data); if (UTF) { - unsigned char *s = (unsigned char *)STRING(scan); - unsigned char *e = s + l; - I32 newl = 0; - while (s < e) { - newl++; - s += UTF8SKIP(s); - } - l = newl; + U8 *s = (U8 *)STRING(scan); + l = utf8_length(s, s + l); + uc = utf8_to_uv_simple(s, NULL); } min += l; if (data && (flags & SCF_DO_SUBSTR)) @@ -858,15 +874,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg /* Check whether it is compatible with what we know already! */ int compat = 1; - if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) - && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) - && !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*(U8*)STRING(scan)])) + if (uc >= 0x100 || + !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) + && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) { - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; data->start_class->flags |= ANYOF_FOLD; if (OP(scan) == EXACTFL) @@ -877,7 +893,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (data->start_class->flags & ANYOF_FOLD) { /* false positive possible if the class is case-folded. Assume that the locale settings are the same... */ - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + if (uc < 0x100) + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; } cl_and(data->start_class, &and_with); @@ -1580,11 +1597,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) FAIL("NULL regexp argument"); /* XXXX This looks very suspicious... */ - if (pm->op_pmdynflags & PMdf_UTF8) { - PL_reg_flags |= RF_utf8; - } + if (pm->op_pmdynflags & PMdf_CMP_UTF8) + RExC_utf8 = 1; else - PL_reg_flags = 0; + RExC_utf8 = 0; RExC_precomp = savepvn(exp, xend - exp); DEBUG_r(if (!PL_colorset) reginitcolors()); @@ -1705,9 +1721,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* Starting-point info. */ again: if (PL_regkind[(U8)OP(first)] == EXACT) { - if (OP(first) == EXACT); /* Empty, get anchored substr later. */ - else if ((OP(first) == EXACTF || OP(first) == EXACTFL) - && !UTF) + if (OP(first) == EXACT) + ; /* Empty, get anchored substr later. */ + else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) r->regstclass = first; } else if (strchr((char*)PL_simple,OP(first))) @@ -3164,6 +3180,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) SV *listsv; register char *e; UV n; + bool dont_optimize_invert = FALSE; ret = reganode(pRExC_state, ANYOF, 0); @@ -3350,6 +3367,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: @@ -3360,6 +3378,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; case ANYOF_ALNUMC: @@ -3370,6 +3389,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; case ANYOF_NALNUMC: @@ -3380,6 +3400,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; case ANYOF_ALPHA: @@ -3390,6 +3411,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; case ANYOF_NALPHA: @@ -3400,6 +3422,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; case ANYOF_ASCII: @@ -3415,6 +3438,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; case ANYOF_NASCII: @@ -3430,6 +3454,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; case ANYOF_BLANK: @@ -3440,6 +3465,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; case ANYOF_NBLANK: @@ -3450,6 +3476,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isBLANK(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; case ANYOF_CNTRL: @@ -3460,6 +3487,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; case ANYOF_NCNTRL: @@ -3470,6 +3498,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; case ANYOF_DIGIT: @@ -3480,6 +3509,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = '0'; value <= '9'; value++) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; case ANYOF_NDIGIT: @@ -3492,6 +3522,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = '9' + 1; value < 256; value++) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; case ANYOF_GRAPH: @@ -3502,6 +3533,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; case ANYOF_NGRAPH: @@ -3512,6 +3544,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; case ANYOF_LOWER: @@ -3522,6 +3555,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; case ANYOF_NLOWER: @@ -3532,6 +3566,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; case ANYOF_PRINT: @@ -3542,6 +3577,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; case ANYOF_NPRINT: @@ -3552,6 +3588,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; case ANYOF_PSXSPC: @@ -3562,6 +3599,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; case ANYOF_NPSXSPC: @@ -3572,6 +3610,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPSXSPC(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_PUNCT: @@ -3582,6 +3621,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; case ANYOF_NPUNCT: @@ -3592,6 +3632,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; case ANYOF_SPACE: @@ -3602,6 +3643,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isSPACE(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); break; case ANYOF_NSPACE: @@ -3612,6 +3654,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isSPACE(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); break; case ANYOF_UPPER: @@ -3622,6 +3665,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; case ANYOF_NUPPER: @@ -3632,6 +3676,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; case ANYOF_XDIGIT: @@ -3642,6 +3687,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; case ANYOF_NXDIGIT: @@ -3652,6 +3698,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; default: @@ -3755,12 +3802,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && + if (!SIZE_ONLY && !dont_optimize_invert && /* If the only flag is inversion. */ (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; - ANYOF_FLAGS(ret) = 0; + ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; } if (!SIZE_ONLY) { @@ -4218,6 +4265,8 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) if (flags & ANYOF_UNICODE) sv_catpv(sv, "{unicode}"); + else if (flags & ANYOF_UNICODE_ALL) + sv_catpv(sv, "{all-unicode}"); { SV *lv; @@ -192,6 +192,7 @@ struct regnode_charclass_class { /* There is a character or a range past 0xff */ #define ANYOF_UNICODE 0x20 +#define ANYOF_UNICODE_ALL 0x40 /* Can match any char past 0xff */ /* Are there any runtime flags on in this node? */ #define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f) @@ -107,15 +107,22 @@ */ #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) -#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) +#define CHR_DIST(a,b) (DO_UTF8(PL_reg_sv) ? utf8_distance(a,b) : a - b) #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off)) #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off)) -#define HOP(pos,off) (UTF ? reghop((U8*)pos, off) : (U8*)(pos + off)) -#define HOPMAYBE(pos,off) (UTF ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off)) +#define HOP(pos,off) (DO_UTF8(PL_reg_sv) ? reghop((U8*)pos, off) : (U8*)(pos + off)) +#define HOPMAYBE(pos,off) (DO_UTF8(PL_reg_sv) ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off)) #define HOPc(pos,off) ((char*)HOP(pos,off)) #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off)) +#define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim)) +#define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim)) +#define HOP3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off)) +#define HOPMAYBE3(pos,off,lim) (DO_UTF8(PL_reg_sv) ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off)) +#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) +#define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim)) + static void restore_pos(pTHXo_ void *arg); @@ -354,11 +361,15 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, (strend - strpos > 60 ? "..." : "")) ); - if (prog->minlen > strend - strpos) { + if (prog->reganch & ROPT_UTF8) + PL_reg_flags |= RF_utf8; + + if (prog->minlen > CHR_DIST((U8*)strend, (U8*)strpos)) { DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short...\n")); goto fail; } strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; + PL_regeol = strend; check = prog->check_substr; if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */ ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE) @@ -377,8 +388,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, /* Substring at constant offset from beg-of-str... */ I32 slen; - PL_regeol = strend; /* Used in HOP() */ - s = HOPc(strpos, prog->check_offset_min); + s = HOP3c(strpos, prog->check_offset_min, strend); if (SvTAIL(check)) { slen = SvCUR(check); /* >= 1 */ @@ -412,7 +422,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (!ml_anch) { I32 end = prog->check_offset_max + CHR_SVLEN(check) - (SvTAIL(check) != 0); - I32 eshift = strend - s - end; + I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; if (end_shift < eshift) end_shift = eshift; @@ -451,8 +461,8 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, *data->scream_olds = s; } else - s = fbm_instr((unsigned char*)s + start_shift, - (unsigned char*)strend - end_shift, + s = fbm_instr(HOP3(s, start_shift, strend), + HOP3(strend, -end_shift, strbeg), check, PL_multiline ? FBMrf_MULTILINE : 0); /* Update the count-of-usability, remove useless subpatterns, @@ -491,34 +501,33 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (check == prog->float_substr) { do_other_anchored: { - char *last = s - start_shift, *last1, *last2; + char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2; char *s1 = s; - tmp = PL_bostr; t = s - prog->check_offset_max; if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ && (!(prog->reganch & ROPT_UTF8) - || (PL_bostr = strpos, /* Used in regcopmaybe() */ - (t = reghopmaybe_c(s, -(prog->check_offset_max))) + || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos)) && t > strpos))) /* EMPTY */; else t = strpos; - t += prog->anchored_offset; + t = HOP3c(t, prog->anchored_offset, strend); if (t < other_last) /* These positions already checked */ t = other_last; - PL_bostr = tmp; - last2 = last1 = strend - prog->minlen; + last2 = last1 = HOP3c(strend, -prog->minlen, strbeg); if (last < last1) last1 = last; /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ /* On end-of-str: see comment below. */ s = fbm_instr((unsigned char*)t, - (unsigned char*)last1 + prog->anchored_offset - + SvCUR(prog->anchored_substr) - - (SvTAIL(prog->anchored_substr)!=0), - prog->anchored_substr, PL_multiline ? FBMrf_MULTILINE : 0); - DEBUG_r(PerlIO_printf(Perl_debug_log, "%s anchored substr `%s%.*s%s'%s", + HOP3(HOP3(last1, prog->anchored_offset, strend) + + SvCUR(prog->anchored_substr), + -(SvTAIL(prog->anchored_substr)!=0), strbeg), + prog->anchored_substr, + PL_multiline ? FBMrf_MULTILINE : 0); + DEBUG_r(PerlIO_printf(Perl_debug_log, + "%s anchored substr `%s%.*s%s'%s", (s ? "Found" : "Contradicts"), PL_colors[0], (int)(SvCUR(prog->anchored_substr) @@ -533,17 +542,16 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, } DEBUG_r(PerlIO_printf(Perl_debug_log, ", trying floating at offset %ld...\n", - (long)(s1 + 1 - i_strpos))); - PL_regeol = strend; /* Used in HOP() */ - other_last = last1 + prog->anchored_offset + 1; - s = HOPc(last, 1); + (long)(HOP3c(s1, 1, strend) - i_strpos))); + other_last = HOP3c(last1, prog->anchored_offset+1, strend); + s = HOP3c(last, 1, strend); goto restart; } else { DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n", (long)(s - i_strpos))); - t = s - prog->anchored_offset; - other_last = s + 1; + t = HOP3c(s, -prog->anchored_offset, strbeg); + other_last = HOP3c(s, 1, strend); s = s1; if (t == strpos) goto try_at_start; @@ -555,11 +563,12 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, char *last, *last1; char *s1 = s; - t = s - start_shift; - last1 = last = strend - prog->minlen + prog->float_min_offset; - if (last - t > prog->float_max_offset) - last = t + prog->float_max_offset; - s = t + prog->float_min_offset; + t = HOP3c(s, -start_shift, strbeg); + last1 = last = + HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg); + if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset) + last = HOP3c(t, prog->float_max_offset, strend); + s = HOP3c(t, prog->float_min_offset, strend); if (s < other_last) s = other_last; /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ @@ -587,8 +596,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, ", trying anchored starting at offset %ld...\n", (long)(s1 + 1 - i_strpos))); other_last = last + 1; - PL_regeol = strend; /* Used in HOP() */ - s = HOPc(t, 1); + s = HOP3c(t, 1, strend); goto restart; } else { @@ -604,13 +612,10 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, } t = s - prog->check_offset_max; - tmp = PL_bostr; if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ && (!(prog->reganch & ROPT_UTF8) - || (PL_bostr = strpos, /* Used in regcopmaybe() */ - ((t = reghopmaybe_c(s, -(prog->check_offset_max))) - && t > strpos)))) { - PL_bostr = tmp; + || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos)) + && t > strpos))) { /* Fixed substring is found far enough so that the match cannot start at strpos. */ try_at_offset: @@ -668,7 +673,6 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, ++BmUSEFUL(prog->check_substr); /* hooray/5 */ } else { - PL_bostr = tmp; /* The found string does not prohibit matching at strpos, - no optimization of calling REx engine can be performed, unless it was an MBOL and we are not after MBOL, @@ -721,13 +725,16 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, regstclass does not come from lookahead... */ /* If regstclass takes bytelength more than 1: If charlength==1, OK. This leaves EXACTF only, which is dealt with in find_byclass(). */ + U8* str = (U8*)STRING(prog->regstclass); int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT - ? STR_LEN(prog->regstclass) + ? CHR_DIST(str+STR_LEN(prog->regstclass), str) : 1); char *endpos = (prog->anchored_substr || ml_anch) - ? s + (prog->minlen? cl_l : 0) - : (prog->float_substr ? check_at - start_shift + cl_l - : strend) ; + ? HOP3c(s, (prog->minlen ? cl_l : 0), strend) + : (prog->float_substr + ? HOP3c(HOP3c(check_at, -start_shift, strbeg), + cl_l, strend) + : strend); char *startpos = strbeg; t = s; @@ -754,8 +761,7 @@ Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos, if (prog->anchored_substr == check) { DEBUG_r( what = "anchored" ); hop_and_restart: - PL_regeol = strend; /* Used in HOP() */ - s = HOPc(t, 1); + s = HOP3c(t, 1, strend); if (s + start_shift + end_shift > strend) { /* XXXX Should be taken into account earlier? */ DEBUG_r( PerlIO_printf(Perl_debug_log, @@ -854,8 +860,14 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta case EXACTF: m = STRING(c); ln = STR_LEN(c); - c1 = *(U8*)m; - c2 = PL_fold[c1]; + if (UTF) { + c1 = to_utf8_lower((U8*)m); + c2 = to_utf8_upper((U8*)m); + } + else { + c1 = *(U8*)m; + c2 = PL_fold[c1]; + } goto do_exactf; case EXACTFL: m = STRING(c); @@ -867,27 +879,45 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (norun && e < s) e = s; /* Due to minlen logic of intuit() */ - /* Here it is NOT UTF! */ - if (c1 == c2) { - while (s <= e) { - if ( *(U8*)s == c1 - && (ln == 1 || !(OP(c) == EXACTF - ? ibcmp(s, m, ln) - : ibcmp_locale(s, m, ln))) - && (norun || regtry(prog, s)) ) - goto got_it; - s++; - } - } else { - while (s <= e) { - if ( (*(U8*)s == c1 || *(U8*)s == c2) - && (ln == 1 || !(OP(c) == EXACTF - ? ibcmp(s, m, ln) - : ibcmp_locale(s, m, ln))) - && (norun || regtry(prog, s)) ) - goto got_it; - s++; - } + + if (do_utf8) { + STRLEN len; + if (c1 == c2) + while (s <= e) { + if ( utf8_to_uv_simple((U8*)s, &len) == c1 + && regtry(prog, s) ) + goto got_it; + s += len; + } + else + while (s <= e) { + UV c = utf8_to_uv_simple((U8*)s, &len); + if ( (c == c1 || c == c2) && regtry(prog, s) ) + goto got_it; + s += len; + } + } + else { + if (c1 == c2) + while (s <= e) { + if ( *(U8*)s == c1 + && (ln == 1 || !(OP(c) == EXACTF + ? ibcmp(s, m, ln) + : ibcmp_locale(s, m, ln))) + && (norun || regtry(prog, s)) ) + goto got_it; + s++; + } + else + while (s <= e) { + if ( (*(U8*)s == c1 || *(U8*)s == c2) + && (ln == 1 || !(OP(c) == EXACTF + ? ibcmp(s, m, ln) + : ibcmp_locale(s, m, ln))) + && (norun || regtry(prog, s)) ) + goto got_it; + s++; + } } break; case BOUNDL: @@ -898,7 +928,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (s == startpos) tmp = '\n'; else { - U8 *r = reghop((U8*)s, -1); + U8 *r = reghop3((U8*)s, -1, (U8*)startpos); tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); } @@ -940,7 +970,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta if (s == startpos) tmp = '\n'; else { - U8 *r = reghop((U8*)s, -1); + U8 *r = reghop3((U8*)s, -1, (U8*)startpos); tmp = (I32)utf8_to_uv(r, s - (char*)r, 0, 0); } @@ -1346,6 +1376,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * I32 scream_pos = -1; /* Internal iterator of scream. */ char *scream_olds; SV* oreplsv = GvSV(PL_replgv); + bool do_utf8 = DO_UTF8(sv); PL_regcc = 0; @@ -1361,12 +1392,22 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } minlen = prog->minlen; - if (strend - startpos < minlen) goto phooey; + if (do_utf8) { + if (utf8_distance((U8*)strend, (U8*)startpos) < minlen) goto phooey; + } + else { + if (strend - startpos < minlen) goto phooey; + } if (startpos == strbeg) /* is ^ valid at stringarg? */ PL_regprev = '\n'; else { - PL_regprev = (U32)stringarg[-1]; + if (prog->reganch & ROPT_UTF8 && do_utf8) { + U8 *s = reghop3((U8*)stringarg, -1, (U8*)strbeg); + PL_regprev = utf8_to_uv(s, (U8*)stringarg - s, NULL, 0); + } + else + PL_regprev = (U32)stringarg[-1]; if (!PL_multiline && PL_regprev == '\n') PL_regprev = '\0'; /* force ^ to NOT match */ } @@ -1454,7 +1495,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * if (minlen) dontbother = minlen - 1; - end = HOPc(strend, -dontbother) - 1; + end = HOP3c(strend, -dontbother, strbeg) - 1; /* for multiline we only have to try after newlines */ if (prog->check_substr) { if (s == startpos) @@ -1500,7 +1541,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * int did_match = 0; #endif - if (UTF) { + if (do_utf8) { while (s < strend) { if (*s == ch) { DEBUG_r( did_match = 1 ); @@ -1529,18 +1570,19 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * "Did not find anchored character...\n")); } /*SUPPRESS 560*/ - else if (prog->anchored_substr != Nullsv - || (prog->float_substr != Nullsv - && prog->float_max_offset < strend - s)) { + else if (do_utf8 == (UTF!=0) && + (prog->anchored_substr != Nullsv + || (prog->float_substr != Nullsv + && prog->float_max_offset < strend - s))) { SV *must = prog->anchored_substr ? prog->anchored_substr : prog->float_substr; I32 back_max = prog->anchored_substr ? prog->anchored_offset : prog->float_max_offset; I32 back_min = prog->anchored_substr ? prog->anchored_offset : prog->float_min_offset; - char *last = HOPc(strend, /* Cannot start after this */ + char *last = HOP3c(strend, /* Cannot start after this */ -(I32)(CHR_SVLEN(must) - - (SvTAIL(must) != 0) + back_min)); + - (SvTAIL(must) != 0) + back_min), strbeg); char *last1; /* Last position checked before */ #ifdef DEBUGGING int did_match = 0; @@ -1558,9 +1600,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * strend = HOPc(strend, -dontbother); while ( (s <= last) && ((flags & REXEC_SCREAM) - ? (s = screaminstr(sv, must, HOPc(s, back_min) - strbeg, + ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg, end_shift, &scream_pos, 0)) - : (s = fbm_instr((unsigned char*)HOP(s, back_min), + : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend), (unsigned char*)strend, must, PL_multiline ? FBMrf_MULTILINE : 0))) ) { DEBUG_r( did_match = 1 ); @@ -1574,7 +1616,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * last1 = HOPc(s, -back_min); s = t; } - if (UTF) { + if (do_utf8) { while (s <= last1) { if (regtry(prog, s)) goto got_it; @@ -1655,7 +1697,7 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * dontbother = minlen - 1; strend -= dontbother; /* this one's always in bytes! */ /* We don't know much -- general case. */ - if (UTF) { + if (do_utf8) { for (;;) { if (regtry(prog, s)) goto got_it; @@ -1926,20 +1968,25 @@ S_regmatch(pTHX_ regnode *prog) SV *prop = sv_newmortal(); int docolor = *PL_colors[0]; int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ - int l = (PL_regeol - locinput > taill ? taill : PL_regeol - locinput); + int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput); /* The part of the string before starttry has one color (pref0_len chars), between starttry and current position another one (pref_len - pref0_len chars), after the current position the third one. We assume that pref0_len <= pref_len, otherwise we decrease pref0_len. */ - int pref_len = (locinput - PL_bostr > (5 + taill) - l - ? (5 + taill) - l : locinput - PL_bostr); - int pref0_len = pref_len - (locinput - PL_reg_starttry); + int pref_len = (locinput - PL_bostr) > (5 + taill) - l + ? (5 + taill) - l : locinput - PL_bostr; + int pref0_len; + while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) + pref_len++; + pref0_len = pref_len - (locinput - PL_reg_starttry); if (l + pref_len < (5 + taill) && l < PL_regeol - locinput) l = ( PL_regeol - locinput > (5 + taill) - pref_len ? (5 + taill) - pref_len : PL_regeol - locinput); + while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) + l--; if (pref0_len < 0) pref0_len = 0; if (pref0_len > pref_len) @@ -2013,7 +2060,7 @@ S_regmatch(pTHX_ regnode *prog) sayNO; break; case SANY: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -2025,20 +2072,46 @@ S_regmatch(pTHX_ regnode *prog) nextchr = UCHARAT(++locinput); break; case REG_ANY: - if (DO_UTF8(PL_reg_sv)) { + if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') + sayNO; + if (do_utf8) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; nextchr = UCHARAT(locinput); - break; } - if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') - sayNO; - nextchr = UCHARAT(++locinput); + else + nextchr = UCHARAT(++locinput); break; case EXACT: s = STRING(scan); ln = STR_LEN(scan); + if (do_utf8 != (UTF!=0)) { + char *l = locinput; + char *e = s + ln; + STRLEN len; + if (do_utf8) + while (s < e) { + if (l >= PL_regeol) + sayNO; + if (*((U8*)s) != utf8_to_uv_simple((U8*)l, &len)) + sayNO; + s++; + l += len; + } + else + while (s < e) { + if (l >= PL_regeol) + sayNO; + if (*((U8*)l) != utf8_to_uv_simple((U8*)s, &len)) + sayNO; + s += len; + l++; + } + locinput = l; + nextchr = UCHARAT(locinput); + break; + } /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr) sayNO; @@ -2056,21 +2129,19 @@ S_regmatch(pTHX_ regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (UTF) { + if (do_utf8) { char *l = locinput; - char *e = s + ln; + char *e; + e = s + ln; c1 = OP(scan) == EXACTF; while (s < e) { - if (l >= PL_regeol) - sayNO; - if (utf8_to_uv((U8*)s, e - s, 0, 0) != - (c1 ? - toLOWER_utf8((U8*)l) : - toLOWER_LC_utf8((U8*)l))) - { + if (l >= PL_regeol) { sayNO; } - s += UTF8SKIP(s); + if ((UTF ? utf8_to_uv((U8*)s, e - s, 0, 0) : *((U8*)s)) != + (c1 ? toLOWER_utf8((U8*)l) : toLOWER_LC_utf8((U8*)l))) + sayNO; + s += UTF ? UTF8SKIP(s) : 1; l += UTF8SKIP(l); } locinput = l; @@ -2201,7 +2272,7 @@ S_regmatch(pTHX_ regnode *prog) case SPACE: if (!nextchr) sayNO; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { if (nextchr & 0x80) { if (!(OP(scan) == SPACE ? swash_fetch(PL_utf8_space, (U8*)locinput) @@ -2231,7 +2302,7 @@ S_regmatch(pTHX_ regnode *prog) case NSPACE: if (!nextchr && locinput >= PL_regeol) sayNO; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { if (OP(scan) == NSPACE ? swash_fetch(PL_utf8_space, (U8*)locinput) : isSPACE_LC_utf8((U8*)locinput)) @@ -2253,7 +2324,7 @@ S_regmatch(pTHX_ regnode *prog) case DIGIT: if (!nextchr) sayNO; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { if (!(OP(scan) == DIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput))) @@ -2275,7 +2346,7 @@ S_regmatch(pTHX_ regnode *prog) case NDIGIT: if (!nextchr && locinput >= PL_regeol) sayNO; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { if (OP(scan) == NDIGIT ? swash_fetch(PL_utf8_digit, (U8*)locinput) : isDIGIT_LC_utf8((U8*)locinput)) @@ -2315,7 +2386,7 @@ S_regmatch(pTHX_ regnode *prog) break; s = PL_bostr + ln; - if (UTF && OP(scan) != REF) { /* REF can do byte comparison */ + if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */ char *l = locinput; char *e = PL_bostr + PL_regendp[n]; /* @@ -2420,7 +2491,6 @@ S_regmatch(pTHX_ regnode *prog) I32 onpar = PL_regnpar; pm.op_pmflags = 0; - pm.op_pmdynflags = (UTF||DO_UTF8(ret) ? PMdf_UTF8 : 0); re = CALLREGCOMP(aTHX_ t, t + len, &pm); if (!(SvFLAGS(ret) & (SVs_TEMP | SVs_PADTMP | SVf_READONLY))) @@ -3035,13 +3105,23 @@ S_regmatch(pTHX_ regnode *prog) * when we know what character comes next. */ if (PL_regkind[(U8)OP(next)] == EXACT) { - c1 = (U8)*STRING(next); - if (OP(next) == EXACTF) - c2 = PL_fold[c1]; - else if (OP(next) == EXACTFL) - c2 = PL_fold_locale[c1]; - else - c2 = c1; + U8 *s = (U8*)STRING(next); + if (!UTF) { + c2 = c1 = *s; + if (OP(next) == EXACTF) + c2 = PL_fold[c1]; + else if (OP(next) == EXACTFL) + c2 = PL_fold_locale[c1]; + } + else { /* UTF */ + if (OP(next) == EXACTF) { + c1 = to_utf8_lower(s); + c2 = to_utf8_upper(s); + } + else { + c2 = c1 = utf8_to_uv_simple(s, NULL); + } + } } else c1 = c2 = -1000; @@ -3054,29 +3134,65 @@ S_regmatch(pTHX_ regnode *prog) locinput = PL_reginput; REGCP_SET(lastcp); if (c1 != -1000) { - char *e = locinput + n - ln; /* Should not check after this */ + char *e; /* Should not check after this */ char *old = locinput; - if (e >= PL_regeol || (n == REG_INFTY)) + if (n == REG_INFTY) { e = PL_regeol - 1; + if (do_utf8) + while (UTF8_IS_CONTINUATION(*(U8*)e)) + e--; + } + else if (do_utf8) { + int m = n - ln; + for (e = locinput; + m >0 && e + UTF8SKIP(e) <= PL_regeol; m--) + e += UTF8SKIP(e); + } + else { + e = locinput + n - ln; + if (e >= PL_regeol) + e = PL_regeol - 1; + } while (1) { + int count; /* Find place 'next' could work */ - if (c1 == c2) { - while (locinput <= e && *locinput != c1) - locinput++; - } else { - while (locinput <= e - && *locinput != c1 - && *locinput != c2) - locinput++; + if (!do_utf8) { + if (c1 == c2) { + while (locinput <= e && *locinput != c1) + locinput++; + } else { + while (locinput <= e + && *locinput != c1 + && *locinput != c2) + locinput++; + } + count = locinput - old; + } + else { + STRLEN len; + if (c1 == c2) { + for (count = 0; + locinput <= e && + utf8_to_uv_simple((U8*)locinput, &len) != c1; + count++) + locinput += len; + + } else { + for (count = 0; locinput <= e; count++) { + UV c = utf8_to_uv_simple((U8*)locinput, &len); + if (c == c1 || c == c2) + break; + locinput += len; + } + } } if (locinput > e) sayNO; /* PL_reginput == old now */ if (locinput != old) { ln = 1; /* Did some */ - if (regrepeat(scan, locinput - old) < - locinput - old) + if (regrepeat(scan, count) < count) sayNO; } /* PL_reginput == locinput now */ @@ -3084,15 +3200,24 @@ S_regmatch(pTHX_ regnode *prog) PL_reginput = locinput; /* Could be reset... */ REGCP_UNWIND(lastcp); /* Couldn't or didn't -- move forward. */ - old = locinput++; + old = locinput; + if (do_utf8) + locinput += UTF8SKIP(locinput); + else + locinput++; } } else while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */ + UV c; + if (c1 != -1000) { + if (do_utf8) + c = utf8_to_uv_simple((U8*)PL_reginput, NULL); + else + c = UCHARAT(PL_reginput); + } /* If it could work, try it. */ - if (c1 == -1000 || - UCHARAT(PL_reginput) == c1 || - UCHARAT(PL_reginput) == c2) + if (c1 == -1000 || c == c1 || c == c2) { TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND(lastcp); @@ -3122,11 +3247,16 @@ S_regmatch(pTHX_ regnode *prog) } REGCP_SET(lastcp); if (paren) { + UV c; while (n >= ln) { + if (c1 != -1000) { + if (do_utf8) + c = utf8_to_uv_simple((U8*)PL_reginput, NULL); + else + c = UCHARAT(PL_reginput); + } /* If it could work, try it. */ - if (c1 == -1000 || - UCHARAT(PL_reginput) == c1 || - UCHARAT(PL_reginput) == c2) + if (c1 == -1000 || c == c1 || c == c2) { TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND(lastcp); @@ -3137,11 +3267,16 @@ S_regmatch(pTHX_ regnode *prog) } } else { + UV c; while (n >= ln) { + if (c1 != -1000) { + if (do_utf8) + c = utf8_to_uv_simple((U8*)PL_reginput, NULL); + else + c = UCHARAT(PL_reginput); + } /* If it could work, try it. */ - if (c1 == -1000 || - UCHARAT(PL_reginput) == c1 || - UCHARAT(PL_reginput) == c2) + if (c1 == -1000 || c == c1 || c == c2) { TRYPAREN(paren, n, PL_reginput); REGCP_UNWIND(lastcp); @@ -3401,9 +3536,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max) loceol = scan + max; switch (OP(p)) { case REG_ANY: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && *scan != '\n') { + while (scan < loceol && hardcount < max && *scan != '\n') { scan += UTF8SKIP(scan); hardcount++; } @@ -3413,9 +3548,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case SANY: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol) { + while (hardcount < max && scan < loceol) { scan += UTF8SKIP(scan); hardcount++; } @@ -3444,7 +3579,8 @@ S_regrepeat(pTHX_ regnode *p, I32 max) case ANYOF: if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && reginclass(p, (U8*)scan, do_utf8)) { + while (hardcount < max && scan < loceol && + reginclass(p, (U8*)scan, do_utf8)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3454,9 +3590,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case ALNUM: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan)) { + while (hardcount < max && scan < loceol && + swash_fetch(PL_utf8_alnum, (U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3467,9 +3604,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max) break; case ALNUML: PL_reg_flags |= RF_tainted; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && isALNUM_LC_utf8((U8*)scan)) { + while (hardcount < max && scan < loceol && + isALNUM_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3479,9 +3617,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case NALNUM: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && !swash_fetch(PL_utf8_alnum, (U8*)scan)) { + while (hardcount < max && scan < loceol && + !swash_fetch(PL_utf8_alnum, (U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3492,9 +3631,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max) break; case NALNUML: PL_reg_flags |= RF_tainted; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { + while (hardcount < max && scan < loceol && + !isALNUM_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3504,9 +3644,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case SPACE: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && + while (hardcount < max && scan < loceol && (*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { scan += UTF8SKIP(scan); hardcount++; @@ -3518,9 +3658,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max) break; case SPACEL: PL_reg_flags |= RF_tainted; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && + while (hardcount < max && scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { scan += UTF8SKIP(scan); hardcount++; @@ -3531,9 +3671,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case NSPACE: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && + while (hardcount < max && scan < loceol && !(*scan == ' ' || swash_fetch(PL_utf8_space,(U8*)scan))) { scan += UTF8SKIP(scan); hardcount++; @@ -3545,9 +3685,9 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } case NSPACEL: PL_reg_flags |= RF_tainted; - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && + while (hardcount < max && scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { scan += UTF8SKIP(scan); hardcount++; @@ -3558,9 +3698,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case DIGIT: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && swash_fetch(PL_utf8_digit,(U8*)scan)) { + while (hardcount < max && scan < loceol && + swash_fetch(PL_utf8_digit,(U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3570,9 +3711,10 @@ S_regrepeat(pTHX_ regnode *p, I32 max) } break; case NDIGIT: - if (DO_UTF8(PL_reg_sv)) { + if (do_utf8) { loceol = PL_regeol; - while (scan < loceol && !swash_fetch(PL_utf8_digit,(U8*)scan)) { + while (hardcount < max && scan < loceol && + !swash_fetch(PL_utf8_digit,(U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -3623,7 +3765,7 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp) return 0; start = PL_reginput; - if (UTF) { + if (DO_UTF8(PL_reg_sv)) { while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) { if (!count++) { l = 0; @@ -3701,16 +3843,21 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) { char flags = ANYOF_FLAGS(n); bool match = FALSE; + UV c; + STRLEN len; + + if (do_utf8) + c = utf8_to_uv_simple(p, &len); + else + c = *p; if (do_utf8 || (flags & ANYOF_UNICODE)) { if (do_utf8 && !ANYOF_RUNTIME(n)) { - STRLEN len; - UV c = utf8_to_uv_simple(p, &len); - if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) match = TRUE; } - + if (do_utf8 && flags & ANYOF_UNICODE_ALL && c >= 256) + match = TRUE; if (!match) { SV *sw = regclass_swash(n, TRUE, 0); @@ -3724,17 +3871,15 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) PL_reg_flags |= RF_tainted; uv_to_utf8(tmpbuf, toLOWER_LC_utf8(p)); } - else - uv_to_utf8(tmpbuf, toLOWER_utf8(p)); + else + uv_to_utf8(tmpbuf, toLOWER_utf8(p)); if (swash_fetch(sw, tmpbuf)) match = TRUE; } } } } - else { - U8 c = *p; - + if (!match && c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if (flags & ANYOF_FOLD) { @@ -3796,18 +3941,24 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) STATIC U8 * S_reghop(pTHX_ U8 *s, I32 off) { + return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); +} + +STATIC U8 * +S_reghop3(pTHX_ U8 *s, I32 off, U8* lim) +{ if (off >= 0) { - while (off-- && s < (U8*)PL_regeol) { + while (off-- && s < lim) { /* XXX could check well-formedness here */ s += UTF8SKIP(s); } } else { while (off++) { - if (s > (U8*)PL_bostr) { + if (s > lim) { s--; if (UTF8_IS_CONTINUED(*s)) { - while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s)) + while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) s--; } /* XXX could check well-formedness here */ @@ -3818,10 +3969,16 @@ S_reghop(pTHX_ U8 *s, I32 off) } STATIC U8 * -S_reghopmaybe(pTHX_ U8* s, I32 off) +S_reghopmaybe(pTHX_ U8 *s, I32 off) +{ + return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)); +} + +STATIC U8 * +S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim) { if (off >= 0) { - while (off-- && s < (U8*)PL_regeol) { + while (off-- && s < lim) { /* XXX could check well-formedness here */ s += UTF8SKIP(s); } @@ -3830,10 +3987,10 @@ S_reghopmaybe(pTHX_ U8* s, I32 off) } else { while (off++) { - if (s > (U8*)PL_bostr) { + if (s > lim) { s--; if (UTF8_IS_CONTINUED(*s)) { - while (s > (U8*)PL_bostr && UTF8_IS_CONTINUATION(*s)) + while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s)) s--; } /* XXX could check well-formedness here */ @@ -3517,7 +3517,7 @@ Perl_sv_setpvn(pTHX_ register SV *sv, register const char *ptr, register STRLEN Move(ptr,dptr,len,char); dptr[len] = '\0'; SvCUR_set(sv, len); - (void)SvPOK_only(sv); /* validate pointer */ + (void)SvPOK_only_UTF8(sv); /* validate pointer */ SvTAINT(sv); } @@ -3561,7 +3561,7 @@ Perl_sv_setpv(pTHX_ register SV *sv, register const char *ptr) SvGROW(sv, len + 1); Move(ptr,SvPVX(sv),len+1,char); SvCUR_set(sv, len); - (void)SvPOK_only(sv); /* validate pointer */ + (void)SvPOK_only_UTF8(sv); /* validate pointer */ SvTAINT(sv); } @@ -3611,7 +3611,7 @@ Perl_sv_usepvn(pTHX_ register SV *sv, register char *ptr, register STRLEN len) SvCUR_set(sv, len); SvLEN_set(sv, len+1); *SvEND(sv) = '\0'; - (void)SvPOK_only(sv); /* validate pointer */ + (void)SvPOK_only_UTF8(sv); /* validate pointer */ SvTAINT(sv); } @@ -4658,13 +4658,24 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2) /* do not utf8ize the comparands as a side-effect */ if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) { + if (PL_hints & HINT_UTF8_DISTINCT) + return FALSE; + if (SvUTF8(sv1)) { - pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2); - pv2tmp = TRUE; + (void)utf8_to_bytes((U8*)(pv1 = savepvn(pv1, cur1)), &cur1); + if (cur1 < 0) { + Safefree(pv1); + return 0; + } + pv1tmp = TRUE; } else { - pv1 = (char*)bytes_to_utf8((U8*)pv1, &cur1); - pv1tmp = TRUE; + (void)utf8_to_bytes((U8*)(pv2 = savepvn(pv2, cur2)), &cur2); + if (cur2 < 0) { + Safefree(pv2); + return 0; + } + pv2tmp = TRUE; } } @@ -4714,6 +4725,9 @@ Perl_sv_cmp(pTHX_ register SV *sv1, register SV *sv2) /* do not utf8ize the comparands as a side-effect */ if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) { + if (PL_hints & HINT_UTF8_DISTINCT) + return SvUTF8(sv1) ? 1 : -1; + if (SvUTF8(sv1)) { pv2 = (char*)bytes_to_utf8((U8*)pv2, &cur2); pv2tmp = TRUE; @@ -1336,6 +1336,8 @@ S_scan_const(pTHX_ char *start) /* backslashes */ if (*s == '\\' && s+1 < send) { + bool to_be_utf8 = FALSE; + s++; /* some backslashes we leave behind */ @@ -1404,7 +1406,7 @@ S_scan_const(pTHX_ char *start) else { STRLEN len = 1; /* allow underscores */ uv = (UV)scan_hex(s + 1, e - s - 1, &len); - has_utf8 = TRUE; + to_be_utf8 = TRUE; } s = e + 1; } @@ -1421,8 +1423,8 @@ S_scan_const(pTHX_ char *start) * There will always enough room in sv since such escapes will * be longer than any utf8 sequence they can end up as */ - if (uv > 127 || has_utf8) { - if (!this_utf8 && !has_utf8 && uv > 255) { + if (uv > 127) { + if (!has_utf8 && (to_be_utf8 || uv > 255)) { /* might need to recode whatever we have accumulated so far * if it contains any hibit chars */ @@ -1454,7 +1456,7 @@ S_scan_const(pTHX_ char *start) } } - if (has_utf8 || uv > 255) { + if (to_be_utf8 || uv > 255) { d = (char*)uv_to_utf8((U8*)d, uv); has_utf8 = TRUE; } @@ -3120,6 +3122,9 @@ Perl_yylex(pTHX) if (*d == '}') { char minus = (PL_tokenbuf[0] == '-'); s = force_word(s + minus, WORD, FALSE, TRUE, FALSE); + if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, 0) && + PL_nextval[PL_nexttoke-1].opval) + SvUTF8_on(((SVOP*)PL_nextval[PL_nexttoke-1].opval)->op_sv); if (minus) force_next('-'); } @@ -3774,6 +3779,8 @@ Perl_yylex(pTHX) CLINE; yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0)); yylval.opval->op_private = OPpCONST_BARE; + if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -3933,6 +3940,8 @@ Perl_yylex(pTHX) if (*s == '=' && s[1] == '>') { CLINE; sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf); + if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len)) + SvUTF8_on(((SVOP*)yylval.opval)->op_sv); TERM(WORD); } @@ -4104,6 +4113,10 @@ Perl_yylex(pTHX) } } #endif +#ifdef PERLIO_LAYERS + if (UTF && !IN_BYTE) + PerlIO_apply_layers(PL_rsfp, NULL, ":utf8"); +#endif PL_rsfp = Nullfp; } goto fake_eof; @@ -4634,6 +4647,7 @@ Perl_yylex(pTHX) int warned = 0; d = SvPV_force(PL_lex_stuff, len); while (len) { + SV *sv; for (; isSPACE(*d) && len; --len, ++d) ; if (len) { char *b = d; @@ -4654,8 +4668,11 @@ Perl_yylex(pTHX) else { for (; !isSPACE(*d) && len; --len, ++d) ; } + sv = newSVpvn(b, d-b); + if (DO_UTF8(PL_lex_stuff)) + SvUTF8_on(sv); words = append_elem(OP_LIST, words, - newSVOP(OP_CONST, 0, tokeq(newSVpvn(b, d-b)))); + newSVOP(OP_CONST, 0, tokeq(sv))); } } if (words) { @@ -6248,7 +6265,9 @@ S_scan_trans(pTHX_ char *start) squash = OPpTRANS_SQUASH; s++; } - o->op_private = del|squash|complement; + o->op_private = del|squash|complement| + (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)| + (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF : 0); PL_lex_op = o; yylval.ival = OP_TRANS; @@ -6442,6 +6461,8 @@ retval: Renew(SvPVX(tmpstr), SvLEN(tmpstr), char); } SvREFCNT_dec(herewas); + if (UTF && !IN_BYTE && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr))) + SvUTF8_on(tmpstr); PL_lex_stuff = tmpstr; yylval.ival = op_type; return s; @@ -7205,7 +7226,8 @@ vstring: SvREADONLY_on(sv); if (utf8) { SvUTF8_on(sv); - sv_utf8_downgrade(sv, TRUE); + if (!UTF||IN_BYTE) + sv_utf8_downgrade(sv, TRUE); } } } @@ -163,9 +163,13 @@ bool Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) { U8* x = s; - U8* send = s + len; + U8* send; STRLEN c; + if (!len) + len = strlen(s); + send = s + len; + while (x < send) { c = is_utf8_char(x); if (!c) |