summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xkeywords.pl1
-rw-r--r--mg.c35
-rw-r--r--pp.c17
-rw-r--r--pp_ctl.c7
-rw-r--r--pp_hot.c2
-rw-r--r--sv.c2
-rw-r--r--t/lib/charnames.t2
-rw-r--r--t/op/length.t3
-rwxr-xr-xt/op/substr.t121
-rwxr-xr-xt/pragma/utf8.t2
-rw-r--r--toke.c30
11 files changed, 175 insertions, 47 deletions
diff --git a/keywords.pl b/keywords.pl
index 46dd53d70e..06ee8f3efc 100755
--- a/keywords.pl
+++ b/keywords.pl
@@ -181,6 +181,7 @@ q
qq
qr
quotemeta
+qu
qw
qx
rand
diff --git a/mg.c b/mg.c
index b5cae86de6..4f183b02c5 100644
--- a/mg.c
+++ b/mg.c
@@ -1404,12 +1404,14 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg)
I32 offs = LvTARGOFF(sv);
I32 rem = LvTARGLEN(sv);
+ if (SvUTF8(lsv))
+ sv_pos_u2b(lsv, &offs, &rem);
if (offs > len)
offs = len;
if (rem + offs > len)
rem = len - offs;
sv_setpvn(sv, tmps + offs, (STRLEN)rem);
- if (DO_UTF8(lsv))
+ if (SvUTF8(lsv))
SvUTF8_on(sv);
return 0;
}
@@ -1417,25 +1419,26 @@ Perl_magic_getsubstr(pTHX_ SV *sv, MAGIC *mg)
int
Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
{
- STRLEN littlelen;
- char *tmps = SvPV(sv, littlelen);
+ STRLEN len;
+ char *tmps = SvPV(sv, len);
+ SV *lsv = LvTARG(sv);
+ I32 lvoff = LvTARGOFF(sv);
+ I32 lvlen = LvTARGLEN(sv);
if (DO_UTF8(sv)) {
- I32 bigoff = LvTARGOFF(sv);
- I32 biglen = LvTARGLEN(sv);
- U8 *s, *a, *b;
-
- sv_utf8_upgrade(LvTARG(sv));
- /* sv_utf8_upgrade() might have moved and/or resized
- * the string to be replaced, we must rediscover it. --jhi */
- s = (U8*)SvPVX(LvTARG(sv));
- a = utf8_hop(s, bigoff);
- b = utf8_hop(a, biglen);
- sv_insert(LvTARG(sv), a - s, b - a, tmps, littlelen);
- SvUTF8_on(LvTARG(sv));
+ sv_utf8_upgrade(lsv);
+ sv_pos_u2b(lsv, &lvoff, &lvlen);
+ sv_insert(lsv, lvoff, lvlen, tmps, len);
+ SvUTF8_on(lsv);
+ }
+ else if (SvUTF8(lsv)) {
+ sv_pos_u2b(lsv, &lvoff, &lvlen);
+ tmps = bytes_to_utf8(tmps, &len);
+ sv_insert(lsv, lvoff, lvlen, tmps, len);
+ Safefree(tmps);
}
else
- sv_insert(LvTARG(sv), LvTARGOFF(sv), LvTARGLEN(sv), tmps, littlelen);
+ sv_insert(lsv, lvoff, lvlen, tmps, len);
return 0;
}
diff --git a/pp.c b/pp.c
index ba6c17a773..87e459e169 100644
--- a/pp.c
+++ b/pp.c
@@ -2792,6 +2792,8 @@ PP(pp_substr)
RETPUSHUNDEF;
}
else {
+ I32 upos = pos;
+ I32 urem = rem;
if (utfcurlen)
sv_pos_u2b(sv, &pos, &rem);
tmps += pos;
@@ -2826,8 +2828,8 @@ PP(pp_substr)
SvREFCNT_dec(LvTARG(TARG));
LvTARG(TARG) = SvREFCNT_inc(sv);
}
- LvTARGOFF(TARG) = pos;
- LvTARGLEN(TARG) = rem;
+ LvTARGOFF(TARG) = upos;
+ LvTARGLEN(TARG) = urem;
}
}
SPAGAIN;
@@ -2970,11 +2972,9 @@ PP(pp_chr)
(void)SvUPGRADE(TARG,SVt_PV);
- if ((value > 255 && !IN_BYTE) ||
- (UTF8_IS_CONTINUED(value) && (PL_hints & HINT_UTF8)) ) {
- SvGROW(TARG, UTF8_MAXLEN+1);
- tmps = SvPVX(TARG);
- tmps = (char*)uv_to_utf8((U8*)tmps, (UV)value);
+ if (value > 255 && !IN_BYTE) {
+ SvGROW(TARG, UNISKIP(value)+1);
+ tmps = (char*)uv_to_utf8((U8*)SvPVX(TARG), value);
SvCUR_set(TARG, tmps - SvPVX(TARG));
*tmps = '\0';
(void)SvPOK_only(TARG);
@@ -2982,9 +2982,6 @@ PP(pp_chr)
XPUSHs(TARG);
RETURN;
}
- else {
- SvUTF8_off(TARG);
- }
SvGROW(TARG,2);
SvCUR_set(TARG, 1);
diff --git a/pp_ctl.c b/pp_ctl.c
index 07545dc28a..5490221d0b 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -116,9 +116,14 @@ PP(pp_regcomp)
pm->op_pmflags = pm->op_pmpermflags; /* reset case sensitivity */
if (DO_UTF8(tmpstr))
pm->op_pmdynflags |= PMdf_DYN_UTF8;
- else
+ else {
pm->op_pmdynflags &= ~PMdf_DYN_UTF8;
+ if (pm->op_pmdynflags & PMdf_UTF8)
+ t = bytes_to_utf8(t, &len);
+ }
pm->op_pmregexp = CALLREGCOMP(aTHX_ t, t + len, pm);
+ if (!DO_UTF8(tmpstr) && (pm->op_pmdynflags & PMdf_UTF8))
+ Safefree(t);
PL_reginterp_cnt = 0; /* XXXX Be extra paranoid - needed
inside tie/overload accessors. */
}
diff --git a/pp_hot.c b/pp_hot.c
index 3a1e08daaf..0f1fee980a 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -76,7 +76,7 @@ PP(pp_stringify)
char *s;
s = SvPV(TOPs,len);
sv_setpvn(TARG,s,len);
- if (SvUTF8(TOPs) && !IN_BYTE)
+ if (SvUTF8(TOPs))
SvUTF8_on(TARG);
else
SvUTF8_off(TARG);
diff --git a/sv.c b/sv.c
index 58c6434ad6..341792412b 100644
--- a/sv.c
+++ b/sv.c
@@ -3440,7 +3440,7 @@ Perl_sv_setsv(pTHX_ SV *dstr, register SV *sstr)
*SvEND(dstr) = '\0';
(void)SvPOK_only(dstr);
}
- if ((sflags & SVf_UTF8) && !IN_BYTE)
+ if (sflags & SVf_UTF8)
SvUTF8_on(dstr);
/*SUPPRESS 560*/
if (sflags & SVp_NOK) {
diff --git a/t/lib/charnames.t b/t/lib/charnames.t
index 1d08ad0880..14da2e0f7b 100644
--- a/t/lib/charnames.t
+++ b/t/lib/charnames.t
@@ -45,7 +45,7 @@ $encoded_bet = "\327\221";
sub to_bytes {
use bytes;
- my $bytes = shift;
+ "".shift;
}
{
diff --git a/t/op/length.t b/t/op/length.t
index aec6a52871..46f0c59698 100644
--- a/t/op/length.t
+++ b/t/op/length.t
@@ -33,8 +33,7 @@ print "ok 3\n";
}
{
- use utf8; # make "\x{80}" to produce UTF-8
- my $a = "\x{80}";
+ my $a = qu"\x{80}"; # make "\x{80}" to produce UTF-8
print "not " unless length($a) == 1;
print "ok 6\n";
diff --git a/t/op/substr.t b/t/op/substr.t
index d3937fb107..12bcd00b33 100755
--- a/t/op/substr.t
+++ b/t/op/substr.t
@@ -1,6 +1,6 @@
#!./perl
-print "1..149\n";
+print "1..162\n";
#P = start of string Q = start of substr R = end of substr S = end of string
@@ -429,3 +429,122 @@ ok 149, length($x) == 5 &&
substr($x, 3, 1) eq "\x{FF}" &&
substr($x, 4, 1) eq "\x{F3}";
+# And tests for already-UTF8 one
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 0, 1) = "\x{100}";
+ok 150, length($x) == 3 &&
+ $x eq "\x{100}\xF2\xF3" &&
+ substr($x, 0, 1) eq "\x{100}" &&
+ substr($x, 1, 1) eq "\x{F2}" &&
+ substr($x, 2, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 0, 1) = "\x{100}\x{FF}";
+ok 151, length($x) == 4 &&
+ $x eq "\x{100}\x{FF}\xF2\xF3" &&
+ substr($x, 0, 1) eq "\x{100}" &&
+ substr($x, 1, 1) eq "\x{FF}" &&
+ substr($x, 2, 1) eq "\x{F2}" &&
+ substr($x, 3, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 0, 2) = "\x{100}\xFF";
+ok 152, length($x) == 3 &&
+ $x eq "\x{100}\xFF\xF3" &&
+ substr($x, 0, 1) eq "\x{100}" &&
+ substr($x, 1, 1) eq "\x{FF}" &&
+ substr($x, 2, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 1, 1) = "\x{100}\xFF";
+ok 153, length($x) == 4 &&
+ $x eq "\x{101}\x{100}\xFF\xF3" &&
+ substr($x, 0, 1) eq "\x{101}" &&
+ substr($x, 1, 1) eq "\x{100}" &&
+ substr($x, 2, 1) eq "\x{FF}" &&
+ substr($x, 3, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 2, 1) = "\x{100}\xFF";
+ok 154, length($x) == 4 &&
+ $x eq "\x{101}\xF2\x{100}\xFF" &&
+ substr($x, 0, 1) eq "\x{101}" &&
+ substr($x, 1, 1) eq "\x{F2}" &&
+ substr($x, 2, 1) eq "\x{100}" &&
+ substr($x, 3, 1) eq "\x{FF}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 3, 1) = "\x{100}\xFF";
+ok 155, length($x) == 5 &&
+ $x eq "\x{101}\x{F2}\x{F3}\x{100}\xFF" &&
+ substr($x, 0, 1) eq "\x{101}" &&
+ substr($x, 1, 1) eq "\x{F2}" &&
+ substr($x, 2, 1) eq "\x{F3}" &&
+ substr($x, 3, 1) eq "\x{100}" &&
+ substr($x, 4, 1) eq "\x{FF}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, -1, 1) = "\x{100}\xFF";
+ok 156, length($x) == 4 &&
+ $x eq "\x{101}\xF2\x{100}\xFF" &&
+ substr($x, 0, 1) eq "\x{101}" &&
+ substr($x, 1, 1) eq "\x{F2}" &&
+ substr($x, 2, 1) eq "\x{100}" &&
+ substr($x, 3, 1) eq "\x{FF}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, -1, 0) = "\x{100}\xFF";
+ok 157, length($x) == 5 &&
+ $x eq "\x{101}\xF2\x{100}\xFF\xF3" &&
+ substr($x, 0, 1) eq "\x{101}" &&
+ substr($x, 1, 1) eq "\x{F2}" &&
+ substr($x, 2, 1) eq "\x{100}" &&
+ substr($x, 3, 1) eq "\x{FF}" &&
+ substr($x, 4, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 0, -1) = "\x{100}\xFF";
+ok 158, length($x) == 3 &&
+ $x eq "\x{100}\xFF\xF3" &&
+ substr($x, 0, 1) eq "\x{100}" &&
+ substr($x, 1, 1) eq "\x{FF}" &&
+ substr($x, 2, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 0, -2) = "\x{100}\xFF";
+ok 159, length($x) == 4 &&
+ $x eq "\x{100}\xFF\xF2\xF3" &&
+ substr($x, 0, 1) eq "\x{100}" &&
+ substr($x, 1, 1) eq "\x{FF}" &&
+ substr($x, 2, 1) eq "\x{F2}" &&
+ substr($x, 3, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 0, -3) = "\x{100}\xFF";
+ok 160, length($x) == 5 &&
+ $x eq "\x{100}\xFF\x{101}\x{F2}\x{F3}" &&
+ substr($x, 0, 1) eq "\x{100}" &&
+ substr($x, 1, 1) eq "\x{FF}" &&
+ substr($x, 2, 1) eq "\x{101}" &&
+ substr($x, 3, 1) eq "\x{F2}" &&
+ substr($x, 4, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, 1, -1) = "\x{100}\xFF";
+ok 161, length($x) == 4 &&
+ $x eq "\x{101}\x{100}\xFF\xF3" &&
+ substr($x, 0, 1) eq "\x{101}" &&
+ substr($x, 1, 1) eq "\x{100}" &&
+ substr($x, 2, 1) eq "\x{FF}" &&
+ substr($x, 3, 1) eq "\x{F3}";
+
+$x = "\x{101}\x{F2}\x{F3}";
+substr($x, -1, -1) = "\x{100}\xFF";
+ok 162, length($x) == 5 &&
+ $x eq "\x{101}\xF2\x{100}\xFF\xF3" &&
+ substr($x, 0, 1) eq "\x{101}" &&
+ substr($x, 1, 1) eq "\x{F2}" &&
+ substr($x, 2, 1) eq "\x{100}" &&
+ substr($x, 3, 1) eq "\x{FF}" &&
+ substr($x, 4, 1) eq "\x{F3}";
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index e3699794bb..546b217f27 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -283,7 +283,7 @@ sub nok_bytes {
{
use utf8;
- ok_bytes chr(0xe2), pack("C*", 0xc3, 0xa2);
+ ok_bytes chr(0x1e2), pack("C*", 0xc7, 0xa2);
$test++; # 65
}
diff --git a/toke.c b/toke.c
index ea32115abd..398253c8a5 100644
--- a/toke.c
+++ b/toke.c
@@ -1045,8 +1045,11 @@ STATIC I32
S_sublex_done(pTHX)
{
if (!PL_lex_starts++) {
+ SV *sv = newSVpvn("",0);
+ if (SvUTF8(PL_linestr))
+ SvUTF8_on(sv);
PL_expect = XOPERATOR;
- yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpvn("",0));
+ yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
return THING;
}
@@ -1173,7 +1176,8 @@ S_scan_const(pTHX_ char *start)
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
- bool has_utf8 = FALSE; /* embedded \x{} */
+ bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr));
+ /* the constant is UTF8 */
UV uv;
I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
@@ -1313,8 +1317,6 @@ S_scan_const(pTHX_ char *start)
/* backslashes */
if (*s == '\\' && s+1 < send) {
- bool to_be_utf8 = FALSE;
-
s++;
/* some backslashes we leave behind */
@@ -1383,8 +1385,6 @@ S_scan_const(pTHX_ char *start)
else {
STRLEN len = 1; /* allow underscores */
uv = (UV)scan_hex(s + 1, e - s - 1, &len);
- if (PL_hints & HINT_UTF8)
- to_be_utf8 = TRUE;
}
s = e + 1;
}
@@ -1408,7 +1408,7 @@ S_scan_const(pTHX_ char *start)
* repertoire. --jhi */
if (uv > 127) {
- if (!has_utf8 && (to_be_utf8 || uv > 255)) {
+ if (!has_utf8 && uv > 255) {
/* Might need to recode whatever we have
* accumulated so far if it contains any
* hibit chars.
@@ -1447,7 +1447,7 @@ S_scan_const(pTHX_ char *start)
}
}
- if (to_be_utf8 || has_utf8 || uv > 255) {
+ if (has_utf8 || uv > 255) {
d = (char*)uv_to_utf8((U8*)d, uv);
has_utf8 = TRUE;
}
@@ -4711,7 +4711,10 @@ Perl_yylex(pTHX)
TOKEN('(');
case KEY_qq:
+ case KEY_qu:
s = scan_str(s,FALSE,FALSE);
+ if (tmp == KEY_qu && is_utf8_string((U8*)s, SvCUR(PL_lex_stuff)))
+ SvUTF8_on(PL_lex_stuff);
if (!s)
missingterm((char*)0);
yylval.ival = OP_STRINGIFY;
@@ -5548,6 +5551,7 @@ Perl_keyword(pTHX_ register char *d, I32 len)
if (strEQ(d,"q")) return KEY_q;
if (strEQ(d,"qr")) return KEY_qr;
if (strEQ(d,"qq")) return KEY_qq;
+ if (strEQ(d,"qu")) return KEY_qu;
if (strEQ(d,"qw")) return KEY_qw;
if (strEQ(d,"qx")) return KEY_qx;
}
@@ -7204,10 +7208,9 @@ vstring:
while (isDIGIT(*pos) || *pos == '_')
pos++;
if (!isALPHA(*pos)) {
- UV rev;
+ UV rev, revmax = 0;
U8 tmpbuf[UTF8_MAXLEN+1];
U8 *tmpend;
- bool utf8 = FALSE;
s++; /* get past 'v' */
sv = NEWSV(92,5);
@@ -7234,7 +7237,8 @@ vstring:
}
}
tmpend = uv_to_utf8(tmpbuf, rev);
- utf8 = utf8 || rev > 127;
+ if (rev > revmax)
+ revmax = rev;
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
if (*pos == '.' && isDIGIT(pos[1]))
s = ++pos;
@@ -7248,9 +7252,9 @@ vstring:
SvPOK_on(sv);
SvREADONLY_on(sv);
- if (utf8) {
+ if (revmax > 127) {
SvUTF8_on(sv);
- if (!UTF||IN_BYTE)
+ if (revmax < 256)
sv_utf8_downgrade(sv, TRUE);
}
}