summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-01-15 05:02:24 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-01-15 05:02:24 +0000
commit9aa983d27b0af31badfcbbb76567f6e557076b41 (patch)
treea3290ebe9e4a9773e967a8beb2895428f7e717c7 /toke.c
parent78d8f6e05211de1a60b4bb9b795b8ff72f179ebe (diff)
downloadperl-9aa983d27b0af31badfcbbb76567f6e557076b41.tar.gz
More UTF-8 patches from Inaba Hiroto.
- The substr lval was still not okay. - Now pp_stringify and sv_setsv copies source's UTF8 flag even if IN_BYTE. pp_stringify is called from fold_constants at optimization phase and "\x{100}" was made SvUTF8_off under use bytes (the bytes pragma is for "byte semantics" and not for "do not produce UTF8 data") - New `qu' operator to generate UTF8 string explicitly. Though I agree with the policy "0x00-0xff always produce bytes", sometimes want to such a string to be coded in UTF8. I can use pack"U0a*" but it requires more typing and has runtime overhead. - Fix pp_regcomp bug uncovered by "0x00-0xff always produce bytes" change, the bug appears if a pm has PMdf_UTF8 flag but interpolated string is not UTF8_on and has char 0x80-0xff. TODO: document and test qu. p4raw-id: //depot/perl@8439
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c30
1 files changed, 17 insertions, 13 deletions
diff --git a/toke.c b/toke.c
index ea32115abd..398253c8a5 100644
--- a/toke.c
+++ b/toke.c
@@ -1045,8 +1045,11 @@ STATIC I32
S_sublex_done(pTHX)
{
if (!PL_lex_starts++) {
+ SV *sv = newSVpvn("",0);
+ if (SvUTF8(PL_linestr))
+ SvUTF8_on(sv);
PL_expect = XOPERATOR;
- yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpvn("",0));
+ yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
return THING;
}
@@ -1173,7 +1176,8 @@ S_scan_const(pTHX_ char *start)
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
- bool has_utf8 = FALSE; /* embedded \x{} */
+ bool has_utf8 = (PL_linestr && SvUTF8(PL_linestr));
+ /* the constant is UTF8 */
UV uv;
I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
@@ -1313,8 +1317,6 @@ S_scan_const(pTHX_ char *start)
/* backslashes */
if (*s == '\\' && s+1 < send) {
- bool to_be_utf8 = FALSE;
-
s++;
/* some backslashes we leave behind */
@@ -1383,8 +1385,6 @@ S_scan_const(pTHX_ char *start)
else {
STRLEN len = 1; /* allow underscores */
uv = (UV)scan_hex(s + 1, e - s - 1, &len);
- if (PL_hints & HINT_UTF8)
- to_be_utf8 = TRUE;
}
s = e + 1;
}
@@ -1408,7 +1408,7 @@ S_scan_const(pTHX_ char *start)
* repertoire. --jhi */
if (uv > 127) {
- if (!has_utf8 && (to_be_utf8 || uv > 255)) {
+ if (!has_utf8 && uv > 255) {
/* Might need to recode whatever we have
* accumulated so far if it contains any
* hibit chars.
@@ -1447,7 +1447,7 @@ S_scan_const(pTHX_ char *start)
}
}
- if (to_be_utf8 || has_utf8 || uv > 255) {
+ if (has_utf8 || uv > 255) {
d = (char*)uv_to_utf8((U8*)d, uv);
has_utf8 = TRUE;
}
@@ -4711,7 +4711,10 @@ Perl_yylex(pTHX)
TOKEN('(');
case KEY_qq:
+ case KEY_qu:
s = scan_str(s,FALSE,FALSE);
+ if (tmp == KEY_qu && is_utf8_string((U8*)s, SvCUR(PL_lex_stuff)))
+ SvUTF8_on(PL_lex_stuff);
if (!s)
missingterm((char*)0);
yylval.ival = OP_STRINGIFY;
@@ -5548,6 +5551,7 @@ Perl_keyword(pTHX_ register char *d, I32 len)
if (strEQ(d,"q")) return KEY_q;
if (strEQ(d,"qr")) return KEY_qr;
if (strEQ(d,"qq")) return KEY_qq;
+ if (strEQ(d,"qu")) return KEY_qu;
if (strEQ(d,"qw")) return KEY_qw;
if (strEQ(d,"qx")) return KEY_qx;
}
@@ -7204,10 +7208,9 @@ vstring:
while (isDIGIT(*pos) || *pos == '_')
pos++;
if (!isALPHA(*pos)) {
- UV rev;
+ UV rev, revmax = 0;
U8 tmpbuf[UTF8_MAXLEN+1];
U8 *tmpend;
- bool utf8 = FALSE;
s++; /* get past 'v' */
sv = NEWSV(92,5);
@@ -7234,7 +7237,8 @@ vstring:
}
}
tmpend = uv_to_utf8(tmpbuf, rev);
- utf8 = utf8 || rev > 127;
+ if (rev > revmax)
+ revmax = rev;
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
if (*pos == '.' && isDIGIT(pos[1]))
s = ++pos;
@@ -7248,9 +7252,9 @@ vstring:
SvPOK_on(sv);
SvREADONLY_on(sv);
- if (utf8) {
+ if (revmax > 127) {
SvUTF8_on(sv);
- if (!UTF||IN_BYTE)
+ if (revmax < 256)
sv_utf8_downgrade(sv, TRUE);
}
}