summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorGurusamy Sarathy <gsar@cpan.org>2000-01-31 04:57:42 +0000
committerGurusamy Sarathy <gsar@cpan.org>2000-01-31 04:57:42 +0000
commit7e2040f0b7c6fc88ec07b6e169aa2f75fc0130a4 (patch)
treede43e349e9f70e27ef30b2a0de9de2df628cc1c3 /toke.c
parent8004f2ac219abdd8660c02a4a46ed97695dc379d (diff)
downloadperl-7e2040f0b7c6fc88ec07b6e169aa2f75fc0130a4.tar.gz
runtime now looks at the SVf_UTF8 bit on the SV to decide
whether to use widechar semantics; lexer and RE engine continue to need "use utf8" to enable unicode awareness in literals and patterns (TODO: this needs to be fixed); $1 et al are marked SvUTF8 if the pattern was compiled for utf8 (TODO: propagating it from the data is probably better) p4raw-id: //depot/perl@4930
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c100
1 files changed, 49 insertions, 51 deletions
diff --git a/toke.c b/toke.c
index e7e217473e..cc370bc245 100644
--- a/toke.c
+++ b/toke.c
@@ -32,19 +32,8 @@ static void restore_rsfp(pTHXo_ void *f);
#define XFAKEBRACK 128
#define XENUMMASK 127
+/*#define UTF (SvUTF8(PL_linestr) && !(PL_hints & HINT_BYTE))*/
#define UTF (PL_hints & HINT_UTF8)
-/*
- * Note: we try to be careful never to call the isXXX_utf8() functions
- * unless we're pretty sure we've seen the beginning of a UTF-8 character
- * (that is, the two high bits are set). Otherwise we risk loading in the
- * heavy-duty SWASHINIT and SWASHGET routines unnecessarily.
- */
-#define isIDFIRST_lazy(p) ((!UTF || (*((U8*)p) < 0xc0)) \
- ? isIDFIRST(*(p)) \
- : isIDFIRST_utf8((U8*)p))
-#define isALNUM_lazy(p) ((!UTF || (*((U8*)p) < 0xc0)) \
- ? isALNUM(*(p)) \
- : isALNUM_utf8((U8*)p))
/* In variables name $^X, these are the legal values for X.
* 1999-02-27 mjd-perl-patch@plover.com */
@@ -223,9 +212,9 @@ S_no_op(pTHX_ char *what, char *s)
yywarn(Perl_form(aTHX_ "%s found where operator expected", what));
if (is_first)
Perl_warn(aTHX_ "\t(Missing semicolon on previous line?)\n");
- else if (PL_oldoldbufptr && isIDFIRST_lazy(PL_oldoldbufptr)) {
+ else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
char *t;
- for (t = PL_oldoldbufptr; *t && (isALNUM_lazy(t) || *t == ':'); t++) ;
+ for (t = PL_oldoldbufptr; *t && (isALNUM_lazy_if(t,UTF) || *t == ':'); t++) ;
if (t < PL_bufptr && isSPACE(*t))
Perl_warn(aTHX_ "\t(Do you need to predeclare %.*s?)\n",
t - PL_oldoldbufptr, PL_oldoldbufptr);
@@ -633,7 +622,7 @@ S_check_uni(pTHX)
return;
while (isSPACE(*PL_last_uni))
PL_last_uni++;
- for (s = PL_last_uni; isALNUM_lazy(s) || *s == '-'; s++) ;
+ for (s = PL_last_uni; isALNUM_lazy_if(s,UTF) || *s == '-'; s++) ;
if ((t = strchr(s, '(')) && t < PL_bufptr)
return;
if (ckWARN_d(WARN_AMBIGUOUS)){
@@ -756,7 +745,7 @@ S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow
start = skipspace(start);
s = start;
- if (isIDFIRST_lazy(s) ||
+ if (isIDFIRST_lazy_if(s,UTF) ||
(allow_pack && *s == ':') ||
(allow_initial_tick && *s == '\'') )
{
@@ -1159,6 +1148,7 @@ S_scan_const(pTHX_ char *start)
register char *s = start; /* start of the constant */
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
+ bool has_utf = FALSE; /* embedded \x{} */
I32 len; /* ? */
I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
@@ -1264,7 +1254,8 @@ S_scan_const(pTHX_ char *start)
}
/* check for embedded arrays (@foo, @:foo, @'foo, @{foo}, @$foo) */
- else if (*s == '@' && s[1] && (isALNUM_lazy(s+1) || strchr(":'{$", s[1])))
+ else if (*s == '@' && s[1]
+ && (isALNUM_lazy_if(s+1,UTF) || strchr(":'{$", s[1])))
break;
/* check for embedded scalars. only stop if we're sure it's a
@@ -1368,6 +1359,7 @@ S_scan_const(pTHX_ char *start)
d = (char*)uv_to_utf8((U8*)d,
(UV)scan_hex(s + 1, e - s - 1, &len));
s = e + 1;
+ has_utf = TRUE;
}
else {
UV uv = (UV)scan_hex(s, 2, &len);
@@ -1375,6 +1367,7 @@ S_scan_const(pTHX_ char *start)
utf != (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
{
d = (char*)uv_to_utf8((U8*)d, uv); /* doing a CU or UC */
+ has_utf = TRUE;
}
else {
if (uv >= 127 && UTF) {
@@ -1485,6 +1478,8 @@ S_scan_const(pTHX_ char *start)
*d = '\0';
SvCUR_set(sv, d - SvPVX(sv));
SvPOK_on(sv);
+ if (has_utf)
+ SvUTF8_on(sv);
/* shrink the sv if we allocated more than we used */
if (SvCUR(sv) + 5 < SvLEN(sv)) {
@@ -1593,7 +1588,7 @@ S_intuit_more(pTHX_ register char *s)
case '&':
case '$':
weight -= seen[un_char] * 10;
- if (isALNUM_lazy(s+1)) {
+ if (isALNUM_lazy_if(s+1,UTF)) {
scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE);
if ((int)strlen(tmpbuf) > 1 && gv_fetchpv(tmpbuf,FALSE, SVt_PV))
weight -= 100;
@@ -2285,7 +2280,7 @@ Perl_yylex(pTHX)
retry:
switch (*s) {
default:
- if (isIDFIRST_lazy(s))
+ if (isIDFIRST_lazy_if(s,UTF))
goto keylookup;
Perl_croak(aTHX_ "Unrecognized character \\x%02X", *s & 255);
case 4:
@@ -2653,7 +2648,7 @@ Perl_yylex(pTHX)
else if (*s == '>') {
s++;
s = skipspace(s);
- if (isIDFIRST_lazy(s)) {
+ if (isIDFIRST_lazy_if(s,UTF)) {
s = force_word(s,METHOD,FALSE,TRUE,FALSE);
TOKEN(ARROW);
}
@@ -2749,7 +2744,7 @@ Perl_yylex(pTHX)
grabattrs:
s = skipspace(s);
attrs = Nullop;
- while (isIDFIRST_lazy(s)) {
+ while (isIDFIRST_lazy_if(s,UTF)) {
d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
if (isLOWER(*s) && (tmp = keyword(PL_tokenbuf, len))) {
if (tmp < 0) tmp = -tmp;
@@ -2894,7 +2889,7 @@ Perl_yylex(pTHX)
while (d < PL_bufend && (*d == ' ' || *d == '\t'))
d++;
}
- if (d < PL_bufend && isIDFIRST_lazy(d)) {
+ if (d < PL_bufend && isIDFIRST_lazy_if(d,UTF)) {
d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
FALSE, &len);
while (d < PL_bufend && (*d == ' ' || *d == '\t'))
@@ -2985,9 +2980,9 @@ Perl_yylex(pTHX)
}
t++;
}
- else if (isALNUM_lazy(t)) {
+ else if (isALNUM_lazy_if(t,UTF)) {
t += UTF8SKIP(t);
- while (t < PL_bufend && isALNUM_lazy(t))
+ while (t < PL_bufend && isALNUM_lazy_if(t,UTF))
t += UTF8SKIP(t);
}
while (t < PL_bufend && isSPACE(*t))
@@ -3047,7 +3042,9 @@ Perl_yylex(pTHX)
AOPERATOR(ANDAND);
s--;
if (PL_expect == XOPERATOR) {
- if (ckWARN(WARN_SEMICOLON) && isIDFIRST_lazy(s) && PL_bufptr == PL_linestart) {
+ if (ckWARN(WARN_SEMICOLON)
+ && isIDFIRST_lazy_if(s,UTF) && PL_bufptr == PL_linestart)
+ {
CopLINE_dec(PL_curcop);
Perl_warner(aTHX_ WARN_SEMICOLON, PL_warn_nosemi);
CopLINE_inc(PL_curcop);
@@ -3177,7 +3174,7 @@ Perl_yylex(pTHX)
}
}
- if (s[1] == '#' && (isIDFIRST_lazy(s+2) || strchr("{$:+-", s[2]))) {
+ if (s[1] == '#' && (isIDFIRST_lazy_if(s+2,UTF) || strchr("{$:+-", s[2]))) {
PL_tokenbuf[0] = '@';
s = scan_ident(s + 1, PL_bufend, PL_tokenbuf + 1,
sizeof PL_tokenbuf - 1, FALSE);
@@ -3220,7 +3217,7 @@ Perl_yylex(pTHX)
PL_tokenbuf[0] = '@';
if (ckWARN(WARN_SYNTAX)) {
for(t = s + 1;
- isSPACE(*t) || isALNUM_lazy(t) || *t == '$';
+ isSPACE(*t) || isALNUM_lazy_if(t,UTF) || *t == '$';
t++) ;
if (*t++ == ',') {
PL_bufptr = skipspace(PL_bufptr);
@@ -3240,7 +3237,7 @@ Perl_yylex(pTHX)
char tmpbuf[sizeof PL_tokenbuf];
STRLEN len;
for (t++; isSPACE(*t); t++) ;
- if (isIDFIRST_lazy(t)) {
+ if (isIDFIRST_lazy_if(t,UTF)) {
t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE, &len);
for (; isSPACE(*t); t++) ;
if (*t == ';' && get_cv(tmpbuf, FALSE))
@@ -3258,9 +3255,9 @@ Perl_yylex(pTHX)
PL_expect = XOPERATOR;
else if (strchr("$@\"'`q", *s))
PL_expect = XTERM; /* e.g. print $fh "foo" */
- else if (strchr("&*<%", *s) && isIDFIRST_lazy(s+1))
+ else if (strchr("&*<%", *s) && isIDFIRST_lazy_if(s+1,UTF))
PL_expect = XTERM; /* e.g. print $fh &sub */
- else if (isIDFIRST_lazy(s)) {
+ else if (isIDFIRST_lazy_if(s,UTF)) {
char tmpbuf[sizeof PL_tokenbuf];
scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
if (tmp = keyword(tmpbuf, len)) {
@@ -3318,7 +3315,7 @@ Perl_yylex(pTHX)
if (ckWARN(WARN_SYNTAX)) {
if (*s == '[' || *s == '{') {
char *t = s + 1;
- while (*t && (isALNUM_lazy(t) || strchr(" \t$#+-'\"", *t)))
+ while (*t && (isALNUM_lazy_if(t,UTF) || strchr(" \t$#+-'\"", *t)))
t++;
if (*t == '}' || *t == ']') {
t++;
@@ -3339,7 +3336,8 @@ Perl_yylex(pTHX)
/* Disable warning on "study /blah/" */
if (PL_oldoldbufptr == PL_last_uni
&& (*PL_last_uni != 's' || s - PL_last_uni < 5
- || memNE(PL_last_uni, "study", 5) || isALNUM_lazy(PL_last_uni+5)))
+ || memNE(PL_last_uni, "study", 5)
+ || isALNUM_lazy_if(PL_last_uni+5,UTF)))
check_uni();
s = scan_pat(s,OP_MATCH);
TERM(sublex_start());
@@ -3665,7 +3663,7 @@ Perl_yylex(pTHX)
/* Two barewords in a row may indicate method call. */
- if ((isIDFIRST_lazy(s) || *s == '$') && (tmp=intuit_method(s,gv)))
+ if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp=intuit_method(s,gv)))
return tmp;
/* If not a declared subroutine, it's an indirect object. */
@@ -3711,7 +3709,7 @@ Perl_yylex(pTHX)
/* If followed by a bareword, see if it looks like indir obj. */
- if ((isIDFIRST_lazy(s) || *s == '$') && (tmp = intuit_method(s,gv)))
+ if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp = intuit_method(s,gv)))
return tmp;
/* Not a method, so call it a subroutine (if defined) */
@@ -4045,7 +4043,7 @@ Perl_yylex(pTHX)
case KEY_foreach:
yylval.ival = CopLINE(PL_curcop);
s = skipspace(s);
- if (PL_expect == XSTATE && isIDFIRST_lazy(s)) {
+ if (PL_expect == XSTATE && isIDFIRST_lazy_if(s,UTF)) {
char *p = s;
if ((PL_bufend - p) >= 3 &&
strnEQ(p, "my", 2) && isSPACE(*(p + 2)))
@@ -4054,7 +4052,7 @@ Perl_yylex(pTHX)
strnEQ(p, "our", 3) && isSPACE(*(p + 3)))
p += 3;
p = skipspace(p);
- if (isIDFIRST_lazy(p)) {
+ if (isIDFIRST_lazy_if(p,UTF)) {
p = scan_ident(p, PL_bufend,
PL_tokenbuf, sizeof PL_tokenbuf, TRUE);
p = skipspace(p);
@@ -4269,7 +4267,7 @@ Perl_yylex(pTHX)
case KEY_my:
PL_in_my = tmp;
s = skipspace(s);
- if (isIDFIRST_lazy(s)) {
+ if (isIDFIRST_lazy_if(s,UTF)) {
s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
if (len == 3 && strnEQ(PL_tokenbuf, "sub", 3))
goto really_sub;
@@ -4307,9 +4305,9 @@ Perl_yylex(pTHX)
case KEY_open:
s = skipspace(s);
- if (isIDFIRST_lazy(s)) {
+ if (isIDFIRST_lazy_if(s,UTF)) {
char *t;
- for (d = s; isALNUM_lazy(d); d++) ;
+ for (d = s; isALNUM_lazy_if(d,UTF); d++) ;
t = skipspace(d);
if (strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_AMBIGUOUS))
Perl_warner(aTHX_ WARN_AMBIGUOUS,
@@ -4448,7 +4446,7 @@ Perl_yylex(pTHX)
else {
*PL_tokenbuf = '\0';
s = force_word(s,WORD,TRUE,TRUE,FALSE);
- if (isIDFIRST_lazy(PL_tokenbuf))
+ if (isIDFIRST_lazy_if(PL_tokenbuf,UTF))
gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf), TRUE);
else if (*s == '<')
yyerror("<> should be quotes");
@@ -4639,7 +4637,7 @@ Perl_yylex(pTHX)
s = skipspace(s);
- if (isIDFIRST_lazy(s) || *s == '\'' ||
+ if (isIDFIRST_lazy_if(s,UTF) || *s == '\'' ||
(*s == ':' && s[1] == ':'))
{
PL_expect = XBLOCK;
@@ -5529,9 +5527,9 @@ S_checkcomma(pTHX_ register char *s, char *name, char *what)
s++;
while (s < PL_bufend && isSPACE(*s))
s++;
- if (isIDFIRST_lazy(s)) {
+ if (isIDFIRST_lazy_if(s,UTF)) {
w = s++;
- while (isALNUM_lazy(s))
+ while (isALNUM_lazy_if(s,UTF))
s++;
while (s < PL_bufend && isSPACE(*s))
s++;
@@ -5653,7 +5651,7 @@ S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_packag
Perl_croak(aTHX_ ident_too_long);
if (isALNUM(*s)) /* UTF handled below */
*d++ = *s++;
- else if (*s == '\'' && allow_package && isIDFIRST_lazy(s+1)) {
+ else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) {
*d++ = ':';
*d++ = ':';
s++;
@@ -5705,7 +5703,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
Perl_croak(aTHX_ ident_too_long);
if (isALNUM(*s)) /* UTF handled below */
*d++ = *s++;
- else if (*s == '\'' && isIDFIRST_lazy(s+1)) {
+ else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) {
*d++ = ':';
*d++ = ':';
s++;
@@ -5736,7 +5734,7 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
return s;
}
if (*s == '$' && s[1] &&
- (isALNUM_lazy(s+1) || strchr("${", s[1]) || strnEQ(s+1,"::",2)) )
+ (isALNUM_lazy_if(s+1,UTF) || strchr("${", s[1]) || strnEQ(s+1,"::",2)) )
{
return s;
}
@@ -5763,11 +5761,11 @@ S_scan_ident(pTHX_ register char *s, register char *send, char *dest, STRLEN des
}
}
}
- if (isIDFIRST_lazy(d)) {
+ if (isIDFIRST_lazy_if(d,UTF)) {
d++;
if (UTF) {
e = s;
- while (e < send && isALNUM_lazy(e) || *e == ':') {
+ while (e < send && isALNUM_lazy_if(e,UTF) || *e == ':') {
e += UTF8SKIP(e);
while (e < send && *e & 0x80 && is_utf8_mark((U8*)e))
e += UTF8SKIP(e);
@@ -6071,9 +6069,9 @@ S_scan_heredoc(pTHX_ register char *s)
s++, term = '\'';
else
term = '"';
- if (!isALNUM_lazy(s))
+ if (!isALNUM_lazy_if(s,UTF))
deprecate("bare << to mean <<\"\"");
- for (; isALNUM_lazy(s); s++) {
+ for (; isALNUM_lazy_if(s,UTF); s++) {
if (d < e)
*d++ = *s;
}
@@ -6284,7 +6282,7 @@ S_scan_inputsymbol(pTHX_ char *start)
if (*d == '$' && d[1]) d++;
/* allow <Pkg'VALUE> or <Pkg::VALUE> */
- while (*d && (isALNUM_lazy(d) || *d == '\'' || *d == ':'))
+ while (*d && (isALNUM_lazy_if(d,UTF) || *d == '\'' || *d == ':'))
d++;
/* If we've tried to read what we allow filehandles to look like, and