summaryrefslogtreecommitdiff
path: root/toke.c
diff options
context:
space:
mode:
authorLarry Wall <larry@wall.org>1998-07-24 05:44:33 +0000
committerLarry Wall <larry@wall.org>1998-07-24 05:44:33 +0000
commita0ed51b321531af4b47cce24205ab9656f043f0f (patch)
tree610356407b37a4041ea8bcaf44571579b2da5613 /toke.c
parent9332a1c1d80ded85a2b1f32b1c8968a35e3b0fbb (diff)
downloadperl-a0ed51b321531af4b47cce24205ab9656f043f0f.tar.gz
Here are the long-expected Unicode/UTF-8 modifications.
p4raw-id: //depot/utfperl@1651
Diffstat (limited to 'toke.c')
-rw-r--r--toke.c186
1 files changed, 168 insertions, 18 deletions
diff --git a/toke.c b/toke.c
index 64c69813b9..9f9631990b 100644
--- a/toke.c
+++ b/toke.c
@@ -59,6 +59,8 @@ static void restore_lex_expect _((void *e));
static char ident_too_long[] = "Identifier too long";
+#define UTF (PL_hints & HINT_UTF8)
+
/* The following are arranged oddly so that the guard on the switch statement
* can get by with a single comparison (if the compiler is smart enough).
*/
@@ -228,6 +230,35 @@ win32_textfilter(int idx, SV *sv, int maxlen)
}
#endif
+STATIC I32
+utf16_textfilter(int idx, SV *sv, int maxlen)
+{
+ I32 count = FILTER_READ(idx+1, sv, maxlen);
+ if (count) {
+ char* tmps;
+ char* tend;
+ New(898, tmps, SvCUR(sv) * 3 / 2 + 1, char);
+ tend = utf16_to_utf8((U16*)SvPVX(sv), tmps, SvCUR(sv));
+ sv_usepvn(sv, tmps, tend - tmps);
+
+ }
+ return count;
+}
+
+STATIC I32
+utf16rev_textfilter(int idx, SV *sv, int maxlen)
+{
+ I32 count = FILTER_READ(idx+1, sv, maxlen);
+ if (count) {
+ char* tmps;
+ char* tend;
+ New(898, tmps, SvCUR(sv) * 3 / 2 + 1, char);
+ tend = utf16_to_utf8_reversed((U16*)SvPVX(sv), tmps, SvCUR(sv));
+ sv_usepvn(sv, tmps, tend - tmps);
+
+ }
+ return count;
+}
void
lex_start(SV *line)
@@ -845,11 +876,17 @@ scan_const(char *start)
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
I32 len; /* ? */
+ I32 utf = PL_lex_inwhat == OP_TRANS
+ ? (PL_sublex_info.sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
+ : UTF;
+ I32 thisutf = PL_lex_inwhat == OP_TRANS
+ ? (PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF))
+ : UTF;
/* leaveit is the set of acceptably-backslashed characters */
char *leaveit =
PL_lex_inpat
- ? "\\.^$@AGZdDwWsSbB+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
+ ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
: "";
while (s < send || dorange) {
@@ -877,6 +914,11 @@ scan_const(char *start)
/* range begins (ignore - as first or last char) */
else if (*s == '-' && s+1 < send && s != start) {
+ if (utf) {
+ *d++ = 0xff; /* use illegal utf8 byte--see pmtrans */
+ s++;
+ continue;
+ }
dorange = TRUE;
s++;
}
@@ -933,6 +975,17 @@ scan_const(char *start)
break; /* in regexp, $ might be tail anchor */
}
+ /* (now in tr/// code again) */
+
+ if (*s & 0x80 && dowarn && thisutf) {
+ (void)utf8_to_uv(s, &len); /* could cvt latin-1 to utf8 here... */
+ if (len) {
+ while (len--)
+ *d++ = *s++;
+ continue;
+ }
+ }
+
/* backslashes */
if (*s == '\\' && s+1 < send) {
s++;
@@ -984,8 +1037,35 @@ scan_const(char *start)
/* \x24 indicates a hex constant */
case 'x':
- *d++ = scan_hex(++s, 2, &len);
- s += len;
+ ++s;
+ if (*s == '{') {
+ char* e = strchr(s, '}');
+
+ if (!e)
+ yyerror("Missing right brace on \\x{}");
+ if (dowarn && !utf)
+ warn("Use of \\x{} without utf8 declaration");
+ /* note: utf always shorter than hex */
+ d = uv_to_utf8(d, scan_hex(s + 1, e - s, &len));
+ s = e + 1;
+
+ }
+ else {
+ UV uv = (UV)scan_hex(s, 2, &len);
+ if (utf && PL_lex_inwhat == OP_TRANS &&
+ utf != (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
+ {
+ d = uv_to_utf8(d, uv); /* doing a CU or UC */
+ }
+ else {
+ if (dowarn && uv >= 127 && UTF)
+ warn(
+ "\\x%.*s will produce malformed UTF-8 character; use \\x{%.*s} for that",
+ len,s,len,s);
+ *d++ = (char)uv;
+ }
+ s += len;
+ }
continue;
/* \c is a control character */
@@ -1726,7 +1806,17 @@ yylex(void)
retry:
switch (*s) {
default:
- croak("Unrecognized character \\%03o", *s & 255);
+ /*
+ * Note: we try to be careful never to call the isXXX_utf8() functions unless we're
+ * pretty sure we've seen the beginning of a UTF-8 character (that is, the two high
+ * bits are set). Otherwise we risk loading in the heavy-duty SWASHINIT and SWASHGET
+ * routines unnecessarily. You will see this not just here but throughout this file.
+ */
+ if (UTF && (*s & 0xc0) == 0x80) {
+ if (isIDFIRST_utf8(s))
+ goto keylookup;
+ }
+ croak("Unrecognized character \\x%02X", *s & 255);
case 4:
case 26:
goto fake_eof; /* emulate EOF on ^D or ^Z */
@@ -2721,7 +2811,7 @@ yylex(void)
missingterm((char*)0);
yylval.ival = OP_CONST;
for (d = SvPV(PL_lex_stuff, len); len; len--, d++) {
- if (*d == '$' || *d == '@' || *d == '\\') {
+ if (*d == '$' || *d == '@' || *d == '\\' || *d & 0x80) {
yylval.ival = OP_STRINGIFY;
break;
}
@@ -4828,6 +4918,16 @@ scan_word(register char *s, char *dest, STRLEN destlen, int allow_package, STRLE
*d++ = *s++;
*d++ = *s++;
}
+ else if (UTF && (*s & 0xc0) == 0x80 && isALNUM_utf8(s)) {
+ char *t = s + UTF8SKIP(s);
+ while (*t & 0x80 && is_utf8_mark(t))
+ t += UTF8SKIP(t);
+ if (d + (t - s) > e)
+ croak(ident_too_long);
+ Copy(s, d, t - s, char);
+ d += t - s;
+ s = t;
+ }
else {
*d = '\0';
*slp = d - dest;
@@ -4872,6 +4972,16 @@ scan_ident(register char *s, register char *send, char *dest, STRLEN destlen, I3
*d++ = *s++;
*d++ = *s++;
}
+ else if (UTF && (*s & 0xc0) == 0x80 && isALNUM_utf8(s)) {
+ char *t = s + UTF8SKIP(s);
+ while (*t & 0x80 && is_utf8_mark(t))
+ t += UTF8SKIP(t);
+ if (d + (t - s) > e)
+ croak(ident_too_long);
+ Copy(s, d, t - s, char);
+ d += t - s;
+ s = t;
+ }
else
break;
}
@@ -4914,10 +5024,23 @@ scan_ident(register char *s, register char *send, char *dest, STRLEN destlen, I3
}
}
}
- if (isIDFIRST(*d)) {
+ if (isIDFIRST(*d) || (UTF && (*d & 0xc0) == 0x80 && isIDFIRST_utf8(d))) {
d++;
- while (isALNUM(*s) || *s == ':')
- *d++ = *s++;
+ if (UTF) {
+ e = s;
+ while (e < send && (isALNUM(*e) || ((*e & 0xc0) == 0x80 && isALNUM_utf8((U8*)e)) || *e == ':')) {
+ e += UTF8SKIP(e);
+ while (e < send && *e & 0x80 && is_utf8_mark(e))
+ e += UTF8SKIP(e);
+ }
+ Copy(s, d, e - s, char);
+ d += e - s;
+ s = e;
+ }
+ else {
+ while (isALNUM(*s) || *s == ':')
+ *d++ = *s++;
+ }
*d = '\0';
while (s < send && (*s == ' ' || *s == '\t')) s++;
if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) {
@@ -5077,8 +5200,10 @@ scan_trans(char *start)
OP *o;
short *tbl;
I32 squash;
- I32 Delete;
+ I32 del;
I32 complement;
+ I32 utf8;
+ I32 count = 0;
yylval.ival = OP_NULL;
@@ -5103,20 +5228,45 @@ scan_trans(char *start)
croak("Transliteration replacement not terminated");
}
- New(803,tbl,256,short);
- o = newPVOP(OP_TRANS, 0, (char*)tbl);
+ if (UTF) {
+ o = newSVOP(OP_TRANS, 0, 0);
+ utf8 = OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF;
+ }
+ else {
+ New(803,tbl,256,short);
+ o = newPVOP(OP_TRANS, 0, (char*)tbl);
+ utf8 = 0;
+ }
- complement = Delete = squash = 0;
- while (*s == 'c' || *s == 'd' || *s == 's') {
+ complement = del = squash = 0;
+ while (strchr("cdsCU", *s)) {
if (*s == 'c')
complement = OPpTRANS_COMPLEMENT;
else if (*s == 'd')
- Delete = OPpTRANS_DELETE;
- else
+ del = OPpTRANS_DELETE;
+ else if (*s == 's')
squash = OPpTRANS_SQUASH;
+ else {
+ switch (count++) {
+ case 0:
+ if (*s == 'C')
+ utf8 &= ~OPpTRANS_FROM_UTF;
+ else
+ utf8 |= OPpTRANS_FROM_UTF;
+ break;
+ case 1:
+ if (*s == 'C')
+ utf8 &= ~OPpTRANS_TO_UTF;
+ else
+ utf8 |= OPpTRANS_TO_UTF;
+ break;
+ default:
+ croak("Too many /C and /U options");
+ }
+ }
s++;
}
- o->op_private = Delete|squash|complement;
+ o->op_private = del|squash|complement|utf8;
PL_lex_op = o;
yylval.ival = OP_TRANS;
@@ -5570,7 +5720,7 @@ scan_str(char *start)
}
/* we read a line, so increment our line counter */
PL_curcop->cop_line++;
-
+
/* update debugger info */
if (PERLDB_LINE && PL_curstash != PL_debstash) {
SV *sv = NEWSV(88,0);
@@ -5580,7 +5730,7 @@ scan_str(char *start)
av_store(GvAV(PL_curcop->cop_filegv),
(I32)PL_curcop->cop_line, sv);
}
-
+
/* having changed the buffer, we must update PL_bufend */
PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
}