summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-10-24 02:55:33 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-10-24 02:55:33 +0000
commitba210ebec161cde003bc967e8e460c72f71fb70c (patch)
tree7eefd78e8e365cbf64ddf49314681d17b83c3025
parent177b92d2814bfc842f28f277e0a2f353c652a5e3 (diff)
downloadperl-ba210ebec161cde003bc967e8e460c72f71fb70c.tar.gz
Make the UTF-8 decoding stricter and more verbose when
malformation happens. This involved adding an argument to utf8_to_uv_chk(), which involved changing its prototype, and prefer STRLEN over I32 for the UTF-8 length, which as a domino effect necessitated changing the prototypes of scan_bin(), scan_oct(), scan_hex(), and reg_uni(). The stricter UTF-8 decoding checking uses Markus Kuhn's UTF-8 Decode Stress Tester from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt p4raw-id: //depot/perl@7416
-rw-r--r--doop.c30
-rw-r--r--embed.h2
-rwxr-xr-xembed.pl12
-rw-r--r--handy.h32
-rw-r--r--op.c18
-rw-r--r--perl.c2
-rw-r--r--perlapi.c12
-rw-r--r--pod/perlapi.pod10
-rw-r--r--pod/perldiag.pod4
-rw-r--r--pod/perlunicode.pod5
-rw-r--r--pp.c36
-rw-r--r--pp_ctl.c8
-rw-r--r--proto.h12
-rw-r--r--regcomp.c21
-rw-r--r--regexec.c13
-rw-r--r--sv.c8
-rwxr-xr-xt/pragma/utf8.t169
-rw-r--r--t/pragma/warn/utf84
-rw-r--r--toke.c66
-rw-r--r--utf8.c167
-rw-r--r--utf8.h4
-rw-r--r--util.c6
22 files changed, 458 insertions, 183 deletions
diff --git a/doop.c b/doop.c
index b75ffaa69f..3cd8f07b20 100644
--- a/doop.c
+++ b/doop.c
@@ -72,12 +72,12 @@ S_do_trans_simple(pTHX_ SV *sv)
Newz(0, d, len*2+1, U8);
dstart = d;
while (s < send) {
- I32 ulen;
+ STRLEN ulen;
short c;
ulen = 1;
/* Need to check this, otherwise 128..255 won't match */
- c = utf8_to_uv_chk(s, &ulen, 0);
+ c = utf8_to_uv_chk(s, send - s, &ulen, 0);
if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
matches++;
if (ch < 0x80)
@@ -122,10 +122,10 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
s += UTF8SKIP(s);
else {
UV c;
- I32 ulen;
+ STRLEN ulen;
ulen = 1;
if (hasutf)
- c = utf8_to_uv_chk(s,&ulen, 0);
+ c = utf8_to_uv_chk(s, send - s, &ulen, 0);
else
c = *s;
if (c < 0x100 && tbl[c] >= 0)
@@ -363,8 +363,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- I32 ulen;
- *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+ STRLEN ulen;
+ *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
s += ulen;
puv = 0xfeedface;
continue;
@@ -404,8 +404,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- I32 ulen;
- *d++ = (U8)utf8_to_uv_chk(s, &ulen, 0);
+ STRLEN ulen;
+ *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
s += ulen;
continue;
}
@@ -964,15 +964,15 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
char *dcsave = dc;
STRLEN lulen = leftlen;
STRLEN rulen = rightlen;
- I32 ulen;
+ STRLEN ulen;
switch (optype) {
case OP_BIT_AND:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+ luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+ ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
break;
case OP_BIT_XOR:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+ luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+ ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
goto mop_up_utf;
case OP_BIT_OR:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, &ulen, 0);
+ luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, &ulen, 0);
+ ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc | ruc;
diff --git a/embed.h b/embed.h
index b4c8f6ab35..eab037fccf 100644
--- a/embed.h
+++ b/embed.h
@@ -2190,7 +2190,7 @@
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
#define utf8_to_uv(a,b) Perl_utf8_to_uv(aTHX_ a,b)
-#define utf8_to_uv_chk(a,b,c) Perl_utf8_to_uv_chk(aTHX_ a,b,c)
+#define utf8_to_uv_chk(a,b,c,d) Perl_utf8_to_uv_chk(aTHX_ a,b,c,d)
#define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b)
diff --git a/embed.pl b/embed.pl
index f685042567..6adb2751e6 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1941,10 +1941,10 @@ p |OP* |scalar |OP* o
p |OP* |scalarkids |OP* o
p |OP* |scalarseq |OP* o
p |OP* |scalarvoid |OP* o
-Ap |NV |scan_bin |char* start|I32 len|I32* retlen
-Ap |NV |scan_hex |char* start|I32 len|I32* retlen
+Ap |NV |scan_bin |char* start|STRLEN len|STRLEN* retlen
+Ap |NV |scan_hex |char* start|STRLEN len|STRLEN* retlen
Ap |char* |scan_num |char* s|YYSTYPE *lvalp
-Ap |NV |scan_oct |char* start|I32 len|I32* retlen
+Ap |NV |scan_oct |char* start|STRLEN len|STRLEN* retlen
p |OP* |scope |OP* o
Ap |char* |screaminstr |SV* bigsv|SV* littlesv|I32 start_shift \
|I32 end_shift|I32 *state|I32 last
@@ -2074,8 +2074,8 @@ Ap |I32 |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
-Ap |UV |utf8_to_uv |U8 *s|I32* retlen
-Ap |UV |utf8_to_uv_chk |U8 *s|I32* retlen|bool checking
+Ap |UV |utf8_to_uv |U8 *s|STRLEN* retlen
+Ap |UV |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|bool checking
Ap |U8* |uv_to_utf8 |U8 *d|UV uv
p |void |vivify_defelem |SV* sv
p |void |vivify_ref |SV* sv|U32 to_what
@@ -2358,7 +2358,7 @@ s |regnode*|reg |I32|I32 *
s |regnode*|reganode |U8|U32
s |regnode*|regatom |I32 *
s |regnode*|regbranch |I32 *|I32
-s |void |reguni |UV|char *|I32*
+s |void |reguni |UV|char *|STRLEN*
s |regnode*|regclass
s |regnode*|regclassutf8
s |I32 |regcurly |char *
diff --git a/handy.h b/handy.h
index f0e39af51d..73410123fb 100644
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,23 @@ Converts the specified character to lowercase.
#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f')
#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */
-#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv_chk(p, 0, 0))
-#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv_chk(p, 0, 0))
+#define STRLEN_MAX ((STRLEN)-1)
+
+#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
+#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv_chk(p, STRLEN_MAX, 0, 0))
#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f')
#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c
index 6ef4bfe777..9e256a3537 100644
--- a/op.c
+++ b/op.c
@@ -2621,7 +2621,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
SV* transv = 0;
U8* tend = t + tlen;
U8* rend = r + rlen;
- I32 ulen;
+ STRLEN ulen;
U32 tfirst = 1;
U32 tlast = 0;
I32 tdiff;
@@ -2641,6 +2641,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
if (complement) {
U8 tmpbuf[UTF8_MAXLEN];
U8** cp;
+ I32* cl;
UV nextmin = 0;
New(1109, cp, tlen, U8*);
i = 0;
@@ -2656,7 +2657,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
qsort(cp, i, sizeof(U8*), utf8compare);
for (j = 0; j < i; j++) {
U8 *s = cp[j];
- UV val = utf8_to_uv_chk(s, &ulen, 0);
+ I32 cur = j < i ? cp[j+1] - s : tend - s;
+ UV val = utf8_to_uv_chk(s, cur, &ulen, 0);
s += ulen;
diff = val - nextmin;
if (diff > 0) {
@@ -2669,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
}
if (*s == 0xff)
- val = utf8_to_uv_chk(s+1, &ulen, 0);
+ val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0);
if (val >= nextmin)
nextmin = val + 1;
}
@@ -2696,10 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
while (t < tend || tfirst <= tlast) {
/* see if we need more "t" chars */
if (tfirst > tlast) {
- tfirst = (I32)utf8_to_uv_chk(t, &ulen, 0);
+ tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
t += ulen;
if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */
- tlast = (I32)utf8_to_uv_chk(++t, &ulen, 0);
+ t++;
+ tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
t += ulen;
}
else
@@ -2709,10 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
/* now see if we need more "r" chars */
if (rfirst > rlast) {
if (r < rend) {
- rfirst = (I32)utf8_to_uv_chk(r, &ulen, 0);
+ rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
r += ulen;
if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */
- rlast = (I32)utf8_to_uv_chk(++r, &ulen, 0);
+ r++;
+ rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
r += ulen;
}
else
diff --git a/perl.c b/perl.c
index cb2cb14db5..3d874ca9bd 100644
--- a/perl.c
+++ b/perl.c
@@ -2025,7 +2025,7 @@ NULL
char *
Perl_moreswitches(pTHX_ char *s)
{
- I32 numlen;
+ STRLEN numlen;
U32 rschar;
switch (*s) {
diff --git a/perlapi.c b/perlapi.c
index 3cfe4e0baf..1f1343db47 100644
--- a/perlapi.c
+++ b/perlapi.c
@@ -2638,14 +2638,14 @@ Perl_save_threadsv(pTHXo_ PADOFFSET i)
#undef Perl_scan_bin
NV
-Perl_scan_bin(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_bin(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_scan_bin(start, len, retlen);
}
#undef Perl_scan_hex
NV
-Perl_scan_hex(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_hex(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_scan_hex(start, len, retlen);
}
@@ -2659,7 +2659,7 @@ Perl_scan_num(pTHXo_ char* s, YYSTYPE *lvalp)
#undef Perl_scan_oct
NV
-Perl_scan_oct(pTHXo_ char* start, I32 len, I32* retlen)
+Perl_scan_oct(pTHXo_ char* start, STRLEN len, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_scan_oct(start, len, retlen);
}
@@ -3380,16 +3380,16 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
#undef Perl_utf8_to_uv
UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, I32* retlen)
+Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen)
{
return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen);
}
#undef Perl_utf8_to_uv_chk
UV
-Perl_utf8_to_uv_chk(pTHXo_ U8 *s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking)
{
- return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, retlen, checking);
+ return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking);
}
#undef Perl_uv_to_utf8
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index a5178e8d61..730d89f896 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -3225,7 +3225,7 @@ advanced to the end of the character.
If C<s> does not point to a well-formed UTF8 character, an optional UTF8
warning is produced.
- U8* s utf8_to_uv(I32 *retlen)
+ U8* s utf8_to_uv(STRLEN *retlen)
=for hackers
Found in file utf8.c
@@ -3233,9 +3233,9 @@ Found in file utf8.c
=item utf8_to_uv_chk
Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
If C<s> does not point to a well-formed UTF8 character, the behaviour
is dependent on the value of C<checking>: if this is true, it is
@@ -3243,7 +3243,7 @@ assumed that the caller will raise a warning, and this function will
set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
warning is produced.
- U8* s utf8_to_uv_chk(I32 *retlen, I32 checking)
+ U8* s utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking)
=for hackers
Found in file utf8.c
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 480ab8492d..139bab98d5 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1789,6 +1789,10 @@ a builtin library search path, prefix2 is substituted. The error may
appear if components are not found, or are too long. See
"PERLLIB_PREFIX" in L<perlos2>.
+=item Malformed UTF-8 character (%s)
+
+Perl detected something that didn't comply with UTF-8 encoding rules.
+
=item Malformed UTF-16 surrogate
Perl thought it was reading UTF-16 encoded character data but while
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index c9954d8e96..145c953099 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -71,6 +71,11 @@ on Windows.
Regardless of the above, the C<bytes> pragma can always be used to force
byte semantics in a particular lexical scope. See L<bytes>.
+One effect of the C<utf8> pragma is that the internal UTF-8 decoding
+becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF
+0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they
+appear in the data.
+
The C<utf8> pragma is primarily a compatibility device that enables
recognition of UTF-8 in literals encountered by the parser. It may also
be used for enabling some of the more experimental Unicode support features.
diff --git a/pp.c b/pp.c
index 98d31cb05e..35f5956829 100644
--- a/pp.c
+++ b/pp.c
@@ -1480,7 +1480,7 @@ PP(pp_complement)
STRLEN targlen = 0;
U8 *result;
U8 *send;
- I32 l;
+ STRLEN l;
send = tmps + len;
while (tmps < send) {
@@ -1944,7 +1944,7 @@ PP(pp_hex)
{
djSP; dTARGET;
char *tmps;
- I32 argtype;
+ STRLEN argtype;
STRLEN n_a;
tmps = POPpx;
@@ -1957,7 +1957,7 @@ PP(pp_oct)
{
djSP; dTARGET;
NV value;
- I32 argtype;
+ STRLEN argtype;
char *tmps;
STRLEN n_a;
@@ -2234,13 +2234,13 @@ PP(pp_ord)
{
djSP; dTARGET;
UV value;
- STRLEN n_a;
SV *tmpsv = POPs;
- U8 *tmps = (U8*)SvPVx(tmpsv,n_a);
- I32 retlen;
+ STRLEN len;
+ U8 *tmps = (U8*)SvPVx(tmpsv, len);
+ STRLEN retlen;
if ((*tmps & 0x80) && DO_UTF8(tmpsv))
- value = utf8_to_uv_chk(tmps, &retlen, 0);
+ value = utf8_to_uv_chk(tmps, len, &retlen, 0);
else
value = (UV)(*tmps & 255);
XPUSHu(value);
@@ -2304,10 +2304,10 @@ PP(pp_ucfirst)
STRLEN slen;
if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
- I32 ulen;
+ STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv_chk(s, &ulen, 0);
+ UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
@@ -2363,10 +2363,10 @@ PP(pp_lcfirst)
STRLEN slen;
if (DO_UTF8(sv) && (s = (U8*)SvPV(sv, slen)) && slen && (*s & 0xc0) == 0xc0) {
- I32 ulen;
+ STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv_chk(s, &ulen, 0);
+ UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
@@ -2423,7 +2423,7 @@ PP(pp_uc)
if (DO_UTF8(sv)) {
dTARGET;
- I32 ulen;
+ STRLEN ulen;
register U8 *d;
U8 *send;
@@ -2443,7 +2443,7 @@ PP(pp_uc)
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+ d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
s += ulen;
}
}
@@ -2497,7 +2497,7 @@ PP(pp_lc)
if (DO_UTF8(sv)) {
dTARGET;
- I32 ulen;
+ STRLEN ulen;
register U8 *d;
U8 *send;
@@ -2517,7 +2517,7 @@ PP(pp_lc)
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, &ulen, 0)));
+ d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
s += ulen;
}
}
@@ -3363,7 +3363,7 @@ PP(pp_unpack)
/* These must not be in registers: */
I16 ashort;
int aint;
- I32 along;
+ STRLEN along;
#ifdef HAS_QUAD
Quad_t aquad;
#endif
@@ -3659,7 +3659,7 @@ PP(pp_unpack)
len = strend - s;
if (checksum) {
while (len-- > 0 && s < strend) {
- auint = utf8_to_uv_chk((U8*)s, &along, 0);
+ auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
s += along;
if (checksum > 32)
cdouble += (NV)auint;
@@ -3671,7 +3671,7 @@ PP(pp_unpack)
EXTEND(SP, len);
EXTEND_MORTAL(len);
while (len-- > 0 && s < strend) {
- auint = utf8_to_uv_chk((U8*)s, &along, 0);
+ auint = utf8_to_uv_chk((U8*)s, strend - s, &along, 0);
s += along;
sv = NEWSV(37, 0);
sv_setuv(sv, (UV)auint);
diff --git a/pp_ctl.c b/pp_ctl.c
index cf2000e5c6..33f91eef26 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2971,17 +2971,17 @@ PP(pp_require)
if (SvNIOKp(sv)) {
if (SvPOK(sv) && SvNOK(sv)) { /* require v5.6.1 */
UV rev = 0, ver = 0, sver = 0;
- I32 len;
+ STRLEN len;
U8 *s = (U8*)SvPVX(sv);
U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
if (s < end) {
- rev = utf8_to_uv_chk(s, &len, 0);
+ rev = utf8_to_uv_chk(s, end - s, &len, 0);
s += len;
if (s < end) {
- ver = utf8_to_uv_chk(s, &len, 0);
+ ver = utf8_to_uv_chk(s, end - s, &len, 0);
s += len;
if (s < end)
- sver = utf8_to_uv_chk(s, &len, 0);
+ sver = utf8_to_uv_chk(s, end - s, &len, 0);
}
}
if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index 27139161a9..7624255b4a 100644
--- a/proto.h
+++ b/proto.h
@@ -677,10 +677,10 @@ PERL_CALLCONV OP* Perl_scalar(pTHX_ OP* o);
PERL_CALLCONV OP* Perl_scalarkids(pTHX_ OP* o);
PERL_CALLCONV OP* Perl_scalarseq(pTHX_ OP* o);
PERL_CALLCONV OP* Perl_scalarvoid(pTHX_ OP* o);
-PERL_CALLCONV NV Perl_scan_bin(pTHX_ char* start, I32 len, I32* retlen);
-PERL_CALLCONV NV Perl_scan_hex(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV Perl_scan_bin(pTHX_ char* start, STRLEN len, STRLEN* retlen);
+PERL_CALLCONV NV Perl_scan_hex(pTHX_ char* start, STRLEN len, STRLEN* retlen);
PERL_CALLCONV char* Perl_scan_num(pTHX_ char* s, YYSTYPE *lvalp);
-PERL_CALLCONV NV Perl_scan_oct(pTHX_ char* start, I32 len, I32* retlen);
+PERL_CALLCONV NV Perl_scan_oct(pTHX_ char* start, STRLEN len, STRLEN* retlen);
PERL_CALLCONV OP* Perl_scope(pTHX_ OP* o);
PERL_CALLCONV char* Perl_screaminstr(pTHX_ SV* bigsv, SV* littlesv, I32 start_shift, I32 end_shift, I32 *state, I32 last);
#if !defined(VMS)
@@ -809,8 +809,8 @@ PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
-PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, I32* retlen, bool checking);
+PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking);
PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv);
PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
@@ -1103,7 +1103,7 @@ STATIC regnode* S_reg(pTHX_ I32, I32 *);
STATIC regnode* S_reganode(pTHX_ U8, U32);
STATIC regnode* S_regatom(pTHX_ I32 *);
STATIC regnode* S_regbranch(pTHX_ I32 *, I32);
-STATIC void S_reguni(pTHX_ UV, char *, I32*);
+STATIC void S_reguni(pTHX_ UV, char *, STRLEN*);
STATIC regnode* S_regclass(pTHX);
STATIC regnode* S_regclassutf8(pTHX);
STATIC I32 S_regcurly(pTHX_ char *);
diff --git a/regcomp.c b/regcomp.c
index e7042eaa4e..3f2b10c8fe 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2742,11 +2742,11 @@ tryagain:
/* FALL THROUGH */
default: {
- register I32 len;
+ register STRLEN len;
register UV ender;
register char *p;
char *oldp, *s;
- I32 numlen;
+ STRLEN numlen;
PL_regcomp_parse++;
@@ -2884,7 +2884,8 @@ tryagain:
default:
normal_default:
if ((*p & 0xc0) == 0xc0 && UTF) {
- ender = utf8_to_uv_chk((U8*)p, &numlen, 0);
+ ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+ &numlen, 0);
p += numlen;
}
else
@@ -3128,7 +3129,7 @@ S_regclass(pTHX)
register I32 lastvalue = OOB_CHAR8;
register I32 range = 0;
register regnode *ret;
- I32 numlen;
+ STRLEN numlen;
I32 namedclass;
char *rangebegin;
bool need_class = 0;
@@ -3606,7 +3607,7 @@ S_regclassutf8(pTHX)
register U32 lastvalue = OOB_UTF8;
register I32 range = 0;
register regnode *ret;
- I32 numlen;
+ STRLEN numlen;
I32 n;
SV *listsv;
U8 flags = 0;
@@ -3638,12 +3639,16 @@ S_regclassutf8(pTHX)
namedclass = OOB_NAMEDCLASS;
if (!range)
rangebegin = PL_regcomp_parse;
- value = utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+ value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ PL_regxend - PL_regcomp_parse,
+ &numlen, 0);
PL_regcomp_parse += numlen;
if (value == '[')
namedclass = regpposixcc(value);
else if (value == '\\') {
- value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse, &numlen, 0);
+ value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ PL_regxend - PL_regcomp_parse,
+ &numlen, 0);
PL_regcomp_parse += numlen;
/* Some compilers cannot handle switching on 64-bit integer
* values, therefore value cannot be an UV. Yes, this will
@@ -3937,7 +3942,7 @@ S_reganode(pTHX_ U8 op, U32 arg)
- reguni - emit (if appropriate) a Unicode character
*/
STATIC void
-S_reguni(pTHX_ UV uv, char* s, I32* lenp)
+S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
{
dTHR;
if (SIZE_ONLY) {
diff --git a/regexec.c b/regexec.c
index 6e046f3abc..350f432145 100644
--- a/regexec.c
+++ b/regexec.c
@@ -917,7 +917,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case BOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+ tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+ strend - s,
+ 0, 0) : '\n';
tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
while (s < strend) {
if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -953,7 +955,9 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case NBOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1), 0, 0) : '\n';
+ tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+ strend - s,
+ 0, 0) : '\n';
tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
while (s < strend) {
if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1998,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog)
while (s < e) {
if (l >= PL_regeol)
sayNO;
- if (utf8_to_uv_chk((U8*)s, 0, 0) != (c1 ?
+ if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ?
toLOWER_utf8((U8*)l) :
toLOWER_LC_utf8((U8*)l)))
{
@@ -2136,7 +2140,8 @@ S_regmatch(pTHX_ regnode *prog)
case NBOUNDUTF8:
/* was last char in word? */
ln = (locinput != PL_regbol)
- ? utf8_to_uv_chk(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
+ ? utf8_to_uv_chk(reghop((U8*)locinput, -1),
+ PL_regeol - locinput, 0, 0) : PL_regprev;
if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
ln = isALNUM_uni(ln);
n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c
index 1fac1620aa..2790cfd859 100644
--- a/sv.c
+++ b/sv.c
@@ -6358,13 +6358,13 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
case 'd':
case 'i':
if (vectorize) {
- I32 ulen;
+ STRLEN ulen;
if (!veclen) {
vectorize = FALSE;
break;
}
if (utf)
- iv = (IV)utf8_to_uv_chk(vecstr, &ulen, 0);
+ iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
else {
iv = *vecstr;
ulen = 1;
@@ -6440,14 +6440,14 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
uns_integer:
if (vectorize) {
- I32 ulen;
+ STRLEN ulen;
vector:
if (!veclen) {
vectorize = FALSE;
break;
}
if (utf)
- uv = utf8_to_uv_chk(vecstr, &ulen, 0);
+ uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
else {
uv = *vecstr;
ulen = 1;
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index 7224a7497a..e61baad587 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
}
}
-print "1..103\n";
+print "1..181\n";
my $test = 1;
@@ -559,3 +559,170 @@ sub nok_bytes {
print "ok $test\n";
$test++;
}
+
+# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
+# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
+# version dated 2000-09-02.
+
+my @MK = split(/\n/, <<__EOMK__);
+1 Correct UTF-8
+1.1.1 y "κόσμε" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
+2 Boundary conditions
+2.1 First possible sequence of certain length
+2.1.1 y "
+2.1.2 y "€" 80 2 c2:80 1
+2.1.3 y "ࠀ" 800 3 e0:a0:80 1
+2.1.4 y "𐀀" 10000 4 f0:90:80:80 1
+2.1.5 y "" 200000 5 f8:88:80:80:80 1
+2.1.6 y "" 4000000 6 fc:84:80:80:80:80 1
+2.2 Last possible sequence of certain length
+2.2.1 y "" 7f 1 7f 1
+2.2.2 y "߿" 7ff 2 df:bf 1
+# The ffff is legal unless under use utf8
+2.2.3 y "￿" ffff 3 ef:bf:bf 1
+2.2.4 y "" 1fffff 4 f7:bf:bf:bf 1
+2.2.5 y "" 3ffffff 5 fb:bf:bf:bf:bf 1
+2.2.6 y "" 7fffffff 6 fd:bf:bf:bf:bf:bf 1
+2.3 Other boundary conditions
+2.3.1 y "퟿" d7ff 3 ed:9f:bf 1
+2.3.2 y "" e000 3 ee:80:80 1
+2.3.3 y "�" fffd 3 ef:bf:bd 1
+2.3.4 y "􏿿" 10ffff 4 f4:8f:bf:bf 1
+2.3.5 y "" 110000 4 f4:90:80:80 1
+3 Malformed sequences
+3.1 Unexpected continuation bytes
+3.1.1 n "" - 1 80
+3.1.2 n "" - 1 bf
+3.1.3 n "" - 2 80:bf
+3.1.4 n "" - 3 80:bf:80
+3.1.5 n "" - 4 80:bf:80:bf
+3.1.6 n "" - 5 80:bf:80:bf:80
+3.1.7 n "" - 6 80:bf:80:bf:80:bf
+3.1.8 n "" - 7 80:bf:80:bf:80:bf:80
+3.1.9 n "" - 64 80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf
+3.2 Lonely start characters
+3.2.1 n " " - 64 c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20
+3.2.2 n " " - 32 e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20
+3.2.3 n " " - 16 f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20
+3.2.4 n " " - 8 f8:20:f9:20:fa:20:fb:20
+3.2.5 n " " - 4 fc:20:fd:20
+3.3 Sequences with last continuation byte missing
+3.3.1 n "" - 1 c0
+3.3.2 n "" - 2 e0:80
+3.3.3 n "" - 3 f0:80:80
+3.3.4 n "" - 4 f8:80:80:80
+3.3.5 n "" - 5 fc:80:80:80:80
+3.3.6 n "" - 1 df
+3.3.7 n "" - 2 ef:bf
+3.3.8 n "" - 3 f7:bf:bf
+3.3.9 n "" - 4 fb:bf:bf:bf
+3.3.10 n "" - 5 fd:bf:bf:bf:bf
+3.4 Concatenation of incomplete sequences
+3.4.1 n "" - 30 c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf
+3.5 Impossible bytes
+3.5.1 n "" - 1 fe
+3.5.2 n "" - 1 ff
+3.5.3 n "" - 4 fe:fe:ff:ff
+4 Overlong sequences
+4.1 Examples of an overlong ASCII character
+4.1.1 n "" - 2 c0:af
+4.1.2 n "" - 3 e0:80:af
+4.1.3 n "" - 4 f0:80:80:af
+4.1.4 n "" - 5 f8:80:80:80:af
+4.1.5 n "" - 6 fc:80:80:80:80:af
+4.2 Maximum overlong sequences
+4.2.1 n "" - 2 c1:bf
+4.2.2 n "" - 3 e0:9f:bf
+4.2.3 n "" - 4 f0:8f:bf:bf
+4.2.4 n "" - 5 f8:87:bf:bf:bf
+4.2.5 n "" - 6 fc:83:bf:bf:bf:bf
+4.3 Overlong representation of the NUL character
+4.3.1 n "" - 2 c0:80
+4.3.2 n "" - 3 e0:80:80
+4.3.3 n "" - 4 f0:80:80:80
+4.3.4 n "" - 5 f8:80:80:80:80
+4.3.5 n "" - 6 fc:80:80:80:80:80
+5 Illegal code positions
+5.1 Single UTF-16 surrogates
+5.1.1 n "" - 3 ed:a0:80
+5.1.2 n "" - 3 ed:ad:bf
+5.1.3 n "" - 3 ed:ae:80
+5.1.4 n "" - 3 ed:af:bf
+5.1.5 n "" - 3 ed:b0:80
+5.1.6 n "" - 3 ed:be:80
+5.1.7 n "" - 3 ed:bf:bf
+5.2 Paired UTF-16 surrogates
+5.2.1 n "" - 6 ed:a0:80:ed:b0:80
+5.2.2 n "" - 6 ed:a0:80:ed:bf:bf
+5.2.3 n "" - 6 ed:ad:bf:ed:b0:80
+5.2.4 n "" - 6 ed:ad:bf:ed:bf:bf
+5.2.5 n "" - 6 ed:ae:80:ed:b0:80
+5.2.6 n "" - 6 ed:ae:80:ed:bf:bf
+5.2.7 n "" - 6 ed:af:bf:ed:b0:80
+5.2.8 n "" - 6 ed:af:bf:ed:bf:bf
+5.3 Other illegal code positions
+5.3.1 n "￾" - 3 ef:bf:be
+# The ffff is legal unless under use utf8
+5.3.2 y "￿" - 3 ef:bf:bf
+__EOMK__
+
+# 104..181
+{
+ my $WARN;
+ my $id;
+
+ local $SIG{__WARN__} =
+ sub {
+ # print "# $id: @_";
+ $WARN++;
+ };
+
+ sub moan {
+ print "$id: @_";
+ }
+
+ sub test_unpack_U {
+ $WARN = 0;
+ unpack('U*', $_[0]);
+ }
+
+ for (@MK) {
+ if (/^(?:\d+(?:\.\d+)?)\s/ || /^#/) {
+ # print "# $_\n";
+ } elsif (/^(\d+\.\d+\.\d+[bu]?)\s+([yn])\s+"(.+)"\s+([0-9a-f]{1,8}|-)\s+(\d+)\s+([0-9a-f]{2}(?::[0-9a-f]{2})*)(?:\s+(\d+))?$/) {
+ $id = $1;
+ my ($okay, $bytes, $Unicode, $byteslen, $hex, $charslen) =
+ ($2, $3, $4, $5, $6, $7);
+ my @hex = split(/:/, $hex);
+ unless (@hex == $byteslen) {
+ my $nhex = @hex;
+ moan "amount of hex ($nhex) not equal to byteslen ($byteslen)\n";
+ }
+ {
+ use bytes;
+ my $bytesbyteslen = length($bytes);
+ unless ($bytesbyteslen == $byteslen) {
+ moan "bytes length() ($bytesbyteslen) not equal to $byteslen\n";
+ }
+ }
+ if ($okay eq 'y') {
+ test_unpack_U($bytes);
+ unless ($WARN == 0) {
+ moan "unpack('U*') false negative\n";
+ print "not ";
+ }
+ } elsif ($okay eq 'n') {
+ test_unpack_U($bytes);
+ unless ($WARN) {
+ moan "unpack('U*') false positive\n";
+ print "not ";
+ }
+ }
+ print "ok $test\n";
+ $test++;
+ } else {
+ moan "unknown format\n";
+ }
+ }
+}
+
diff --git a/t/pragma/warn/utf8 b/t/pragma/warn/utf8
index 6a2fe5446c..012c65529e 100644
--- a/t/pragma/warn/utf8
+++ b/t/pragma/warn/utf8
@@ -24,6 +24,6 @@ my $a = "snstorm" ;
my $a = "snstorm";
}
EXPECT
-Malformed UTF-8 character at - line 3.
-Malformed UTF-8 character at - line 8.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 3.
+Malformed UTF-8 character (unexpected non-continuation byte 0x73 after byte 0xf8) at - line 8.
########
diff --git a/toke.c b/toke.c
index 2ec1f8cb30..32073a5842 100644
--- a/toke.c
+++ b/toke.c
@@ -813,10 +813,10 @@ Perl_str_to_version(pTHX_ SV *sv)
bool utf = SvUTF8(sv) ? TRUE : FALSE;
char *end = start + len;
while (start < end) {
- I32 skip;
+ STRLEN skip;
UV n;
if (utf)
- n = utf8_to_uv_chk((U8*)start, &skip, 0);
+ n = utf8_to_uv_chk((U8*)start, len, &skip, 0);
else {
n = *(U8*)start;
skip = 1;
@@ -1188,7 +1188,6 @@ S_scan_const(pTHX_ char *start)
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
bool has_utf = FALSE; /* embedded \x{} */
- I32 len; /* ? */
UV uv;
I32 utf = (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op)
@@ -1329,20 +1328,23 @@ S_scan_const(pTHX_ char *start)
/* (now in tr/// code again) */
if (*s & 0x80 && thisutf) {
- (void)utf8_to_uv_chk((U8*)s, &len, 0);
- if (len == 1) {
- /* illegal UTF8, make it valid */
- char *old_pvx = SvPVX(sv);
- /* need space for one extra char (NOTE: SvCUR() not set here) */
- d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
- d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
- }
- else {
- while (len--)
- *d++ = *s++;
- }
- has_utf = TRUE;
- continue;
+ STRLEN len;
+ UV uv;
+
+ uv = utf8_to_uv_chk((U8*)s, send - s, &len, 1);
+ if (len == 1) {
+ /* illegal UTF8, make it valid */
+ char *old_pvx = SvPVX(sv);
+ /* need space for one extra char (NOTE: SvCUR() not set here) */
+ d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
+ d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+ }
+ else {
+ while (len--)
+ *d++ = *s++;
+ }
+ has_utf = TRUE;
+ continue;
}
/* backslashes */
@@ -1398,9 +1400,11 @@ S_scan_const(pTHX_ char *start)
/* \132 indicates an octal constant */
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
- len = 0; /* disallow underscores */
- uv = (UV)scan_oct(s, 3, &len);
- s += len;
+ {
+ STRLEN len = 0; /* disallow underscores */
+ uv = (UV)scan_oct(s, 3, &len);
+ s += len;
+ }
goto NUM_ESCAPE_INSERT;
/* \x24 indicates a hex constant */
@@ -1412,14 +1416,18 @@ S_scan_const(pTHX_ char *start)
yyerror("Missing right brace on \\x{}");
e = s;
}
- len = 1; /* allow underscores */
- uv = (UV)scan_hex(s + 1, e - s - 1, &len);
- s = e + 1;
+ {
+ STRLEN len = 1; /* allow underscores */
+ uv = (UV)scan_hex(s + 1, e - s - 1, &len);
+ }
+ s = e + 1;
}
else {
- len = 0; /* disallow underscores */
- uv = (UV)scan_hex(s, 2, &len);
- s += len;
+ {
+ STRLEN len = 0; /* disallow underscores */
+ uv = (UV)scan_hex(s, 2, &len);
+ s += len;
+ }
}
NUM_ESCAPE_INSERT:
@@ -1528,8 +1536,10 @@ S_scan_const(pTHX_ char *start)
*d = toCTRL(*d);
d++;
#else
- len = *s++;
- *d++ = toCTRL(len);
+ {
+ U8 c = *s++;
+ *d++ = toCTRL(c);
+ }
#endif
continue;
diff --git a/utf8.c b/utf8.c
index a713ea198b..98236ed170 100644
--- a/utf8.c
+++ b/utf8.c
@@ -153,12 +153,12 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
}
/*
-=for apidoc Am|U8* s|utf8_to_uv_chk|I32 *retlen|I32 checking
+=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|I32 checking
Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
If C<s> does not point to a well-formed UTF8 character, the behaviour
is dependent on the value of C<checking>: if this is true, it is
@@ -170,79 +170,150 @@ warning is produced.
*/
UV
-Perl_utf8_to_uv_chk(pTHX_ U8* s, I32* retlen, bool checking)
+Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, bool checking)
{
- UV uv = *s;
- int len;
- if (!(uv & 0x80)) {
+ dTHR;
+ UV uv = *s, ouv;
+ STRLEN len = 1;
+ bool dowarn = ckWARN_d(WARN_UTF8);
+ STRLEN expectlen = 0;
+
+ if (uv <= 0x7f) { /* Pure ASCII. */
if (retlen)
*retlen = 1;
return *s;
}
- if (!(uv & 0x40)) {
- dTHR;
- if (checking && retlen) {
- *retlen = -1;
- return 0;
- }
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
- if (retlen)
- *retlen = 1;
- return *s;
+ if (uv >= 0x80 && uv <= 0xbf) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+ uv);
+ goto malformed;
+ }
+
+ if (uv >= 0xc0 && uv <= 0xfd && curlen > 1 && s[1] < 0x80) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x after byte 0x%02x)",
+ s[1], uv);
+ goto malformed;
+ }
+
+ if ((uv == 0xfe || uv == 0xff) && IN_UTF8){
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (impossible byte 0x%02x)",
+ uv);
+ goto malformed;
}
- if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
- else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
- else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
- else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
- else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
- else if (!(uv & 0x01)) { len = 7; uv = 0; }
+ if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
+ else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
+ else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
+ else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
+ else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
+ else if (!(uv & 0x01)) { len = 7; uv = 0; }
else { len = 13; uv = 0; } /* whoa! */
if (retlen)
*retlen = len;
- --len;
+
+ expectlen = len;
+
+ if (curlen < expectlen) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (%d byte%s, need %d)",
+ curlen, curlen > 1 ? "s" : "", expectlen);
+ goto malformed;
+ }
+
+ len--;
s++;
+ ouv = uv;
+
while (len--) {
if ((*s & 0xc0) != 0x80) {
- dTHR;
- if (checking && retlen) {
- *retlen = -1;
- return 0;
- }
-
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
- if (retlen)
- *retlen -= len + 1;
- return 0xfffd;
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (unexpected continuation byte 0x%02x)",
+ *s);
+ goto malformed;
}
else
- uv = (uv << 6) | (*s++ & 0x3f);
+ uv = (uv << 6) | (*s & 0x3f);
+ if (uv < ouv) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
+ ouv, *s);
+ goto malformed;
+ }
+ s++;
+ ouv = uv;
+ }
+
+ if (uv >= 0xd800 && uv <= 0xdfff) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
+ uv);
+ goto malformed;
+ } else if (uv == 0xfffe) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
+ uv);
+ goto malformed;
+ } else if (uv == 0xffff && IN_UTF8) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (impossible character 0x%04"UVxf")",
+ uv);
+ goto malformed;
+ } else if (expectlen > UTF8LEN(uv)) {
+ if (dowarn)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "Malformed UTF-8 character (%d byte%s, need %d)",
+ expectlen, expectlen > 1 ? "s": "", UTF8LEN(uv));
+ goto malformed;
}
+
return uv;
+
+malformed:
+
+ if (checking) {
+ if (retlen)
+ *retlen = len;
+ return 0;
+ }
+
+ if (retlen)
+ *retlen = -1;
+
+ return UNICODE_REPLACEMENT_CHARACTER;
}
/*
-=for apidoc Am|U8* s|utf8_to_uv|I32 *retlen
+=for apidoc Am|U8* s|utf8_to_uv|STRLEN *retlen
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding; C<retlen> will be set to the
length, in bytes, of that character, and the pointer C<s> will be
advanced to the end of the character.
-If C<s> does not point to a well-formed UTF8 character, an optional UTF8
-warning is produced.
+If C<s> does not point to a well-formed UTF8 character, zero is
+returned and retlen is set, if possible, to -1.
=cut
*/
UV
-Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
+Perl_utf8_to_uv(pTHX_ U8* s, STRLEN* retlen)
{
- return Perl_utf8_to_uv_chk(aTHX_ s, retlen, 0);
+ return Perl_utf8_to_uv_chk(aTHX_ s, (STRLEN)-1, retlen, 0);
}
/* utf8_distance(a,b) returns the number of UTF8 characters between
@@ -324,7 +395,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
if (*s < 0x80)
*d++ = *s++;
else {
- I32 ulen;
+ STRLEN ulen;
*d++ = (U8)utf8_to_uv(s, &ulen);
s += ulen;
}
@@ -853,7 +924,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
if (!PL_utf8_toupper)
PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_toupper, p);
- return uv ? uv : utf8_to_uv_chk(p,0,0);
+ return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
}
UV
@@ -864,7 +935,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
if (!PL_utf8_totitle)
PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_totitle, p);
- return uv ? uv : utf8_to_uv_chk(p,0,0);
+ return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
}
UV
@@ -875,7 +946,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
if (!PL_utf8_tolower)
PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_tolower, p);
- return uv ? uv : utf8_to_uv_chk(p,0,0);
+ return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
}
/* a "swash" is a swatch hash */
@@ -965,7 +1036,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, 0, 0) & ~(needents - 1))));
+ PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1))));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))
diff --git a/utf8.h b/utf8.h
index 7407335806..bb494ab1ef 100644
--- a/utf8.h
+++ b/utf8.h
@@ -29,7 +29,7 @@ END_EXTERN_C
#define UTF8_MAXLEN 13 /* how wide can a single UTF8 encoded character become */
-/*#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)*/
+#define IN_UTF8 (PL_curcop->op_private & HINT_UTF8)
#define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
@@ -53,6 +53,8 @@ END_EXTERN_C
(uv) < 0x80000000 ? 6 : 7 )
#endif
+#define UNICODE_REPLACEMENT_CHARACTER 0xfffd
+
/*
* Note: we try to be careful never to call the isXXX_utf8() functions
* unless we're pretty sure we've seen the beginning of a UTF-8 character
diff --git a/util.c b/util.c
index 6c949c7da0..2122d4ea17 100644
--- a/util.c
+++ b/util.c
@@ -2933,7 +2933,7 @@ Perl_same_dirent(pTHX_ char *a, char *b)
#endif /* !HAS_RENAME */
NV
-Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_bin(pTHX_ char *start, STRLEN len, STRLEN *retlen)
{
register char *s = start;
register NV rnv = 0.0;
@@ -3004,7 +3004,7 @@ Perl_scan_bin(pTHX_ char *start, I32 len, I32 *retlen)
}
NV
-Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_oct(pTHX_ char *start, STRLEN len, STRLEN *retlen)
{
register char *s = start;
register NV rnv = 0.0;
@@ -3074,7 +3074,7 @@ Perl_scan_oct(pTHX_ char *start, I32 len, I32 *retlen)
}
NV
-Perl_scan_hex(pTHX_ char *start, I32 len, I32 *retlen)
+Perl_scan_hex(pTHX_ char *start, STRLEN len, STRLEN *retlen)
{
register char *s = start;
register NV rnv = 0.0;