summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-09-14 14:40:40 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-09-14 14:40:40 +0000
commit67e989fb549091286d76fd8d29f1ec03b9da175d (patch)
treeb435bb5d55ee1fd063a1afe459e143ab597037ba
parentde6193504aa249326a30bbe962866c18d77ea85d (diff)
downloadperl-67e989fb549091286d76fd8d29f1ec03b9da175d.tar.gz
Batch of UTF-8 patches from Simon Cozens.
p4raw-id: //depot/perl@7075
-rw-r--r--doop.c20
-rw-r--r--embed.h2
-rwxr-xr-xembed.pl2
-rw-r--r--ext/Encode/Encode.xs103
-rw-r--r--handy.h30
-rw-r--r--op.c12
-rw-r--r--pod/perlapi.pod16
-rw-r--r--pp.c14
-rw-r--r--pp_ctl.c6
-rw-r--r--proto.h2
-rw-r--r--regcomp.c6
-rw-r--r--regexec.c8
-rw-r--r--sv.c100
-rw-r--r--toke.c4
-rw-r--r--utf8.c45
15 files changed, 214 insertions, 156 deletions
diff --git a/doop.c b/doop.c
index 77c7324e31..80cc0f6337 100644
--- a/doop.c
+++ b/doop.c
@@ -77,7 +77,7 @@ S_do_trans_simple(pTHX_ SV *sv)
ulen = 1;
/* Need to check this, otherwise 128..255 won't match */
- c = utf8_to_uv(s, &ulen);
+ c = utf8_to_uv(s, &ulen, 0);
if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
matches++;
if (ch < 0x80)
@@ -125,7 +125,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
I32 ulen;
ulen = 1;
if (hasutf)
- c = utf8_to_uv(s,&ulen);
+ c = utf8_to_uv(s,&ulen, 0);
else
c = *s;
if (c < 0x100 && tbl[c] >= 0)
@@ -364,7 +364,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
}
else if (uv == none) { /* "none" is unmapped character */
I32 ulen;
- *d++ = (U8)utf8_to_uv(s, &ulen);
+ *d++ = (U8)utf8_to_uv(s, &ulen, 0);
s += ulen;
puv = 0xfeedface;
continue;
@@ -405,7 +405,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
}
else if (uv == none) { /* "none" is unmapped character */
I32 ulen;
- *d++ = (U8)utf8_to_uv(s, &ulen);
+ *d++ = (U8)utf8_to_uv(s, &ulen, 0);
s += ulen;
continue;
}
@@ -969,10 +969,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
switch (optype) {
case OP_BIT_AND:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, &ulen);
+ luc = utf8_to_uv((U8*)lc, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, &ulen);
+ ruc = utf8_to_uv((U8*)rc, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc & ruc;
@@ -984,10 +984,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
break;
case OP_BIT_XOR:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, &ulen);
+ luc = utf8_to_uv((U8*)lc, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, &ulen);
+ ruc = utf8_to_uv((U8*)rc, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc ^ ruc;
@@ -996,10 +996,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
goto mop_up_utf;
case OP_BIT_OR:
while (lulen && rulen) {
- luc = utf8_to_uv((U8*)lc, &ulen);
+ luc = utf8_to_uv((U8*)lc, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv((U8*)rc, &ulen);
+ ruc = utf8_to_uv((U8*)rc, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc | ruc;
diff --git a/embed.h b/embed.h
index 9bd72ad9b8..7e030a9199 100644
--- a/embed.h
+++ b/embed.h
@@ -2186,7 +2186,7 @@
#define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b)
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
-#define utf8_to_uv(a,b) Perl_utf8_to_uv(aTHX_ a,b)
+#define utf8_to_uv(a,b,c) Perl_utf8_to_uv(aTHX_ a,b,c)
#define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b)
diff --git a/embed.pl b/embed.pl
index c8e83f85ce..23214a3f26 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2074,7 +2074,7 @@ Ap |I32 |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
-Ap |UV |utf8_to_uv |U8 *s|I32* retlen
+Ap |UV |utf8_to_uv |U8 *s|I32* retlen|bool checking
Ap |U8* |uv_to_utf8 |U8 *d|UV uv
p |void |vivify_defelem |SV* sv
p |void |vivify_ref |SV* sv|U32 to_what
diff --git a/ext/Encode/Encode.xs b/ext/Encode/Encode.xs
index cc0a86a117..5f4a77e6af 100644
--- a/ext/Encode/Encode.xs
+++ b/ext/Encode/Encode.xs
@@ -2,29 +2,104 @@
#include "perl.h"
#include "XSUB.h"
-MODULE = Encode PACKAGE = Encode
+#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
+ Perl_croak("panic_unimplemented"); \
+ }
+UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
+UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
+
+void call_failure (SV *routine, U8* done, U8* dest, U8* orig);
+
+MODULE = Encode PACKAGE = Encode
PROTOTYPES: ENABLE
-SV *
+I32
_bytes_to_utf8(sv, ...)
- SV * sv
+ SV * sv
CODE:
- {
- SV * encoding = 2 ? ST(1) : Nullsv;
- RETVAL = &PL_sv_undef;
- }
+ {
+ SV * encoding = items == 2 ? ST(1) : Nullsv;
+
+ if (encoding)
+ RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
+ else {
+ STRLEN len;
+ U8* s = SvPV(sv, len);
+ U8* converted;
+
+ converted = bytes_to_utf8(s, &len); /* This allocs */
+ sv_setpvn(sv, converted, len);
+ SvUTF8_on(sv); /* XXX Should we? */
+ Safefree(converted); /* ... so free it */
+ RETVAL = len;
+ }
+ }
OUTPUT:
- RETVAL
+ RETVAL
-SV *
+I32
_utf8_to_bytes(sv, ...)
- SV * sv
+ SV * sv
CODE:
- {
- SV * to = items > 1 ? ST(1) : Nullsv;
- SV * check = items > 2 ? ST(2) : Nullsv;
- RETVAL = &PL_sv_undef;
+ {
+ SV * to = items > 1 ? ST(1) : Nullsv;
+ SV * check = items > 2 ? ST(2) : Nullsv;
+
+ if (to)
+ RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
+ else {
+ U8 *s;
+ STRLEN len;
+ s = SvPV(sv, len);
+
+ if (SvTRUE(check)) {
+ /* Must do things the slow way */
+ U8 *dest;
+ U8 *src = savepv(s); /* We need a copy to pass to check() */
+ U8 *send = s + len;
+
+ New(83, dest, len, U8); /* I think */
+
+ while (s < send) {
+ if (*s < 0x80)
+ *dest++ = *s++;
+ else {
+ I32 ulen;
+ I32 byte;
+ I32 uv = *s++;
+
+ /* Have to do it all ourselves because of error routine,
+ aargh. */
+ if (!(uv & 0x40))
+ goto failure;
+ if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
+ else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
+ else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
+ else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
+ else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
+ else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
+ else { ulen = 13; uv = 0; }
+
+ /* Note change to utf8.c variable naming, for variety */
+ while (ulen--) {
+ if ((*s & 0xc0) != 0x80)
+ goto failure;
+
+ else
+ uv = (uv << 6) | (*s++ & 0x3f);
+ }
+ if (uv > 256) {
+ failure:
+ call_failure(check, s, dest, src);
+ /* Now what happens? */
+ }
+ *dest++ = (U8)uv;
+ }
+ }
+ } else
+ RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
+ }
}
OUTPUT:
RETVAL
diff --git a/handy.h b/handy.h
index d82b1c609b..c240c4220c 100644
--- a/handy.h
+++ b/handy.h
@@ -448,21 +448,21 @@ Converts the specified character to lowercase.
#define isPSXSPC_utf8(c) (isSPACE_utf8(c) ||(c) == '\f')
#define isBLANK_utf8(c) isBLANK(c) /* could be wrong */
-#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, 0))
-#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, 0))
-#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, 0))
-#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv(p, 0))
-#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, 0))
-#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, 0))
-#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, 0))
-#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, 0))
-#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, 0))
-#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, 0))
-#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, 0))
-#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, 0))
-#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, 0))
-#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0))
-#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0))
+#define isALNUM_LC_utf8(p) isALNUM_LC_uni(utf8_to_uv(p, 0, 0))
+#define isIDFIRST_LC_utf8(p) isIDFIRST_LC_uni(utf8_to_uv(p, 0, 0))
+#define isALPHA_LC_utf8(p) isALPHA_LC_uni(utf8_to_uv(p, 0, 0))
+#define isSPACE_LC_utf8(p) isSPACE_LC_uni(utf8_to_uv(p, 0, 0))
+#define isDIGIT_LC_utf8(p) isDIGIT_LC_uni(utf8_to_uv(p, 0, 0))
+#define isUPPER_LC_utf8(p) isUPPER_LC_uni(utf8_to_uv(p, 0, 0))
+#define isLOWER_LC_utf8(p) isLOWER_LC_uni(utf8_to_uv(p, 0, 0))
+#define isALNUMC_LC_utf8(p) isALNUMC_LC_uni(utf8_to_uv(p, 0, 0))
+#define isCNTRL_LC_utf8(p) isCNTRL_LC_uni(utf8_to_uv(p, 0, 0))
+#define isGRAPH_LC_utf8(p) isGRAPH_LC_uni(utf8_to_uv(p, 0, 0))
+#define isPRINT_LC_utf8(p) isPRINT_LC_uni(utf8_to_uv(p, 0, 0))
+#define isPUNCT_LC_utf8(p) isPUNCT_LC_uni(utf8_to_uv(p, 0, 0))
+#define toUPPER_LC_utf8(p) toUPPER_LC_uni(utf8_to_uv(p, 0, 0))
+#define toTITLE_LC_utf8(p) toTITLE_LC_uni(utf8_to_uv(p, 0, 0))
+#define toLOWER_LC_utf8(p) toLOWER_LC_uni(utf8_to_uv(p, 0, 0))
#define isPSXSPC_LC_utf8(c) (isSPACE_LC_utf8(c) ||(c) == '\f')
#define isBLANK_LC_utf8(c) isBLANK(c) /* could be wrong */
diff --git a/op.c b/op.c
index 74d67e38f0..d24396a8d3 100644
--- a/op.c
+++ b/op.c
@@ -2656,7 +2656,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
qsort(cp, i, sizeof(U8*), utf8compare);
for (j = 0; j < i; j++) {
U8 *s = cp[j];
- UV val = utf8_to_uv(s, &ulen);
+ UV val = utf8_to_uv(s, &ulen, 0);
s += ulen;
diff = val - nextmin;
if (diff > 0) {
@@ -2669,7 +2669,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
}
if (*s == 0xff)
- val = utf8_to_uv(s+1, &ulen);
+ val = utf8_to_uv(s+1, &ulen, 0);
if (val >= nextmin)
nextmin = val + 1;
}
@@ -2696,10 +2696,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
while (t < tend || tfirst <= tlast) {
/* see if we need more "t" chars */
if (tfirst > tlast) {
- tfirst = (I32)utf8_to_uv(t, &ulen);
+ tfirst = (I32)utf8_to_uv(t, &ulen, 0);
t += ulen;
if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */
- tlast = (I32)utf8_to_uv(++t, &ulen);
+ tlast = (I32)utf8_to_uv(++t, &ulen, 0);
t += ulen;
}
else
@@ -2709,10 +2709,10 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
/* now see if we need more "r" chars */
if (rfirst > rlast) {
if (r < rend) {
- rfirst = (I32)utf8_to_uv(r, &ulen);
+ rfirst = (I32)utf8_to_uv(r, &ulen, 0);
r += ulen;
if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */
- rlast = (I32)utf8_to_uv(++r, &ulen);
+ rlast = (I32)utf8_to_uv(++r, &ulen, 0);
r += ulen;
}
else
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index b1feed3197..ca2ba7c834 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -3182,10 +3182,18 @@ Found in file handy.h
=item U8 *s
-Returns true if first C<len> bytes of the given string form valid a UTF8
-string, false otherwise.
+Returns the character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character, and the pointer C<s> will be
+advanced to the end of the character.
- is_utf8_string U8 *s(STRLEN len)
+If C<s> does not point to a well-formed UTF8 character, the behaviour
+is dependent on the value of C<checking>: if this is true, it is
+assumed that the caller will raise a warning, and this function will
+set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
+warning is produced.
+
+ utf8_to_uv U8 *s(I32 *retlen, I32 checking)
=for hackers
Found in file utf8.c
@@ -3195,7 +3203,7 @@ Found in file utf8.c
Converts a string C<s> of length C<len> from UTF8 into byte encoding.
Unlike C<bytes_to_utf8>, this over-writes the original string, and
updates len to contain the new length.
-Returns zero on failure leaving the string and len unchanged
+Returns zero on failure, setting C<len> to -1.
U8 * utf8_to_bytes(U8 *s, STRLEN *len)
diff --git a/pp.c b/pp.c
index d5d5dd88b0..1c5a9638e9 100644
--- a/pp.c
+++ b/pp.c
@@ -2195,7 +2195,7 @@ PP(pp_ord)
I32 retlen;
if ((*tmps & 0x80) && DO_UTF8(tmpsv))
- value = utf8_to_uv(tmps, &retlen);
+ value = utf8_to_uv(tmps, &retlen, 0);
else
value = (UV)(*tmps & 255);
XPUSHu(value);
@@ -2262,7 +2262,7 @@ PP(pp_ucfirst)
I32 ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv(s, &ulen);
+ UV uv = utf8_to_uv(s, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
@@ -2321,7 +2321,7 @@ PP(pp_lcfirst)
I32 ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv(s, &ulen);
+ UV uv = utf8_to_uv(s, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
@@ -2398,7 +2398,7 @@ PP(pp_uc)
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen)));
+ d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, &ulen, 0)));
s += ulen;
}
}
@@ -2472,7 +2472,7 @@ PP(pp_lc)
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen)));
+ d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, &ulen, 0)));
s += ulen;
}
}
@@ -3614,7 +3614,7 @@ PP(pp_unpack)
len = strend - s;
if (checksum) {
while (len-- > 0 && s < strend) {
- auint = utf8_to_uv((U8*)s, &along);
+ auint = utf8_to_uv((U8*)s, &along, 0);
s += along;
if (checksum > 32)
cdouble += (NV)auint;
@@ -3626,7 +3626,7 @@ PP(pp_unpack)
EXTEND(SP, len);
EXTEND_MORTAL(len);
while (len-- > 0 && s < strend) {
- auint = utf8_to_uv((U8*)s, &along);
+ auint = utf8_to_uv((U8*)s, &along, 0);
s += along;
sv = NEWSV(37, 0);
sv_setuv(sv, (UV)auint);
diff --git a/pp_ctl.c b/pp_ctl.c
index 8981bb88c2..3cc74e5281 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2959,13 +2959,13 @@ PP(pp_require)
U8 *s = (U8*)SvPVX(sv);
U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
if (s < end) {
- rev = utf8_to_uv(s, &len);
+ rev = utf8_to_uv(s, &len, 0);
s += len;
if (s < end) {
- ver = utf8_to_uv(s, &len);
+ ver = utf8_to_uv(s, &len, 0);
s += len;
if (s < end)
- sver = utf8_to_uv(s, &len);
+ sver = utf8_to_uv(s, &len, 0);
}
}
if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index ed08b20c8e..6a0229a7fc 100644
--- a/proto.h
+++ b/proto.h
@@ -817,7 +817,7 @@ PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
+PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen, bool checking);
PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv);
PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
diff --git a/regcomp.c b/regcomp.c
index 723cbbe298..c60ab84409 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2881,7 +2881,7 @@ tryagain:
default:
normal_default:
if ((*p & 0xc0) == 0xc0 && UTF) {
- ender = utf8_to_uv((U8*)p, &numlen);
+ ender = utf8_to_uv((U8*)p, &numlen, 0);
p += numlen;
}
else
@@ -3635,12 +3635,12 @@ S_regclassutf8(pTHX)
namedclass = OOB_NAMEDCLASS;
if (!range)
rangebegin = PL_regcomp_parse;
- value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
+ value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0);
PL_regcomp_parse += numlen;
if (value == '[')
namedclass = regpposixcc(value);
else if (value == '\\') {
- value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen);
+ value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen, 0);
PL_regcomp_parse += numlen;
/* Some compilers cannot handle switching on 64-bit integer
* values, therefore value cannot be an UV. Yes, this will
diff --git a/regexec.c b/regexec.c
index 6401710b99..990791bd6b 100644
--- a/regexec.c
+++ b/regexec.c
@@ -914,7 +914,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case BOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
+ tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n';
tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
while (s < strend) {
if (tmp == !(OP(c) == BOUNDUTF8 ?
@@ -950,7 +950,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case NBOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0) : '\n';
+ tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1), 0, 0) : '\n';
tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
while (s < strend) {
if (tmp == !(OP(c) == NBOUNDUTF8 ?
@@ -1995,7 +1995,7 @@ S_regmatch(pTHX_ regnode *prog)
while (s < e) {
if (l >= PL_regeol)
sayNO;
- if (utf8_to_uv((U8*)s, 0) != (c1 ?
+ if (utf8_to_uv((U8*)s, 0, 0) != (c1 ?
toLOWER_utf8((U8*)l) :
toLOWER_LC_utf8((U8*)l)))
{
@@ -2133,7 +2133,7 @@ S_regmatch(pTHX_ regnode *prog)
case NBOUNDUTF8:
/* was last char in word? */
ln = (locinput != PL_regbol)
- ? utf8_to_uv(reghop((U8*)locinput, -1), 0) : PL_regprev;
+ ? utf8_to_uv(reghop((U8*)locinput, -1), 0, 0) : PL_regprev;
if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
ln = isALNUM_uni(ln);
n = swash_fetch(PL_utf8_alnum, (U8*)locinput);
diff --git a/sv.c b/sv.c
index 4da49cc86d..2c45cae0bd 100644
--- a/sv.c
+++ b/sv.c
@@ -2398,6 +2398,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
{
int hicount;
char *c;
+ char *s;
if (!sv || !SvPOK(sv) || SvUTF8(sv))
return;
@@ -2406,30 +2407,16 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
* to signal if there are any hibit chars in the string
*/
hicount = 0;
- for (c = SvPVX(sv); c < SvEND(sv); c++) {
+ for (c = s = SvPVX(sv); c < SvEND(sv); c++) {
if (*c & 0x80)
hicount++;
}
if (hicount) {
- char *src, *dst;
- SvGROW(sv, SvCUR(sv) + hicount + 1);
-
- src = SvEND(sv) - 1;
- SvCUR_set(sv, SvCUR(sv) + hicount);
- dst = SvEND(sv) - 1;
-
- while (src < dst) {
- if (*src & 0x80) {
- dst--;
- uv_to_utf8((U8*)dst, (U8)*src--);
- dst--;
- }
- else {
- *dst-- = *src--;
- }
- }
-
+ STRLEN len = SvCUR(sv) + 1; /* Plus the \0 */
+ SvPVX(sv) = bytes_to_utf8(s, &len);
+ SvCUR(sv) = len - 1;
+ Safefree(s); /* No longer using what was there before */
SvUTF8_on(sv);
}
}
@@ -2450,46 +2437,14 @@ Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok)
{
if (SvPOK(sv) && SvUTF8(sv)) {
char *c = SvPVX(sv);
- char *first_hi = 0;
- /* need to figure out if this is possible at all first */
- while (c < SvEND(sv)) {
- if (*c & 0x80) {
- I32 len;
- UV uv = utf8_to_uv((U8*)c, &len);
- if (uv >= 256) {
- if (fail_ok)
- return FALSE;
- else {
- /* XXX might want to make a callback here instead */
- Perl_croak(aTHX_ "Big byte");
- }
- }
- if (!first_hi)
- first_hi = c;
- c += len;
- }
- else {
- c++;
- }
- }
-
- if (first_hi) {
- char *src = first_hi;
- char *dst = first_hi;
- while (src < SvEND(sv)) {
- if (*src & 0x80) {
- I32 len;
- U8 u = (U8)utf8_to_uv((U8*)src, &len);
- *dst++ = u;
- src += len;
- }
- else {
- *dst++ = *src++;
- }
- }
- SvCUR_set(sv, dst - SvPVX(sv));
- }
- SvUTF8_off(sv);
+ STRLEN len = SvCUR(sv);
+ if (!utf8_to_bytes(c, &len)) {
+ if (fail_ok)
+ return FALSE;
+ else
+ Perl_croak("big byte");
+ }
+ SvCUR(sv) = len - 1;
}
return TRUE;
}
@@ -2523,24 +2478,15 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv)
* we want to make sure everything inside is valid utf8 first.
*/
c = SvPVX(sv);
+ if (!is_utf8_string(c,SvCUR(c)+1))
+ return FALSE;
+
while (c < SvEND(sv)) {
- if (*c & 0x80) {
- I32 len;
- (void)utf8_to_uv((U8*)c, &len);
- if (len == 1) {
- /* bad utf8 */
- return FALSE;
- }
- c += len;
- has_utf = TRUE;
- }
- else {
- c++;
- }
+ if (*c++ & 0x80) {
+ SvUTF8_on(sv);
+ break;
+ }
}
-
- if (has_utf)
- SvUTF8_on(sv);
}
return TRUE;
}
@@ -6373,7 +6319,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
break;
}
if (utf)
- iv = (IV)utf8_to_uv(vecstr, &ulen);
+ iv = (IV)utf8_to_uv(vecstr, &ulen, 0);
else {
iv = *vecstr;
ulen = 1;
@@ -6455,7 +6401,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
break;
}
if (utf)
- uv = utf8_to_uv(vecstr, &ulen);
+ uv = utf8_to_uv(vecstr, &ulen, 0);
else {
uv = *vecstr;
ulen = 1;
diff --git a/toke.c b/toke.c
index e5f737b8c7..31f5f0a903 100644
--- a/toke.c
+++ b/toke.c
@@ -812,7 +812,7 @@ Perl_str_to_version(pTHX_ SV *sv)
I32 skip;
UV n;
if (utf)
- n = utf8_to_uv((U8*)start, &skip);
+ n = utf8_to_uv((U8*)start, &skip, 0);
else {
n = *(U8*)start;
skip = 1;
@@ -1323,7 +1323,7 @@ S_scan_const(pTHX_ char *start)
/* (now in tr/// code again) */
if (*s & 0x80 && thisutf) {
- (void)utf8_to_uv((U8*)s, &len);
+ (void)utf8_to_uv((U8*)s, &len, 0);
if (len == 1) {
/* illegal UTF8, make it valid */
char *old_pvx = SvPVX(sv);
diff --git a/utf8.c b/utf8.c
index a9600e95b9..d97a8b003d 100644
--- a/utf8.c
+++ b/utf8.c
@@ -158,8 +158,25 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
return 1;
}
+/*
+=for apidoc Am|utf8_to_uv|U8 *s|I32 *retlen|I32 checking
+
+Returns the character value of the first character in the string C<s>
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character, and the pointer C<s> will be
+advanced to the end of the character.
+
+If C<s> does not point to a well-formed UTF8 character, the behaviour
+is dependent on the value of C<checking>: if this is true, it is
+assumed that the caller will raise a warning, and this function will
+set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
+warning is produced.
+
+=cut
+*/
+
UV
-Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
+Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen, bool checking)
{
UV uv = *s;
int len;
@@ -170,6 +187,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
}
if (!(uv & 0x40)) {
dTHR;
+ if (checking && retlen) {
+ *retlen = -1;
+ return 0;
+ }
+
if (ckWARN_d(WARN_UTF8))
Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
if (retlen)
@@ -192,6 +214,11 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
while (len--) {
if ((*s & 0xc0) != 0x80) {
dTHR;
+ if (checking && retlen) {
+ *retlen = -1;
+ return 0;
+ }
+
if (ckWARN_d(WARN_UTF8))
Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
if (retlen)
@@ -253,7 +280,7 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
Converts a string C<s> of length C<len> from UTF8 into byte encoding.
Unlike C<bytes_to_utf8>, this over-writes the original string, and
updates len to contain the new length.
-Returns zero on failure leaving the string and len unchanged
+Returns zero on failure, setting C<len> to -1.
=cut
*/
@@ -273,8 +300,10 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
while (s < send) {
U8 c = *s++;
if (c >= 0x80 &&
- ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2)))
+ ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
+ *len = -1;
return 0;
+ }
}
s = save;
while (s < send) {
@@ -282,7 +311,7 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
*d++ = *s++;
else {
I32 ulen;
- *d++ = (U8)utf8_to_uv(s, &ulen);
+ *d++ = (U8)utf8_to_uv(s, &ulen, 0);
s += ulen;
}
}
@@ -810,7 +839,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
if (!PL_utf8_toupper)
PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_toupper, p);
- return uv ? uv : utf8_to_uv(p,0);
+ return uv ? uv : utf8_to_uv(p,0,0);
}
UV
@@ -821,7 +850,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
if (!PL_utf8_totitle)
PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_totitle, p);
- return uv ? uv : utf8_to_uv(p,0);
+ return uv ? uv : utf8_to_uv(p,0,0);
}
UV
@@ -832,7 +861,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
if (!PL_utf8_tolower)
PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_tolower, p);
- return uv ? uv : utf8_to_uv(p,0);
+ return uv ? uv : utf8_to_uv(p,0,0);
}
/* a "swash" is a swatch hash */
@@ -922,7 +951,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0) & ~(needents - 1))));
+ PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0, 0) & ~(needents - 1))));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))