summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-10-25 20:00:48 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-10-25 20:00:48 +0000
commitdcad28805702d580064bc39a267d63c58bbb3b3f (patch)
treec59311ffbadd7bc18b3b7bcc1b2158652051f134
parentfcc8fcf67e5ea5f08178c9ac86509bc972ef38ff (diff)
downloadperl-dcad28805702d580064bc39a267d63c58bbb3b3f.tar.gz
Continue the internal UTF-8 API tweaking.
Rename utf8_to_uv_chk() back to utf8_to_uv() because it's used much more than the simpler API, now called utf8_to_uv_simple(). Still not quite happy with API, too much partial duplication of functionality. p4raw-id: //depot/perl@7439
-rw-r--r--doop.c23
-rw-r--r--embed.h10
-rwxr-xr-xembed.pl6
-rw-r--r--objXSUB.h8
-rw-r--r--op.c12
-rw-r--r--perlapi.c14
-rw-r--r--pod/perlapi.pod44
-rw-r--r--pod/perlunicode.pod5
-rw-r--r--pp.c18
-rw-r--r--pp_ctl.c6
-rw-r--r--proto.h4
-rw-r--r--regcomp.c6
-rw-r--r--regexec.c8
-rw-r--r--sv.c4
-rw-r--r--toke.c4
-rw-r--r--utf8.c59
16 files changed, 112 insertions, 119 deletions
diff --git a/doop.c b/doop.c
index 3cd8f07b20..fa927c22b1 100644
--- a/doop.c
+++ b/doop.c
@@ -77,7 +77,7 @@ S_do_trans_simple(pTHX_ SV *sv)
ulen = 1;
/* Need to check this, otherwise 128..255 won't match */
- c = utf8_to_uv_chk(s, send - s, &ulen, 0);
+ c = utf8_to_uv(s, send - s, &ulen, 0);
if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
matches++;
if (ch < 0x80)
@@ -125,7 +125,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
STRLEN ulen;
ulen = 1;
if (hasutf)
- c = utf8_to_uv_chk(s, send - s, &ulen, 0);
+ c = utf8_to_uv(s, send - s, &ulen, 0);
else
c = *s;
if (c < 0x100 && tbl[c] >= 0)
@@ -364,7 +364,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
}
else if (uv == none) { /* "none" is unmapped character */
STRLEN ulen;
- *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
+ *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
s += ulen;
puv = 0xfeedface;
continue;
@@ -405,7 +405,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
}
else if (uv == none) { /* "none" is unmapped character */
STRLEN ulen;
- *d++ = (U8)utf8_to_uv_chk(s, send - s, &ulen, 0);
+ *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
s += ulen;
continue;
}
@@ -550,9 +550,8 @@ Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
if (size < 1 || (size & (size-1))) /* size < 1 or not a power of two */
Perl_croak(aTHX_ "Illegal number of bits in vec");
- if (SvUTF8(sv)) {
+ if (SvUTF8(sv))
(void) Perl_sv_utf8_downgrade(aTHX_ sv, TRUE);
- }
offset *= size; /* turn into bit offset */
len = (offset + size + 7) / 8; /* required number of bytes */
@@ -969,10 +968,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
switch (optype) {
case OP_BIT_AND:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
+ luc = utf8_to_uv((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
+ ruc = utf8_to_uv((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc & ruc;
@@ -984,10 +983,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
break;
case OP_BIT_XOR:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
+ luc = utf8_to_uv((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
+ ruc = utf8_to_uv((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc ^ ruc;
@@ -996,10 +995,10 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
goto mop_up_utf;
case OP_BIT_OR:
while (lulen && rulen) {
- luc = utf8_to_uv_chk((U8*)lc, lulen, &ulen, 0);
+ luc = utf8_to_uv((U8*)lc, lulen, &ulen, 0);
lc += ulen;
lulen -= ulen;
- ruc = utf8_to_uv_chk((U8*)rc, rulen, &ulen, 0);
+ ruc = utf8_to_uv((U8*)rc, rulen, &ulen, 0);
rc += ulen;
rulen -= ulen;
duc = luc | ruc;
diff --git a/embed.h b/embed.h
index eab037fccf..a5888193d3 100644
--- a/embed.h
+++ b/embed.h
@@ -729,8 +729,8 @@
#define utf8_hop Perl_utf8_hop
#define utf8_to_bytes Perl_utf8_to_bytes
#define bytes_to_utf8 Perl_bytes_to_utf8
+#define utf8_to_uv_simple Perl_utf8_to_uv_simple
#define utf8_to_uv Perl_utf8_to_uv
-#define utf8_to_uv_chk Perl_utf8_to_uv_chk
#define uv_to_utf8 Perl_uv_to_utf8
#define vivify_defelem Perl_vivify_defelem
#define vivify_ref Perl_vivify_ref
@@ -2189,8 +2189,8 @@
#define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b)
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
-#define utf8_to_uv(a,b) Perl_utf8_to_uv(aTHX_ a,b)
-#define utf8_to_uv_chk(a,b,c,d) Perl_utf8_to_uv_chk(aTHX_ a,b,c,d)
+#define utf8_to_uv_simple(a,b) Perl_utf8_to_uv_simple(aTHX_ a,b)
+#define utf8_to_uv(a,b,c,d) Perl_utf8_to_uv(aTHX_ a,b,c,d)
#define uv_to_utf8(a,b) Perl_uv_to_utf8(aTHX_ a,b)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
#define vivify_ref(a,b) Perl_vivify_ref(aTHX_ a,b)
@@ -4290,10 +4290,10 @@
#define utf8_to_bytes Perl_utf8_to_bytes
#define Perl_bytes_to_utf8 CPerlObj::Perl_bytes_to_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
+#define Perl_utf8_to_uv_simple CPerlObj::Perl_utf8_to_uv_simple
+#define utf8_to_uv_simple Perl_utf8_to_uv_simple
#define Perl_utf8_to_uv CPerlObj::Perl_utf8_to_uv
#define utf8_to_uv Perl_utf8_to_uv
-#define Perl_utf8_to_uv_chk CPerlObj::Perl_utf8_to_uv_chk
-#define utf8_to_uv_chk Perl_utf8_to_uv_chk
#define Perl_uv_to_utf8 CPerlObj::Perl_uv_to_utf8
#define uv_to_utf8 Perl_uv_to_utf8
#define Perl_vivify_defelem CPerlObj::Perl_vivify_defelem
diff --git a/embed.pl b/embed.pl
index 8f80bbfcaf..62135fccb1 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2074,9 +2074,9 @@ Ap |I32 |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
-Ap |UV |utf8_to_uv |U8 *s|STRLEN* retlen
-Ap |UV |utf8_to_uv_chk |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
-Ap |U8* |uv_to_utf8 |U8 *d|UV uv
+Ap |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen
+Ap |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
+Ap |U8* |uv_to_utf8|U8 *d|UV uv
p |void |vivify_defelem |SV* sv
p |void |vivify_ref |SV* sv|U32 to_what
p |I32 |wait4pid |Pid_t pid|int* statusp|int flags
diff --git a/objXSUB.h b/objXSUB.h
index bc04f03ceb..4d5ff6b8a5 100644
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -1869,14 +1869,14 @@
#define Perl_bytes_to_utf8 pPerl->Perl_bytes_to_utf8
#undef bytes_to_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
+#undef Perl_utf8_to_uv_simple
+#define Perl_utf8_to_uv_simple pPerl->Perl_utf8_to_uv_simple
+#undef utf8_to_uv_simple
+#define utf8_to_uv_simple Perl_utf8_to_uv_simple
#undef Perl_utf8_to_uv
#define Perl_utf8_to_uv pPerl->Perl_utf8_to_uv
#undef utf8_to_uv
#define utf8_to_uv Perl_utf8_to_uv
-#undef Perl_utf8_to_uv_chk
-#define Perl_utf8_to_uv_chk pPerl->Perl_utf8_to_uv_chk
-#undef utf8_to_uv_chk
-#define utf8_to_uv_chk Perl_utf8_to_uv_chk
#undef Perl_uv_to_utf8
#define Perl_uv_to_utf8 pPerl->Perl_uv_to_utf8
#undef uv_to_utf8
diff --git a/op.c b/op.c
index 9e256a3537..1aa558ee65 100644
--- a/op.c
+++ b/op.c
@@ -2658,7 +2658,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
for (j = 0; j < i; j++) {
U8 *s = cp[j];
I32 cur = j < i ? cp[j+1] - s : tend - s;
- UV val = utf8_to_uv_chk(s, cur, &ulen, 0);
+ UV val = utf8_to_uv(s, cur, &ulen, 0);
s += ulen;
diff = val - nextmin;
if (diff > 0) {
@@ -2671,7 +2671,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
}
}
if (*s == 0xff)
- val = utf8_to_uv_chk(s+1, cur - 1, &ulen, 0);
+ val = utf8_to_uv(s+1, cur - 1, &ulen, 0);
if (val >= nextmin)
nextmin = val + 1;
}
@@ -2698,11 +2698,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
while (t < tend || tfirst <= tlast) {
/* see if we need more "t" chars */
if (tfirst > tlast) {
- tfirst = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
+ tfirst = (I32)utf8_to_uv(t, tend - t, &ulen, 0);
t += ulen;
if (t < tend && *t == 0xff) { /* illegal utf8 val indicates range */
t++;
- tlast = (I32)utf8_to_uv_chk(t, tend - t, &ulen, 0);
+ tlast = (I32)utf8_to_uv(t, tend - t, &ulen, 0);
t += ulen;
}
else
@@ -2712,11 +2712,11 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
/* now see if we need more "r" chars */
if (rfirst > rlast) {
if (r < rend) {
- rfirst = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
+ rfirst = (I32)utf8_to_uv(r, rend - r, &ulen, 0);
r += ulen;
if (r < rend && *r == 0xff) { /* illegal utf8 val indicates range */
r++;
- rlast = (I32)utf8_to_uv_chk(r, rend - r, &ulen, 0);
+ rlast = (I32)utf8_to_uv(r, rend - r, &ulen, 0);
r += ulen;
}
else
diff --git a/perlapi.c b/perlapi.c
index 1f1343db47..efa716410c 100644
--- a/perlapi.c
+++ b/perlapi.c
@@ -1327,7 +1327,7 @@ Perl_to_uni_lower_lc(pTHXo_ U32 c)
}
#undef Perl_is_utf8_char
-int
+STRLEN
Perl_is_utf8_char(pTHXo_ U8 *p)
{
return ((CPerlObj*)pPerl)->Perl_is_utf8_char(p);
@@ -3378,18 +3378,18 @@ Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
return ((CPerlObj*)pPerl)->Perl_bytes_to_utf8(s, len);
}
-#undef Perl_utf8_to_uv
+#undef Perl_utf8_to_uv_simple
UV
-Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN* retlen)
+Perl_utf8_to_uv_simple(pTHXo_ U8 *s, STRLEN* retlen)
{
- return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, retlen);
+ return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_simple(s, retlen);
}
-#undef Perl_utf8_to_uv_chk
+#undef Perl_utf8_to_uv
UV
-Perl_utf8_to_uv_chk(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, bool checking)
+Perl_utf8_to_uv(pTHXo_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags)
{
- return ((CPerlObj*)pPerl)->Perl_utf8_to_uv_chk(s, curlen, retlen, checking);
+ return ((CPerlObj*)pPerl)->Perl_utf8_to_uv(s, curlen, retlen, flags);
}
#undef Perl_uv_to_utf8
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 730d89f896..634180f7ef 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -2368,19 +2368,19 @@ false, defined or undefined. Does not handle 'get' magic.
=for hackers
Found in file sv.h
-=item svtype
+=item SvTYPE
-An enum of flags for Perl types. These are found in the file B<sv.h>
-in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
+Returns the type of the SV. See C<svtype>.
+
+ svtype SvTYPE(SV* sv)
=for hackers
Found in file sv.h
-=item SvTYPE
-
-Returns the type of the SV. See C<svtype>.
+=item svtype
- svtype SvTYPE(SV* sv)
+An enum of flags for Perl types. These are found in the file B<sv.h>
+in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
=for hackers
Found in file sv.h
@@ -3218,32 +3218,32 @@ Found in file utf8.c
=item utf8_to_uv
Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding; C<retlen> will be set to the
-length, in bytes, of that character, and the pointer C<s> will be
-advanced to the end of the character.
+which is assumed to be in UTF8 encoding and no longer than C<curlen>;
+C<retlen> will be set to the length, in bytes, of that character,
+and the pointer C<s> will be advanced to the end of the character.
-If C<s> does not point to a well-formed UTF8 character, an optional UTF8
+If C<s> does not point to a well-formed UTF8 character, the behaviour
+is dependent on the value of C<checking>: if this is true, it is
+assumed that the caller will raise a warning, and this function will
+set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
warning is produced.
- U8* s utf8_to_uv(STRLEN *retlen)
+ U8* s utf8_to_uv(STRLEN curlen, I32 *retlen, U32 flags)
=for hackers
Found in file utf8.c
-=item utf8_to_uv_chk
+=item utf8_to_uv_simple
Returns the character value of the first character in the string C<s>
-which is assumed to be in UTF8 encoding and no longer than C<curlen>;
-C<retlen> will be set to the length, in bytes, of that character,
-and the pointer C<s> will be advanced to the end of the character.
+which is assumed to be in UTF8 encoding; C<retlen> will be set to the
+length, in bytes, of that character, and the pointer C<s> will be
+advanced to the end of the character.
-If C<s> does not point to a well-formed UTF8 character, the behaviour
-is dependent on the value of C<checking>: if this is true, it is
-assumed that the caller will raise a warning, and this function will
-set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
-warning is produced.
+If C<s> does not point to a well-formed UTF8 character, zero is
+returned and retlen is set, if possible, to -1.
- U8* s utf8_to_uv_chk(STRLEN curlen, I32 *retlen, I32 checking)
+ U8* s utf8_to_uv_simple(STRLEN *retlen)
=for hackers
Found in file utf8.c
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod
index 145c953099..c9954d8e96 100644
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -71,11 +71,6 @@ on Windows.
Regardless of the above, the C<bytes> pragma can always be used to force
byte semantics in a particular lexical scope. See L<bytes>.
-One effect of the C<utf8> pragma is that the internal UTF-8 decoding
-becomes stricter so that the character 0xFFFF (UTF-8 bytes 0xEF 0xBF
-0xBF), and the bytes 0xFE and 0xFF, start to cause warnings if they
-appear in the data.
-
The C<utf8> pragma is primarily a compatibility device that enables
recognition of UTF-8 in literals encountered by the parser. It may also
be used for enabling some of the more experimental Unicode support features.
diff --git a/pp.c b/pp.c
index ba5062704a..6d77ca1bfa 100644
--- a/pp.c
+++ b/pp.c
@@ -1484,7 +1484,7 @@ PP(pp_complement)
send = tmps + len;
while (tmps < send) {
- UV c = utf8_to_uv_chk(tmps, 0, &l, UTF8_ALLOW_ANY);
+ UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
tmps += UTF8SKIP(tmps);
targlen += UNISKIP(~c);
}
@@ -1493,7 +1493,7 @@ PP(pp_complement)
tmps -= len;
Newz(0, result, targlen + 1, U8);
while (tmps < send) {
- UV c = utf8_to_uv_chk(tmps, 0, &l, UTF8_ALLOW_ANY);
+ UV c = utf8_to_uv(tmps, 0, &l, UTF8_ALLOW_ANY);
tmps += UTF8SKIP(tmps);
result = uv_to_utf8(result,(UV)~c);
}
@@ -2240,7 +2240,7 @@ PP(pp_ord)
STRLEN retlen;
if ((*tmps & 0x80) && DO_UTF8(tmpsv))
- value = utf8_to_uv_chk(tmps, len, &retlen, 0);
+ value = utf8_to_uv(tmps, len, &retlen, 0);
else
value = (UV)(*tmps & 255);
XPUSHu(value);
@@ -2307,7 +2307,7 @@ PP(pp_ucfirst)
STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
+ UV uv = utf8_to_uv(s, slen, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
@@ -2366,7 +2366,7 @@ PP(pp_lcfirst)
STRLEN ulen;
U8 tmpbuf[UTF8_MAXLEN];
U8 *tend;
- UV uv = utf8_to_uv_chk(s, slen, &ulen, 0);
+ UV uv = utf8_to_uv(s, slen, &ulen, 0);
if (PL_op->op_private & OPpLOCALE) {
TAINT;
@@ -2443,7 +2443,7 @@ PP(pp_uc)
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
+ d = uv_to_utf8(d, toUPPER_LC_uni( utf8_to_uv(s, len, &ulen, 0)));
s += ulen;
}
}
@@ -2517,7 +2517,7 @@ PP(pp_lc)
TAINT;
SvTAINTED_on(TARG);
while (s < send) {
- d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv_chk(s, len, &ulen, 0)));
+ d = uv_to_utf8(d, toLOWER_LC_uni( utf8_to_uv(s, len, &ulen, 0)));
s += ulen;
}
}
@@ -3660,7 +3660,7 @@ PP(pp_unpack)
if (checksum) {
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = utf8_to_uv_chk((U8*)s, strend - s, &alen, 0);
+ auint = utf8_to_uv((U8*)s, strend - s, &alen, 0);
along = alen;
s += along;
if (checksum > 32)
@@ -3674,7 +3674,7 @@ PP(pp_unpack)
EXTEND_MORTAL(len);
while (len-- > 0 && s < strend) {
STRLEN alen;
- auint = utf8_to_uv_chk((U8*)s, strend - s, &alen, 0);
+ auint = utf8_to_uv((U8*)s, strend - s, &alen, 0);
along = alen;
s += along;
sv = NEWSV(37, 0);
diff --git a/pp_ctl.c b/pp_ctl.c
index 33f91eef26..a65cb1b4b6 100644
--- a/pp_ctl.c
+++ b/pp_ctl.c
@@ -2975,13 +2975,13 @@ PP(pp_require)
U8 *s = (U8*)SvPVX(sv);
U8 *end = (U8*)SvPVX(sv) + SvCUR(sv);
if (s < end) {
- rev = utf8_to_uv_chk(s, end - s, &len, 0);
+ rev = utf8_to_uv(s, end - s, &len, 0);
s += len;
if (s < end) {
- ver = utf8_to_uv_chk(s, end - s, &len, 0);
+ ver = utf8_to_uv(s, end - s, &len, 0);
s += len;
if (s < end)
- sver = utf8_to_uv_chk(s, end - s, &len, 0);
+ sver = utf8_to_uv(s, end - s, &len, 0);
}
}
if (PERL_REVISION < rev
diff --git a/proto.h b/proto.h
index 14a6e47cb9..06ef1b1fcc 100644
--- a/proto.h
+++ b/proto.h
@@ -809,8 +809,8 @@ PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
-PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN* retlen);
-PERL_CALLCONV UV Perl_utf8_to_uv_chk(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
+PERL_CALLCONV UV Perl_utf8_to_uv_simple(pTHX_ U8 *s, STRLEN* retlen);
+PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv);
PERL_CALLCONV void Perl_vivify_ref(pTHX_ SV* sv, U32 to_what);
diff --git a/regcomp.c b/regcomp.c
index 3f2b10c8fe..19467202f6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2884,7 +2884,7 @@ tryagain:
default:
normal_default:
if ((*p & 0xc0) == 0xc0 && UTF) {
- ender = utf8_to_uv_chk((U8*)p, PL_regxend - p,
+ ender = utf8_to_uv((U8*)p, PL_regxend - p,
&numlen, 0);
p += numlen;
}
@@ -3639,14 +3639,14 @@ S_regclassutf8(pTHX)
namedclass = OOB_NAMEDCLASS;
if (!range)
rangebegin = PL_regcomp_parse;
- value = utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ value = utf8_to_uv((U8*)PL_regcomp_parse,
PL_regxend - PL_regcomp_parse,
&numlen, 0);
PL_regcomp_parse += numlen;
if (value == '[')
namedclass = regpposixcc(value);
else if (value == '\\') {
- value = (U32)utf8_to_uv_chk((U8*)PL_regcomp_parse,
+ value = (U32)utf8_to_uv((U8*)PL_regcomp_parse,
PL_regxend - PL_regcomp_parse,
&numlen, 0);
PL_regcomp_parse += numlen;
diff --git a/regexec.c b/regexec.c
index 350f432145..a71f1d8601 100644
--- a/regexec.c
+++ b/regexec.c
@@ -917,7 +917,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case BOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+ tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1),
strend - s,
0, 0) : '\n';
tmp = ((OP(c) == BOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
@@ -955,7 +955,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
case NBOUNDUTF8:
- tmp = (I32)(s != startpos) ? utf8_to_uv_chk(reghop((U8*)s, -1),
+ tmp = (I32)(s != startpos) ? utf8_to_uv(reghop((U8*)s, -1),
strend - s,
0, 0) : '\n';
tmp = ((OP(c) == NBOUNDUTF8 ? isALNUM_uni(tmp) : isALNUM_LC_uni(tmp)) != 0);
@@ -2002,7 +2002,7 @@ S_regmatch(pTHX_ regnode *prog)
while (s < e) {
if (l >= PL_regeol)
sayNO;
- if (utf8_to_uv_chk((U8*)s, e - s, 0, 0) != (c1 ?
+ if (utf8_to_uv((U8*)s, e - s, 0, 0) != (c1 ?
toLOWER_utf8((U8*)l) :
toLOWER_LC_utf8((U8*)l)))
{
@@ -2140,7 +2140,7 @@ S_regmatch(pTHX_ regnode *prog)
case NBOUNDUTF8:
/* was last char in word? */
ln = (locinput != PL_regbol)
- ? utf8_to_uv_chk(reghop((U8*)locinput, -1),
+ ? utf8_to_uv(reghop((U8*)locinput, -1),
PL_regeol - locinput, 0, 0) : PL_regprev;
if (OP(scan) == BOUNDUTF8 || OP(scan) == NBOUNDUTF8) {
ln = isALNUM_uni(ln);
diff --git a/sv.c b/sv.c
index 2790cfd859..726f853f08 100644
--- a/sv.c
+++ b/sv.c
@@ -6364,7 +6364,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
break;
}
if (utf)
- iv = (IV)utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
+ iv = (IV)utf8_to_uv(vecstr, veclen, &ulen, 0);
else {
iv = *vecstr;
ulen = 1;
@@ -6447,7 +6447,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
break;
}
if (utf)
- uv = utf8_to_uv_chk(vecstr, veclen, &ulen, 0);
+ uv = utf8_to_uv(vecstr, veclen, &ulen, 0);
else {
uv = *vecstr;
ulen = 1;
diff --git a/toke.c b/toke.c
index 78ed359890..3572b0edcd 100644
--- a/toke.c
+++ b/toke.c
@@ -816,7 +816,7 @@ Perl_str_to_version(pTHX_ SV *sv)
STRLEN skip;
UV n;
if (utf)
- n = utf8_to_uv_chk((U8*)start, len, &skip, 0);
+ n = utf8_to_uv((U8*)start, len, &skip, 0);
else {
n = *(U8*)start;
skip = 1;
@@ -1331,7 +1331,7 @@ S_scan_const(pTHX_ char *start)
STRLEN len;
UV uv;
- uv = utf8_to_uv_chk((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
+ uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
if (len == 1) {
/* Illegal UTF8 (a high-bit byte), make it valid. */
char *old_pvx = SvPVX(sv);
diff --git a/utf8.c b/utf8.c
index 7bb34b764e..80f88466e6 100644
--- a/utf8.c
+++ b/utf8.c
@@ -171,7 +171,7 @@ Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
}
/*
-=for apidoc Am|U8* s|utf8_to_uv_chk|STRLEN curlen|I32 *retlen|U32 flags
+=for apidoc Am|U8* s|utf8_to_uv|STRLEN curlen|I32 *retlen|U32 flags
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding and no longer than C<curlen>;
@@ -179,16 +179,15 @@ C<retlen> will be set to the length, in bytes, of that character,
and the pointer C<s> will be advanced to the end of the character.
If C<s> does not point to a well-formed UTF8 character, the behaviour
-is dependent on the value of C<checking>: if this is true, it is
-assumed that the caller will raise a warning, and this function will
-set C<retlen> to C<-1> and return. If C<checking> is not true, an optional UTF8
-warning is produced.
+is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
+it is assumed that the caller will raise a warning, and this function
+will set C<retlen> to C<-1> and return. The C<flags> can also contain
+various flags to allow deviations from the strict UTF-8 encoding.
-=cut
-*/
+=cut */
UV
-Perl_utf8_to_uv_chk(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
+Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
{
dTHR;
UV uv = *s, ouv;
@@ -324,7 +323,7 @@ malformed:
}
/*
-=for apidoc Am|U8* s|utf8_to_uv|STRLEN *retlen
+=for apidoc Am|U8* s|utf8_to_uv_simple|STRLEN *retlen
Returns the character value of the first character in the string C<s>
which is assumed to be in UTF8 encoding; C<retlen> will be set to the
@@ -338,9 +337,9 @@ returned and retlen is set, if possible, to -1.
*/
UV
-Perl_utf8_to_uv(pTHX_ U8* s, STRLEN* retlen)
+Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen)
{
- return Perl_utf8_to_uv_chk(aTHX_ s, (STRLEN)-1, retlen, 0);
+ return Perl_utf8_to_uv(aTHX_ s, (STRLEN)-1, retlen, 0);
}
/* utf8_distance(a,b) returns the number of UTF8 characters between
@@ -400,30 +399,30 @@ Returns zero on failure, setting C<len> to -1.
U8 *
Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
{
- dTHR;
U8 *send;
U8 *d;
- U8 *save;
-
- send = s + *len;
- d = save = s;
+ U8 *save = s;
/* ensure valid UTF8 and chars < 256 before updating string */
- while (s < send) {
- U8 c = *s++;
+ for (send = s + *len; s < send; ) {
+ U8 c = *s++;
+
if (c >= 0x80 &&
- ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
- *len = -1;
- return 0;
- }
+ ((s >= send) ||
+ ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) {
+ *len = -1;
+ return 0;
+ }
}
- s = save;
+
+ d = s = save;
while (s < send) {
- if (*s < 0x80)
- *d++ = *s++;
+ if (*s < 0x80) {
+ *d++ = *s++;
+ }
else {
STRLEN ulen;
- *d++ = (U8)utf8_to_uv(s, &ulen);
+ *d++ = (U8)utf8_to_uv_simple(s, &ulen);
s += ulen;
}
}
@@ -951,7 +950,7 @@ Perl_to_utf8_upper(pTHX_ U8 *p)
if (!PL_utf8_toupper)
PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_toupper, p);
- return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
+ return uv ? uv : utf8_to_uv(p,STRLEN_MAX,0,0);
}
UV
@@ -962,7 +961,7 @@ Perl_to_utf8_title(pTHX_ U8 *p)
if (!PL_utf8_totitle)
PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_totitle, p);
- return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
+ return uv ? uv : utf8_to_uv(p,STRLEN_MAX,0,0);
}
UV
@@ -973,7 +972,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p)
if (!PL_utf8_tolower)
PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
uv = swash_fetch(PL_utf8_tolower, p);
- return uv ? uv : utf8_to_uv_chk(p,STRLEN_MAX,0,0);
+ return uv ? uv : utf8_to_uv(p,STRLEN_MAX,0,0);
}
/* a "swash" is a swatch hash */
@@ -1063,7 +1062,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- PUSHs(sv_2mortal(newSViv(utf8_to_uv_chk(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1))));
+ PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, STRLEN_MAX, 0, 0) & ~(needents - 1))));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))