summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doop.c25
-rwxr-xr-xembed.pl2
-rw-r--r--perlapi.c2
-rw-r--r--pod/perlapi.pod9
-rw-r--r--pod/perldiag.pod5
-rw-r--r--pod/perlfunc.pod3
-rw-r--r--proto.h2
-rwxr-xr-xt/op/vec.t31
-rw-r--r--utf8.c33
9 files changed, 89 insertions, 23 deletions
diff --git a/doop.c b/doop.c
index 3c86690bfb..46ffc1b284 100644
--- a/doop.c
+++ b/doop.c
@@ -537,7 +537,8 @@ Perl_do_sprintf(pTHX_ SV *sv, I32 len, SV **sarg)
SvTAINTED_on(sv);
}
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+ > 255 is encountered */
UV
Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
{
@@ -549,6 +550,16 @@ Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
return retnum;
if (size < 1 || (size & (size-1))) /* size < 1 or not a power of two */
Perl_croak(aTHX_ "Illegal number of bits in vec");
+
+ if (SvUTF8(sv)) {
+ if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &srclen)) {
+ SvUTF8_off(sv);
+ SvCUR_set(sv, srclen);
+ }
+ else
+ Perl_croak(aTHX_ "Character > 255 in vec()");
+ }
+
offset *= size; /* turn into bit offset */
len = (offset + size + 7) / 8; /* required number of bytes */
if (len > srclen) {
@@ -670,7 +681,8 @@ Perl_do_vecget(pTHX_ SV *sv, I32 offset, I32 size)
return retnum;
}
-/* XXX SvUTF8 support missing! */
+/* currently converts input to bytes if needed and croaks if a character
+ > 255 is encountered */
void
Perl_do_vecset(pTHX_ SV *sv)
{
@@ -686,6 +698,15 @@ Perl_do_vecset(pTHX_ SV *sv)
if (!targ)
return;
s = (unsigned char*)SvPV_force(targ, targlen);
+ if (SvUTF8(targ)) {
+ if (Perl_utf8_to_bytes(aTHX_ (U8*) s, &targlen)) {
+ /* SvUTF8_off(targ); SvPOK_only below ensures this */
+ SvCUR_set(targ, targlen);
+ }
+ else
+ Perl_croak(aTHX_ "Character > 255 in vec()");
+ }
+
(void)SvPOK_only(targ);
lval = SvUV(sv);
offset = LvTARGOFF(sv);
diff --git a/embed.pl b/embed.pl
index 93534359b3..b99a59f3cd 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2071,7 +2071,7 @@ Ap |U8* |utf16_to_utf8 |U8* p|U8 *d|I32 bytelen|I32 *newlen
Ap |U8* |utf16_to_utf8_reversed|U8* p|U8 *d|I32 bytelen|I32 *newlen
Ap |I32 |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
-ApM |U8* |utf8_to_bytes |U8 *s|STRLEN len
+ApM |U8* |utf8_to_bytes |U8 *s|STRLEN *len
ApM |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
Ap |UV |utf8_to_uv |U8 *s|I32* retlen
Ap |U8* |uv_to_utf8 |U8 *d|UV uv
diff --git a/perlapi.c b/perlapi.c
index 2fca6bc02d..3257fecabb 100644
--- a/perlapi.c
+++ b/perlapi.c
@@ -3359,7 +3359,7 @@ Perl_utf8_hop(pTHXo_ U8 *s, I32 off)
#undef Perl_utf8_to_bytes
U8*
-Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN len)
+Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN *len)
{
return ((CPerlObj*)pPerl)->Perl_utf8_to_bytes(s, len);
}
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 3eda765382..b6dab89350 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -3115,11 +3115,12 @@ Found in file utf8.c
=item utf8_to_bytes
-Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string.
-Returns zero on failure after converting as much as possible.
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string, and
+updates len to contain the new length.
+Returns zero on failure leaving the string and len unchanged
- U8 * utf8_to_bytes(U8 *s, STRLEN len)
+ U8 * utf8_to_bytes(U8 *s, STRLEN *len)
=for hackers
Found in file utf8.c
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index 27711f5416..960170d81f 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1043,6 +1043,11 @@ references can be weakened.
with an assignment operator, which implies modifying the value itself.
Perhaps you need to copy the value to a temporary, and repeat that.
+=item Character > 255 in vec()
+
+(F) You applied the vec() function to a UTF8 string which contained
+a character > 255. vec() currently only operates on characters < 256.
+
=item chmod() mode argument is missing initial 0
(W chmod) A novice will sometimes say
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod
index a1381f1d3a..c3ba7366da 100644
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -5510,6 +5510,9 @@ If an element off the end of the string is written to, Perl will first
extend the string with sufficiently many zero bytes. It is an error
to try to write off the beginning of the string (i.e. negative OFFSET).
+The string must not contain any character with value > 255 (which
+can only happen if you're using UTF8 encoding).
+
Strings created with C<vec> can also be manipulated with the logical
operators C<|>, C<&>, C<^>, and C<~>. These operators will assume a bit
vector operation is desired when both operands are strings.
diff --git a/proto.h b/proto.h
index 841e32a32e..931997cf96 100644
--- a/proto.h
+++ b/proto.h
@@ -814,7 +814,7 @@ PERL_CALLCONV U8* Perl_utf16_to_utf8(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newle
PERL_CALLCONV U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8 *d, I32 bytelen, I32 *newlen);
PERL_CALLCONV I32 Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
-PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN len);
+PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, I32* retlen);
PERL_CALLCONV U8* Perl_uv_to_utf8(pTHX_ U8 *d, UV uv);
diff --git a/t/op/vec.t b/t/op/vec.t
index 52b20cd7fa..b75bebfade 100755
--- a/t/op/vec.t
+++ b/t/op/vec.t
@@ -1,6 +1,6 @@
#!./perl
-print "1..23\n";
+print "1..30\n";
print vec($foo,0,1) == 0 ? "ok 1\n" : "not ok 1\n";
print length($foo) == 0 ? "ok 2\n" : "not ok 2\n";
@@ -48,3 +48,32 @@ print "not " if defined $x or $@ !~ /^Assigning to negative offset in vec/;
print "ok 22\n";
print "not " if vec('abcd', 7, 8);
print "ok 23\n";
+
+# UTF8
+# N.B. currently curiously coded to circumvent bugs elswhere in UTF8 handling
+
+$foo = "\x{100}" . "\xff\xfe";
+$x = substr $foo, 1;
+print "not " if vec($x, 0, 8) != 255;
+print "ok 24\n";
+eval { vec($foo, 1, 8) };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 25\n";
+eval { vec($foo, 1, 8) = 13 };
+print "not " unless $@ =~ /^Character > 255 in vec\(\) /;
+print "ok 26\n";
+print "not " if $foo ne "\x{100}" . "\xff\xfe";
+print "ok 27\n";
+$x = substr $foo, 1;
+vec($x, 2, 4) = 7;
+print "not " if $x ne "\xff\xf7";
+print "ok 28\n";
+
+# mixed magic
+
+$foo = "\x61\x62\x63\x64\x65\x66";
+print "not " if vec(substr($foo, 2, 2), 0, 16) != 25444;
+print "ok 29\n";
+vec(substr($foo, 1,3), 5, 4) = 3;
+print "not " if $foo ne "\x61\x62\x63\x34\x65\x66";
+print "ok 30\n";
diff --git a/utf8.c b/utf8.c
index e86c49fdcf..c109f6529f 100644
--- a/utf8.c
+++ b/utf8.c
@@ -204,7 +204,8 @@ Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
return uv;
}
-/* utf8_distance(a,b) is intended to be a - b in pointer arithmetic */
+/* utf8_distance(a,b) returns the number of UTF8 characters between
+ the pointers a and b */
I32
Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
@@ -247,40 +248,46 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
}
/*
-=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN len
+=for apidoc Am|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
-Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
-Unlike C<bytes_to_utf8>, this over-writes the original string.
-Returns zero on failure after converting as much as possible.
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string, and
+updates len to contain the new length.
+Returns zero on failure leaving the string and len unchanged
=cut
*/
U8 *
-Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN len)
+Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
{
dTHR;
U8 *send;
U8 *d;
U8 *save;
- send = s + len;
+ send = s + *len;
d = save = s;
+
+ /* ensure valid UTF8 and chars < 256 before updating string */
+ while (s < send) {
+ U8 c = *s++;
+ if (c >= 0x80 &&
+ ( (s >= send) || ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2)))
+ return 0;
+ }
+ s = save;
while (s < send) {
if (*s < 0x80)
*d++ = *s++;
else {
I32 ulen;
- UV uv = utf8_to_uv(s, &ulen);
- if (uv > 255) {
- *d = '\0';
- return 0;
- }
+ *d++ = (U8)utf8_to_uv(s, &ulen);
s += ulen;
- *d++ = (U8)uv;
}
}
*d = '\0';
+ *len = d - save;
return save;
}