summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Ing-Simmons <nik@tiuk.ti.com>2000-12-09 19:47:30 +0000
committerNick Ing-Simmons <nik@tiuk.ti.com>2000-12-09 19:47:30 +0000
commit7d59b7e40bca518078f3e97c802950b76d52efa2 (patch)
tree53c65c30afe57d62fc8ebfa20197f1c74c0c9fd4
parentbbc28b27949817e8e7461c0a92c6108632259a4b (diff)
downloadperl-7d59b7e40bca518078f3e97c802950b76d52efa2.tar.gz
Make print, syswrite, send, readline, getc honour utf8-ness of PerlIO.
(sysread, recv and write i.e. formats still to do...) Allow :utf8 or :bytes in PerlIO_apply_layers() so that open($fh,">:utf8","name") etc. work. - "applying" those just sets/clears the UTF8 bit of the top layer, so no extra overhead is involved. Tweak t/comp/require.t to add a 'use bytes' to permit its dubious writing of BOM to a non-utf8 stream. Add initial io/utf8.t Fix SvPVutf8() - sv_2pv() was not expecting to be called with something that was already SvPOK() - (we just fossiked with SvUTF8 bit). Fix that and also just use the SvPV macro in sv_2pvutf8() to avoid the issue/overhead. p4raw-id: //depot/perlio@8054
-rw-r--r--MANIFEST1
-rw-r--r--doio.c14
-rw-r--r--perlio.c10
-rw-r--r--pp_sys.c64
-rw-r--r--sv.c34
-rwxr-xr-xt/comp/require.t3
-rwxr-xr-xt/io/utf8.t51
7 files changed, 152 insertions, 25 deletions
diff --git a/MANIFEST b/MANIFEST
index 2f334275be..eaa74253e1 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1340,6 +1340,7 @@ t/io/pipe.t See if secure pipes work
t/io/print.t See if print commands work
t/io/read.t See if read works
t/io/tell.t See if file seeking works
+t/io/utf8.t See if file seeking works
t/lib/abbrev.t See if Text::Abbrev works
t/lib/ansicolor.t See if Term::ANSIColor works
t/lib/anydbm.t See if AnyDBM_File works
diff --git a/doio.c b/doio.c
index 901ca718d0..d8168e1636 100644
--- a/doio.c
+++ b/doio.c
@@ -1148,12 +1148,14 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
}
/* FALL THROUGH */
default:
-#if 0
- /* XXX Fix this when the I/O disciplines arrive. XXX */
- if (DO_UTF8(sv))
- sv_utf8_downgrade(sv, FALSE);
-#endif
- tmps = SvPV(sv, len);
+ if (PerlIO_isutf8(fp)) {
+ tmps = SvPVutf8(sv, len);
+ }
+ else {
+ if (DO_UTF8(sv))
+ sv_utf8_downgrade(sv, FALSE);
+ tmps = SvPV(sv, len);
+ }
break;
}
/* To detect whether the process is about to overstep its
diff --git a/perlio.c b/perlio.c
index 874dece319..278dde1991 100644
--- a/perlio.c
+++ b/perlio.c
@@ -572,6 +572,14 @@ PerlIO_apply_layers(pTHX_ PerlIO *f, const char *mode, const char *names)
}
}
}
+ else if ((e - s) == 4 && strncmp(s,"utf8",4) == 0)
+ {
+ PerlIOBase(f)->flags |= PERLIO_F_UTF8;
+ }
+ else if ((e - s) == 5 && strncmp(s,"bytes",5) == 0)
+ {
+ PerlIOBase(f)->flags &= ~PERLIO_F_UTF8;
+ }
else
{
SV *layer = PerlIO_find_layer(s,e-s);
@@ -606,7 +614,7 @@ PerlIO_binmode(pTHX_ PerlIO *f, int iotype, int mode, const char *names)
{
PerlIO_debug("PerlIO_binmode f=%p %s %c %x %s\n",
f,PerlIOBase(f)->tab->name,iotype,mode, (names) ? names : "(Null)");
- if (!names || (O_TEXT != O_BINARY && mode & O_BINARY))
+ if (!names || (O_TEXT != O_BINARY && (mode & O_BINARY)))
{
PerlIO *top = f;
PerlIOl *l;
diff --git a/pp_sys.c b/pp_sys.c
index 314b8851fd..e4640bef3b 100644
--- a/pp_sys.c
+++ b/pp_sys.c
@@ -1129,6 +1129,16 @@ PP(pp_getc)
TAINT;
sv_setpv(TARG, " ");
*SvPVX(TARG) = PerlIO_getc(IoIFP(GvIOp(gv))); /* should never be EOF */
+ if (PerlIO_isutf8(IoIFP(GvIOp(gv)))) {
+ /* Find out how many bytes the char needs */
+ Size_t len = UTF8SKIP(SvPVX(TARG));
+ if (len > 1) {
+ SvGROW(TARG,len+1);
+ len = PerlIO_read(IoIFP(GvIOp(gv)),SvPVX(TARG)+1,len-1);
+ SvCUR_set(TARG,1+len);
+ }
+ SvUTF8_on(TARG);
+ }
PUSHTARG;
RETURN;
}
@@ -1490,10 +1500,7 @@ PP(pp_sysread)
bufsv = *++MARK;
if (! SvOK(bufsv))
sv_setpvn(bufsv, "", 0);
- buffer = SvPV_force(bufsv, blen);
length = SvIVx(*++MARK);
- if (length < 0)
- DIE(aTHX_ "Negative length");
SETERRNO(0,0);
if (MARK < SP)
offset = SvIVx(*++MARK);
@@ -1502,6 +1509,15 @@ PP(pp_sysread)
io = GvIO(gv);
if (!io || !IoIFP(io))
goto say_undef;
+ if (PerlIO_isutf8(IoIFP(io))) {
+ buffer = SvPVutf8_force(bufsv, blen);
+ }
+ else {
+ buffer = SvPV_force(bufsv, blen);
+ }
+ if (length < 0)
+ DIE(aTHX_ "Negative length");
+
#ifdef HAS_SOCKET
if (PL_op->op_type == OP_RECV) {
char namebuf[MAXPATHLEN];
@@ -1514,10 +1530,6 @@ PP(pp_sysread)
if (bufsize >= 256)
bufsize = 255;
#endif
-#ifdef OS2 /* At least Warp3+IAK: only the first byte of bufsize set */
- if (bufsize >= 256)
- bufsize = 255;
-#endif
buffer = SvGROW(bufsv, length+1);
/* 'offset' means 'flags' here */
length = PerlSock_recvfrom(PerlIO_fileno(IoIFP(io)), buffer, length, offset,
@@ -1540,6 +1552,10 @@ PP(pp_sysread)
if (PL_op->op_type == OP_RECV)
DIE(aTHX_ PL_no_sock_func, "recv");
#endif
+ if (SvUTF8(bufsv) && offset) {
+ /* FIXME ! */
+ Perl_croak(aTHX_ "Non zero offset not supported yet for utf8");
+ }
if (offset < 0) {
if (-offset > blen)
DIE(aTHX_ "Offset outside string");
@@ -1642,7 +1658,6 @@ PP(pp_send)
char *buffer;
Size_t length;
SSize_t retval;
- IV offset;
STRLEN blen;
MAGIC *mg;
@@ -1664,7 +1679,6 @@ PP(pp_send)
if (!gv)
goto say_undef;
bufsv = *++MARK;
- buffer = SvPV(bufsv, blen);
#if Size_t_size > IVSIZE
length = (Size_t)SvNVx(*++MARK);
#else
@@ -1678,8 +1692,24 @@ PP(pp_send)
retval = -1;
if (ckWARN(WARN_CLOSED))
report_evil_fh(gv, io, PL_op->op_type);
+ goto say_undef;
+ }
+
+ if (PerlIO_isutf8(IoIFP(io))) {
+ buffer = SvPVutf8(bufsv, blen);
}
- else if (PL_op->op_type == OP_SYSWRITE) {
+ else {
+ if (DO_UTF8(bufsv))
+ sv_utf8_downgrade(bufsv, FALSE);
+ buffer = SvPV(bufsv, blen);
+ }
+
+ if (PL_op->op_type == OP_SYSWRITE) {
+ IV offset;
+ if (DO_UTF8(bufsv)) {
+ /* length and offset are in chars */
+ blen = sv_len_utf8(bufsv);
+ }
if (MARK < SP) {
offset = SvIVx(*++MARK);
if (offset < 0) {
@@ -1692,17 +1722,24 @@ PP(pp_send)
offset = 0;
if (length > blen - offset)
length = blen - offset;
+ if (DO_UTF8(bufsv)) {
+ buffer = utf8_hop((U8 *)buffer, offset);
+ length = utf8_hop((U8 *)buffer, length) - (U8 *)buffer;
+ }
+ else {
+ buffer = buffer+offset;
+ }
#ifdef PERL_SOCK_SYSWRITE_IS_SEND
if (IoTYPE(io) == IoTYPE_SOCKET) {
retval = PerlSock_send(PerlIO_fileno(IoIFP(io)),
- buffer+offset, length, 0);
+ buffer, length, 0);
}
else
#endif
{
/* See the note at doio.c:do_print about filesize limits. --jhi */
retval = PerlLIO_write(PerlIO_fileno(IoIFP(io)),
- buffer+offset, length);
+ buffer, length);
}
}
#ifdef HAS_SOCKET
@@ -1710,12 +1747,13 @@ PP(pp_send)
char *sockbuf;
STRLEN mlen;
sockbuf = SvPVx(*++MARK, mlen);
+ /* length is really flags */
retval = PerlSock_sendto(PerlIO_fileno(IoIFP(io)), buffer, blen,
length, (struct sockaddr *)sockbuf, mlen);
}
else
+ /* length is really flags */
retval = PerlSock_send(PerlIO_fileno(IoIFP(io)), buffer, blen, length);
-
#else
else
DIE(aTHX_ PL_no_sock_func, "send");
diff --git a/sv.c b/sv.c
index 87da8f7a3c..3d25a2eede 100644
--- a/sv.c
+++ b/sv.c
@@ -2192,7 +2192,11 @@ Perl_sv_2pv(pTHX_ register SV *sv, STRLEN *lp)
return "";
}
}
- if (SvNOKp(sv)) { /* See note in sv_2uv() */
+ if (SvPOK(sv)) {
+ *lp = SvCUR(sv);
+ return SvPVX(sv);
+ }
+ else if (SvNOKp(sv)) { /* See note in sv_2uv() */
/* XXXX 64-bit? IV may have better precision... */
/* I tried changing this to be 64-bit-aware and
* the t/op/numconvert.t became very, very, angry.
@@ -2328,7 +2332,7 @@ char *
Perl_sv_2pvutf8(pTHX_ register SV *sv, STRLEN *lp)
{
sv_utf8_upgrade(sv);
- return sv_2pv(sv,lp);
+ return SvPV(sv,*lp);
}
/* This function is only called on magical items */
@@ -4330,14 +4334,31 @@ Perl_sv_gets(pTHX_ register SV *sv, register PerlIO *fp, I32 append)
#endif
SvCUR_set(sv, bytesread);
buffer[bytesread] = '\0';
+ if (PerlIO_isutf8(fp))
+ SvUTF8_on(sv);
+ else
+ SvUTF8_off(sv);
return(SvCUR(sv) ? SvPVX(sv) : Nullch);
}
else if (RsPARA(PL_rs)) {
rsptr = "\n\n";
rslen = 2;
}
- else
- rsptr = SvPV(PL_rs, rslen);
+ else {
+ /* Get $/ i.e. PL_rs into same encoding as stream wants */
+ if (PerlIO_isutf8(fp)) {
+ rsptr = SvPVutf8(PL_rs, rslen);
+ }
+ else {
+ if (SvUTF8(PL_rs)) {
+ if (!sv_utf8_downgrade(PL_rs, TRUE)) {
+ Perl_croak(aTHX_ "Wide character in $/");
+ }
+ }
+ rsptr = SvPV(PL_rs, rslen);
+ }
+ }
+
rslast = rslen ? rsptr[rslen - 1] : '\0';
if (RsPARA(PL_rs)) { /* have to do this both before and after */
@@ -4556,6 +4577,11 @@ screamer2:
}
}
+ if (PerlIO_isutf8(fp))
+ SvUTF8_on(sv);
+ else
+ SvUTF8_off(sv);
+
return (SvCUR(sv) - append) ? SvPVX(sv) : Nullch;
}
diff --git a/t/comp/require.t b/t/comp/require.t
index eaea3ad5f6..e634532275 100755
--- a/t/comp/require.t
+++ b/t/comp/require.t
@@ -21,6 +21,7 @@ sub write_file {
my $f = shift;
open(REQ,">$f") or die "Can't write '$f': $!";
binmode REQ;
+ use bytes;
print REQ @_;
close REQ;
}
@@ -132,7 +133,7 @@ $i++; do_require(qq(${utf8}print "ok $i\n"; 1;\n));
sub bytes_to_utf16 {
my $utf16 = pack("$_[0]*", unpack("C*", $_[1]));
- return @_ == 3 && $_[2] ? pack("$_[0]", 0xFEFF) . $utf16 : $utf16;
+ return @_ == 3 && $_[2] ? pack("$_[0]", 0xFEFF) . $utf16 : $utf16;
}
$i++; do_require(bytes_to_utf16('n', qq(print "ok $i\\n"; 1;\n), 1)); # BE
diff --git a/t/io/utf8.t b/t/io/utf8.t
new file mode 100755
index 0000000000..1e47c33d3f
--- /dev/null
+++ b/t/io/utf8.t
@@ -0,0 +1,51 @@
+#!./perl
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+ require Config; import Config;
+ unless ($Config{'useperlio'}) {
+ print "1..0 # Skip: not perlio\n";
+ exit 0;
+ }
+}
+
+$| = 1;
+print "1..11\n";
+
+open(F,"+>:utf8",'a');
+print F chr(0x100).'£';
+print '#'.tell(F)."\n";
+print "not " unless tell(F) == 4;
+print "ok 1\n";
+print F "\n";
+print '#'.tell(F)."\n";
+print "not " unless tell(F) >= 5;
+print "ok 2\n";
+seek(F,0,0);
+print "not " unless getc(F) eq chr(0x100);
+print "ok 3\n";
+print "not " unless getc(F) eq "£";
+print "ok 4\n";
+print "not " unless getc(F) eq "\n";
+print "ok 5\n";
+seek(F,0,0);
+binmode(F,":bytes");
+print "not " unless getc(F) eq chr(0xc4);
+print "ok 6\n";
+print "not " unless getc(F) eq chr(0x80);
+print "ok 7\n";
+print "not " unless getc(F) eq chr(0xc2);
+print "ok 8\n";
+print "not " unless getc(F) eq chr(0xa3);
+print "ok 9\n";
+print "not " unless getc(F) eq "\n";
+print "ok 10\n";
+seek(F,0,0);
+binmode(F,":utf8");
+print "not " unless scalar(<F>) eq "\x{100}£\n";
+print "ok 11\n";
+close(F);
+
+# unlink('a');
+