summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2000-12-29 07:08:32 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2000-12-29 07:08:32 +0000
commitc512ce4f7f4a9bd0f491f91cb5a15fcb65ee37d9 (patch)
treefaecae30ec45ee9fe9180f727c600de58a3a50d2
parenta7514e1ec900a5b60cda6ed25728476973d26ae0 (diff)
downloadperl-c512ce4f7f4a9bd0f491f91cb5a15fcb65ee37d9.tar.gz
(Retracted by #8264) Externally: join() was still quite UTF-8-unaware.
Internally: sv_catsv() wasn't quite okay on UTF-8, it assumed that the only cases to care about are byte+byte and byte+character. TODO: See how well pp_concat() could be implemented in terms of sv_catsv(). p4raw-id: //depot/perl@8248
-rw-r--r--doop.c7
-rw-r--r--sv.c80
-rwxr-xr-xt/op/join.t22
-rw-r--r--utf8.h13
4 files changed, 88 insertions, 34 deletions
diff --git a/doop.c b/doop.c
index ea65a68eab..3548556a89 100644
--- a/doop.c
+++ b/doop.c
@@ -504,8 +504,6 @@ Perl_do_join(pTHX_ register SV *sv, SV *del, register SV **mark, register SV **s
}
if (items-- > 0) {
- char *s;
-
sv_setpv(sv, "");
if (*mark)
sv_catsv(sv, *mark);
@@ -513,10 +511,9 @@ Perl_do_join(pTHX_ register SV *sv, SV *del, register SV **mark, register SV **s
}
else
sv_setpv(sv,"");
- len = delimlen;
- if (len) {
+ if (delimlen) {
for (; items > 0; items--,mark++) {
- sv_catpvn(sv,delim,len);
+ sv_catsv(sv,del);
sv_catsv(sv,*mark);
}
}
diff --git a/sv.c b/sv.c
index 4794596d38..97ee2ada53 100644
--- a/sv.c
+++ b/sv.c
@@ -2934,7 +2934,7 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
char *s, *t, *e;
int hibit = 0;
- if (!sv || !SvPOK(sv) || SvUTF8(sv))
+ if (!sv || !SvPOK(sv) || !SvCUR(sv) || SvUTF8(sv))
return;
/* This function could be much more efficient if we had a FLAG in SVs
@@ -3755,20 +3755,54 @@ C<dsv>. Handles 'get' magic, but not 'set' magic. See C<sv_catsv_mg>.
*/
void
-Perl_sv_catsv(pTHX_ SV *dstr, register SV *sstr)
+Perl_sv_catsv(pTHX_ SV *dsv, register SV *ssv)
{
- char *s;
- STRLEN len;
- if (!sstr)
+ if (!ssv)
return;
- if ((s = SvPV(sstr, len))) {
- if (DO_UTF8(sstr)) {
- sv_utf8_upgrade(dstr);
- sv_catpvn(dstr,s,len);
- SvUTF8_on(dstr);
+ else {
+ STRLEN slen;
+ char *spv;
+
+ if ((spv = SvPV(ssv, slen))) {
+ bool dutf8 = DO_UTF8(dsv);
+ bool sutf8 = DO_UTF8(ssv);
+
+ if (dutf8 != sutf8) {
+ char *s = spv;
+ char *send = s + slen;
+ STRLEN dlen;
+ char *dpv;
+ char *d;
+
+ /* We may modify dsv but not ssv. */
+
+ if (!dutf8)
+ sv_utf8_upgrade(dsv);
+ dpv = SvPV(dsv, dlen);
+ /* Overguestimate on the slen. */
+ SvGROW(dsv, dlen + (sutf8 ? 2 * slen : slen) + 1);
+ d = dpv + dlen;
+ if (dutf8) /* && !sutf8 */ {
+ while (s < send) {
+ if (UTF8_IS_ASCII(*s))
+ *d++ = *s++;
+ else {
+ *d++ = UTF8_EIGHT_BIT_HI(*s);
+ *d++ = UTF8_EIGHT_BIT_LO(*s);
+ s += 2;
+ }
+ }
+ SvCUR(dsv) += s - spv;
+ *SvEND(dsv) = 0;
+ }
+ else /* !dutf8 (was) && sutf8 */ {
+ sv_catpvn(dsv, spv, slen);
+ SvUTF8_on(dsv);
+ }
+ }
+ else
+ sv_catpvn(dsv, spv, slen);
}
- else
- sv_catpvn(dstr,s,len);
}
}
@@ -3781,10 +3815,10 @@ Like C<sv_catsv>, but also handles 'set' magic.
*/
void
-Perl_sv_catsv_mg(pTHX_ SV *dstr, register SV *sstr)
+Perl_sv_catsv_mg(pTHX_ SV *dsv, register SV *ssv)
{
- sv_catsv(dstr,sstr);
- SvSETMAGIC(dstr);
+ sv_catsv(dsv,ssv);
+ SvSETMAGIC(dsv);
}
/*
@@ -3797,20 +3831,20 @@ Handles 'get' magic, but not 'set' magic. See C<sv_catpv_mg>.
*/
void
-Perl_sv_catpv(pTHX_ register SV *sv, register const char *ptr)
+Perl_sv_catpv(pTHX_ register SV *sv, register const char *pv)
{
register STRLEN len;
STRLEN tlen;
char *junk;
- if (!ptr)
+ if (!pv)
return;
junk = SvPV_force(sv, tlen);
- len = strlen(ptr);
+ len = strlen(pv);
SvGROW(sv, tlen + len + 1);
- if (ptr == junk)
- ptr = SvPVX(sv);
- Move(ptr,SvPVX(sv)+tlen,len+1,char);
+ if (pv == junk)
+ pv = SvPVX(sv);
+ Move(pv,SvPVX(sv)+tlen,len+1,char);
SvCUR(sv) += len;
(void)SvPOK_only_UTF8(sv); /* validate pointer */
SvTAINT(sv);
@@ -3825,9 +3859,9 @@ Like C<sv_catpv>, but also handles 'set' magic.
*/
void
-Perl_sv_catpv_mg(pTHX_ register SV *sv, register const char *ptr)
+Perl_sv_catpv_mg(pTHX_ register SV *sv, register const char *pv)
{
- sv_catpv(sv,ptr);
+ sv_catpv(sv,pv);
SvSETMAGIC(sv);
}
diff --git a/t/op/join.t b/t/op/join.t
index b50878e735..eea9add850 100755
--- a/t/op/join.t
+++ b/t/op/join.t
@@ -1,6 +1,6 @@
#!./perl
-print "1..10\n";
+print "1..14\n";
@x = (1, 2, 3);
if (join(':',@x) eq '1:2:3') {print "ok 1\n";} else {print "not ok 1\n";}
@@ -44,3 +44,23 @@ if ($f eq 'baeak') {print "ok 6\n";} else {print "# '$f'\nnot ok 6\n";}
print "# expected 'a17b21c' got '$r'\nnot " if $r ne 'a17b21c';
print "ok 10\n";
};
+
+{ my $s = join("", chr(1234),chr(255));
+ print "not " unless length($s) == 2;
+ print "ok 11\n";
+}
+
+{ my $s = join(chr(2345), chr(1234),chr(255));
+ print "not " unless length($s) == 3;
+ print "ok 12\n";
+}
+
+{ my $s = join(chr(2345), chr(1234),chr(3456));
+ print "not " unless length($s) == 3;
+ print "ok 13\n";
+}
+
+{ my $s = join(chr(255), chr(1234),chr(2345));
+ print "not " unless length($s) == 3;
+ print "ok 14\n";
+}
diff --git a/utf8.h b/utf8.h
index 26ef7236ee..e9598b8759 100644
--- a/utf8.h
+++ b/utf8.h
@@ -62,15 +62,18 @@ END_EXTERN_C
#define UTF8_QUAD_MAX UINT64_C(0x1000000000)
-#define UTF8_IS_ASCII(c) ((c) < 0x80)
-#define UTF8_IS_START(c) ((c) >= 0xc0 && ((c) <= 0xfd))
-#define UTF8_IS_CONTINUATION(c) ((c) >= 0x80 && ((c) <= 0xbf))
-#define UTF8_IS_CONTINUED(c) ((c) & 0x80)
+#define UTF8_IS_ASCII(c) (((U8)c) < 0x80)
+#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
+#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
+#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
-#define UTF8_CONTINUATION_MASK 0x3f
+#define UTF8_CONTINUATION_MASK ((U8)0x3f)
#define UTF8_ACCUMULATION_SHIFT 6
#define UTF8_ACCUMULATE(old, new) ((old) << UTF8_ACCUMULATION_SHIFT | ((new) & UTF8_CONTINUATION_MASK))
+#define UTF8_EIGHT_BIT_HI(c) ( (((U8)c)>>6) |0xc0)
+#define UTF8_EIGHT_BIT_LO(c) (((((U8)c)>>6)&0x3f)|0x80)
+
#ifdef HAS_QUAD
#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
(uv) < 0x800 ? 2 : \