summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doop.c18
-rw-r--r--sv.c14
-rw-r--r--t/camel-III/vstring.t4
-rwxr-xr-xt/op/each.t17
-rw-r--r--t/op/length.t56
-rwxr-xr-xt/op/pack.t23
-rwxr-xr-xt/op/ver.t2
-rw-r--r--toke.c21
-rw-r--r--utf8.c35
-rw-r--r--utf8.h4
-rw-r--r--utfebcdic.h18
11 files changed, 121 insertions, 91 deletions
diff --git a/doop.c b/doop.c
index e4a516a1a3..f2bda8b08d 100644
--- a/doop.c
+++ b/doop.c
@@ -316,9 +316,11 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
isutf8 = SvUTF8(sv);
if (!isutf8) {
U8 *t = s, *e = s + len;
- while (t < e)
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ while (t < e) {
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
+ }
if (hibit)
s = bytes_to_utf8(s, &len);
}
@@ -408,9 +410,11 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
s = (U8*)SvPV(sv, len);
if (!SvUTF8(sv)) {
U8 *t = s, *e = s + len;
- while (t < e)
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ while (t < e) {
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
+ }
if (hibit)
start = s = bytes_to_utf8(s, &len);
}
@@ -453,9 +457,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
isutf8 = SvUTF8(sv);
if (!isutf8) {
U8 *t = s, *e = s + len;
- while (t < e)
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ while (t < e) {
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
+ }
if (hibit)
s = bytes_to_utf8(s, &len);
}
diff --git a/sv.c b/sv.c
index 18c5ac95b9..1b367448a6 100644
--- a/sv.c
+++ b/sv.c
@@ -2978,7 +2978,8 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
e = (U8 *) SvEND(sv);
t = s;
while (t < e) {
- if ((hibit = !UTF8_IS_INVARIANT(*t++)))
+ U8 ch = *t++;
+ if ((hibit = !NATIVE_IS_INVARIANT(ch)))
break;
}
if (hibit) {
@@ -2991,12 +2992,6 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
Safefree(s); /* No longer using what was there before. */
SvLEN(sv) = len; /* No longer know the real size. */
}
-#ifdef EBCDIC
- else {
- for (t = s; t < e; t++)
- *t = NATIVE_TO_ASCII(*t);
- }
-#endif
/* Mark as UTF-8 even if no hibit - saves scanning loop */
SvUTF8_on(sv);
return SvCUR(sv);
@@ -3112,7 +3107,8 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv)
return FALSE;
e = (U8 *) SvEND(sv);
while (c < e) {
- if (!UTF8_IS_INVARIANT(*c++)) {
+ U8 ch = *c++;
+ if (!UTF8_IS_INVARIANT(ch)) {
SvUTF8_on(sv);
break;
}
@@ -7127,7 +7123,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
case 'c':
uv = args ? va_arg(*args, int) : SvIVx(argsv);
- if ((uv > 255 || (!UTF8_IS_INVARIANT(uv) && SvUTF8(sv))) && !IN_BYTE) {
+ if ((uv > 255 || (!UNI_IS_INVARIANT(uv) || SvUTF8(sv))) && !IN_BYTE) {
eptr = (char*)utf8buf;
elen = uvchr_to_utf8((U8*)eptr, uv) - utf8buf;
is_utf = TRUE;
diff --git a/t/camel-III/vstring.t b/t/camel-III/vstring.t
index 7360ae7654..6dec4ddd69 100644
--- a/t/camel-III/vstring.t
+++ b/t/camel-III/vstring.t
@@ -5,14 +5,12 @@ BEGIN {
}
use Test;
plan test => 5;
-# Error messages may have wide chars, say that is okay - if we can.
-eval { binmode STDOUT,":utf8" };
# Chapter 2 pp67/68
my $vs = v1.20.300.4000;
ok($vs,"\x{1}\x{14}\x{12c}\x{fa0}","v-string ne \\x{}");
ok($vs,chr(1).chr(20).chr(300).chr(4000),"v-string ne chr()");
-ok('foo',v102.111.111,"v-string ne ''");
+ok('foo',((chr(193) eq 'A') ? v134.150.150 : v102.111.111),"v-string ne ''");
# Chapter 15, pp403
diff --git a/t/op/each.t b/t/op/each.t
index 2e80dcd009..daddc9c3c1 100755
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -2,11 +2,11 @@
BEGIN {
chdir 't' if -d 't';
- @INC = '.';
+ @INC = '.';
push @INC, '../lib';
-}
+}
-print "1..26\n";
+print "1..27\n";
$h{'abc'} = 'ABC';
$h{'def'} = 'DEF';
@@ -163,15 +163,20 @@ print "ok 23\n";
print "#$u{$_}\n" for keys %u; # Used to core dump before change #8056.
print "ok 24\n";
+use bytes ();
+
$d = pack("U*", 0xe3, 0x81, 0x82);
+$ol = bytes::length($d);
+print "not " unless $ol > 3;
+print "ok 25\n";
%u = ($d => "downgrade");
for (keys %u) {
use bytes;
print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
- print "ok 25\n";
+ print "ok 26\n";
}
{
use bytes;
- print "not " if length($d) ne 6;
- print "ok 26\n";
+ print "not " if length($d) != $ol;
+ print "ok 27\n";
}
diff --git a/t/op/length.t b/t/op/length.t
index df80fcd039..c4445e3e48 100644
--- a/t/op/length.t
+++ b/t/op/length.t
@@ -34,52 +34,84 @@ print "ok 3\n";
{
my $a = pack("U", 0x80);
-
+
print "not " unless length($a) == 1;
print "ok 6\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc2\x80" && length($a) == 2;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x80\n",$a;
+ print "not " unless $a eq "\x8a\x67" && length($a) == 2;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc2\x80" && length($a) == 2;
+ }
print "ok 7\n";
$test++;
}
{
my $a = "\x{100}";
-
+
print "not " unless length($a) == 1;
print "ok 8\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc4\x80" && length($a) == 2;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x100\n",$a;
+ print "not " unless $a eq "\x8c\x41" && length($a) == 2;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc4\x80" && length($a) == 2;
+ }
print "ok 9\n";
$test++;
}
{
my $a = "\x{100}\x{80}";
-
+
print "not " unless length($a) == 2;
print "ok 10\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc4\x80\xc2\x80" && length($a) == 4;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x100 0x80\n",$a;
+ print "not " unless $a eq "\x8c\x41\x8a\x67" && length($a) == 4;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc4\x80\xc2\x80" && length($a) == 4;
+ }
print "ok 11\n";
$test++;
}
{
my $a = "\x{80}\x{100}";
-
+
print "not " unless length($a) == 2;
print "ok 12\n";
$test++;
-
+
use bytes;
- print "not " unless $a eq "\xc2\x80\xc4\x80" && length($a) == 4;
+ if (ord('A') == 193)
+ {
+ printf "#%vx for 0x80 0x100\n",$a;
+ print "not " unless $a eq "\x8a\x67\x8c\x41" && length($a) == 4;
+ }
+ else
+ {
+ print "not " unless $a eq "\xc2\x80\xc4\x80" && length($a) == 4;
+ }
print "ok 13\n";
$test++;
}
diff --git a/t/op/pack.t b/t/op/pack.t
index 4c169917ae..5323bc34b8 100755
--- a/t/op/pack.t
+++ b/t/op/pack.t
@@ -43,7 +43,7 @@ $sum = 103 if ($Config{ebcdic} eq 'define');
print +($x = unpack("%32B*", "Now is the time for all good blurfl")) == $sum
? "ok 7\n" : "not ok 7 $x\n";
-open(BIN, "./perl") || open(BIN, "./perl.exe")
+open(BIN, "./perl") || open(BIN, "./perl.exe")
|| die "Can't open ../perl or ../perl.exe: $!\n";
sysread BIN, $foo, 8192;
close BIN;
@@ -119,10 +119,10 @@ print( ((unpack("i",pack("i",-1))) == -1 ? "ok " : "not ok "),$test++,"\n");
# 31..36: test the pack lengths of s S i I l L
print "not " unless length(pack("s", 0)) == 2;
print "ok ", $test++, "\n";
-
+
print "not " unless length(pack("S", 0)) == 2;
print "ok ", $test++, "\n";
-
+
print "not " unless length(pack("i", 0)) >= 4;
print "ok ", $test++, "\n";
@@ -171,7 +171,7 @@ foreach my $t (@templates) {
# binary values of the uuencoded version would not be portable between
# character sets. Uuencoding is meant for encoding binary data, not
# text data.
-
+
$in = pack 'C*', 0 .. 255;
# just to be anal, we do some random tr/`/ /
@@ -205,7 +205,7 @@ print "ok ", $test++, "\n";
$uu = <<'EOUU';
M'XL("%C<Q#4" TI!4%4 \RHM+E%(S,LOR4@M4@A(+<I1*"U-SD])+>(" &1F
-&8%P:
+&8%P:
EOUU
print "not " unless unpack('u', $uu) eq $in;
@@ -407,15 +407,16 @@ $z = pack <<EOP,'string','etc';
EOP
print 'not ' unless $z eq "\000\006string\003etc"; print "ok $test\n"; $test++;
-print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000);
+print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000);
print "ok $test\n"; $test++;
-print 'not ' unless "1.20.300.4000" eq
- sprintf "%vd", pack(" U*",1,20,300,4000);
+print 'not ' unless "1.20.300.4000" eq
+ sprintf "%vd", pack(" U*",1,20,300,4000);
print "ok $test\n"; $test++;
-print 'not ' unless v1.20.300.4000 ne
- sprintf "%vd", pack("C0U*",1,20,300,4000);
+print 'not ' unless v1.20.300.4000 ne
+ sprintf "%vd", pack("C0U*",1,20,300,4000);
print "ok $test\n"; $test++;
# 160
-print "not " unless join(" ", unpack("C*", chr(0x1e2))) eq "199 162";
+print "not " unless join(" ", unpack("C*", chr(0x1e2)))
+ eq ((ord(A) == 193) ? "156 67" : "199 162");
print "ok $test\n"; $test++;
diff --git a/t/op/ver.t b/t/op/ver.t
index b9ba5891f0..e248a48482 100755
--- a/t/op/ver.t
+++ b/t/op/ver.t
@@ -114,7 +114,7 @@ print "not " unless sprintf("%*vb", "##", v1.22.333.4444)
print "ok $test\n"; ++$test;
print "not " unless sprintf("%vd", join("", map { chr }
- unpack "U*", v2001.2002.2003))
+ unpack 'U*', pack('U*',2001,2002,2003)))
eq '2001.2002.2003';
print "ok $test\n"; ++$test;
diff --git a/toke.c b/toke.c
index 53159f3128..ea0f65097f 100644
--- a/toke.c
+++ b/toke.c
@@ -1457,7 +1457,7 @@ S_scan_const(pTHX_ char *start)
/* We need to map to chars to ASCII before doing the tests
to cover EBCDIC
*/
- if (!UTF8_IS_INVARIANT(uv)) {
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) {
if (!has_utf8 && uv > 255) {
/* Might need to recode whatever we have
* accumulated so far if it contains any
@@ -1469,7 +1469,7 @@ S_scan_const(pTHX_ char *start)
int hicount = 0;
U8 *c;
for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) {
- if (!UTF8_IS_INVARIANT(*c)) {
+ if (!NATIVE_IS_INVARIANT(*c)) {
hicount++;
}
}
@@ -1481,7 +1481,7 @@ S_scan_const(pTHX_ char *start)
dst = src+hicount;
d += hicount;
while (src >= (U8 *)SvPVX(sv)) {
- if (!UTF8_IS_INVARIANT(*src)) {
+ if (!NATIVE_IS_INVARIANT(*src)) {
U8 ch = NATIVE_TO_ASCII(*src);
*dst-- = UTF8_EIGHT_BIT_LO(ch);
*dst-- = UTF8_EIGHT_BIT_HI(ch);
@@ -1510,7 +1510,7 @@ S_scan_const(pTHX_ char *start)
}
}
else {
- *d++ = NATIVE_TO_NEED(has_utf8,uv);
+ *d++ = (char) uv;
}
continue;
@@ -1603,7 +1603,6 @@ S_scan_const(pTHX_ char *start)
} /* end if (backslash) */
default_action:
- /* The 'has_utf8' here is very dubious */
if (!UTF8_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) {
STRLEN len = (STRLEN) -1;
UV uv;
@@ -7230,7 +7229,7 @@ vstring:
while (isDIGIT(*pos) || *pos == '_')
pos++;
if (!isALPHA(*pos)) {
- UV rev, revmax = 0;
+ UV rev;
U8 tmpbuf[UTF8_MAXLEN+1];
U8 *tmpend;
s++; /* get past 'v' */
@@ -7260,9 +7259,9 @@ vstring:
}
/* Append native character for the rev point */
tmpend = uvchr_to_utf8(tmpbuf, rev);
- if (rev > revmax)
- revmax = rev;
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
+ if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev)))
+ SvUTF8_on(sv);
if (*pos == '.' && isDIGIT(pos[1]))
s = ++pos;
else {
@@ -7272,14 +7271,8 @@ vstring:
while (isDIGIT(*pos) || *pos == '_')
pos++;
}
-
SvPOK_on(sv);
SvREADONLY_on(sv);
- /* if (revmax > 127) { */
- SvUTF8_on(sv); /*
- if (revmax < 256)
- sv_utf8_downgrade(sv, TRUE);
- } */
}
}
break;
diff --git a/utf8.c b/utf8.c
index 81fb44db35..01afa010be 100644
--- a/utf8.c
+++ b/utf8.c
@@ -46,8 +46,8 @@ is the recommended Unicode-aware way of saying
U8 *
Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
{
- if (UTF8_IS_INVARIANT(uv)) {
- *d++ = uv;
+ if (UNI_IS_INVARIANT(uv)) {
+ *d++ = UTF_TO_NATIVE(uv);
return d;
}
#if defined(EBCDIC) || 1 /* always for testing */
@@ -151,9 +151,7 @@ is the recommended wide native character-aware way of saying
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
- if (uv < 0x100)
- uv = NATIVE_TO_ASCII(uv);
- return Perl_uvuni_to_utf8(aTHX_ d, uv);
+ return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
}
@@ -293,7 +291,7 @@ Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
if (UTF8_IS_INVARIANT(uv)) {
if (retlen)
*retlen = 1;
- return (UV) (*s);
+ return (UV) (NATIVE_TO_UTF(*s));
}
if (UTF8_IS_CONTINUATION(uv) &&
@@ -478,9 +476,7 @@ UV
Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
{
UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
- if (uv < 0x100)
- return (UV) ASCII_TO_NATIVE(uv);
- return uv;
+ return UNI_TO_NATIVE(uv);
}
/*
@@ -550,7 +546,7 @@ Perl_utf8_length(pTHX_ U8* s, U8* e)
U8 t = UTF8SKIP(s);
if (e - s < t)
- Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
+ Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t);
s += t;
len++;
}
@@ -713,19 +709,16 @@ Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
*is_utf8 = 0;
-#ifndef EBCDIC
- /* Can use as-is if no high chars */
- if (!count)
- return start;
-#endif
-
Newz(801, d, (*len) - count + 1, U8);
s = start; start = d;
while (s < send) {
U8 c = *s++;
- if (!UTF8_IS_INVARIANT(c))
- c = UTF8_ACCUMULATE(c, *s++);
- *d++ = ASCII_TO_NATIVE(c);
+ if (!UTF8_IS_INVARIANT(c)) {
+ /* Then it is two-byte encoded */
+ c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
+ c = ASCII_TO_NATIVE(c);
+ }
+ *d++ = c;
}
*d = '\0';
*len = d - start;
@@ -755,8 +748,8 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len)
while (s < send) {
UV uv = NATIVE_TO_ASCII(*s++);
- if (UTF8_IS_INVARIANT(uv))
- *d++ = uv;
+ if (UNI_IS_INVARIANT(uv))
+ *d++ = UTF_TO_NATIVE(uv);
else {
*d++ = UTF8_EIGHT_BIT_HI(uv);
*d++ = UTF8_EIGHT_BIT_LO(uv);
diff --git a/utf8.h b/utf8.h
index a60639720f..46bc8289e9 100644
--- a/utf8.h
+++ b/utf8.h
@@ -64,7 +64,9 @@ END_EXTERN_C
*/
-#define UTF8_IS_INVARIANT(c) (((UV)c) < 0x80)
+#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
+#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
+#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
#define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
#define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
diff --git a/utfebcdic.h b/utfebcdic.h
index 0eef54b001..ef67cb2c35 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -15,17 +15,18 @@ START_EXTERN_C
#ifdef DOINIT
/* Indexed by encoded byte this table gives the length of the sequence.
Adapted from the shadow flags table in tr16.
- The entries marked 9 are continuation bytes.
+ The entries marked 9 in tr6 are continuation bytes and are marked
+ as length 1 here so that we can recover.
*/
EXTCONST unsigned char PL_utf8skip[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-1,9,9,9,9,9,9,9,9,9,9,1,1,1,1,1,
-1,9,9,9,9,9,9,9,9,9,1,1,1,1,1,1,
-1,1,9,9,9,9,9,9,9,9,9,1,1,1,1,1,
-9,9,9,9,2,2,2,2,2,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+1,1,1,1,2,2,2,2,2,1,1,1,1,1,1,1,
2,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,
2,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,
2,1,1,1,1,1,1,1,1,1,2,2,2,1,2,2,
@@ -221,7 +222,7 @@ END_EXTERN_C
#define UTF_TO_NATIVE(ch) PL_utf2e[(U8)(ch)]
/* Transform in wide UV char space */
#define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : NATIVE_TO_ASCII(ch))
-#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : (UV) ASCII_TO_NATIVE(ch))
+#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : ASCII_TO_NATIVE(ch))
/* Transform in invariant..byte space */
#define NATIVE_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(NATIVE_TO_ASCII(ch)) : (ch))
#define ASCII_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(ch) : ASCII_TO_NATIVE(ch))
@@ -267,8 +268,11 @@ END_EXTERN_C
(uv) < 0x400000 ? 5 : \
(uv) < 0x4000000 ? 6 : 7 )
+
+#define UNI_IS_INVARIANT(c) ((c) < 0xA0)
/* UTF-EBCDIC sematic macros - transform back into UTF-8-Mod and then compare */
-#define UTF8_IS_INVARIANT(c) (NATIVE_TO_UTF(c) < 0xA0)
+#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
+#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
#define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0)
#define UTF8_IS_CONTINUATION(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
#define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0)