diff options
-rw-r--r-- | doop.c | 18 | ||||
-rw-r--r-- | sv.c | 14 | ||||
-rw-r--r-- | t/camel-III/vstring.t | 4 | ||||
-rwxr-xr-x | t/op/each.t | 17 | ||||
-rw-r--r-- | t/op/length.t | 56 | ||||
-rwxr-xr-x | t/op/pack.t | 23 | ||||
-rwxr-xr-x | t/op/ver.t | 2 | ||||
-rw-r--r-- | toke.c | 21 | ||||
-rw-r--r-- | utf8.c | 35 | ||||
-rw-r--r-- | utf8.h | 4 | ||||
-rw-r--r-- | utfebcdic.h | 18 |
11 files changed, 121 insertions, 91 deletions
@@ -316,9 +316,11 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ isutf8 = SvUTF8(sv); if (!isutf8) { U8 *t = s, *e = s + len; - while (t < e) - if ((hibit = !UTF8_IS_INVARIANT(*t++))) + while (t < e) { + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; + } if (hibit) s = bytes_to_utf8(s, &len); } @@ -408,9 +410,11 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ s = (U8*)SvPV(sv, len); if (!SvUTF8(sv)) { U8 *t = s, *e = s + len; - while (t < e) - if ((hibit = !UTF8_IS_INVARIANT(*t++))) + while (t < e) { + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; + } if (hibit) start = s = bytes_to_utf8(s, &len); } @@ -453,9 +457,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ isutf8 = SvUTF8(sv); if (!isutf8) { U8 *t = s, *e = s + len; - while (t < e) - if ((hibit = !UTF8_IS_INVARIANT(*t++))) + while (t < e) { + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; + } if (hibit) s = bytes_to_utf8(s, &len); } @@ -2978,7 +2978,8 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) e = (U8 *) SvEND(sv); t = s; while (t < e) { - if ((hibit = !UTF8_IS_INVARIANT(*t++))) + U8 ch = *t++; + if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; } if (hibit) { @@ -2991,12 +2992,6 @@ Perl_sv_utf8_upgrade(pTHX_ register SV *sv) Safefree(s); /* No longer using what was there before. */ SvLEN(sv) = len; /* No longer know the real size. */ } -#ifdef EBCDIC - else { - for (t = s; t < e; t++) - *t = NATIVE_TO_ASCII(*t); - } -#endif /* Mark as UTF-8 even if no hibit - saves scanning loop */ SvUTF8_on(sv); return SvCUR(sv); @@ -3112,7 +3107,8 @@ Perl_sv_utf8_decode(pTHX_ register SV *sv) return FALSE; e = (U8 *) SvEND(sv); while (c < e) { - if (!UTF8_IS_INVARIANT(*c++)) { + U8 ch = *c++; + if (!UTF8_IS_INVARIANT(ch)) { SvUTF8_on(sv); break; } @@ -7127,7 +7123,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV case 'c': uv = args ? va_arg(*args, int) : SvIVx(argsv); - if ((uv > 255 || (!UTF8_IS_INVARIANT(uv) && SvUTF8(sv))) && !IN_BYTE) { + if ((uv > 255 || (!UNI_IS_INVARIANT(uv) || SvUTF8(sv))) && !IN_BYTE) { eptr = (char*)utf8buf; elen = uvchr_to_utf8((U8*)eptr, uv) - utf8buf; is_utf = TRUE; diff --git a/t/camel-III/vstring.t b/t/camel-III/vstring.t index 7360ae7654..6dec4ddd69 100644 --- a/t/camel-III/vstring.t +++ b/t/camel-III/vstring.t @@ -5,14 +5,12 @@ BEGIN { } use Test; plan test => 5; -# Error messages may have wide chars, say that is okay - if we can. -eval { binmode STDOUT,":utf8" }; # Chapter 2 pp67/68 my $vs = v1.20.300.4000; ok($vs,"\x{1}\x{14}\x{12c}\x{fa0}","v-string ne \\x{}"); ok($vs,chr(1).chr(20).chr(300).chr(4000),"v-string ne chr()"); -ok('foo',v102.111.111,"v-string ne ''"); +ok('foo',((chr(193) eq 'A') ? v134.150.150 : v102.111.111),"v-string ne ''"); # Chapter 15, pp403 diff --git a/t/op/each.t b/t/op/each.t index 2e80dcd009..daddc9c3c1 100755 --- a/t/op/each.t +++ b/t/op/each.t @@ -2,11 +2,11 @@ BEGIN { chdir 't' if -d 't'; - @INC = '.'; + @INC = '.'; push @INC, '../lib'; -} +} -print "1..26\n"; +print "1..27\n"; $h{'abc'} = 'ABC'; $h{'def'} = 'DEF'; @@ -163,15 +163,20 @@ print "ok 23\n"; print "#$u{$_}\n" for keys %u; # Used to core dump before change #8056. print "ok 24\n"; +use bytes (); + $d = pack("U*", 0xe3, 0x81, 0x82); +$ol = bytes::length($d); +print "not " unless $ol > 3; +print "ok 25\n"; %u = ($d => "downgrade"); for (keys %u) { use bytes; print "not " if length ne 3 or $_ ne "\xe3\x81\x82"; - print "ok 25\n"; + print "ok 26\n"; } { use bytes; - print "not " if length($d) ne 6; - print "ok 26\n"; + print "not " if length($d) != $ol; + print "ok 27\n"; } diff --git a/t/op/length.t b/t/op/length.t index df80fcd039..c4445e3e48 100644 --- a/t/op/length.t +++ b/t/op/length.t @@ -34,52 +34,84 @@ print "ok 3\n"; { my $a = pack("U", 0x80); - + print "not " unless length($a) == 1; print "ok 6\n"; $test++; - + use bytes; - print "not " unless $a eq "\xc2\x80" && length($a) == 2; + if (ord('A') == 193) + { + printf "#%vx for 0x80\n",$a; + print "not " unless $a eq "\x8a\x67" && length($a) == 2; + } + else + { + print "not " unless $a eq "\xc2\x80" && length($a) == 2; + } print "ok 7\n"; $test++; } { my $a = "\x{100}"; - + print "not " unless length($a) == 1; print "ok 8\n"; $test++; - + use bytes; - print "not " unless $a eq "\xc4\x80" && length($a) == 2; + if (ord('A') == 193) + { + printf "#%vx for 0x100\n",$a; + print "not " unless $a eq "\x8c\x41" && length($a) == 2; + } + else + { + print "not " unless $a eq "\xc4\x80" && length($a) == 2; + } print "ok 9\n"; $test++; } { my $a = "\x{100}\x{80}"; - + print "not " unless length($a) == 2; print "ok 10\n"; $test++; - + use bytes; - print "not " unless $a eq "\xc4\x80\xc2\x80" && length($a) == 4; + if (ord('A') == 193) + { + printf "#%vx for 0x100 0x80\n",$a; + print "not " unless $a eq "\x8c\x41\x8a\x67" && length($a) == 4; + } + else + { + print "not " unless $a eq "\xc4\x80\xc2\x80" && length($a) == 4; + } print "ok 11\n"; $test++; } { my $a = "\x{80}\x{100}"; - + print "not " unless length($a) == 2; print "ok 12\n"; $test++; - + use bytes; - print "not " unless $a eq "\xc2\x80\xc4\x80" && length($a) == 4; + if (ord('A') == 193) + { + printf "#%vx for 0x80 0x100\n",$a; + print "not " unless $a eq "\x8a\x67\x8c\x41" && length($a) == 4; + } + else + { + print "not " unless $a eq "\xc2\x80\xc4\x80" && length($a) == 4; + } print "ok 13\n"; $test++; } diff --git a/t/op/pack.t b/t/op/pack.t index 4c169917ae..5323bc34b8 100755 --- a/t/op/pack.t +++ b/t/op/pack.t @@ -43,7 +43,7 @@ $sum = 103 if ($Config{ebcdic} eq 'define'); print +($x = unpack("%32B*", "Now is the time for all good blurfl")) == $sum ? "ok 7\n" : "not ok 7 $x\n"; -open(BIN, "./perl") || open(BIN, "./perl.exe") +open(BIN, "./perl") || open(BIN, "./perl.exe") || die "Can't open ../perl or ../perl.exe: $!\n"; sysread BIN, $foo, 8192; close BIN; @@ -119,10 +119,10 @@ print( ((unpack("i",pack("i",-1))) == -1 ? "ok " : "not ok "),$test++,"\n"); # 31..36: test the pack lengths of s S i I l L print "not " unless length(pack("s", 0)) == 2; print "ok ", $test++, "\n"; - + print "not " unless length(pack("S", 0)) == 2; print "ok ", $test++, "\n"; - + print "not " unless length(pack("i", 0)) >= 4; print "ok ", $test++, "\n"; @@ -171,7 +171,7 @@ foreach my $t (@templates) { # binary values of the uuencoded version would not be portable between # character sets. Uuencoding is meant for encoding binary data, not # text data. - + $in = pack 'C*', 0 .. 255; # just to be anal, we do some random tr/`/ / @@ -205,7 +205,7 @@ print "ok ", $test++, "\n"; $uu = <<'EOUU'; M'XL("%C<Q#4" TI!4%4 \RHM+E%(S,LOR4@M4@A(+<I1*"U-SD])+>(" &1F -&8%P: +&8%P: EOUU print "not " unless unpack('u', $uu) eq $in; @@ -407,15 +407,16 @@ $z = pack <<EOP,'string','etc'; EOP print 'not ' unless $z eq "\000\006string\003etc"; print "ok $test\n"; $test++; -print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000); +print 'not ' unless "1.20.300.4000" eq sprintf "%vd", pack("U*",1,20,300,4000); print "ok $test\n"; $test++; -print 'not ' unless "1.20.300.4000" eq - sprintf "%vd", pack(" U*",1,20,300,4000); +print 'not ' unless "1.20.300.4000" eq + sprintf "%vd", pack(" U*",1,20,300,4000); print "ok $test\n"; $test++; -print 'not ' unless v1.20.300.4000 ne - sprintf "%vd", pack("C0U*",1,20,300,4000); +print 'not ' unless v1.20.300.4000 ne + sprintf "%vd", pack("C0U*",1,20,300,4000); print "ok $test\n"; $test++; # 160 -print "not " unless join(" ", unpack("C*", chr(0x1e2))) eq "199 162"; +print "not " unless join(" ", unpack("C*", chr(0x1e2))) + eq ((ord(A) == 193) ? "156 67" : "199 162"); print "ok $test\n"; $test++; diff --git a/t/op/ver.t b/t/op/ver.t index b9ba5891f0..e248a48482 100755 --- a/t/op/ver.t +++ b/t/op/ver.t @@ -114,7 +114,7 @@ print "not " unless sprintf("%*vb", "##", v1.22.333.4444) print "ok $test\n"; ++$test; print "not " unless sprintf("%vd", join("", map { chr } - unpack "U*", v2001.2002.2003)) + unpack 'U*', pack('U*',2001,2002,2003))) eq '2001.2002.2003'; print "ok $test\n"; ++$test; @@ -1457,7 +1457,7 @@ S_scan_const(pTHX_ char *start) /* We need to map to chars to ASCII before doing the tests to cover EBCDIC */ - if (!UTF8_IS_INVARIANT(uv)) { + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))) { if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have * accumulated so far if it contains any @@ -1469,7 +1469,7 @@ S_scan_const(pTHX_ char *start) int hicount = 0; U8 *c; for (c = (U8 *) SvPVX(sv); c < (U8 *)d; c++) { - if (!UTF8_IS_INVARIANT(*c)) { + if (!NATIVE_IS_INVARIANT(*c)) { hicount++; } } @@ -1481,7 +1481,7 @@ S_scan_const(pTHX_ char *start) dst = src+hicount; d += hicount; while (src >= (U8 *)SvPVX(sv)) { - if (!UTF8_IS_INVARIANT(*src)) { + if (!NATIVE_IS_INVARIANT(*src)) { U8 ch = NATIVE_TO_ASCII(*src); *dst-- = UTF8_EIGHT_BIT_LO(ch); *dst-- = UTF8_EIGHT_BIT_HI(ch); @@ -1510,7 +1510,7 @@ S_scan_const(pTHX_ char *start) } } else { - *d++ = NATIVE_TO_NEED(has_utf8,uv); + *d++ = (char) uv; } continue; @@ -1603,7 +1603,6 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: - /* The 'has_utf8' here is very dubious */ if (!UTF8_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) { STRLEN len = (STRLEN) -1; UV uv; @@ -7230,7 +7229,7 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; if (!isALPHA(*pos)) { - UV rev, revmax = 0; + UV rev; U8 tmpbuf[UTF8_MAXLEN+1]; U8 *tmpend; s++; /* get past 'v' */ @@ -7260,9 +7259,9 @@ vstring: } /* Append native character for the rev point */ tmpend = uvchr_to_utf8(tmpbuf, rev); - if (rev > revmax) - revmax = rev; sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf); + if (!UNI_IS_INVARIANT(NATIVE_TO_UNI(rev))) + SvUTF8_on(sv); if (*pos == '.' && isDIGIT(pos[1])) s = ++pos; else { @@ -7272,14 +7271,8 @@ vstring: while (isDIGIT(*pos) || *pos == '_') pos++; } - SvPOK_on(sv); SvREADONLY_on(sv); - /* if (revmax > 127) { */ - SvUTF8_on(sv); /* - if (revmax < 256) - sv_utf8_downgrade(sv, TRUE); - } */ } } break; @@ -46,8 +46,8 @@ is the recommended Unicode-aware way of saying U8 * Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) { - if (UTF8_IS_INVARIANT(uv)) { - *d++ = uv; + if (UNI_IS_INVARIANT(uv)) { + *d++ = UTF_TO_NATIVE(uv); return d; } #if defined(EBCDIC) || 1 /* always for testing */ @@ -151,9 +151,7 @@ is the recommended wide native character-aware way of saying U8 * Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) { - if (uv < 0x100) - uv = NATIVE_TO_ASCII(uv); - return Perl_uvuni_to_utf8(aTHX_ d, uv); + return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv)); } @@ -293,7 +291,7 @@ Perl_utf8n_to_uvuni(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) if (UTF8_IS_INVARIANT(uv)) { if (retlen) *retlen = 1; - return (UV) (*s); + return (UV) (NATIVE_TO_UTF(*s)); } if (UTF8_IS_CONTINUATION(uv) && @@ -478,9 +476,7 @@ UV Perl_utf8n_to_uvchr(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) { UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); - if (uv < 0x100) - return (UV) ASCII_TO_NATIVE(uv); - return uv; + return UNI_TO_NATIVE(uv); } /* @@ -550,7 +546,7 @@ Perl_utf8_length(pTHX_ U8* s, U8* e) U8 t = UTF8SKIP(s); if (e - s < t) - Perl_croak(aTHX_ "panic: utf8_length: unaligned end"); + Perl_croak(aTHX_ "panic: utf8_length: s=%p (%02X) e=%p l=%d - unaligned end",s,*s,e,t); s += t; len++; } @@ -713,19 +709,16 @@ Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8) *is_utf8 = 0; -#ifndef EBCDIC - /* Can use as-is if no high chars */ - if (!count) - return start; -#endif - Newz(801, d, (*len) - count + 1, U8); s = start; start = d; while (s < send) { U8 c = *s++; - if (!UTF8_IS_INVARIANT(c)) - c = UTF8_ACCUMULATE(c, *s++); - *d++ = ASCII_TO_NATIVE(c); + if (!UTF8_IS_INVARIANT(c)) { + /* Then it is two-byte encoded */ + c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++); + c = ASCII_TO_NATIVE(c); + } + *d++ = c; } *d = '\0'; *len = d - start; @@ -755,8 +748,8 @@ Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len) while (s < send) { UV uv = NATIVE_TO_ASCII(*s++); - if (UTF8_IS_INVARIANT(uv)) - *d++ = uv; + if (UNI_IS_INVARIANT(uv)) + *d++ = UTF_TO_NATIVE(uv); else { *d++ = UTF8_EIGHT_BIT_HI(uv); *d++ = UTF8_EIGHT_BIT_LO(uv); @@ -64,7 +64,9 @@ END_EXTERN_C */ -#define UTF8_IS_INVARIANT(c) (((UV)c) < 0x80) +#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80) +#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) +#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c)) #define UTF8_IS_START(c) (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd)) #define UTF8_IS_CONTINUATION(c) (((U8)c) >= 0x80 && (((U8)c) <= 0xbf)) #define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80) diff --git a/utfebcdic.h b/utfebcdic.h index 0eef54b001..ef67cb2c35 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -15,17 +15,18 @@ START_EXTERN_C #ifdef DOINIT /* Indexed by encoded byte this table gives the length of the sequence. Adapted from the shadow flags table in tr16. - The entries marked 9 are continuation bytes. + The entries marked 9 in tr6 are continuation bytes and are marked + as length 1 here so that we can recover. */ EXTCONST unsigned char PL_utf8skip[] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, -1,9,9,9,9,9,9,9,9,9,9,1,1,1,1,1, -1,9,9,9,9,9,9,9,9,9,1,1,1,1,1,1, -1,1,9,9,9,9,9,9,9,9,9,1,1,1,1,1, -9,9,9,9,2,2,2,2,2,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,2,2,2,2,2,1,1,1,1,1,1,1, 2,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2, 2,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2, 2,1,1,1,1,1,1,1,1,1,2,2,2,1,2,2, @@ -221,7 +222,7 @@ END_EXTERN_C #define UTF_TO_NATIVE(ch) PL_utf2e[(U8)(ch)] /* Transform in wide UV char space */ #define NATIVE_TO_UNI(ch) (((ch) > 255) ? (ch) : NATIVE_TO_ASCII(ch)) -#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : (UV) ASCII_TO_NATIVE(ch)) +#define UNI_TO_NATIVE(ch) (((ch) > 255) ? (ch) : ASCII_TO_NATIVE(ch)) /* Transform in invariant..byte space */ #define NATIVE_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(NATIVE_TO_ASCII(ch)) : (ch)) #define ASCII_TO_NEED(enc,ch) ((enc) ? UTF_TO_NATIVE(ch) : ASCII_TO_NATIVE(ch)) @@ -267,8 +268,11 @@ END_EXTERN_C (uv) < 0x400000 ? 5 : \ (uv) < 0x4000000 ? 6 : 7 ) + +#define UNI_IS_INVARIANT(c) ((c) < 0xA0) /* UTF-EBCDIC sematic macros - transform back into UTF-8-Mod and then compare */ -#define UTF8_IS_INVARIANT(c) (NATIVE_TO_UTF(c) < 0xA0) +#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c)) +#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) #define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0) #define UTF8_IS_CONTINUATION(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) == 0xA0) #define UTF8_IS_CONTINUED(c) (NATIVE_TO_UTF(c) >= 0xA0) |