diff options
-rw-r--r-- | ext/B/B.pm | 8 | ||||
-rw-r--r-- | handy.h | 22 | ||||
-rw-r--r-- | pod/perlebcdic.pod | 12 | ||||
-rw-r--r-- | t/op/chars.t | 10 |
4 files changed, 41 insertions, 11 deletions
diff --git a/ext/B/B.pm b/ext/B/B.pm index ee4700f292..952475db2c 100644 --- a/ext/B/B.pm +++ b/ext/B/B.pm @@ -90,11 +90,13 @@ sub B::GV::SAFENAME { # The regex below corresponds to the isCONTROLVAR macro # from toke.c - $name =~ s/^([\cA-\cZ\c\\c[\c]\c?\c_\c^])/"^". - chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e; + $name =~ s/^\c?/^?/ + or $name =~ s/^([\cA-\cZ\c\\c[\c]\c_\c^])/ + "^" . chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e; # When we say unicode_to_native we really mean ascii_to_native, - # which matters iff this is a non-ASCII platform (EBCDIC). + # which matters iff this is a non-ASCII platform (EBCDIC). '\c?' would + # not have to be special cased, except for non-ASCII. return $name; } @@ -1633,11 +1633,23 @@ EXTCONST U32 PL_charclass[]; #define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p) #define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p) -/* This conversion works both ways, strangely enough. On EBCDIC platforms, - * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't - * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but - * that is EOT on ASCII; ST on EBCDIC */ -# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64) +/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, + * except that they don't necessarily mean the same characters, e.g. CTRL-D is + * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC. + * '?' is special-cased on EBCDIC to APC, which is the control there that is + * the outlier from the block that contains the other controls, just like + * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0 + * block. If it weren't special cased, it would yield a non-control. + * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */ +#ifndef EBCDIC +# define toCTRL(c) (toUPPER(c) ^ 64) +#else +# define toCTRL(c) ((c) == '?' \ + ? LATIN1_TO_NATIVE(0x9F) \ + : (c) == LATIN1_TO_NATIVE(0x9F) \ + ? '?' \ + : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64)) +#endif /* Line numbers are unsigned, 32 bits. */ typedef U32 line_t; diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod index af0e05d310..36f93e8d97 100644 --- a/pod/perlebcdic.pod +++ b/pod/perlebcdic.pod @@ -789,7 +789,7 @@ in the ASCII table is that they can "literally" be constructed as control characters in perl, e.g. C<(chr(0)> eq C<\c@>)> C<(chr(1)> eq C<\cA>)>, and so on. Perl on EBCDIC platforms has been ported to take C<\c@> to chr(0) and C<\cA> to chr(1), etc. as well, but the -thirty three characters that result depend on which code page you are +characters that result depend on which code page you are using. The table below uses the standard acronyms for the controls. The POSIX-BC and 1047 sets are identical throughout this range and differ from the 0037 set at only @@ -799,10 +799,12 @@ platforms and cannot be generated as a C<"\c.letter."> control character on 0037 platforms. Note also that C<\c\> cannot be the final element in a string or regex, as it will absorb the terminator. But C<\c\I<X>> is a C<FILE SEPARATOR> concatenated with I<X> for all I<X>. +The outlier C<\c?> on ASCII, which yields a non-C0 control C<DEL>, +yields the outlier control C<APC> on EBCDIC, the one that isn't in the +block of contiguous controls. chr ord 8859-1 0037 1047 && POSIX-BC ----------------------------------------------------------------------- - \c? 127 <DEL> " " \c@ 0 <NUL> <NUL> <NUL> \cA 1 <SOH> <SOH> <SOH> \cB 2 <STX> <STX> <STX> @@ -835,6 +837,12 @@ SEPARATOR> concatenated with I<X> for all I<X>. \c] 29 <GS> <GS> <GS> \c^ 30 <RS> <RS> <RS> \c_ 31 <US> <US> <US> + \c? * <DEL> <APC> <APC> + +C<*> Note: C<\c?> maps to ordinal 127 (C<DEL>) on ASCII platforms, but +since ordinal 127 is a not a control character on EBCDIC machines, +C<\c?> instead maps to C<APC>, which is 255 in 0037 and 1047, and 95 in +POSIX-BC. =head1 FUNCTION DIFFERENCES diff --git a/t/op/chars.t b/t/op/chars.t index d26a632b17..3fa9b8fb19 100644 --- a/t/op/chars.t +++ b/t/op/chars.t @@ -77,6 +77,14 @@ is (ord($c), 30, '\c^'); $c = "\c_"; is (ord($c), 31, '\c_'); $c = "\c?"; -is (ord($c), 127, '\c?'); + +# '\c?' is an outlier, and is treated differently on each platform. +# It's DEL on ASCII, and APC on EBCDIC +is (ord($c), ((ord('^') == 95 || ord('^') == 175) # 1047 or 0037 + ? 255 + : ord('^') == 106 # Posix-BC + ? 95 + : 127), + '\c?'); $c = ''; is (ord($c), 0, 'ord("") is 0'); |