diff options
author | Karl Williamson <public@khwilliamson.com> | 2013-04-13 11:41:04 -0600 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2014-02-05 15:47:04 -0700 |
commit | 2bd1cbf6ef490552a1de7d86d43c05162e3e5e91 (patch) | |
tree | 900d97fa1a7662c381d473bdcdf3493a363a7d4c | |
parent | 1a5eefe0dbec84fef092f54a1da3f267f6ac039d (diff) | |
download | perl-2bd1cbf6ef490552a1de7d86d43c05162e3e5e91.tar.gz |
handy.h Special case toCTRL('?') for EBCDIC
There is no change for ASCII platforms. For EBCDIC ones, toCTRL('?")
and its inverse are special cased to map to/from the APC control
character, which is the outlier control on these platforms. The reason
to special case this is that otherwise toCTRL('?') would map to a
graphic character, not a control. By outlier, I mean it is the one
control not in the single block where all the other controls are placed.
Further, it corresponds on two of the platforms with 0xFF, which is
would be an EBCDIC rub-out character corresponding to an ASCII rub-out
(or DEL) 0x7F, which is what toCTRL('?') maps to on ASCII. This is an
outlier control on ASCII not being a member of the C0 nor C1 controls.
Hence this make '?' mean the outlier control on both platforms.
-rw-r--r-- | ext/B/B.pm | 8 | ||||
-rw-r--r-- | handy.h | 22 | ||||
-rw-r--r-- | pod/perlebcdic.pod | 12 | ||||
-rw-r--r-- | t/op/chars.t | 10 |
4 files changed, 41 insertions, 11 deletions
diff --git a/ext/B/B.pm b/ext/B/B.pm index ee4700f292..952475db2c 100644 --- a/ext/B/B.pm +++ b/ext/B/B.pm @@ -90,11 +90,13 @@ sub B::GV::SAFENAME { # The regex below corresponds to the isCONTROLVAR macro # from toke.c - $name =~ s/^([\cA-\cZ\c\\c[\c]\c?\c_\c^])/"^". - chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e; + $name =~ s/^\c?/^?/ + or $name =~ s/^([\cA-\cZ\c\\c[\c]\c_\c^])/ + "^" . chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e; # When we say unicode_to_native we really mean ascii_to_native, - # which matters iff this is a non-ASCII platform (EBCDIC). + # which matters iff this is a non-ASCII platform (EBCDIC). '\c?' would + # not have to be special cased, except for non-ASCII. return $name; } @@ -1633,11 +1633,23 @@ EXTCONST U32 PL_charclass[]; #define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p) #define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p) -/* This conversion works both ways, strangely enough. On EBCDIC platforms, - * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't - * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but - * that is EOT on ASCII; ST on EBCDIC */ -# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64) +/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, + * except that they don't necessarily mean the same characters, e.g. CTRL-D is + * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC. + * '?' is special-cased on EBCDIC to APC, which is the control there that is + * the outlier from the block that contains the other controls, just like + * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0 + * block. If it weren't special cased, it would yield a non-control. + * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */ +#ifndef EBCDIC +# define toCTRL(c) (toUPPER(c) ^ 64) +#else +# define toCTRL(c) ((c) == '?' \ + ? LATIN1_TO_NATIVE(0x9F) \ + : (c) == LATIN1_TO_NATIVE(0x9F) \ + ? '?' \ + : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64)) +#endif /* Line numbers are unsigned, 32 bits. */ typedef U32 line_t; diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod index af0e05d310..36f93e8d97 100644 --- a/pod/perlebcdic.pod +++ b/pod/perlebcdic.pod @@ -789,7 +789,7 @@ in the ASCII table is that they can "literally" be constructed as control characters in perl, e.g. C<(chr(0)> eq C<\c@>)> C<(chr(1)> eq C<\cA>)>, and so on. Perl on EBCDIC platforms has been ported to take C<\c@> to chr(0) and C<\cA> to chr(1), etc. as well, but the -thirty three characters that result depend on which code page you are +characters that result depend on which code page you are using. The table below uses the standard acronyms for the controls. The POSIX-BC and 1047 sets are identical throughout this range and differ from the 0037 set at only @@ -799,10 +799,12 @@ platforms and cannot be generated as a C<"\c.letter."> control character on 0037 platforms. Note also that C<\c\> cannot be the final element in a string or regex, as it will absorb the terminator. But C<\c\I<X>> is a C<FILE SEPARATOR> concatenated with I<X> for all I<X>. +The outlier C<\c?> on ASCII, which yields a non-C0 control C<DEL>, +yields the outlier control C<APC> on EBCDIC, the one that isn't in the +block of contiguous controls. chr ord 8859-1 0037 1047 && POSIX-BC ----------------------------------------------------------------------- - \c? 127 <DEL> " " \c@ 0 <NUL> <NUL> <NUL> \cA 1 <SOH> <SOH> <SOH> \cB 2 <STX> <STX> <STX> @@ -835,6 +837,12 @@ SEPARATOR> concatenated with I<X> for all I<X>. \c] 29 <GS> <GS> <GS> \c^ 30 <RS> <RS> <RS> \c_ 31 <US> <US> <US> + \c? * <DEL> <APC> <APC> + +C<*> Note: C<\c?> maps to ordinal 127 (C<DEL>) on ASCII platforms, but +since ordinal 127 is a not a control character on EBCDIC machines, +C<\c?> instead maps to C<APC>, which is 255 in 0037 and 1047, and 95 in +POSIX-BC. =head1 FUNCTION DIFFERENCES diff --git a/t/op/chars.t b/t/op/chars.t index d26a632b17..3fa9b8fb19 100644 --- a/t/op/chars.t +++ b/t/op/chars.t @@ -77,6 +77,14 @@ is (ord($c), 30, '\c^'); $c = "\c_"; is (ord($c), 31, '\c_'); $c = "\c?"; -is (ord($c), 127, '\c?'); + +# '\c?' is an outlier, and is treated differently on each platform. +# It's DEL on ASCII, and APC on EBCDIC +is (ord($c), ((ord('^') == 95 || ord('^') == 175) # 1047 or 0037 + ? 255 + : ord('^') == 106 # Posix-BC + ? 95 + : 127), + '\c?'); $c = ''; is (ord($c), 0, 'ord("") is 0'); |