summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-04-13 11:41:04 -0600
committerKarl Williamson <public@khwilliamson.com>2014-02-05 15:47:04 -0700
commit2bd1cbf6ef490552a1de7d86d43c05162e3e5e91 (patch)
tree900d97fa1a7662c381d473bdcdf3493a363a7d4c
parent1a5eefe0dbec84fef092f54a1da3f267f6ac039d (diff)
downloadperl-2bd1cbf6ef490552a1de7d86d43c05162e3e5e91.tar.gz
handy.h Special case toCTRL('?') for EBCDIC
There is no change for ASCII platforms. For EBCDIC ones, toCTRL('?") and its inverse are special cased to map to/from the APC control character, which is the outlier control on these platforms. The reason to special case this is that otherwise toCTRL('?') would map to a graphic character, not a control. By outlier, I mean it is the one control not in the single block where all the other controls are placed. Further, it corresponds on two of the platforms with 0xFF, which is would be an EBCDIC rub-out character corresponding to an ASCII rub-out (or DEL) 0x7F, which is what toCTRL('?') maps to on ASCII. This is an outlier control on ASCII not being a member of the C0 nor C1 controls. Hence this make '?' mean the outlier control on both platforms.
-rw-r--r--ext/B/B.pm8
-rw-r--r--handy.h22
-rw-r--r--pod/perlebcdic.pod12
-rw-r--r--t/op/chars.t10
4 files changed, 41 insertions, 11 deletions
diff --git a/ext/B/B.pm b/ext/B/B.pm
index ee4700f292..952475db2c 100644
--- a/ext/B/B.pm
+++ b/ext/B/B.pm
@@ -90,11 +90,13 @@ sub B::GV::SAFENAME {
# The regex below corresponds to the isCONTROLVAR macro
# from toke.c
- $name =~ s/^([\cA-\cZ\c\\c[\c]\c?\c_\c^])/"^".
- chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
+ $name =~ s/^\c?/^?/
+ or $name =~ s/^([\cA-\cZ\c\\c[\c]\c_\c^])/
+ "^" . chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
# When we say unicode_to_native we really mean ascii_to_native,
- # which matters iff this is a non-ASCII platform (EBCDIC).
+ # which matters iff this is a non-ASCII platform (EBCDIC). '\c?' would
+ # not have to be special cased, except for non-ASCII.
return $name;
}
diff --git a/handy.h b/handy.h
index d695cf5183..2f0132f572 100644
--- a/handy.h
+++ b/handy.h
@@ -1633,11 +1633,23 @@ EXTCONST U32 PL_charclass[];
#define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)
#define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)
-/* This conversion works both ways, strangely enough. On EBCDIC platforms,
- * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
- * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
- * that is EOT on ASCII; ST on EBCDIC */
-# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
+/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
+ * except that they don't necessarily mean the same characters, e.g. CTRL-D is
+ * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.
+ * '?' is special-cased on EBCDIC to APC, which is the control there that is
+ * the outlier from the block that contains the other controls, just like
+ * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
+ * block. If it weren't special cased, it would yield a non-control.
+ * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */
+#ifndef EBCDIC
+# define toCTRL(c) (toUPPER(c) ^ 64)
+#else
+# define toCTRL(c) ((c) == '?' \
+ ? LATIN1_TO_NATIVE(0x9F) \
+ : (c) == LATIN1_TO_NATIVE(0x9F) \
+ ? '?' \
+ : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64))
+#endif
/* Line numbers are unsigned, 32 bits. */
typedef U32 line_t;
diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod
index af0e05d310..36f93e8d97 100644
--- a/pod/perlebcdic.pod
+++ b/pod/perlebcdic.pod
@@ -789,7 +789,7 @@ in the ASCII table is that they can "literally" be constructed
as control characters in perl, e.g. C<(chr(0)> eq C<\c@>)>
C<(chr(1)> eq C<\cA>)>, and so on. Perl on EBCDIC platforms has been
ported to take C<\c@> to chr(0) and C<\cA> to chr(1), etc. as well, but the
-thirty three characters that result depend on which code page you are
+characters that result depend on which code page you are
using. The table below uses the standard acronyms for the controls.
The POSIX-BC and 1047 sets are
identical throughout this range and differ from the 0037 set at only
@@ -799,10 +799,12 @@ platforms and cannot be generated as a C<"\c.letter."> control character on
0037 platforms. Note also that C<\c\> cannot be the final element in a string
or regex, as it will absorb the terminator. But C<\c\I<X>> is a C<FILE
SEPARATOR> concatenated with I<X> for all I<X>.
+The outlier C<\c?> on ASCII, which yields a non-C0 control C<DEL>,
+yields the outlier control C<APC> on EBCDIC, the one that isn't in the
+block of contiguous controls.
chr ord 8859-1 0037 1047 && POSIX-BC
-----------------------------------------------------------------------
- \c? 127 <DEL> " "
\c@ 0 <NUL> <NUL> <NUL>
\cA 1 <SOH> <SOH> <SOH>
\cB 2 <STX> <STX> <STX>
@@ -835,6 +837,12 @@ SEPARATOR> concatenated with I<X> for all I<X>.
\c] 29 <GS> <GS> <GS>
\c^ 30 <RS> <RS> <RS>
\c_ 31 <US> <US> <US>
+ \c? * <DEL> <APC> <APC>
+
+C<*> Note: C<\c?> maps to ordinal 127 (C<DEL>) on ASCII platforms, but
+since ordinal 127 is a not a control character on EBCDIC machines,
+C<\c?> instead maps to C<APC>, which is 255 in 0037 and 1047, and 95 in
+POSIX-BC.
=head1 FUNCTION DIFFERENCES
diff --git a/t/op/chars.t b/t/op/chars.t
index d26a632b17..3fa9b8fb19 100644
--- a/t/op/chars.t
+++ b/t/op/chars.t
@@ -77,6 +77,14 @@ is (ord($c), 30, '\c^');
$c = "\c_";
is (ord($c), 31, '\c_');
$c = "\c?";
-is (ord($c), 127, '\c?');
+
+# '\c?' is an outlier, and is treated differently on each platform.
+# It's DEL on ASCII, and APC on EBCDIC
+is (ord($c), ((ord('^') == 95 || ord('^') == 175) # 1047 or 0037
+ ? 255
+ : ord('^') == 106 # Posix-BC
+ ? 95
+ : 127),
+ '\c?');
$c = '';
is (ord($c), 0, 'ord("") is 0');