summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ext/B/B.pm8
-rw-r--r--handy.h22
-rw-r--r--pod/perlebcdic.pod12
-rw-r--r--t/op/chars.t10
4 files changed, 41 insertions, 11 deletions
diff --git a/ext/B/B.pm b/ext/B/B.pm
index ee4700f292..952475db2c 100644
--- a/ext/B/B.pm
+++ b/ext/B/B.pm
@@ -90,11 +90,13 @@ sub B::GV::SAFENAME {
# The regex below corresponds to the isCONTROLVAR macro
# from toke.c
- $name =~ s/^([\cA-\cZ\c\\c[\c]\c?\c_\c^])/"^".
- chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
+ $name =~ s/^\c?/^?/
+ or $name =~ s/^([\cA-\cZ\c\\c[\c]\c_\c^])/
+ "^" . chr( utf8::unicode_to_native( 64 ^ ord($1) ))/e;
# When we say unicode_to_native we really mean ascii_to_native,
- # which matters iff this is a non-ASCII platform (EBCDIC).
+ # which matters iff this is a non-ASCII platform (EBCDIC). '\c?' would
+ # not have to be special cased, except for non-ASCII.
return $name;
}
diff --git a/handy.h b/handy.h
index d695cf5183..2f0132f572 100644
--- a/handy.h
+++ b/handy.h
@@ -1633,11 +1633,23 @@ EXTCONST U32 PL_charclass[];
#define isALNUMC_utf8(p) isALPHANUMERIC_utf8(p)
#define isALNUMC_LC_utf8(p) isALPHANUMERIC_LC_utf8(p)
-/* This conversion works both ways, strangely enough. On EBCDIC platforms,
- * CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII, except that they don't
- * necessarily mean the same characters, e.g. CTRL-D is 4 on both systems, but
- * that is EOT on ASCII; ST on EBCDIC */
-# define toCTRL(c) (toUPPER(NATIVE_TO_LATIN1(c)) ^ 64)
+/* On EBCDIC platforms, CTRL-@ is 0, CTRL-A is 1, etc, just like on ASCII,
+ * except that they don't necessarily mean the same characters, e.g. CTRL-D is
+ * 4 on both systems, but that is EOT on ASCII; ST on EBCDIC.
+ * '?' is special-cased on EBCDIC to APC, which is the control there that is
+ * the outlier from the block that contains the other controls, just like
+ * toCTRL('?') on ASCII yields DEL, the control that is the outlier from the C0
+ * block. If it weren't special cased, it would yield a non-control.
+ * The conversion works both ways, so CTRL('D') is 4, and CTRL(4) is D, etc. */
+#ifndef EBCDIC
+# define toCTRL(c) (toUPPER(c) ^ 64)
+#else
+# define toCTRL(c) ((c) == '?' \
+ ? LATIN1_TO_NATIVE(0x9F) \
+ : (c) == LATIN1_TO_NATIVE(0x9F) \
+ ? '?' \
+ : (NATIVE_TO_LATIN1(toUPPER(c)) ^ 64))
+#endif
/* Line numbers are unsigned, 32 bits. */
typedef U32 line_t;
diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod
index af0e05d310..36f93e8d97 100644
--- a/pod/perlebcdic.pod
+++ b/pod/perlebcdic.pod
@@ -789,7 +789,7 @@ in the ASCII table is that they can "literally" be constructed
as control characters in perl, e.g. C<(chr(0)> eq C<\c@>)>
C<(chr(1)> eq C<\cA>)>, and so on. Perl on EBCDIC platforms has been
ported to take C<\c@> to chr(0) and C<\cA> to chr(1), etc. as well, but the
-thirty three characters that result depend on which code page you are
+characters that result depend on which code page you are
using. The table below uses the standard acronyms for the controls.
The POSIX-BC and 1047 sets are
identical throughout this range and differ from the 0037 set at only
@@ -799,10 +799,12 @@ platforms and cannot be generated as a C<"\c.letter."> control character on
0037 platforms. Note also that C<\c\> cannot be the final element in a string
or regex, as it will absorb the terminator. But C<\c\I<X>> is a C<FILE
SEPARATOR> concatenated with I<X> for all I<X>.
+The outlier C<\c?> on ASCII, which yields a non-C0 control C<DEL>,
+yields the outlier control C<APC> on EBCDIC, the one that isn't in the
+block of contiguous controls.
chr ord 8859-1 0037 1047 && POSIX-BC
-----------------------------------------------------------------------
- \c? 127 <DEL> " "
\c@ 0 <NUL> <NUL> <NUL>
\cA 1 <SOH> <SOH> <SOH>
\cB 2 <STX> <STX> <STX>
@@ -835,6 +837,12 @@ SEPARATOR> concatenated with I<X> for all I<X>.
\c] 29 <GS> <GS> <GS>
\c^ 30 <RS> <RS> <RS>
\c_ 31 <US> <US> <US>
+ \c? * <DEL> <APC> <APC>
+
+C<*> Note: C<\c?> maps to ordinal 127 (C<DEL>) on ASCII platforms, but
+since ordinal 127 is a not a control character on EBCDIC machines,
+C<\c?> instead maps to C<APC>, which is 255 in 0037 and 1047, and 95 in
+POSIX-BC.
=head1 FUNCTION DIFFERENCES
diff --git a/t/op/chars.t b/t/op/chars.t
index d26a632b17..3fa9b8fb19 100644
--- a/t/op/chars.t
+++ b/t/op/chars.t
@@ -77,6 +77,14 @@ is (ord($c), 30, '\c^');
$c = "\c_";
is (ord($c), 31, '\c_');
$c = "\c?";
-is (ord($c), 127, '\c?');
+
+# '\c?' is an outlier, and is treated differently on each platform.
+# It's DEL on ASCII, and APC on EBCDIC
+is (ord($c), ((ord('^') == 95 || ord('^') == 175) # 1047 or 0037
+ ? 255
+ : ord('^') == 106 # Posix-BC
+ ? 95
+ : 127),
+ '\c?');
$c = '';
is (ord($c), 0, 'ord("") is 0');