summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2001-03-28 15:47:43 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2001-03-28 15:47:43 +0000
commitffbc6a930f7d2050ba54ac6bb9f15db93c1fab59 (patch)
tree7e81ef584643250fd840d7fda63c1eb7f7a6ddb6
parented4a5f9986e389ab30cb5ae84d8409ef8d4dcaec (diff)
downloadperl-ffbc6a930f7d2050ba54ac6bb9f15db93c1fab59.tar.gz
Integrate perlio:
[ 9400] More EBCDIC tweaks: - one more swash issue &~(0xA0-1) did not do the right thing, for UTF-EBCDIC where &~(0x80-1) does for UTF-8. - add "use re 'asciirange'" to make [!-~] etc. work use it in MIME::QuotedPrint and t/op/regexp.t and t/op/pat.t - Choose a key for t/op/each.t test which gets encoded. - Skip utf8decode if this is UTF-EBCDIC. p4raw-link: @9400 on //depot/perlio: daf0f78e031c718c75590ef9ef573756f805776e p4raw-id: //depot/perl@9407
-rw-r--r--ext/MIME/Base64/QuotedPrint.pm1
-rw-r--r--ext/re/re.pm15
-rw-r--r--perl.h1
-rw-r--r--regcomp.c42
-rwxr-xr-xt/op/each.t6
-rwxr-xr-xt/op/pat.t3
-rwxr-xr-xt/op/regexp.t2
-rw-r--r--t/op/utf8decode.t21
-rw-r--r--utf8.c3
9 files changed, 73 insertions, 21 deletions
diff --git a/ext/MIME/Base64/QuotedPrint.pm b/ext/MIME/Base64/QuotedPrint.pm
index 069f3226e9..b72a4b905c 100644
--- a/ext/MIME/Base64/QuotedPrint.pm
+++ b/ext/MIME/Base64/QuotedPrint.pm
@@ -71,6 +71,7 @@ require Exporter;
$VERSION = sprintf("%d.%02d", q$Revision: 2.3 $ =~ /(\d+)\.(\d+)/);
+use re 'asciirange'; # ranges in regular expressions refer to ASCII
sub encode_qp ($)
{
diff --git a/ext/re/re.pm b/ext/re/re.pm
index 3f142d9de4..d66bda5800 100644
--- a/ext/re/re.pm
+++ b/ext/re/re.pm
@@ -42,21 +42,21 @@ other transformations.
When C<use re 'eval'> is in effect, a regex is allowed to contain
C<(?{ ... })> zero-width assertions even if regular expression contains
-variable interpolation. That is normally disallowed, since it is a
+variable interpolation. That is normally disallowed, since it is a
potential security risk. Note that this pragma is ignored when the regular
expression is obtained from tainted data, i.e. evaluation is always
disallowed with tainted regular expresssions. See L<perlre/(?{ code })>.
-For the purpose of this pragma, interpolation of precompiled regular
+For the purpose of this pragma, interpolation of precompiled regular
expressions (i.e., the result of C<qr//>) is I<not> considered variable
interpolation. Thus:
/foo${pat}bar/
-I<is> allowed if $pat is a precompiled regular expression, even
+I<is> allowed if $pat is a precompiled regular expression, even
if $pat contains C<(?{ ... })> assertions.
-When C<use re 'debug'> is in effect, perl emits debugging messages when
+When C<use re 'debug'> is in effect, perl emits debugging messages when
compiling and using regular expressions. The output is the same as that
obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the
B<-Dr> switch. It may be quite voluminous depending on the complexity
@@ -64,7 +64,7 @@ of the match. Using C<debugcolor> instead of C<debug> enables a
form of output that can be used to get a colorful display on terminals
that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a
comma-separated list of C<termcap> properties to use for highlighting
-strings on/off, pre-point part on/off.
+strings on/off, pre-point part on/off.
See L<perldebug/"Debugging regular expressions"> for additional info.
The directive C<use re 'debug'> is I<not lexically scoped>, as the
@@ -77,8 +77,9 @@ See L<perlmodlib/Pragmatic Modules>.
# N.B. File::Basename contains a literal for 'taint' as a fallback. If
# taint is changed here, File::Basename must be updated as well.
my %bitmask = (
-taint => 0x00100000,
-eval => 0x00200000,
+taint => 0x00100000,
+eval => 0x00200000,
+asciirange => 0x02000000,
);
sub setcolor {
diff --git a/perl.h b/perl.h
index d1cb711809..7e5d994bd4 100644
--- a/perl.h
+++ b/perl.h
@@ -2807,6 +2807,7 @@ enum { /* pass one of these to get_vtbl */
#define HINT_FILETEST_ACCESS 0x00400000
#define HINT_UTF8 0x00800000
#define HINT_UTF8_DISTINCT 0x01000000
+#define HINT_RE_ASCIIR 0x02000000
/* Various states of an input record separator SV (rs, nrs) */
#define RsSNARF(sv) (! SvOK(sv))
diff --git a/regcomp.c b/regcomp.c
index 33765fff9d..85f0e4532e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3402,9 +3402,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
for (value = 0; value < 128; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
- if (isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ for (value = 0; value < 256; value++) {
+ if (PL_hints & HINT_RE_ASCIIR) {
+ if (NATIVE_TO_ASCII(value) < 128)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ else {
+ if (isASCII(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ }
#endif /* EBCDIC */
}
dont_optimize_invert = TRUE;
@@ -3418,9 +3425,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
for (value = 128; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
- if (!isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ for (value = 0; value < 256; value++) {
+ if (PL_hints & HINT_RE_ASCIIR) {
+ if (NATIVE_TO_ASCII(value) >= 128)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ else {
+ if (!isASCII(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ }
#endif /* EBCDIC */
}
dont_optimize_invert = TRUE;
@@ -3681,7 +3695,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
} /* end of namedclass \blah */
if (range) {
- if (lastvalue > value) /* b-a */ {
+ if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) ||
+ ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
RExC_parse - rangebegin,
@@ -3715,7 +3730,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
if (!SIZE_ONLY) {
if (lastvalue < 256 && value < 256) {
#ifdef EBCDIC /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
+ if (PL_hints & HINT_RE_ASCIIR) {
+ IV i;
+ /* New style scheme for ranges:
+ * after :
+ * use re 'asciir';
+ * do ranges in ASCII/Unicode space
+ */
+ for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++)
+ ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i));
+ }
+ else if ((isLOWER(lastvalue) && isLOWER(value)) ||
(isUPPER(lastvalue) && isUPPER(value)))
{
IV i;
@@ -4519,3 +4544,4 @@ clear_re(pTHXo_ void *r)
{
ReREFCNT_dec((regexp *)r);
}
+
diff --git a/t/op/each.t b/t/op/each.t
index daddc9c3c1..6dd1ceae8c 100755
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -165,14 +165,16 @@ print "ok 24\n";
use bytes ();
-$d = pack("U*", 0xe3, 0x81, 0x82);
+# on EBCDIC chars are mapped differently so pick something that needs encoding
+# there too.
+$d = pack("U*", 0xe3, 0x81, 0xAF);
$ol = bytes::length($d);
print "not " unless $ol > 3;
print "ok 25\n";
%u = ($d => "downgrade");
for (keys %u) {
use bytes;
- print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
+ print "not " if length ne 3 or $_ ne "\xe3\x81\xAF";
print "ok 26\n";
}
{
diff --git a/t/op/pat.t b/t/op/pat.t
index 4c48c33d8e..c3024a2f2d 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -11,6 +11,9 @@ BEGIN {
chdir 't' if -d 't';
@INC = '../lib';
}
+
+use re 'asciirange'; # Compute ranges in ASCII space
+
eval 'use Config'; # Defaults assumed if this fails
$x = "abc\ndef\n";
diff --git a/t/op/regexp.t b/t/op/regexp.t
index 4a4d42fd98..0b81e714a9 100755
--- a/t/op/regexp.t
+++ b/t/op/regexp.t
@@ -36,6 +36,8 @@ BEGIN {
@INC = '../lib';
}
+use re 'asciirange'; # ranges are computed in ASCII
+
$iters = shift || 1; # Poor man performance suite, 10000 is OK.
open(TESTS,'op/re_tests') || open(TESTS,'t/op/re_tests') ||
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t
index 4d05a6b8d3..494aa8cfb8 100644
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -3,6 +3,21 @@
BEGIN {
chdir 't' if -d 't';
@INC = '../lib';
+
+}
+
+{
+ my $wide = v256;
+ use bytes;
+ my $ordwide = ord($wide);
+ printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide;
+ if ($ordwide == 140) {
+ print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n";
+ exit 0;
+ }
+ elsif ($ordwide != 196) {
+ printf "# v256 starts with 0x%02x\n", $ordwide;
+ }
}
no utf8;
@@ -13,7 +28,7 @@ my $test = 1;
# This table is based on Markus Kuhn's UTF-8 Decode Stress Tester,
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt,
-# version dated 2000-09-02.
+# version dated 2000-09-02.
# We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff
# because e.g. many patch programs have issues with binary data.
@@ -21,7 +36,7 @@ my $test = 1;
my @MK = split(/\n/, <<__EOMK__);
1 Correct UTF-8
1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5
-2 Boundary conditions
+2 Boundary conditions
2.1 First possible sequence of certain length
2.1.1 y "\x00" 0 1 00 1
2.1.2 y "\xc2\x80" 80 2 c2:80 1
@@ -135,7 +150,7 @@ __EOMK__
sub moan {
print "$id: @_";
}
-
+
sub test_unpack_U {
$WARNCNT = 0;
$WARNMSG = "";
diff --git a/utf8.c b/utf8.c
index 66d3fec81c..25cd0fdc41 100644
--- a/utf8.c
+++ b/utf8.c
@@ -1342,7 +1342,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
PUSHMARK(SP);
EXTEND(SP,3);
PUSHs((SV*)sv);
- PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1))));
+ /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
+ PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0)));
PUSHs(sv_2mortal(newSViv(needents)));
PUTBACK;
if (call_method("SWASHGET", G_SCALAR))