diff options
-rw-r--r-- | ext/MIME/Base64/QuotedPrint.pm | 1 | ||||
-rw-r--r-- | ext/re/re.pm | 15 | ||||
-rw-r--r-- | perl.h | 1 | ||||
-rw-r--r-- | regcomp.c | 42 | ||||
-rwxr-xr-x | t/op/each.t | 6 | ||||
-rwxr-xr-x | t/op/pat.t | 3 | ||||
-rwxr-xr-x | t/op/regexp.t | 2 | ||||
-rw-r--r-- | t/op/utf8decode.t | 21 | ||||
-rw-r--r-- | utf8.c | 3 |
9 files changed, 73 insertions, 21 deletions
diff --git a/ext/MIME/Base64/QuotedPrint.pm b/ext/MIME/Base64/QuotedPrint.pm index 069f3226e9..b72a4b905c 100644 --- a/ext/MIME/Base64/QuotedPrint.pm +++ b/ext/MIME/Base64/QuotedPrint.pm @@ -71,6 +71,7 @@ require Exporter; $VERSION = sprintf("%d.%02d", q$Revision: 2.3 $ =~ /(\d+)\.(\d+)/); +use re 'asciirange'; # ranges in regular expressions refer to ASCII sub encode_qp ($) { diff --git a/ext/re/re.pm b/ext/re/re.pm index 3f142d9de4..d66bda5800 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -42,21 +42,21 @@ other transformations. When C<use re 'eval'> is in effect, a regex is allowed to contain C<(?{ ... })> zero-width assertions even if regular expression contains -variable interpolation. That is normally disallowed, since it is a +variable interpolation. That is normally disallowed, since it is a potential security risk. Note that this pragma is ignored when the regular expression is obtained from tainted data, i.e. evaluation is always disallowed with tainted regular expresssions. See L<perlre/(?{ code })>. -For the purpose of this pragma, interpolation of precompiled regular +For the purpose of this pragma, interpolation of precompiled regular expressions (i.e., the result of C<qr//>) is I<not> considered variable interpolation. Thus: /foo${pat}bar/ -I<is> allowed if $pat is a precompiled regular expression, even +I<is> allowed if $pat is a precompiled regular expression, even if $pat contains C<(?{ ... })> assertions. -When C<use re 'debug'> is in effect, perl emits debugging messages when +When C<use re 'debug'> is in effect, perl emits debugging messages when compiling and using regular expressions. The output is the same as that obtained by running a C<-DDEBUGGING>-enabled perl interpreter with the B<-Dr> switch. It may be quite voluminous depending on the complexity @@ -64,7 +64,7 @@ of the match. Using C<debugcolor> instead of C<debug> enables a form of output that can be used to get a colorful display on terminals that understand termcap color sequences. Set C<$ENV{PERL_RE_TC}> to a comma-separated list of C<termcap> properties to use for highlighting -strings on/off, pre-point part on/off. +strings on/off, pre-point part on/off. See L<perldebug/"Debugging regular expressions"> for additional info. The directive C<use re 'debug'> is I<not lexically scoped>, as the @@ -77,8 +77,9 @@ See L<perlmodlib/Pragmatic Modules>. # N.B. File::Basename contains a literal for 'taint' as a fallback. If # taint is changed here, File::Basename must be updated as well. my %bitmask = ( -taint => 0x00100000, -eval => 0x00200000, +taint => 0x00100000, +eval => 0x00200000, +asciirange => 0x02000000, ); sub setcolor { @@ -2807,6 +2807,7 @@ enum { /* pass one of these to get_vtbl */ #define HINT_FILETEST_ACCESS 0x00400000 #define HINT_UTF8 0x00800000 #define HINT_UTF8_DISTINCT 0x01000000 +#define HINT_RE_ASCIIR 0x02000000 /* Various states of an input record separator SV (rs, nrs) */ #define RsSNARF(sv) (! SvOK(sv)) @@ -3402,9 +3402,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = 0; value < 128; value++) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ - for (value = 0; value < 256; value++) - if (isASCII(value)) - ANYOF_BITMAP_SET(ret, value); + for (value = 0; value < 256; value++) { + if (PL_hints & HINT_RE_ASCIIR) { + if (NATIVE_TO_ASCII(value) < 128) + ANYOF_BITMAP_SET(ret, value); + } + else { + if (isASCII(value)) + ANYOF_BITMAP_SET(ret, value); + } + } #endif /* EBCDIC */ } dont_optimize_invert = TRUE; @@ -3418,9 +3425,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) for (value = 128; value < 256; value++) ANYOF_BITMAP_SET(ret, value); #else /* EBCDIC */ - for (value = 0; value < 256; value++) - if (!isASCII(value)) - ANYOF_BITMAP_SET(ret, value); + for (value = 0; value < 256; value++) { + if (PL_hints & HINT_RE_ASCIIR) { + if (NATIVE_TO_ASCII(value) >= 128) + ANYOF_BITMAP_SET(ret, value); + } + else { + if (!isASCII(value)) + ANYOF_BITMAP_SET(ret, value); + } + } #endif /* EBCDIC */ } dont_optimize_invert = TRUE; @@ -3681,7 +3695,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) } /* end of namedclass \blah */ if (range) { - if (lastvalue > value) /* b-a */ { + if (((lastvalue > value) && !(PL_hints & HINT_RE_ASCIIR)) || + ((NATIVE_TO_UNI(lastvalue) > NATIVE_TO_UNI(value)) && (PL_hints & HINT_RE_ASCIIR))) /* b-a */ { Simple_vFAIL4("Invalid [] range \"%*.*s\"", RExC_parse - rangebegin, RExC_parse - rangebegin, @@ -3715,7 +3730,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) if (!SIZE_ONLY) { if (lastvalue < 256 && value < 256) { #ifdef EBCDIC /* EBCDIC, for example. */ - if ((isLOWER(lastvalue) && isLOWER(value)) || + if (PL_hints & HINT_RE_ASCIIR) { + IV i; + /* New style scheme for ranges: + * after : + * use re 'asciir'; + * do ranges in ASCII/Unicode space + */ + for (i = NATIVE_TO_ASCII(lastvalue) ; i <= NATIVE_TO_ASCII(value); i++) + ANYOF_BITMAP_SET(ret, ASCII_TO_NATIVE(i)); + } + else if ((isLOWER(lastvalue) && isLOWER(value)) || (isUPPER(lastvalue) && isUPPER(value))) { IV i; @@ -4519,3 +4544,4 @@ clear_re(pTHXo_ void *r) { ReREFCNT_dec((regexp *)r); } + diff --git a/t/op/each.t b/t/op/each.t index daddc9c3c1..6dd1ceae8c 100755 --- a/t/op/each.t +++ b/t/op/each.t @@ -165,14 +165,16 @@ print "ok 24\n"; use bytes (); -$d = pack("U*", 0xe3, 0x81, 0x82); +# on EBCDIC chars are mapped differently so pick something that needs encoding +# there too. +$d = pack("U*", 0xe3, 0x81, 0xAF); $ol = bytes::length($d); print "not " unless $ol > 3; print "ok 25\n"; %u = ($d => "downgrade"); for (keys %u) { use bytes; - print "not " if length ne 3 or $_ ne "\xe3\x81\x82"; + print "not " if length ne 3 or $_ ne "\xe3\x81\xAF"; print "ok 26\n"; } { diff --git a/t/op/pat.t b/t/op/pat.t index 4c48c33d8e..c3024a2f2d 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -11,6 +11,9 @@ BEGIN { chdir 't' if -d 't'; @INC = '../lib'; } + +use re 'asciirange'; # Compute ranges in ASCII space + eval 'use Config'; # Defaults assumed if this fails $x = "abc\ndef\n"; diff --git a/t/op/regexp.t b/t/op/regexp.t index 4a4d42fd98..0b81e714a9 100755 --- a/t/op/regexp.t +++ b/t/op/regexp.t @@ -36,6 +36,8 @@ BEGIN { @INC = '../lib'; } +use re 'asciirange'; # ranges are computed in ASCII + $iters = shift || 1; # Poor man performance suite, 10000 is OK. open(TESTS,'op/re_tests') || open(TESTS,'t/op/re_tests') || diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t index 4d05a6b8d3..494aa8cfb8 100644 --- a/t/op/utf8decode.t +++ b/t/op/utf8decode.t @@ -3,6 +3,21 @@ BEGIN { chdir 't' if -d 't'; @INC = '../lib'; + +} + +{ + my $wide = v256; + use bytes; + my $ordwide = ord($wide); + printf "# under use bytes ord(v256) = 0x%02x\n", $ordwide; + if ($ordwide == 140) { + print "1..0 # Skip: UTF-EBCDIC (not UTF-8) used here\n"; + exit 0; + } + elsif ($ordwide != 196) { + printf "# v256 starts with 0x%02x\n", $ordwide; + } } no utf8; @@ -13,7 +28,7 @@ my $test = 1; # This table is based on Markus Kuhn's UTF-8 Decode Stress Tester, # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt, -# version dated 2000-09-02. +# version dated 2000-09-02. # We use the \x notation instead of raw binary bytes for \x00-\x1f\x7f-\xff # because e.g. many patch programs have issues with binary data. @@ -21,7 +36,7 @@ my $test = 1; my @MK = split(/\n/, <<__EOMK__); 1 Correct UTF-8 1.1.1 y "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5" - 11 ce:ba:e1:bd:b9:cf:83:ce:bc:ce:b5 5 -2 Boundary conditions +2 Boundary conditions 2.1 First possible sequence of certain length 2.1.1 y "\x00" 0 1 00 1 2.1.2 y "\xc2\x80" 80 2 c2:80 1 @@ -135,7 +150,7 @@ __EOMK__ sub moan { print "$id: @_"; } - + sub test_unpack_U { $WARNCNT = 0; $WARNMSG = ""; @@ -1342,7 +1342,8 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) PUSHMARK(SP); EXTEND(SP,3); PUSHs((SV*)sv); - PUSHs(sv_2mortal(newSViv(code_point & ~(needents - 1)))); + /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */ + PUSHs(sv_2mortal(newSViv((klen) ? (code_point & ~(needents - 1)) : 0))); PUSHs(sv_2mortal(newSViv(needents))); PUTBACK; if (call_method("SWASHGET", G_SCALAR)) |