diff options
author | Yves Orton <demerphq@gmail.com> | 2008-11-07 20:20:21 +0000 |
---|---|---|
committer | Yves Orton <demerphq@gmail.com> | 2008-11-07 20:20:21 +0000 |
commit | da7fcca4b8d6fb4dc88e0305bf9830bf24912ebd (patch) | |
tree | d05a14842c3d234ee9e4f5d1f692c20733133eb1 /lib | |
parent | 463559e728b65f7b60e46efa081b43ff1b4b6fa4 (diff) | |
download | perl-da7fcca4b8d6fb4dc88e0305bf9830bf24912ebd.tar.gz |
create new unicode props as defined in POSIX spec (optionally use them in the regex engine)
Perlbug #60156 and #49302 (and probably others) resolve down to the problem
that the definition of \s and \w and \d and the POSIX charclasses are different
for unicode strings and for non-unicode strings. This broke the character class
logic in the regex engine. The easiest fix to make the character class logic sane
again is to define new properties which do match.
This change creates new property classes that can be used instead of the
traditional ones (it does not change the previously defined ones). If the
define in regcomp.h:
#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1
is changed to 0, then the new mappings will be used. This will fix a bunch
of bugs that are reported as TODO items in the new reg_posixcc.t test file.
p4raw-id: //depot/perl@34769
Diffstat (limited to 'lib')
-rw-r--r-- | lib/unicore/mktables | 38 |
1 files changed, 37 insertions, 1 deletions
diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 64de8b1fee..242465d548 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -779,6 +779,31 @@ sub simple_dumper { ## ## Process UnicodeData.txt (Categories, etc.) ## +# These are the character mappings as defined in the POSIX standard +# and in the case of PerlSpace and PerlWord as is defined in the test macros +# for binary strings. IOW, PerlWord is [A-Za-z_] and PerlSpace is [\f\r\n\t ] +# This differs from Word and the existing SpacePerl (note the prefix/suffix difference) +# which is basically the Unicode WhiteSpace without the vertical tab included +# +my %TRUE_POSIX_PERL_CC= ( + PosixAlnum => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x0061..0x007a )}, + PosixAlpha => { map { $_ => 1 } ( 0x0041..0x005a, 0x0061..0x007a )}, + # Not Needed: Ascii => { map { $_ => 1 } ( 0x0000..0x007f )}, + PosixBlank => { map { $_ => 1 } ( 0x0009, 0x0020 )}, + PosixCntrl => { map { $_ => 1 } ( 0x0000..0x001f, 0x007f )}, + PosixGraph => { map { $_ => 1 } ( 0x0021..0x007e )}, + PosixLower => { map { $_ => 1 } ( 0x0061..0x007a )}, + PosixPrint => { map { $_ => 1 } ( 0x0020..0x007e )}, + PosixPunct => { map { $_ => 1 } ( 0x0021..0x002f, 0x003a..0x0040, 0x005b..0x0060, 0x007b..0x007e )}, + PosixSpace => { map { $_ => 1 } ( 0x0009..0x000d, 0x0020 )}, + PosixUpper => { map { $_ => 1 } ( 0x0041..0x005a )}, + # Not needed: PosixXdigit => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x0046, 0x0061..0x0066 )}, + PosixDigit => { map { $_ => 1 } ( 0x0030..0x0039 )}, + + PerlSpace => { map { $_ => 1 } ( 0x0009..0x000a, 0x000c..0x000d, 0x0020 )}, + PerlWord => { map { $_ => 1 } ( 0x0030..0x0039, 0x0041..0x005a, 0x005f, 0x0061..0x007a )}, +); + sub UnicodeData_Txt() { my $Bidi = Table->New(); @@ -795,7 +820,7 @@ sub UnicodeData_Txt() $DC{can} = Table->New(); $DC{com} = Table->New(); - ## Initialize Perl-generated categories + ## Initialize Broken Perl-generated categories ## (Categories from UnicodeData.txt are auto-initialized in gencat) $Cat{Alnum} = Table->New(Is => 'Alnum', Desc => "[[:Alnum:]]", Fuzzy => 0); @@ -839,6 +864,10 @@ sub UnicodeData_Txt() $To{Title} = Table->New(); $To{Digit} = Table->New(); + foreach my $cat (keys %TRUE_POSIX_PERL_CC) { + $Cat{$cat} = Table->New(Is=>$cat, Fuzzy => 0); + } + sub gencat($$$$) { my ($name, ## Name ("LATIN CAPITAL LETTER A") @@ -920,6 +949,13 @@ sub UnicodeData_Txt() $Cat{XDigit}->$op($code) if ($code >= 0x30 && $code <= 0x39) ## 0..9 || ($code >= 0x41 && $code <= 0x46) ## A..F || ($code >= 0x61 && $code <= 0x66); ## a..f + if ($code<=0x7F) { + foreach my $cat (keys %TRUE_POSIX_PERL_CC) { + if ($TRUE_POSIX_PERL_CC{$cat}{$code}) { + $Cat{$cat}->$op($code); + } + } + } } ## open ane read file..... |