diff options
author | Karl Williamson <public@khwilliamson.com> | 2014-01-23 20:34:15 -0700 |
---|---|---|
committer | Karl Williamson <public@khwilliamson.com> | 2014-01-27 11:07:17 -0700 |
commit | 1a27eb967982a1b8fdc2ab7ae9af98f318c808ec (patch) | |
tree | 9fa41e716d7e18f8bd6b9a49925cc17b5a6b0af9 | |
parent | f0672d4d47b5a2683d20833c31c6a5238588be20 (diff) | |
download | perl-1a27eb967982a1b8fdc2ab7ae9af98f318c808ec.tar.gz |
Move an inversion list generation to mktables
Prior to this patch, this was in regen/mk_invlists.pl, but future
commits will want it to also be used by the header generated by
regen/regcharclass.pl, so use a common source so the logic doesn't have
to be duplicated.
-rw-r--r-- | charclass_invlists.h | 2 | ||||
-rw-r--r-- | lib/unicore/mktables | 22 | ||||
-rw-r--r-- | regcharclass.h | 27 | ||||
-rw-r--r-- | regcomp.c | 3 | ||||
-rw-r--r-- | regen/mk_invlists.pl | 7 | ||||
-rwxr-xr-x | regen/regcharclass.pl | 4 |
6 files changed, 52 insertions, 13 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index 9f40681dda..7bc14d88d0 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -9171,7 +9171,7 @@ static const UV NonL1_Perl_Non_Final_Folds_invlist[] = { #ifndef PERL_IN_XSUB_RE -static const UV _Perl_Multi_Char_Folds_invlist[] = { +static const UV _Perl_Folds_To_Multi_Char_invlist[] = { 59, /* Number of elements */ 148565664, /* Version and data structure type */ 1, /* 0 if the list starts at 0; diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 4a58886eee..8ab0b4629e 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -13849,11 +13849,25 @@ sub compile_perl() { my $any_folds = $perl->add_match_table("_Perl_Any_Folds", Description => "Code points that particpate in some fold", ); - # + + my $folds_to_multi_char = $perl->add_match_table( + "_Perl_Folds_To_Multi_Char", + Description => + "Code points whose fold is a string of more than one character", + ); + foreach my $range (property_ref('Case_Folding')->ranges) { - $any_folds->add_range($range->start, $range->end); - foreach my $hex_code_point (split " ", $range->value) { - my $code_point = hex $hex_code_point; + my $start = $range->start; + my $end = $range->end; + $any_folds->add_range($start, $end); + + my @hex_code_points = split " ", $range->value; + if (@hex_code_points > 1) { + $folds_to_multi_char->add_range($start, $end); + } + + foreach my $i (0 .. @hex_code_points - 1) { + my $code_point = hex $hex_code_points[$i]; $any_folds->add_range($code_point, $code_point); } } diff --git a/regcharclass.h b/regcharclass.h index 8b175fd1b8..2c482ce013 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -854,6 +854,33 @@ : 0 ) /* + FOLDS_TO_MULTI: characters that fold to multi-char strings + + \p{_Perl_Folds_To_Multi_Char} +*/ +/*** GENERATED CODE ***/ +#define is_FOLDS_TO_MULTI_utf8(s) \ +( ( 0xC3 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( 0x9F == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ +: ( 0xC4 == NATIVE_TO_LATIN1(((U8*)s)[0]) || 0xC7 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ?\ + ( ( 0xB0 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ +: ( 0xC5 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( 0x89 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ +: ( 0xCE == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( ( NATIVE_TO_LATIN1(((U8*)s)[1]) & 0xDF ) == 0x90 ) ? 2 : 0 ) \ +: ( 0xD6 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( 0x87 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \ +: ( 0xE1 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \ + ( ( 0xBA == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? \ + ( ( ( 0x96 <= NATIVE_TO_LATIN1(((U8*)s)[2]) && NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x9A ) || 0x9E == NATIVE_TO_LATIN1(((U8*)s)[2]) ) ? 3 : 0 )\ + : ( 0xBD == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? \ + ( ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF9 ) == 0x90 ) ? 3 : 0 ) \ + : ( 0xBE == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? \ + ( ( ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xE0 ) == 0x80 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF0 ) == 0xA0 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFA ) == 0xB2 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ? 3 : 0 )\ + : ( ( 0xBF == NATIVE_TO_LATIN1(((U8*)s)[1]) ) && ( ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xCA ) == 0x82 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF7 ) == 0x84 ) || NATIVE_TO_LATIN1(((U8*)s)[2]) == 0xA4 || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ) ? 3 : 0 )\ +: ( ( ( 0xEF == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0xAC == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x86 ) || ( 0x93 <= NATIVE_TO_LATIN1(((U8*)s)[2]) && NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x97 ) ) ) ? 3 : 0 ) + +/* PATWS: pattern white space \p{PatWS} @@ -6080,9 +6080,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_Latin1 = _new_invlist_C_array(Latin1_invlist); PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); - PL_HasMultiCharFold = - _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); + _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); } #endif diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index d112b3059c..c9432a3d8d 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -111,11 +111,6 @@ for my $i (0 .. @$folds_ref - 1) { } } -sub _Perl_Multi_Char_Folds { - @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold; - return mk_invlist_from_cp_list(\@has_multi_char_fold); -} - sub _Perl_Non_Final_Folds { @is_non_final_fold = sort { $a <=> $b } @is_non_final_fold; return mk_invlist_from_cp_list(\@is_non_final_fold); @@ -175,7 +170,7 @@ for my $prop (qw( XPosixXDigit _Perl_Any_Folds &NonL1_Perl_Non_Final_Folds - &_Perl_Multi_Char_Folds + _Perl_Folds_To_Multi_Char &UpperLatin1 _Perl_IDStart _Perl_IDCont diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 61cd2109a1..959b7a5c47 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1513,6 +1513,10 @@ MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character ®charclass_multi_char_folds::multi_char_folds(0) # 0 => Latin1-only +FOLDS_TO_MULTI: characters that fold to multi-char strings +=> UTF8 :fast +\p{_Perl_Folds_To_Multi_Char} + PATWS: pattern white space => generic generic_non_low cp : fast safe \p{PatWS} |