summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2014-01-23 20:34:15 -0700
committerKarl Williamson <public@khwilliamson.com>2014-01-27 11:07:17 -0700
commit1a27eb967982a1b8fdc2ab7ae9af98f318c808ec (patch)
tree9fa41e716d7e18f8bd6b9a49925cc17b5a6b0af9
parentf0672d4d47b5a2683d20833c31c6a5238588be20 (diff)
downloadperl-1a27eb967982a1b8fdc2ab7ae9af98f318c808ec.tar.gz
Move an inversion list generation to mktables
Prior to this patch, this was in regen/mk_invlists.pl, but future commits will want it to also be used by the header generated by regen/regcharclass.pl, so use a common source so the logic doesn't have to be duplicated.
-rw-r--r--charclass_invlists.h2
-rw-r--r--lib/unicore/mktables22
-rw-r--r--regcharclass.h27
-rw-r--r--regcomp.c3
-rw-r--r--regen/mk_invlists.pl7
-rwxr-xr-xregen/regcharclass.pl4
6 files changed, 52 insertions, 13 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 9f40681dda..7bc14d88d0 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -9171,7 +9171,7 @@ static const UV NonL1_Perl_Non_Final_Folds_invlist[] = {
#ifndef PERL_IN_XSUB_RE
-static const UV _Perl_Multi_Char_Folds_invlist[] = {
+static const UV _Perl_Folds_To_Multi_Char_invlist[] = {
59, /* Number of elements */
148565664, /* Version and data structure type */
1, /* 0 if the list starts at 0;
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 4a58886eee..8ab0b4629e 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -13849,11 +13849,25 @@ sub compile_perl() {
my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
Description => "Code points that particpate in some fold",
);
- #
+
+ my $folds_to_multi_char = $perl->add_match_table(
+ "_Perl_Folds_To_Multi_Char",
+ Description =>
+ "Code points whose fold is a string of more than one character",
+ );
+
foreach my $range (property_ref('Case_Folding')->ranges) {
- $any_folds->add_range($range->start, $range->end);
- foreach my $hex_code_point (split " ", $range->value) {
- my $code_point = hex $hex_code_point;
+ my $start = $range->start;
+ my $end = $range->end;
+ $any_folds->add_range($start, $end);
+
+ my @hex_code_points = split " ", $range->value;
+ if (@hex_code_points > 1) {
+ $folds_to_multi_char->add_range($start, $end);
+ }
+
+ foreach my $i (0 .. @hex_code_points - 1) {
+ my $code_point = hex $hex_code_points[$i];
$any_folds->add_range($code_point, $code_point);
}
}
diff --git a/regcharclass.h b/regcharclass.h
index 8b175fd1b8..2c482ce013 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -854,6 +854,33 @@
: 0 )
/*
+ FOLDS_TO_MULTI: characters that fold to multi-char strings
+
+ \p{_Perl_Folds_To_Multi_Char}
+*/
+/*** GENERATED CODE ***/
+#define is_FOLDS_TO_MULTI_utf8(s) \
+( ( 0xC3 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \
+ ( ( 0x9F == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \
+: ( 0xC4 == NATIVE_TO_LATIN1(((U8*)s)[0]) || 0xC7 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ?\
+ ( ( 0xB0 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \
+: ( 0xC5 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \
+ ( ( 0x89 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \
+: ( 0xCE == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \
+ ( ( ( NATIVE_TO_LATIN1(((U8*)s)[1]) & 0xDF ) == 0x90 ) ? 2 : 0 ) \
+: ( 0xD6 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \
+ ( ( 0x87 == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? 2 : 0 ) \
+: ( 0xE1 == NATIVE_TO_LATIN1(((U8*)s)[0]) ) ? \
+ ( ( 0xBA == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? \
+ ( ( ( 0x96 <= NATIVE_TO_LATIN1(((U8*)s)[2]) && NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x9A ) || 0x9E == NATIVE_TO_LATIN1(((U8*)s)[2]) ) ? 3 : 0 )\
+ : ( 0xBD == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? \
+ ( ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF9 ) == 0x90 ) ? 3 : 0 ) \
+ : ( 0xBE == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ? \
+ ( ( ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xE0 ) == 0x80 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF0 ) == 0xA0 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xFA ) == 0xB2 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ? 3 : 0 )\
+ : ( ( 0xBF == NATIVE_TO_LATIN1(((U8*)s)[1]) ) && ( ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xCA ) == 0x82 ) || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF7 ) == 0x84 ) || NATIVE_TO_LATIN1(((U8*)s)[2]) == 0xA4 || ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) & 0xF7 ) == 0xB4 ) ) ) ? 3 : 0 )\
+: ( ( ( 0xEF == NATIVE_TO_LATIN1(((U8*)s)[0]) ) && ( 0xAC == NATIVE_TO_LATIN1(((U8*)s)[1]) ) ) && ( ( NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x86 ) || ( 0x93 <= NATIVE_TO_LATIN1(((U8*)s)[2]) && NATIVE_TO_LATIN1(((U8*)s)[2]) <= 0x97 ) ) ) ? 3 : 0 )
+
+/*
PATWS: pattern white space
\p{PatWS}
diff --git a/regcomp.c b/regcomp.c
index 147484cd31..152cac9df6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -6080,9 +6080,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
-
PL_HasMultiCharFold =
- _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
+ _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
}
#endif
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl
index d112b3059c..c9432a3d8d 100644
--- a/regen/mk_invlists.pl
+++ b/regen/mk_invlists.pl
@@ -111,11 +111,6 @@ for my $i (0 .. @$folds_ref - 1) {
}
}
-sub _Perl_Multi_Char_Folds {
- @has_multi_char_fold = sort { $a <=> $b } @has_multi_char_fold;
- return mk_invlist_from_cp_list(\@has_multi_char_fold);
-}
-
sub _Perl_Non_Final_Folds {
@is_non_final_fold = sort { $a <=> $b } @is_non_final_fold;
return mk_invlist_from_cp_list(\@is_non_final_fold);
@@ -175,7 +170,7 @@ for my $prop (qw(
XPosixXDigit
_Perl_Any_Folds
&NonL1_Perl_Non_Final_Folds
- &_Perl_Multi_Char_Folds
+ _Perl_Folds_To_Multi_Char
&UpperLatin1
_Perl_IDStart
_Perl_IDCont
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl
index 61cd2109a1..959b7a5c47 100755
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -1513,6 +1513,10 @@ MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
&regcharclass_multi_char_folds::multi_char_folds(0)
# 0 => Latin1-only
+FOLDS_TO_MULTI: characters that fold to multi-char strings
+=> UTF8 :fast
+\p{_Perl_Folds_To_Multi_Char}
+
PATWS: pattern white space
=> generic generic_non_low cp : fast safe
\p{PatWS}