diff options
-rw-r--r-- | charclass_invlists.h | 356 | ||||
-rw-r--r-- | embedvar.h | 2 | ||||
-rw-r--r-- | perlapi.h | 2 | ||||
-rw-r--r-- | perlvars.h | 1 | ||||
-rw-r--r-- | regcomp.c | 1 | ||||
-rw-r--r-- | regen/mk_invlists.pl | 30 | ||||
-rw-r--r-- | uni_keywords.h | 2 |
7 files changed, 392 insertions, 2 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index eb40f5eaf7..b1dd4430e3 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -18043,6 +18043,360 @@ static const int Lowercase_Mapping_invmap[] = { /* for EBCDIC 037 */ #if (defined(PERL_IN_REGCOMP_C) && ! defined(PERL_IN_XSUB_RE)) +static const UV _Perl_CCC_non0_non230_invlist[] = { /* for all charsets */ + 347, /* Number of elements */ + 148565664, /* Version and data structure type */ + 1, /* 0 if the list starts at 0; + 1 if it starts at the element beyond 0 */ + 0x0, + 0x315, + 0x33D, + 0x345, + 0x346, + 0x347, + 0x34A, + 0x34D, + 0x34F, + 0x353, + 0x357, + 0x358, + 0x35B, + 0x35C, + 0x363, + 0x591, + 0x592, + 0x596, + 0x597, + 0x59A, + 0x59C, + 0x5A2, + 0x5A8, + 0x5AA, + 0x5AB, + 0x5AD, + 0x5AF, + 0x5B0, + 0x5BE, + 0x5BF, + 0x5C0, + 0x5C1, + 0x5C3, + 0x5C5, + 0x5C6, + 0x5C7, + 0x5C8, + 0x618, + 0x61B, + 0x64B, + 0x653, + 0x655, + 0x657, + 0x65C, + 0x65D, + 0x65F, + 0x660, + 0x670, + 0x671, + 0x6E3, + 0x6E4, + 0x6EA, + 0x6EB, + 0x6ED, + 0x6EE, + 0x711, + 0x712, + 0x731, + 0x732, + 0x734, + 0x735, + 0x737, + 0x73A, + 0x73B, + 0x73D, + 0x73E, + 0x73F, + 0x742, + 0x743, + 0x744, + 0x745, + 0x746, + 0x747, + 0x748, + 0x749, + 0x7F2, + 0x7F3, + 0x7FD, + 0x7FE, + 0x859, + 0x85C, + 0x8D3, + 0x8D4, + 0x8E3, + 0x8E4, + 0x8E6, + 0x8E7, + 0x8E9, + 0x8EA, + 0x8ED, + 0x8F3, + 0x8F6, + 0x8F7, + 0x8F9, + 0x8FB, + 0x93C, + 0x93D, + 0x94D, + 0x94E, + 0x952, + 0x953, + 0x9BC, + 0x9BD, + 0x9CD, + 0x9CE, + 0xA3C, + 0xA3D, + 0xA4D, + 0xA4E, + 0xABC, + 0xABD, + 0xACD, + 0xACE, + 0xB3C, + 0xB3D, + 0xB4D, + 0xB4E, + 0xBCD, + 0xBCE, + 0xC4D, + 0xC4E, + 0xC55, + 0xC57, + 0xCBC, + 0xCBD, + 0xCCD, + 0xCCE, + 0xD3B, + 0xD3D, + 0xD4D, + 0xD4E, + 0xDCA, + 0xDCB, + 0xE38, + 0xE3B, + 0xE48, + 0xE4C, + 0xEB8, + 0xEBA, + 0xEC8, + 0xECC, + 0xF18, + 0xF1A, + 0xF35, + 0xF36, + 0xF37, + 0xF38, + 0xF39, + 0xF3A, + 0xF71, + 0xF73, + 0xF74, + 0xF75, + 0xF7A, + 0xF7E, + 0xF80, + 0xF81, + 0xF84, + 0xF85, + 0xFC6, + 0xFC7, + 0x1037, + 0x1038, + 0x1039, + 0x103B, + 0x108D, + 0x108E, + 0x1714, + 0x1715, + 0x1734, + 0x1735, + 0x17D2, + 0x17D3, + 0x18A9, + 0x18AA, + 0x1939, + 0x193A, + 0x193B, + 0x193C, + 0x1A18, + 0x1A19, + 0x1A60, + 0x1A61, + 0x1A7F, + 0x1A80, + 0x1AB5, + 0x1ABB, + 0x1ABD, + 0x1ABE, + 0x1B34, + 0x1B35, + 0x1B44, + 0x1B45, + 0x1B6C, + 0x1B6D, + 0x1BAA, + 0x1BAC, + 0x1BE6, + 0x1BE7, + 0x1BF2, + 0x1BF4, + 0x1C37, + 0x1C38, + 0x1CD4, + 0x1CDA, + 0x1CDC, + 0x1CE0, + 0x1CE2, + 0x1CE9, + 0x1CED, + 0x1CEE, + 0x1DC2, + 0x1DC3, + 0x1DCA, + 0x1DCB, + 0x1DCD, + 0x1DD1, + 0x1DF6, + 0x1DFA, + 0x1DFC, + 0x1DFE, + 0x1DFF, + 0x1E00, + 0x20D2, + 0x20D4, + 0x20D8, + 0x20DB, + 0x20E5, + 0x20E7, + 0x20E8, + 0x20E9, + 0x20EA, + 0x20F0, + 0x2D7F, + 0x2D80, + 0x302A, + 0x3030, + 0x3099, + 0x309B, + 0xA806, + 0xA807, + 0xA8C4, + 0xA8C5, + 0xA92B, + 0xA92E, + 0xA953, + 0xA954, + 0xA9B3, + 0xA9B4, + 0xA9C0, + 0xA9C1, + 0xAAB4, + 0xAAB5, + 0xAAF6, + 0xAAF7, + 0xABED, + 0xABEE, + 0xFB1E, + 0xFB1F, + 0xFE27, + 0xFE2E, + 0x101FD, + 0x101FE, + 0x102E0, + 0x102E1, + 0x10A0D, + 0x10A0E, + 0x10A39, + 0x10A3B, + 0x10A3F, + 0x10A40, + 0x10AE6, + 0x10AE7, + 0x10F46, + 0x10F48, + 0x10F4B, + 0x10F4C, + 0x10F4D, + 0x10F51, + 0x11046, + 0x11047, + 0x1107F, + 0x11080, + 0x110B9, + 0x110BB, + 0x11133, + 0x11135, + 0x11173, + 0x11174, + 0x111C0, + 0x111C1, + 0x111CA, + 0x111CB, + 0x11235, + 0x11237, + 0x112E9, + 0x112EB, + 0x1133B, + 0x1133D, + 0x1134D, + 0x1134E, + 0x11442, + 0x11443, + 0x11446, + 0x11447, + 0x114C2, + 0x114C4, + 0x115BF, + 0x115C1, + 0x1163F, + 0x11640, + 0x116B6, + 0x116B8, + 0x1172B, + 0x1172C, + 0x11839, + 0x1183B, + 0x11A34, + 0x11A35, + 0x11A47, + 0x11A48, + 0x11A99, + 0x11A9A, + 0x11C3F, + 0x11C40, + 0x11D42, + 0x11D43, + 0x11D44, + 0x11D46, + 0x11D97, + 0x11D98, + 0x16AF0, + 0x16AF5, + 0x1BC9E, + 0x1BC9F, + 0x1D165, + 0x1D16A, + 0x1D16D, + 0x1D173, + 0x1D17B, + 0x1D183, + 0x1D18A, + 0x1D18C, + 0x1E8D0, + 0x1E8D7, + 0x1E94A, + 0x1E94B +}; + # if 'A' == 65 /* ASCII/Latin1 */ static const UV _Perl_GCB_invlist[] = { /* for ASCII/Latin1 */ @@ -383696,5 +384050,5 @@ static const U8 WB_table[23][23] = { * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl - * 1fdcc4c0ed94008c13daeb934b40cbd9f5b2871201dce7a9f0530be4145ea026 regen/mk_invlists.pl + * 8ae37f2b5bbc7d215f63e8d1189754d83a16c6156fd353847f6fcced90c513d5 regen/mk_invlists.pl * ex: set ro: */ diff --git a/embedvar.h b/embedvar.h index 539fc5a32e..ad7aae8306 100644 --- a/embedvar.h +++ b/embedvar.h @@ -351,6 +351,8 @@ #define PL_GAboveLatin1 (my_vars->GAboveLatin1) #define PL_Assigned_invlist (my_vars->GAssigned_invlist) #define PL_GAssigned_invlist (my_vars->GAssigned_invlist) +#define PL_CCC_non0_non230 (my_vars->GCCC_non0_non230) +#define PL_GCCC_non0_non230 (my_vars->GCCC_non0_non230) #define PL_C_locale_obj (my_vars->GC_locale_obj) #define PL_GC_locale_obj (my_vars->GC_locale_obj) #define PL_GCB_invlist (my_vars->GGCB_invlist) @@ -103,6 +103,8 @@ END_EXTERN_C #define PL_AboveLatin1 (*Perl_GAboveLatin1_ptr(NULL)) #undef PL_Assigned_invlist #define PL_Assigned_invlist (*Perl_GAssigned_invlist_ptr(NULL)) +#undef PL_CCC_non0_non230 +#define PL_CCC_non0_non230 (*Perl_GCCC_non0_non230_ptr(NULL)) #undef PL_C_locale_obj #define PL_C_locale_obj (*Perl_GC_locale_obj_ptr(NULL)) #undef PL_GCB_invlist diff --git a/perlvars.h b/perlvars.h index e8064eb320..8a4ff6a47b 100644 --- a/perlvars.h +++ b/perlvars.h @@ -305,6 +305,7 @@ PERLVAR(G, utf8_charname_begin, SV *) PERLVAR(G, utf8_charname_continue, SV *) PERLVAR(G, utf8_mark, SV *) PERLVAR(G, InBitmap, SV *) +PERLVAR(G, CCC_non0_non230, SV *) /* Everything that folds to a given character, for case insensitivity regex * matching */ @@ -21947,6 +21947,7 @@ Perl_init_uniprops(pTHX) PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist); PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]); + PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist); #ifdef UNI_XIDC /* The below are used only by deprecated functions. They could be removed */ diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index dd6a0321e0..55c4afb279 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -1092,6 +1092,35 @@ sub UpperLatin1 { return \@return; } +sub _Perl_CCC_non0_non230 { + + # Create an inversion list of code points with non-zero canonical + # combining class that also don't have 230 as the class number. This is + # part of a Unicode Standard rule + + my @nonzeros = prop_invlist("ccc=0"); + shift @nonzeros; # Invert so is "ccc != 0" + + my @return; + + # Expand into list of code points, while excluding those with ccc == 230 + for (my $i = 0; $i < @nonzeros; $i += 2) { + my $upper = ($i + 1) < @nonzeros + ? $nonzeros[$i+1] - 1 # In range + : $Unicode::UCD::MAX_CP; # To infinity. + for my $j ($nonzeros[$i] .. $upper) { + my @ccc_names = prop_value_aliases("ccc", charprop($j, "ccc")); + + # Final element in @ccc_names will be all numeric + push @return, $j if $ccc_names[-1] != 230; + } + } + + @return = sort { $a <=> $b } @return; + @return = mk_invlist_from_sorted_cp_list(\@return); + return \@return; +} + sub output_table_common { # Common subroutine to actually output the generated rules table. @@ -2319,6 +2348,7 @@ push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw( Simple_Case_Folding Case_Folding &_Perl_IVCF + &_Perl_CCC_non0_non230 ); # NOTE that the convention is that extra enum values come # after the property name, separated by commas, with the enums diff --git a/uni_keywords.h b/uni_keywords.h index c2fceb5527..9d2b8816ae 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -6996,6 +6996,6 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl - * 1fdcc4c0ed94008c13daeb934b40cbd9f5b2871201dce7a9f0530be4145ea026 regen/mk_invlists.pl + * 8ae37f2b5bbc7d215f63e8d1189754d83a16c6156fd353847f6fcced90c513d5 regen/mk_invlists.pl * c56b78df81e0f96632246052d71580b212546ca02ba4075158965e11d892f21e regen/mph.pl * ex: set ro: */ |