diff options
author | Karl Williamson <khw@cpan.org> | 2018-03-28 18:20:10 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2018-03-31 15:36:46 -0600 |
commit | d2aadf62cbb988487f162551df71b8a36cb54fb6 (patch) | |
tree | a6603a286c9e9e3f550d284b13471943c683549e /regen | |
parent | 341bb5b701cc0a72be8a81da44c67546e0a62436 (diff) | |
download | perl-d2aadf62cbb988487f162551df71b8a36cb54fb6.tar.gz |
regen/mk_invlists.pl: Generate tables for inverted case folds
This table will be used in the next commit
Diffstat (limited to 'regen')
-rw-r--r-- | regen/mk_invlists.pl | 190 |
1 files changed, 189 insertions, 1 deletions
diff --git a/regen/mk_invlists.pl b/regen/mk_invlists.pl index 525d44b0ea..dae78467f3 100644 --- a/regen/mk_invlists.pl +++ b/regen/mk_invlists.pl @@ -659,7 +659,7 @@ sub mk_invlist_from_sorted_cp_list { # Read in the Case Folding rules, and construct arrays of code points for the # properties we need. -my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding"); +my ($cp_ref, $folds_ref, $format, $default) = prop_invmap("Case_Folding"); die "Could not find inversion map for Case_Folding" unless defined $format; die "Incorrect format '$format' for Case_Folding inversion map" unless $format eq 'al' @@ -685,6 +685,186 @@ sub _Perl_Non_Final_Folds { return \@return; } +sub _Perl_IVCF { + + # This creates a map of the inversion of case folding. i.e., given a + # character, it gives all the other characters that fold to it. + # + # Inversion maps function kind of like a hash, with the inversion list + # specifying the buckets (keys) and the inversion maps specifying the + # contents of the corresponding bucket. Effectively this function just + # swaps the keys and values of the case fold hash. But there are + # complications. Most importantly, More than one character can each have + # the same fold. This is solved by having a list of characters that fold + # to a given one. + + my %new; + + # Go through the inversion list. + for (my $i = 0; $i < @$cp_ref; $i++) { + + # Skip if nothing folds to this + next if $folds_ref->[$i] == 0; + + # This entry which is valid from here to up (but not including) the + # next entry is for the next $count characters, so that, for example, + # A-Z is represented by one entry. + my $cur_list = $cp_ref->[$i]; + my $count = $cp_ref->[$i+1] - $cur_list; + + # The fold of [$i] can be not just a single character, but a sequence + # of multiple ones. We deal with those here by just creating a string + # consisting of them. Otherwise, we use the single code point [$i] + # folds to. + my $cur_map = (ref $folds_ref->[$i]) + ? join "", map { chr } $folds_ref->[$i]->@* + : $folds_ref->[$i]; + + # Expand out this range + while ($count > 0) { + push @{$new{$cur_map}}, $cur_list; + + # A multiple-character fold is a string, and shouldn't need + # incrementing anyway + if (ref $folds_ref->[$i]) { + die sprintf("Case fold for %x is multiple chars; should have" + . " a count of 1, but instead it was $count", $count) + unless $count == 1; + } + else { + $cur_map++; + $cur_list++; + } + $count--; + } + } + + # Now go through and make some adjustments. We add synthetic entries for + # two cases. + # 1) Two or more code points can fold to the same multiple character, + # sequence, as U+FB05 and U+FB06 both fold to 'st'. This code is only + # for single character folds, but FB05 and FB06 are single characters + # that are equivalent folded, so we add entries so that they are + # considered to fold to each other + # 2) If two or more above-Latin1 code points fold to the same Latin1 range + # one, we also add entries so that they are considered to fold to each + # other. This is so that under /aa or /l matching, where folding to + # their Latin1 range code point is illegal, they still can fold to each + # other. This situation happens in Unicode 3.0.1, but probably no + # other version. + foreach my $fold (keys %new) { + my $folds_to_string = $fold =~ /\D/a; + + # If the bucket contains only one element, convert from an array to a + # scalar + if (scalar $new{$fold}->@* == 1) { + $new{$fold} = $new{$fold}[0]; + } + else { + + # Otherwise, sort numerically. This places the highest code point + # in the list at the tail end. This is because Unicode keeps the + # lowercase code points as higher ordinals than the uppercase, at + # least for the ones that matter so far. These are synthetic + # entries, and we want to predictably have the lowercase (which is + # more likely to be what gets folded to) in the same corresponding + # position, so that other code can rely on that. If some new + # version of Unicode came along that violated this, we might have + # to change so that the sort is based on upper vs lower instead. + # (The lower-comes-after isn't true of native EBCDIC, but here we + # are dealing strictly with Unicode values). + @{$new{$fold}} = sort { $a <=> $b } $new{$fold}->@* + unless $folds_to_string; + # We will be working with a copy of this sorted entry. + my @source_list = $new{$fold}->@*; + if (! $folds_to_string) { + + # This handles situation 2) listed above, which only arises if + # what is being folded-to (the fold) is in the Latin1 range. + if ($fold > 255 ) { + undef @source_list; + } + else { + # And it only arises if there are two or more folders that + # fold to it above Latin1. We look at just those. + @source_list = grep { $_ > 255 } @source_list; + undef @source_list if @source_list == 1; + } + } + + # Here, we've found the items we want to set up synthetic folds + # for. Add entries so that each folds to each other. + foreach my $cp (@source_list) { + my @rest = grep { $cp != $_ } @source_list; + if (@rest == 1) { + $new{$cp} = $rest[0]; + } + else { + push @{$new{$cp}}, @rest; + } + } + } + + # We don't otherwise deal with multiple-character folds + delete $new{$fold} if $folds_to_string; + } + + + # Now we have a hash that is the inversion of the case fold property. + # Convert it to an inversion map. + + my @sorted_folds = sort { $a <=> $b } keys %new; + my (@invlist, @invmap); + + # We know that nothing folds to the controls (whose ordinals start at 0). + # And the first real entries are the lowest in the hash. + push @invlist, 0, $sorted_folds[0]; + push @invmap, 0, $new{$sorted_folds[0]}; + + # Go through the remainder of the hash keys (which are the folded code + # points) + for (my $i = 1; $i < @sorted_folds; $i++) { + + # Get the current one, and the one prior to it. + my $fold = $sorted_folds[$i]; + my $prev_fold = $sorted_folds[$i-1]; + + # If the current one is not just 1 away from the prior one, we close + # out the range containing the previous fold, and know that the gap + # doesn't have anything that folds. + if ($fold - 1 != $prev_fold) { + push @invlist, $prev_fold + 1; + push @invmap, 0; + + # And start a new range + push @invlist, $fold; + push @invmap, $new{$fold}; + } + elsif ($new{$fold} - 1 != $new{$prev_fold}) { + + # Here the current fold is just 1 greater than the previous, but + # the new map isn't correspondingly 1 greater than the previous, + # the old range is ended, but since there is no gap, we don't have + # to insert anything else. + push @invlist, $fold; + push @invmap, $new{$fold}; + + } # else { Otherwise, this new entry just extends the previous } + + die "In IVCF: $invlist[-1] <= $invlist[-2]" + if $invlist[-1] <= $invlist[-2]; + } + + # And add an entry that indicates that everything above this, to infinity, + # does not have a case fold. + push @invlist, $sorted_folds[-1] + 1; + push @invmap, 0; + + # All Unicode versions have some places where multiple code points map to + # the same one, so the format always has an 'l' + return \@invlist, \@invmap, 'al', $default; +} + sub prop_name_for_cmp ($) { # Sort helper my $name = shift; @@ -1920,6 +2100,7 @@ for my $charset (get_supported_code_pages()) { Uppercase_Mapping Simple_Case_Folding Case_Folding + &_Perl_IVCF ) # NOTE that the convention is that extra enum # values come after the property name, separated by @@ -1965,6 +2146,13 @@ for my $charset (get_supported_code_pages()) { die $@ if $@; my $invlist_ref = shift @return; @invlist = @$invlist_ref; + if (@return) { # If has other values returned , must be an + # inversion map + my $invmap_ref = shift @return; + @invmap = @$invmap_ref; + $map_format = shift @return; + $map_default = shift @return; + } } else { @invlist = prop_invlist($lookup_prop, '_perl_core_internal_ok'); |