summaryrefslogtreecommitdiff
path: root/regen
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-10-02 21:48:26 -0600
committerKarl Williamson <public@khwilliamson.com>2012-10-09 11:16:04 -0600
commit4b9dbf476b4dd332407df71ce9af089941a5fa0b (patch)
tree32bf3125529cb4f6051ba7e9b146d5588e5c8a4c /regen
parentfbd1cbddb39b0020d30bb614af1f816fd6366cba (diff)
downloadperl-4b9dbf476b4dd332407df71ce9af089941a5fa0b.tar.gz
Add regen/regcharclass_multi_char_folds.pl
This takes as input the current Unicode character data base, and outputs lists of the multi-character folds in it, in a form suitable for input to regen/regcharclass.pl
Diffstat (limited to 'regen')
-rw-r--r--regen/regcharclass_multi_char_folds.pl106
1 files changed, 106 insertions, 0 deletions
diff --git a/regen/regcharclass_multi_char_folds.pl b/regen/regcharclass_multi_char_folds.pl
new file mode 100644
index 0000000000..ce2d781af7
--- /dev/null
+++ b/regen/regcharclass_multi_char_folds.pl
@@ -0,0 +1,106 @@
+package regcharclass_multi_char_folds;
+use 5.015;
+use strict;
+use warnings;
+use Unicode::UCD "prop_invmap";
+
+# This returns an array of strings of the form
+# "\x{foo}\x{bar}\x{baz}"
+# of the sequences of code points that are multi-character folds in the
+# current Unicode version. If the parameter is 1, all such folds are
+# returned. If the parameters is 0, only the ones containing exclusively
+# ASCII characters are returned. In the latter case all combinations of ASCII
+# characters that can fold to the base one are returned. Thus for 'ss', it
+# would return in addition, 'Ss', 'sS', and 'SS'. This is because this code
+# is designed to help regcomp.c, and EXACTFish regnodes. For non-UTF-8
+# patterns, the strings are not folded, so we need to check for the upper and
+# lower case versions. For UTF-8 patterns, the strings are folded, so we only
+# need to worry about the fold version. There are no non-ASCII Latin1
+# multi-char folds currently, and none likely to be ever added, so this
+# doesn't worry about that case, except to croak should it happen.
+# This is designed for input to regen/regcharlass.pl.
+
+sub gen_combinations ($;) {
+ # Generate all combinations for the first parameter which is an array of
+ # arrays.
+
+ my ($fold_ref, $string, $i) = @_;
+ $string = "" unless $string;
+ $i = 0 unless $i;
+
+ my @ret;
+
+ # Look at each element in this level's array.
+ foreach my $j (0 .. @{$fold_ref->[$i]} - 1) {
+
+ # Append its representation to what we have currently
+ my $new_string = sprintf "$string\\x{%X}", $fold_ref->[$i][$j];
+
+ if ($i >= @$fold_ref - 1) { # Final level: just return it
+ push @ret, "\"$new_string\"";
+ }
+ else { # Generate the combinations for the next level with this one's
+ push @ret, &gen_combinations($fold_ref, $new_string, $i + 1);
+ }
+ }
+
+ return @ret;
+}
+
+sub multi_char_folds ($) {
+ my $all_folds = shift; # The single parameter is true if wants all
+ # multi-char folds; false if just the ones that
+ # are all ascii
+
+ my ($cp_ref, $folds_ref, $format) = prop_invmap("Case_Folding");
+ die "Could not find inversion map for Case_Folding" unless defined $format;
+ die "Incorrect format '$format' for Case_Folding inversion map"
+ unless $format eq 'al';
+ my @folds;
+
+ for my $i (0 .. @$folds_ref - 1) {
+ next unless ref $folds_ref->[$i]; # Skip single-char folds
+
+ # The code in regcomp.c currently assumes that no multi-char fold
+ # folds to the upper Latin1 range. It's not a big deal to add; we
+ # just have to forbid such a fold in EXACTFL nodes, like we do already
+ # for ascii chars in EXACTFA (and EXACTFL) nodes. But I (khw) doubt
+ # that there will ever be such a fold created by Unicode, so the code
+ # isn't there to occupy space and time; instead there is this check.
+ die sprintf("regcomp.c can't cope with a latin1 multi-char fold (found in the fold of U+%X", $cp_ref->[$i]) if grep { $_ < 256 && chr($_) !~ /[[:ascii:]]/ } @{$folds_ref->[$i]};
+
+ # Create a line that looks like "\x{foo}\x{bar}\x{baz}" of the code
+ # points that make up the fold.
+ my $fold = join "", map { sprintf "\\x{%X}", $_ } @{$folds_ref->[$i]};
+ $fold = "\"$fold\"";
+
+ # Skip if something else already has this fold
+ next if grep { $_ eq $fold } @folds;
+
+ if ($all_folds) {
+ push @folds, $fold
+ } # Skip if wants only all-ascii folds, and there is a non-ascii
+ elsif (! grep { chr($_) =~ /[^[:ascii:]]/ } @{$folds_ref->[$i]}) {
+
+ # If the fold is to a cased letter, replace the entry with an
+ # array which also includes its upper case.
+ my $this_fold_ref = $folds_ref->[$i];
+ for my $j (0 .. @$this_fold_ref - 1) {
+ my $this_ord = $this_fold_ref->[$j];
+ if (chr($this_ord) =~ /\p{Cased}/) {
+ my $uc = ord(uc(chr($this_ord)));
+ undef $this_fold_ref->[$j];
+ @{$this_fold_ref->[$j]} = ( $this_ord, $uc);
+ }
+ }
+
+ # Then generate all combinations of upper/lower case of the fold.
+ push @folds, gen_combinations($this_fold_ref);
+
+ }
+ }
+
+ return @folds;
+}
+
+1