summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2022-09-16 06:24:39 -0600
committerKarl Williamson <khw@cpan.org>2022-09-28 07:33:44 -0600
commit18d9f713b2f9bb0dbbd8f62a5a821b2d7586a272 (patch)
tree2b37f0fcbf41b41d0be62ee67ef6e7a895b6f1d5
parent8727bfde8965fbfd0d7e5a95f654e3e1d851bcb6 (diff)
downloadperl-18d9f713b2f9bb0dbbd8f62a5a821b2d7586a272.tar.gz
mktables: Accept multiple @missing lines in input files
Unicode 15.0 will now use this approach to deal with ranges of code points that have a different default for unassigned code points than the table at large. For example, a table may have one default, but all Ideographic character ranges have something else. Prior to this new mechanism, the files had entries for each unassigned code point that had a different default than the global one. So this saves some lines in the files that Unicode delivers that were otherwise useless. Not all files in 15.0 have been converted to use the new scheme, for whatever reason.
-rw-r--r--charclass_invlists.h2
-rw-r--r--lib/unicore/mktables87
-rw-r--r--lib/unicore/uni_keywords.pl2
-rw-r--r--regcharclass.h2
-rw-r--r--uni_keywords.h2
5 files changed, 81 insertions, 14 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index dde3391a7f..38799d0e28 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -430752,7 +430752,7 @@ static const U8 WB_table[23][23] = {
* 43f6df50e4878f501b417e366b0ee097ae5ccb2d4ce942026bed3d62d78e7887 lib/unicore/extracted/DLineBreak.txt
* a04502ebb36a45d83cbe48a7d8132ea8143edb7b3d34d0aa6afe4a9685049741 lib/unicore/extracted/DNumType.txt
* 11075771b112e8e7ccf6ffa637c4c91eadc3ef3db0517b24e605df8fd3624239 lib/unicore/extracted/DNumValues.txt
- * d97aeb4312c8fdc0f44654834108596ecdf5d03c8fca231d6def4338687a89c9 lib/unicore/mktables
+ * 3f7a81c6f40611d1e68f5e42699368ce95d36ca8a852ca5a252b41cee055391a lib/unicore/mktables
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 4c9dc23bc0..af4ed02869 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -10945,12 +10945,19 @@ sub output_perl_charnames_line ($code_point, $name) {
# stored by the Input_file class until we access it here.
# It's possible that there is more than one such line
# waiting for us; collect them all, and parse
- my @missings_list = $file->get_missings
+ my @missings_list;
+ @missings_list = $file->get_missings
if $file->has_missings_defaults;
foreach my $default_ref (@missings_list) {
- my $default = $default_ref->{default};
- my $addr = pack 'J', refaddr property_ref($default_ref->{property});
+
+ # For now, we are only interested in the fallback
+ # default for the entire property. i.e., an @missing
+ # line that is for the whole Unicode range.
+ next if $default_ref->{start} != 0
+ || $default_ref->{end} != $MAX_UNICODE_CODEPOINT;
+
+ $default_map = $default_ref->{default};
# For string properties, the default is just what the
# file says, but non-string properties should already
@@ -10960,16 +10967,76 @@ sub output_perl_charnames_line ($code_point, $name) {
if ($property_type == $STRING
|| $property_type == $UNKNOWN)
{
- $this_property_info->{$MISSINGS} = $default;
+ $this_property_info->{$MISSINGS} = $default_map;
}
else {
- $this_property_info->{$MISSINGS}
- = $property_object->table($default);
+ $default_map =
+ $property_object->table($default_map)->full_name;
+ $this_property_info->{$MISSINGS} = $default_map;
+ $this_property_info->{$DEFAULT_MAP} = $default_map;
+ if (! defined $property_object->default_map) {
+ $property_object->set_default_map($default_map);
+ }
}
}
- # Here, we have $default_map defined, possibly in terms of
- # $missings, but maybe not, and possibly is a dummy one.
+ # For later Unicode versions, multiple @missing lines for
+ # a single property can appear in the files. The first
+ # always applies to the entire Unicode range, and was
+ # handled above. The subsequent ones are for smaller
+ # ranges, and can be read as "But for this range, the
+ # default is ...". So each overrides all the preceding
+ # ones for the range it applies to. Typically they apply
+ # to disjoint ranges, but don't have to. What we do is to
+ # set them up to work in reverse order, so that after the
+ # rest of the table is filled, the highest priority
+ # default range fills in any code points that haven't been
+ # specified; then the next highest priority one is
+ # applied, and so forth.
+ if (@missings_list > 1 && $v_version ge v15.0.0) {
+ if ($property_type != $ENUM) {
+ Carp::my_carp_bug("Multiple \@missings lines only"
+ . " make sense for ENUM-type"
+ . " properties. Changing type to"
+ . " that");
+ $property_type = $this_property_info->{$TYPE}
+ = $ENUM;
+ $property_object->set_type($ENUM);
+ }
+
+ my $multi = Multi_Default->new();
+
+ # The overall default should be first on this list,
+ # and is handled differently than the rest.
+ $default_map = shift @missings_list;
+ Carp::my_carp_bug("\@missings needs to be entire range")
+ if $default_map->{start} != 0
+ || $default_map->{end} != $MAX_UNICODE_CODEPOINT;
+
+ # We already have looked at this line above. Use that
+ # result
+ $multi->set_final_default($this_property_info->
+ {$MISSINGS});
+
+ # Now get the individual range elements, and add them
+ # to Multi_Default object
+ while (@missings_list) {
+ my $this_entry = pop @missings_list;
+ my $subrange_default = $this_entry->{default};
+
+ # Use the short name as a standard
+ $subrange_default = $property_object->
+ table($subrange_default)->short_name;
+ $multi->append_default($subrange_default,
+ "Range_List->new(Initialize => Range->new("
+ . "$this_entry->{start}, $this_entry->{end}))");
+ }
+
+ # Override the property's simple default with this.
+ $property_object->set_default_map($multi);
+ }
+
+ if (! $default_map || $property_type != $ENUM) {
# Finished storing all the @missings defaults in the
# input file so far. Get the one for the current
@@ -11066,7 +11133,7 @@ END
= $this_property_info->{$DEFAULT_TABLE}
= $property_object->table($default_map);
}
-
+ }
} # End of is first time for this property
} # End of switching properties.
@@ -13422,7 +13489,7 @@ END
# Add mappings to the property for each code point in the list
foreach my $range ($list->ranges) {
$property->add_map($range->start, $range->end, $default,
- Replace => $CROAK);
+ Replace => $NO);
}
}
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index bda496f96a..8c82cccf1c 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1320,7 +1320,7 @@
# 43f6df50e4878f501b417e366b0ee097ae5ccb2d4ce942026bed3d62d78e7887 lib/unicore/extracted/DLineBreak.txt
# a04502ebb36a45d83cbe48a7d8132ea8143edb7b3d34d0aa6afe4a9685049741 lib/unicore/extracted/DNumType.txt
# 11075771b112e8e7ccf6ffa637c4c91eadc3ef3db0517b24e605df8fd3624239 lib/unicore/extracted/DNumValues.txt
-# d97aeb4312c8fdc0f44654834108596ecdf5d03c8fca231d6def4338687a89c9 lib/unicore/mktables
+# 3f7a81c6f40611d1e68f5e42699368ce95d36ca8a852ca5a252b41cee055391a lib/unicore/mktables
# c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
# 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
# c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl
diff --git a/regcharclass.h b/regcharclass.h
index 812baaf73e..685427e32e 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -3850,7 +3850,7 @@
* 43f6df50e4878f501b417e366b0ee097ae5ccb2d4ce942026bed3d62d78e7887 lib/unicore/extracted/DLineBreak.txt
* a04502ebb36a45d83cbe48a7d8132ea8143edb7b3d34d0aa6afe4a9685049741 lib/unicore/extracted/DNumType.txt
* 11075771b112e8e7ccf6ffa637c4c91eadc3ef3db0517b24e605df8fd3624239 lib/unicore/extracted/DNumValues.txt
- * d97aeb4312c8fdc0f44654834108596ecdf5d03c8fca231d6def4338687a89c9 lib/unicore/mktables
+ * 3f7a81c6f40611d1e68f5e42699368ce95d36ca8a852ca5a252b41cee055391a lib/unicore/mktables
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* acc94e4afc339fe2cf2ae74d6e1cbcf2c396328d78e56236ad314eadbfc84125 regen/regcharclass.pl
diff --git a/uni_keywords.h b/uni_keywords.h
index 9c905f5d6b..8baec415bb 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -7677,7 +7677,7 @@ match_uniprop( const unsigned char * const key, const U16 key_len ) {
* 43f6df50e4878f501b417e366b0ee097ae5ccb2d4ce942026bed3d62d78e7887 lib/unicore/extracted/DLineBreak.txt
* a04502ebb36a45d83cbe48a7d8132ea8143edb7b3d34d0aa6afe4a9685049741 lib/unicore/extracted/DNumType.txt
* 11075771b112e8e7ccf6ffa637c4c91eadc3ef3db0517b24e605df8fd3624239 lib/unicore/extracted/DNumValues.txt
- * d97aeb4312c8fdc0f44654834108596ecdf5d03c8fca231d6def4338687a89c9 lib/unicore/mktables
+ * 3f7a81c6f40611d1e68f5e42699368ce95d36ca8a852ca5a252b41cee055391a lib/unicore/mktables
* c72bbdeda99714db1c8024d3311da4aef3c0db3b9b9f11455a7cfe10d5e9aba3 lib/unicore/version
* 0a6b5ab33bb1026531f816efe81aea1a8ffcd34a27cbea37dd6a70a63d73c844 regen/charset_translations.pl
* c7ff8e0d207d3538c7feb4a1a152b159e5e902d20293b303569ea8323e84633e regen/mk_PL_charclass.pl