summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-02-19 14:55:50 -0700
committerKarl Williamson <khw@cpan.org>2020-02-19 17:57:26 -0700
commitb5c66e73f5dfdc3424c469a7407d517635040a9c (patch)
treeafe3c6ea8f27db2fb4b9a02f7d99bdc602a3a16e
parentef0d5f5f6ff5fd68bc3eae8895729728d5b2579b (diff)
downloadperl-b5c66e73f5dfdc3424c469a7407d517635040a9c.tar.gz
mktables: Handle versioning of non-UCD files
Unicode has lately been asking implementations to support non-Unicode Character Database properties. Files for these contain a different versioning syntax than the UCD files. Previously I was hand-editing those files before commitiing to bring them to use a consistent style. But that is tedious, and I decide to invest a little time to be able to handle all the current versioning syntaxes automatically, to save having to manually update in the future. This was complicated by the fact that some Unicode non-UCD files have BOM marks on many comment lines. I submitted a trouble report to them.
-rw-r--r--charclass_invlists.h2
-rw-r--r--lib/unicore/mktables77
-rw-r--r--lib/unicore/uni_keywords.pl2
-rw-r--r--regcharclass.h2
-rw-r--r--uni_keywords.h2
5 files changed, 65 insertions, 20 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index b9a17d3b6c..f010188578 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -419812,7 +419812,7 @@ static const U8 WB_table[24][24] = {
* 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt
* 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
* 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables
+ * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 52c680f445..1820ad3a30 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -2375,6 +2375,11 @@ sub trace { return main::trace(@_); }
# giving the first release without this file.
main::set_access('withdrawn', \%withdrawn, 'c');
+ my %ucd;
+ # Some files are not actually part of the Unicode Character Database.
+ # These typically have a different way of indicating their version
+ main::set_access('ucd', \%ucd, 'c');
+
my %in_this_release;
# Calculated value from %first_released and %withdrawn. Are we compiling
# a Unicode release which includes this file?
@@ -2404,6 +2409,7 @@ sub trace { return main::trace(@_); }
$missings{$addr} = [ ];
$early{$addr} = [ ];
$optional{$addr} = [ ];
+ $ucd{$addr} = 1;
# Two positional parameters.
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
@@ -2839,6 +2845,8 @@ END
&& ! $early{$addr}[0]
&& lc($file) ne 'unicodedata.txt')
{
+ my $this_version;
+
if ($file !~ /^Unihan/i) {
# The non-Unihan files started getting version numbers in
@@ -2849,17 +2857,39 @@ END
# numbers are correct.
if ($v_version ge v4.0.1) {
$_ = <$file_handle>; # The version number is in the
- # very first line
- if ($_ !~ / - $string_version \. /x) {
- chomp;
+ # very first line if it is a
+ # UCD file; otherwise, it
+ # might be
+ goto valid_version if $_ =~ / - $string_version \. /x;
+ chomp;
+ if ($ucd{$addr}) {
$_ =~ s/^#\s*//;
# 4.0.1 had some valid files that weren't updated.
- if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
- die Carp::my_carp("File '$file' is version "
- . "'$_'. It should be "
- . "version $string_version");
+ goto valid_version
+ if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/;
+ $this_version = $_;
+ goto wrong_version;
+ }
+ else {
+ my $BOM = "\x{FEFF}";
+ utf8::encode($BOM);
+ my $BOM_re = qr/ ^ (?:$BOM)? /x;
+
+ while ($_ =~ s/$BOM_re//) { # BOM; seems to be on
+ # many lines in some files!!
+ $_ = <$file_handle>;
+ chomp;
+ if ($_ =~ /^# Version: (.*)/) {
+ $this_version = $1;
+ goto valid_version
+ if $this_version eq $string_version;
+ goto valid_version
+ if "$this_version.0" eq $string_version;
+ goto wrong_version;
+ }
}
+ goto no_version;
}
}
}
@@ -2869,23 +2899,30 @@ END
# 6.0. The version is somewhere in the first comment
# block
while (<$file_handle>) {
- if ($_ !~ /^#/) {
- Carp::my_carp_bug("Could not find the expected "
- . "version info in file '$file'");
- last;
- }
+ goto no_version if $_ !~ /^#/;
chomp;
$_ =~ s/^#\s*//;
next if $_ !~ / version: /x;
- last if $_ =~ /$string_version/;
- die Carp::my_carp("File '$file' is version "
- . "'$_'. It should be "
- . "version $string_version");
+ goto valid_version if $_ =~ /$string_version/;
+ goto wrong_version;
}
+ goto no_version;
+ }
+ else { # Old Unihan; have to assume is valid
+ goto valid_version;
}
+
+ wrong_version:
+ die Carp::my_carp("File '$file' is version "
+ . "'$this_version'. It should be "
+ . "version $string_version");
+ no_version:
+ Carp::my_carp_bug("Could not find the expected "
+ . "version info in file '$file'");
}
}
+ valid_version:
print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
# Call any special handler for before the file.
@@ -20167,18 +20204,26 @@ my @input_file_objects = (
Pre_Handler => \&setup_emojidata,
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => \&filter_emojidata_line,
+ UCD => 0,
),
Input_file->new("$EMOJI/emoji.txt", v13.0.0,
Has_Missings_Defaults => $NOT_IGNORED,
+ UCD => 0,
+ ),
+ Input_file->new("$EMOJI/ReadMe.txt", v13.0.0,
+ Skip => $Documentation,
+ UCD => 0,
),
Input_file->new('IdStatus.txt', v13.0.0,
Pre_Handler => \&setup_IdStatus,
Property => 'Identifier_Status',
+ UCD => 0,
),
Input_file->new('IdType.txt', v13.0.0,
Pre_Handler => \&setup_IdType,
Each_Line_Handler => \&filter_IdType_line,
Property => 'Identifier_Type',
+ UCD => 0,
),
);
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index e222178691..7fd25b241c 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1295,7 +1295,7 @@
# 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt
# 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
# 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
-# 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables
+# 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
# 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
# 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
# 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
diff --git a/regcharclass.h b/regcharclass.h
index f315cb464d..f8e9f0ab68 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -2247,7 +2247,7 @@
* 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt
* 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
* 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables
+ * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl
diff --git a/uni_keywords.h b/uni_keywords.h
index f754c9dda5..be271a14fc 100644
--- a/uni_keywords.h
+++ b/uni_keywords.h
@@ -7540,7 +7540,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
* 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt
* 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
* 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables
+ * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables
* 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
* 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl