summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-02-13 09:04:41 -0700
committerKarl Williamson <khw@cpan.org>2020-03-11 09:00:04 -0600
commitb555069b72f93a232deba173dc7bf7892cfa5868 (patch)
tree46616e3464efba862569f9f74b08f6369768c496 /lib/Unicode
parentd022bb19f99bdcaafe3758b24d031a0f0450e623 (diff)
downloadperl-b555069b72f93a232deba173dc7bf7892cfa5868.tar.gz
Reformat lib/unicore/Name.pl
This changes the format of this generated file so that it can more easily be used with the Unicode Name property in wildcard matching. Each line will now end with \n\n, and the \t characters are replaced by \n. Thus an entry will look like 00001\nSTART OF HEADING\n\n This makes matching of user-defined patterns using anchors work under /m, which commit 4829f32decd128e6a122bd8ce35fe944bd87f104 forces. That commit also changed some anchors' defintions to make them match \n under /m with wildcards, so this makes it all transparent to user patterns. The double \n\n at the end of an entry is so that the code can distinguish between a line that contains a code point vs a name without relying on the content; it is a disambiguator, like the \t that used to be.
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/UCD.pm41
-rw-r--r--lib/Unicode/UCD.t31
2 files changed, 48 insertions, 24 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 531a37be64..11fa2bef6a 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.74';
+our $VERSION = '0.75';
sub DEBUG () { 0 }
$|=1 if DEBUG;
@@ -2350,9 +2350,18 @@ sub _namedseq {
local $_;
local $/ = "\n";
while (<$namedseqfh>) {
- if (/^ [0-9A-F]+ \ /x) {
- chomp;
- my ($sequence, $name) = split /\t/;
+ next if m/ ^ \s* \# /x;
+
+ # Each entry is currently two lines. The first contains the code
+ # points in the sequence separated by spaces. If this entry
+ # doesn't have spaces, it isn't a named sequence.
+ if (/^ [0-9A-F]{4,5} (?: \ [0-9A-F]{4,5} )+ $ /x) {
+ my $sequence = $_;
+ chomp $sequence;
+
+ # And the second is the name
+ my $name = <$namedseqfh>;
+ chomp $name;
my @s = map { chr(hex($_)) } split(' ', $sequence);
$NAMEDSEQ{$name} = join("", @s);
}
@@ -3753,17 +3762,9 @@ them.
Instead of reading the Unicode Database directly from files, as you were able
to do for a long time, you are encouraged to use the supplied functions. So,
-instead of reading C<Name.pl> - which may disappear without notice in the
-future - directly, as with
-
- my (%name, %cp);
- for (split m/\s*\n/ => do "unicore/Name.pl") {
- my ($cp, $name) = split m/\t/ => $_;
- $cp{$name} = $cp;
- $name{$cp} = $name unless $cp =~ m/ /;
- }
-
-You ought to use L</prop_invmap()> like this:
+instead of reading C<Name.pl> directly, which changed formats in 5.32, and may
+do so again without notice in the future or even disappear, you ought to use
+L</prop_invmap()> like this:
my (%name, %cp, %cps, $n);
# All codepoints
@@ -3906,6 +3907,14 @@ RETRY:
my %names;
$names{'LIST'} = "";
my $original = do "unicore/Name.pl";
+
+ # Change the double \n format of the file back to single lines
+ # with a tab
+ $original =~ s/\n\n/\e/g; # Use a control that shouldn't occur
+ #in the file
+ $original =~ s/\n/\t/g;
+ $original =~ s/\e/\n/g;
+
my $algorithm_names = \@algorithmic_named_code_points;
# We need to remove the names from it that are aliases. For that
@@ -3934,7 +3943,7 @@ RETRY:
foreach my $line (split "\n", $original) {
my ($hex_code_point, $name) = split "\t", $line;
- # Weeds out all comments, blank lines, and named sequences
+ # Weeds out any comments, blank lines, and named sequences
next if $hex_code_point =~ /[^[:xdigit:]]/a;
my $code_point = hex $hex_code_point;
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 6adb3ffef7..eb7fbd8f35 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -2454,22 +2454,36 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
$official = do "unicore/Name.pl";
+ # Change the double \n format of the file back to single lines with a tab
+ $official =~ s/\n\n/\e/g; # Use a control that shouldn't occur
+ # in the file
+ $official =~ s/\n/\t/g;
+ $official =~ s/\e/\n/g;
+
# Get rid of the named sequences portion of the file. These don't
# have a tab before the first blank on a line.
$official =~ s/ ^ [^\t]+ \ .*? \n //xmg;
# And get rid of the controls. These are named in the file, but
- # shouldn't be in the property. This gets rid of the two ranges in
- # one fell swoop, and also all the Unicode1_Name values that may not
- # be in Name_Alias.
+ # shouldn't be in the property. On all supported platforms, there are
+ # two ranges of controls. The first range extends from 0..SPACE-1.
+ # The second depends on the platform.
+ $official =~ s/ ^ 00000 .*? ( .{5} \t SPACE ) $ /$1/xms;
+ my $range_2_start;
+ my $range_2_end_next;
if ($::IS_ASCII) {
- $official =~ s/ 00000 \t .* 0001F .*? \n//xs;
- $official =~ s/ 0007F \t .* 0009F .*? \n//xs;
+ $range_2_start = '0007F';
+ $range_2_end_next = '000A0';
+ }
+ elsif (ord '^' == 106) { # POSIX-BC
+ $range_2_start = '005F';
+ $range_2_end_next = '0060';
}
- elsif ($::IS_EBCDIC) { # Won't work for POSIX-BC
- $official =~ s/ 00000 \t .* 0003F .*? \n//xs;
- $official =~ s/ 000FF \t .* 000FF .*? \n//xs;
+ else {
+ $range_2_start = '00FF';
+ $range_2_end_next = '0100';
}
+ $official =~ s/ ^ $range_2_start .*? ( $range_2_end_next ) /$1/xms;
# And remove the aliases. We read in the Name_Alias property, and go
# through them one by one.
@@ -2499,6 +2513,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
$official =~ s/$hex_code_point \t $alias \n //x;
}
}
+
local $/ = "\n";
chomp $official;
$/ = $input_record_separator;