Reformat lib/unicore/Name.pl

This changes the format of this generated file so that it can more easily be used with the Unicode Name property in wildcard matching. Each line will now end with \n\n, and the \t characters are replaced by \n. Thus an entry will look like 00001\nSTART OF HEADING\n\n This makes matching of user-defined patterns using anchors work under /m, which commit 4829f32decd128e6a122bd8ce35fe944bd87f104 forces. That commit also changed some anchors' defintions to make them match \n under /m with wildcards, so this makes it all transparent to user patterns. The double \n\n at the end of an entry is so that the code can distinguish between a line that contains a code point vs a name without relying on the content; it is a disambiguator, like the \t that used to be.
author: Karl Williamson <khw@cpan.org> 2020-02-13 09:04:41 -0700
committer: Karl Williamson <khw@cpan.org> 2020-03-11 09:00:04 -0600
commit: b555069b72f93a232deba173dc7bf7892cfa5868 (patch)
tree: 46616e3464efba862569f9f74b08f6369768c496 /lib
parent: d022bb19f99bdcaafe3758b24d031a0f0450e623 (diff)
download: perl-b555069b72f93a232deba173dc7bf7892cfa5868.tar.gz
6 files changed, 78 insertions, 54 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 531a37be64..11fa2bef6a 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
 no warnings 'surrogate';    # surrogates can be inputs to this
 use charnames ();
 
-our $VERSION = '0.74';
+our $VERSION = '0.75';
 
 sub DEBUG () { 0 }
 $|=1 if DEBUG;
@@ -2350,9 +2350,18 @@ sub _namedseq {
         local $_;
         local $/ = "\n";
         while (<$namedseqfh>) {
-            if (/^ [0-9A-F]+ \  /x) {
-                chomp;
-                my ($sequence, $name) = split /\t/;
+            next if m/ ^ \s* \# /x;
+
+            # Each entry is currently two lines.  The first contains the code
+            # points in the sequence separated by spaces.  If this entry
+            # doesn't have spaces, it isn't a named sequence.
+            if (/^ [0-9A-F]{4,5} (?: \  [0-9A-F]{4,5} )+ $ /x) {
+                my $sequence = $_;
+                chomp $sequence;
+
+                # And the second is the name
+                my $name = <$namedseqfh>;
+                chomp $name;
                 my @s = map { chr(hex($_)) } split(' ', $sequence);
                 $NAMEDSEQ{$name} = join("", @s);
             }
@@ -3753,17 +3762,9 @@ them.
 
 Instead of reading the Unicode Database directly from files, as you were able
 to do for a long time, you are encouraged to use the supplied functions. So,
-instead of reading C<Name.pl> - which may disappear without notice in the
-future - directly, as with
-
-  my (%name, %cp);
-  for (split m/\s*\n/ => do "unicore/Name.pl") {
-      my ($cp, $name) = split m/\t/ => $_;
-      $cp{$name} = $cp;
-      $name{$cp} = $name unless $cp =~ m/ /;
-  }
-
-You ought to use L</prop_invmap()> like this:
+instead of reading C<Name.pl> directly, which changed formats in 5.32, and may
+do so again without notice in the future or even disappear, you ought to use
+L</prop_invmap()> like this:
 
   my (%name, %cp, %cps, $n);
   # All codepoints
@@ -3906,6 +3907,14 @@ RETRY:
             my %names;
             $names{'LIST'} = "";
             my $original = do "unicore/Name.pl";
+
+            # Change the double \n format of the file back to single lines
+            # with a tab
+            $original =~ s/\n\n/\e/g;   # Use a control that shouldn't occur
+                                        #in the file
+            $original =~ s/\n/\t/g;
+            $original =~ s/\e/\n/g;
+
             my $algorithm_names = \@algorithmic_named_code_points;
 
             # We need to remove the names from it that are aliases.  For that
@@ -3934,7 +3943,7 @@ RETRY:
             foreach my $line (split "\n", $original) {
                 my ($hex_code_point, $name) = split "\t", $line;
 
-                # Weeds out all comments, blank lines, and named sequences
+                # Weeds out any comments, blank lines, and named sequences
                 next if $hex_code_point =~ /[^[:xdigit:]]/a;
 
                 my $code_point = hex $hex_code_point;
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 6adb3ffef7..eb7fbd8f35 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -2454,22 +2454,36 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
 
         $official = do "unicore/Name.pl";
 
+        # Change the double \n format of the file back to single lines with a tab
+        $official =~ s/\n\n/\e/g;     # Use a control that shouldn't occur
+                                      # in the file
+        $official =~ s/\n/\t/g;
+        $official =~ s/\e/\n/g;
+
         # Get rid of the named sequences portion of the file.  These don't
         # have a tab before the first blank on a line.
         $official =~ s/ ^ [^\t]+ \  .*? \n //xmg;
 
         # And get rid of the controls.  These are named in the file, but
-        # shouldn't be in the property.  This gets rid of the two ranges in
-        # one fell swoop, and also all the Unicode1_Name values that may not
-        # be in Name_Alias.
+        # shouldn't be in the property.  On all supported platforms, there are
+        # two ranges of controls.  The first range extends from 0..SPACE-1.
+        # The second depends on the platform.
+        $official =~ s/ ^ 00000 .*? ( .{5} \t SPACE ) $ /$1/xms;
+        my $range_2_start;
+        my $range_2_end_next;
         if ($::IS_ASCII) {
-            $official =~ s/ 00000 \t .* 0001F .*? \n//xs;
-            $official =~ s/ 0007F \t .* 0009F .*? \n//xs;
+            $range_2_start    = '0007F';
+            $range_2_end_next = '000A0';
+        }
+        elsif (ord '^' == 106) { # POSIX-BC
+            $range_2_start    = '005F';
+            $range_2_end_next = '0060';
         }
-        elsif ($::IS_EBCDIC) { # Won't work for POSIX-BC
-            $official =~ s/ 00000 \t .* 0003F .*? \n//xs;
-            $official =~ s/ 000FF \t .* 000FF .*? \n//xs;
+        else {
+            $range_2_start    = '00FF';
+            $range_2_end_next = '0100';
         }
+        $official =~ s/ ^ $range_2_start .*? ( $range_2_end_next ) /$1/xms;
 
         # And remove the aliases.  We read in the Name_Alias property, and go
         # through them one by one.
@@ -2499,6 +2513,7 @@ foreach my $prop (sort(keys %props), sort keys %legacy_props) {
                 $official =~ s/$hex_code_point \t $alias \n //x;
             }
         }
+
         local $/ = "\n";
         chomp $official;
         $/ = $input_record_separator;
diff --git a/lib/_charnames.pm b/lib/_charnames.pm
index b383337d8f..be9fac4f89 100644
--- a/lib/_charnames.pm
+++ b/lib/_charnames.pm
@@ -6,7 +6,7 @@
 package _charnames;
 use strict;
 use warnings;
-our $VERSION = '1.46';
+our $VERSION = '1.47';
 use unicore::Name;    # mktables-generated algorithmically-defined names
 
 use bytes ();          # for $bytes::hint_bits
@@ -21,22 +21,22 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
 #
 # The official names with their code points are stored in a table in
 # lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in
-# Unicode 6.0).  Each code point/name combination is separated by a \n in the
-# string.  (Some of the CJK and the Hangul syllable names are instead
-# determined algorithmically via subroutines stored instead in
-# lib/unicore/Name.pm).  Because of the large size of this table, it isn't
-# converted into hashes for faster lookup.
+# Unicode 6.0).  Each code point appears on a line by itself, with its
+# corresponding name occupying the next line in the string.  (Some of the CJK
+# and the Hangul syllable names are instead determined algorithmically via
+# subroutines stored instead in lib/unicore/Name.pm).  Because of the large
+# size of this table, it isn't converted into hashes for faster lookup.
 #
 # But, user defined aliases are stored in their own hashes, as are Perl
 # extensions to the official names.  These are checked first before looking at
 # the official table.
 #
 # Basically, the table is grepped for the input code point (viacode()) or
-# name (the other functions), and the corresponding value on the same line is
-# returned.  The grepping is done by turning the input into a regular
-# expression.  Thus, the same table does double duty, used by both name and
-# code point lookup.  (If we were to have hashes, we would need two, one for
-# each lookup direction.)
+# name (the other functions), and the corresponding value on the next or
+# previous line is returned.  The grepping is done by turning the input into a
+# regular expression.  Thus, the same table does double duty, used by both
+# name and code point lookup.  (If we were to have hashes, we would need two,
+# one for each lookup direction.)
 #
 # For loose name matching, the logical thing would be to have a table
 # with all the ignorable characters squeezed out, and then grep it with the
@@ -48,9 +48,9 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
 # regular expression of the input name is modified to have optional spaces and
 # dashes between characters.  For example, in strict matching, the regular
 # expression would be:
-#   qr/\tDIGIT ONE$/m
+#   qr/^DIGIT ONE$/m
 # Under loose matching, the blank would be squeezed out, and the re would be:
-#   qr/\tD[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m
+#   qr/^D[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m
 # which matches a blank or dash between any characters in the official table.
 #
 # This is also how script lookup is done.  Basically the re looks like
@@ -404,10 +404,10 @@ sub lookup_name ($$$;$) {
       my $cache_ref;
 
       ## Suck in the code/name list as a big string.
-      ## Lines look like:
-      ##     "00052\tLATIN CAPITAL LETTER R\n"
+      ## Entries look like:
+      ##     "00052\nLATIN CAPITAL LETTER R\n\n"
       # or
-      #      "0052 0303\tLATIN CAPITAL LETTER R WITH TILDE\n"
+      #      "0052 0303\nLATIN CAPITAL LETTER R WITH TILDE\n\n"
       $txt = do "unicore/Name.pl" unless $txt;
 
       ## @off will hold the index into the code/name string of the start and
@@ -461,8 +461,8 @@ sub lookup_name ($$$;$) {
 
         # Do the lookup in the full table if asked for, and if succeeds
         # save the offsets and set where to cache the result.
-        if (($loose || $^H{charnames_full}) && $txt =~ /\t$lookup_name$/m) {
-          @off = ($-[0] + 1, $+[0]);    # The 1 is for the tab
+        if (($loose || $^H{charnames_full}) && $txt =~ /^$lookup_name$/m) {
+          @off = ($-[0], $+[0]);
           $cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache;
         }
         elsif ($regex_loose) {
@@ -506,18 +506,18 @@ sub lookup_name ($$$;$) {
 
           my $case = $name_has_uppercase ? "CAPITAL" : "SMALL";
           return if (! $scripts_trie || $txt !~
-             /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm);
+             /^ (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm);
 
           # Here have found the input name in the table.
-          @off = ($-[0] + 1, $+[0]);  # The 1 is for the tab
+          @off = ($-[0], $+[0]);
         }
 
         # Here, the input name has been found; we haven't set up the output,
         # but we know where in the string
         # the name starts.  The string is set up so that for single characters
-        # (and not named sequences), the name is preceded immediately by a
-        # tab and 5 hex digits for its code, with a \n before those.  Named
-        # sequences won't have the 7th preceding character be a \n.
+        # (and not named sequences), the name is on a line by itself, and the
+        # previous line contains precisely 5 hex digits for its code point.
+        # Named sequences won't have the 7th preceding character be a \n.
         # (Actually, for the very first entry in the table this isn't strictly
         # true: subtracting 7 will yield -1, and the substr below will
         # therefore yield the very last character in the table, which should
@@ -698,7 +698,7 @@ sub import
     $txt = do "unicore/Name.pl" unless $txt;
 
     for my $script (@scripts) {
-      if (not $txt =~ m/\t$script (?:CAPITAL |SMALL )?LETTER /) {
+      if (not $txt =~ m/^$script (?:CAPITAL |SMALL )?LETTER /m) {
         warnings::warn('utf8',  "No such script: '$script'");
         $script = quotemeta $script;  # Escape it, for use in the re.
       }
@@ -785,7 +785,7 @@ sub viacode {
     # Return the official name, if exists.  It's unclear to me (khw) at
     # this juncture if it is better to return a user-defined override, so
     # leaving it as is for now.
-    if ($txt =~ m/^$hex\t/m) {
+    if ($txt =~ m/^$hex\n/m) {
 
         # The name starts with the next character and goes up to the
         # next new-line.  Using capturing parentheses above instead of
diff --git a/lib/charnames.pm b/lib/charnames.pm
index 9f4a9683e1..497b50cc1e 100644
--- a/lib/charnames.pm
+++ b/lib/charnames.pm
@@ -1,7 +1,7 @@
 package charnames;
 use strict;
 use warnings;
-our $VERSION = '1.46';
+our $VERSION = '1.47';
 use unicore::Name;    # mktables-generated algorithmically-defined names
 use _charnames ();    # The submodule for this where most of the work gets done
 
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 498a94d9f1..2126268709 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -11233,7 +11233,7 @@ sub output_perl_charnames_line ($$) {
     # ordinals, but they are all private use or noncharacters which do not
     # have names, so won't be in this table.
 
-    return sprintf "%05X\t%s\n", $_[0], $_[1];
+    return sprintf "%05X\n%s\n\n", $_[0], $_[1];
 }
 
 { # Closure
@@ -12579,7 +12579,7 @@ sub process_NamedSequences {
         # Perl_charnames.  But it turns out that the code points don't have to
         # be 5 digits long, like the rest, based on the internal workings of
         # charnames.pm.  This could be easily changed for consistency.
-        push @named_sequences, "$sequence\t$name";
+        push @named_sequences, "$sequence\n$name\n";
     }
     return;
 }
diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl
index 6c1f236ba9..bcfe12e20c 100644
--- a/lib/unicore/uni_keywords.pl
+++ b/lib/unicore/uni_keywords.pl
@@ -1247,7 +1247,7 @@
 1;
 
 # Generated from:
-# 64f46a4b25d29a7f952077ee277909df8599a7a803759805c865914d981671a2 lib/Unicode/UCD.pm
+# d60b1a3dffe16c4aaaf3f00d21993bb320d05e9828b106182214764d4c69935c lib/Unicode/UCD.pm
 # ce96627d4fc91b4fd886c409caeb9b76cf7bd345e12f05c6701add7f233f6437 lib/unicore/ArabicShaping.txt
 # 0e69eef3da722cc104522d8372e86d5b86bb7afcc761b0c991e39e832294946d lib/unicore/BidiBrackets.txt
 # a00d9d21585106a52113fb7b1d3d0373a5835be72e76862fb559ebddd474d70e lib/unicore/BidiMirroring.txt
@@ -1295,7 +1295,7 @@
 # baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
 # 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
 # 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
-# a3f3caba903e4d39b6c7aaa7ea4d3a739e745b010ad51cf0e05f34ffa0ac2c04 lib/unicore/mktables
+# 91977d5f417fa9252fe9bfebeb61bb28bda9273b630a0e333b6c7b94c8445bca lib/unicore/mktables
 # 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
 # 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
 # 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
author	Karl Williamson <khw@cpan.org>	2020-02-13 09:04:41 -0700
committer	Karl Williamson <khw@cpan.org>	2020-03-11 09:00:04 -0600
commit	b555069b72f93a232deba173dc7bf7892cfa5868 (patch)
tree	46616e3464efba862569f9f74b08f6369768c496 /lib
parent	d022bb19f99bdcaafe3758b24d031a0f0450e623 (diff)
download	perl-b555069b72f93a232deba173dc7bf7892cfa5868.tar.gz