summaryrefslogtreecommitdiff
path: root/lib/_charnames.pm
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2020-02-13 09:04:41 -0700
committerKarl Williamson <khw@cpan.org>2020-03-11 09:00:04 -0600
commitb555069b72f93a232deba173dc7bf7892cfa5868 (patch)
tree46616e3464efba862569f9f74b08f6369768c496 /lib/_charnames.pm
parentd022bb19f99bdcaafe3758b24d031a0f0450e623 (diff)
downloadperl-b555069b72f93a232deba173dc7bf7892cfa5868.tar.gz
Reformat lib/unicore/Name.pl
This changes the format of this generated file so that it can more easily be used with the Unicode Name property in wildcard matching. Each line will now end with \n\n, and the \t characters are replaced by \n. Thus an entry will look like 00001\nSTART OF HEADING\n\n This makes matching of user-defined patterns using anchors work under /m, which commit 4829f32decd128e6a122bd8ce35fe944bd87f104 forces. That commit also changed some anchors' defintions to make them match \n under /m with wildcards, so this makes it all transparent to user patterns. The double \n\n at the end of an entry is so that the code can distinguish between a line that contains a code point vs a name without relying on the content; it is a disambiguator, like the \t that used to be.
Diffstat (limited to 'lib/_charnames.pm')
-rw-r--r--lib/_charnames.pm50
1 files changed, 25 insertions, 25 deletions
diff --git a/lib/_charnames.pm b/lib/_charnames.pm
index b383337d8f..be9fac4f89 100644
--- a/lib/_charnames.pm
+++ b/lib/_charnames.pm
@@ -6,7 +6,7 @@
package _charnames;
use strict;
use warnings;
-our $VERSION = '1.46';
+our $VERSION = '1.47';
use unicore::Name; # mktables-generated algorithmically-defined names
use bytes (); # for $bytes::hint_bits
@@ -21,22 +21,22 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
#
# The official names with their code points are stored in a table in
# lib/unicore/Name.pl which is read in as a large string (almost 3/4 Mb in
-# Unicode 6.0). Each code point/name combination is separated by a \n in the
-# string. (Some of the CJK and the Hangul syllable names are instead
-# determined algorithmically via subroutines stored instead in
-# lib/unicore/Name.pm). Because of the large size of this table, it isn't
-# converted into hashes for faster lookup.
+# Unicode 6.0). Each code point appears on a line by itself, with its
+# corresponding name occupying the next line in the string. (Some of the CJK
+# and the Hangul syllable names are instead determined algorithmically via
+# subroutines stored instead in lib/unicore/Name.pm). Because of the large
+# size of this table, it isn't converted into hashes for faster lookup.
#
# But, user defined aliases are stored in their own hashes, as are Perl
# extensions to the official names. These are checked first before looking at
# the official table.
#
# Basically, the table is grepped for the input code point (viacode()) or
-# name (the other functions), and the corresponding value on the same line is
-# returned. The grepping is done by turning the input into a regular
-# expression. Thus, the same table does double duty, used by both name and
-# code point lookup. (If we were to have hashes, we would need two, one for
-# each lookup direction.)
+# name (the other functions), and the corresponding value on the next or
+# previous line is returned. The grepping is done by turning the input into a
+# regular expression. Thus, the same table does double duty, used by both
+# name and code point lookup. (If we were to have hashes, we would need two,
+# one for each lookup direction.)
#
# For loose name matching, the logical thing would be to have a table
# with all the ignorable characters squeezed out, and then grep it with the
@@ -48,9 +48,9 @@ $Carp::Internal{ (__PACKAGE__) } = 1;
# regular expression of the input name is modified to have optional spaces and
# dashes between characters. For example, in strict matching, the regular
# expression would be:
-# qr/\tDIGIT ONE$/m
+# qr/^DIGIT ONE$/m
# Under loose matching, the blank would be squeezed out, and the re would be:
-# qr/\tD[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m
+# qr/^D[- ]?I[- ]?G[- ]?I[- ]?T[- ]?O[- ]?N[- ]?E$/m
# which matches a blank or dash between any characters in the official table.
#
# This is also how script lookup is done. Basically the re looks like
@@ -404,10 +404,10 @@ sub lookup_name ($$$;$) {
my $cache_ref;
## Suck in the code/name list as a big string.
- ## Lines look like:
- ## "00052\tLATIN CAPITAL LETTER R\n"
+ ## Entries look like:
+ ## "00052\nLATIN CAPITAL LETTER R\n\n"
# or
- # "0052 0303\tLATIN CAPITAL LETTER R WITH TILDE\n"
+ # "0052 0303\nLATIN CAPITAL LETTER R WITH TILDE\n\n"
$txt = do "unicore/Name.pl" unless $txt;
## @off will hold the index into the code/name string of the start and
@@ -461,8 +461,8 @@ sub lookup_name ($$$;$) {
# Do the lookup in the full table if asked for, and if succeeds
# save the offsets and set where to cache the result.
- if (($loose || $^H{charnames_full}) && $txt =~ /\t$lookup_name$/m) {
- @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
+ if (($loose || $^H{charnames_full}) && $txt =~ /^$lookup_name$/m) {
+ @off = ($-[0], $+[0]);
$cache_ref = ($loose) ? \%loose_names_cache : \%full_names_cache;
}
elsif ($regex_loose) {
@@ -506,18 +506,18 @@ sub lookup_name ($$$;$) {
my $case = $name_has_uppercase ? "CAPITAL" : "SMALL";
return if (! $scripts_trie || $txt !~
- /\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm);
+ /^ (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U$lookup_name $/xm);
# Here have found the input name in the table.
- @off = ($-[0] + 1, $+[0]); # The 1 is for the tab
+ @off = ($-[0], $+[0]);
}
# Here, the input name has been found; we haven't set up the output,
# but we know where in the string
# the name starts. The string is set up so that for single characters
- # (and not named sequences), the name is preceded immediately by a
- # tab and 5 hex digits for its code, with a \n before those. Named
- # sequences won't have the 7th preceding character be a \n.
+ # (and not named sequences), the name is on a line by itself, and the
+ # previous line contains precisely 5 hex digits for its code point.
+ # Named sequences won't have the 7th preceding character be a \n.
# (Actually, for the very first entry in the table this isn't strictly
# true: subtracting 7 will yield -1, and the substr below will
# therefore yield the very last character in the table, which should
@@ -698,7 +698,7 @@ sub import
$txt = do "unicore/Name.pl" unless $txt;
for my $script (@scripts) {
- if (not $txt =~ m/\t$script (?:CAPITAL |SMALL )?LETTER /) {
+ if (not $txt =~ m/^$script (?:CAPITAL |SMALL )?LETTER /m) {
warnings::warn('utf8', "No such script: '$script'");
$script = quotemeta $script; # Escape it, for use in the re.
}
@@ -785,7 +785,7 @@ sub viacode {
# Return the official name, if exists. It's unclear to me (khw) at
# this juncture if it is better to return a user-defined override, so
# leaving it as is for now.
- if ($txt =~ m/^$hex\t/m) {
+ if ($txt =~ m/^$hex\n/m) {
# The name starts with the next character and goes up to the
# next new-line. Using capturing parentheses above instead of