summaryrefslogtreecommitdiff
path: root/lib/charnames.pm
diff options
context:
space:
mode:
authorKarl Williamson <khw@khw-desktop.(none)>2010-07-13 14:29:00 -0600
committerKarl Williamson <khw@khw-desktop.(none)>2010-07-13 17:16:34 -0600
commite79869e1c56fc83ab416405bb9ec6ebd41859dd8 (patch)
treea2d059577f279ecfa05fcf1997ab7f811c22d8e1 /lib/charnames.pm
parentdc023ef4ba9dbad8f45a2001a409b7f4b7371f6b (diff)
downloadperl-e79869e1c56fc83ab416405bb9ec6ebd41859dd8.tar.gz
charnames.pm: More refactoring for performance
I realized after the last commit that it might be faster to use a trie when there are multiple scripts that a letter could be in, instead of searching the table for each script. When there were 6 possible scripts and the letter was found in the final one, the speed-up was a factor of 5. This also simplified things. The list of scripts can be stored as a string like A|B|C instead of a stringified array, and the code just gets simpler. Also, there were complications to the code to keep from zapping the input name, just in case it was needed for an error message. But I realized that instead of using a shift to get the name, just copy it from $_[0], and on the error leg that needs the original, it still is in $_[0]. If a user-defined alias is to a character name to lookup and that one is invalid, we want to output the invalid one, so a further variable, $save_input is used to hold it
Diffstat (limited to 'lib/charnames.pm')
-rw-r--r--lib/charnames.pm65
1 files changed, 33 insertions, 32 deletions
diff --git a/lib/charnames.pm b/lib/charnames.pm
index dfe4ab4e90..b017b48bba 100644
--- a/lib/charnames.pm
+++ b/lib/charnames.pm
@@ -2,7 +2,7 @@ package charnames;
use strict;
use warnings;
use File::Spec;
-our $VERSION = '1.13';
+our $VERSION = '1.14';
use bytes (); # for $bytes::hint_bits
@@ -511,10 +511,10 @@ sub lookup_name ($;$) {
my $runtime = (@_ > 1); # compile vs run time
- my $name = shift;
- my $hints_ref = shift;
+ my ($name, $hints_ref) = @_;
my $ord;
+ my $save_input;
if ($runtime) {
# At runtime, but currently not at compile time, $^H gets
@@ -524,7 +524,7 @@ sub lookup_name ($;$) {
%{$^H{charnames_name_aliases}} = split ',', $hints_ref->{charnames_stringified_names};
%{$^H{charnames_ord_aliases}} = split ',', $hints_ref->{charnames_stringified_ords};
- @{$^H{charnames_scripts}} = split ',', $hints_ref->{charnames_stringified_scripts};
+ $^H{charnames_scripts} = $hints_ref->{charnames_scripts};
$^H{charnames_full} = $hints_ref->{charnames_full};
$^H{charnames_short} = $hints_ref->{charnames_short};
}
@@ -536,6 +536,7 @@ sub lookup_name ($;$) {
}
elsif (exists $^H{charnames_name_aliases}{$name}) {
$name = $^H{charnames_name_aliases}{$name};
+ $save_input = $name; # Cache the result for any error message
}
elsif (exists $system_aliases{$name}) {
$ord = $system_aliases{$name};
@@ -573,41 +574,35 @@ sub lookup_name ($;$) {
}
}
- # If we didn't get it above keep looking
+ # If we didn't get it above, keep looking
if (! $found_full_in_table) {
# If :short is allowed, see if input is like "greek:Sigma".
- my $scripts_ref;
- my $name_ref;
+ my $scripts_trie;
if (($^H{charnames_short})
&& $name =~ /^ \s* (.+?) \s* : \s* (.+?) \s* $ /xs)
{
- my @script = uc $1;
- my $character_name = $2;
- $scripts_ref = \@script;
- $name_ref = \$character_name;
+ $scripts_trie = "\U\Q$1";
+ $name = $2;
}
else {
- $scripts_ref = $^H{charnames_scripts};
- $name_ref = \$name;
+ $scripts_trie = $^H{charnames_scripts};
}
- my $case = $$name_ref =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
- for my $script (@{$scripts_ref}) {
- if ($txt =~
- m/\t\t \Q$script\E \ (?:$case\ )? LETTER \ \U\Q$$name_ref\E $/xm)
- {
- @off = ($-[0] + 2, $+[0]);
- goto found_one;
- }
- }
+ my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
+ if ($txt !~
+ /\t\t (?: $scripts_trie ) \ (?:$case\ )? LETTER \ \U\Q$name\E $/xm)
+ {
+ # Here we still don't have it, give up.
+ return if $runtime;
- # Here we still don't have it, give up.
- return if $runtime;
- carp "Unknown charname '$name'";
- return 0xFFFD;
+ # May have zapped input name, get it again.
+ $name = (defined $save_input) ? $save_input : $_[0];
+ carp "Unknown charname '$name'";
+ return 0xFFFD;
+ }
-found_one:
+ @off = ($-[0] + 2, $+[0]);
}
##
@@ -640,7 +635,12 @@ found_one:
# Here is compile time, "use bytes" is in effect, and the character
# won't fit in a byte
# Prefer any official name over the input one.
- $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
+ if (@off) {
+ $name = substr($txt, $off[0], $off[1] - $off[0]) if @off;
+ }
+ else {
+ $name = (defined $save_input) ? $save_input : $_[0];
+ }
croak not_legal_use_bytes_msg($name, $ord);
} # lookup_name
@@ -705,18 +705,19 @@ sub import
$^H{charnames_full} = delete $h{':full'};
$^H{charnames_short} = delete $h{':short'};
- $^H{charnames_scripts} = [map uc, keys %h];
+ my @scripts = map uc, keys %h;
##
## If utf8? warnings are enabled, and some scripts were given,
## see if at least we can find one letter from each script.
##
- if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
+ if (warnings::enabled('utf8') && @scripts) {
$txt = do "unicore/Name.pl" unless $txt;
- for my $script (@{$^H{charnames_scripts}}) {
+ for my $script (@scripts) {
if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
warnings::warn('utf8', "No such script: '$script'");
+ $script = quotemeta $script; # Escape it, for use in the re.
}
}
}
@@ -726,7 +727,7 @@ sub import
$^H{charnames_stringified_ords} = join ",", %{$^H{charnames_ord_aliases}};
$^H{charnames_stringified_names} = join ",", %{$^H{charnames_name_aliases}};
$^H{charnames_stringified_inverse_ords} = join ",", %{$^H{charnames_inverse_ords}};
- $^H{charnames_stringified_scripts} = join ",", @{$^H{charnames_scripts}};
+ $^H{charnames_scripts} = join "|", @scripts; # Stringifiy them as a trie
} # import
# Cache of already looked-up values. This is set to only contain