summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-08-28 12:40:37 -0600
committerKarl Williamson <public@khwilliamson.com>2011-09-02 11:57:44 -0600
commit5c3b35c95645d687026b198c24e884914bfb6ce6 (patch)
treeae8ac11dc1ca3f9abb98b10b3882d0fd5a802e72 /lib/Unicode
parentcb366075e52d6ac4c3816d5b59a319395748363d (diff)
downloadperl-5c3b35c95645d687026b198c24e884914bfb6ce6.tar.gz
Unicode::UCD: speed up some look ups
A previous commit created the possibility that for a data structure to be used instead of an array of arrays. The latter data structure is used because most properties have long ranges in which each code point maps to the same thing, and so don't have to have a hash entry for each code point, can use the range as the base data structure. However certain properties (or property-like structures) don't have long ranges, and hence don't need to save memory by having things work through ranges instead of individual code points. For these, a hash offers improved speed, without the memory cost. This patch converts 4 of the data structures to hashes. Future extensions to this module will also take advantage of some of these being in hashes
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/UCD.pm27
1 files changed, 13 insertions, 14 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 890dfb4137..692cad0b0f 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -271,10 +271,10 @@ my @BIDIS;
my @CATEGORIES;
my @DECOMPOSITIONS;
my @NUMERIC_TYPES;
-my @SIMPLE_LOWER;
-my @SIMPLE_TITLE;
-my @SIMPLE_UPPER;
-my @UNICODE_1_NAMES;
+my %SIMPLE_LOWER;
+my %SIMPLE_TITLE;
+my %SIMPLE_UPPER;
+my %UNICODE_1_NAMES;
sub _charinfo_case {
@@ -284,20 +284,20 @@ sub _charinfo_case {
# $cased is the case-changed character
# $file is the file in lib/unicore/To/$file that contains the data
# needed for this, in the form that _search() understands.
- # $array_ref points to the array holding the contents of $file. It will
+ # $hash_ref points to the hash holding the contents of $file. It will
# be populated if empty.
# By using the 'uc', etc. functions, we avoid loading more files into
# memory except for those rare cases where the simple casing (which has
# been what charinfo() has always returned, is different than the full
# casing.
- my ($char, $cased, $file, $array_ref) = @_;
+ my ($char, $cased, $file, $hash_ref) = @_;
return "" if $cased eq $char;
return sprintf("%04X", ord $cased) if length($cased) == 1;
- @$array_ref =_read_table("unicore/To/$file") unless @$array_ref;
- return _search($array_ref, 0, $#$array_ref, ord $char) // "";
+ %$hash_ref =_read_table("unicore/To/$file", 'use_hash') unless %$hash_ref;
+ return $hash_ref->{ord $char} // "";
}
sub charinfo {
@@ -394,18 +394,17 @@ sub charinfo {
$prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
- @UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl") unless @UNICODE_1_NAMES;
- $prop{'unicode10'} = _search(\@UNICODE_1_NAMES, 0, $#UNICODE_1_NAMES, $code)
- // "";
+ %UNICODE_1_NAMES =_read_table("unicore/To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
+ $prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
# This is true starting in 6.0, but, num() also requires 6.0, so
# don't need to test for version again here.
$prop{'comment'} = "";
- $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \@SIMPLE_UPPER);
- $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \@SIMPLE_LOWER);
+ $prop{'upper'} = _charinfo_case($char, uc $char, '_suc.pl', \%SIMPLE_UPPER);
+ $prop{'lower'} = _charinfo_case($char, lc $char, '_slc.pl', \%SIMPLE_LOWER);
$prop{'title'} = _charinfo_case($char, ucfirst $char, '_stc.pl',
- \@SIMPLE_TITLE);
+ \%SIMPLE_TITLE);
$prop{block} = charblock($code);
$prop{script} = charscript($code);