summaryrefslogtreecommitdiff
path: root/lib/utf8_heavy.pl
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2012-01-02 16:12:21 -0700
committerKarl Williamson <public@khwilliamson.com>2012-01-02 16:29:01 -0700
commit4de6d205aeab9ec737ca35ba4eb61f37cebefc55 (patch)
treeb7219bbfbbc8c429210fc53c5037b4617758eacc /lib/utf8_heavy.pl
parent9b2257811e57d81f3d8a720e80bb0af3c15d292f (diff)
downloadperl-4de6d205aeab9ec737ca35ba4eb61f37cebefc55.tar.gz
utf8_heavy.pl: Skip unnecessary work for official properties
The tables that mktables generates are well behaved, and so the checks and sorting that are done for user-defined properties may be skipped. tainting needs to be preserved because $list can be passed in already tainted. This is also in preparation for Unicode 6.1, in which one table will legitimately have duplicate entries that the old code removed.
Diffstat (limited to 'lib/utf8_heavy.pl')
-rw-r--r--lib/utf8_heavy.pl22
1 files changed, 20 insertions, 2 deletions
diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl
index 699a26f1dd..84a81676f0 100644
--- a/lib/utf8_heavy.pl
+++ b/lib/utf8_heavy.pl
@@ -525,9 +525,10 @@ sub _loose_name ($) {
if ($list) {
my $taint = substr($list,0,0); # maintain taint
- # Separate the extras from the code point list, and
- # make sure the latter are well-behaved
+ # Separate the extras from the code point list, and for
+ # user-defined properties, make sure the latter are well-behaved
# for downstream code.
+ if ($user_defined) {
my @tmp = split(/^/m, $list);
my %seen;
no warnings;
@@ -543,6 +544,23 @@ sub _loose_name ($) {
sort { $a->[0] <=> $b->[0] }
map { /^([0-9a-fA-F]+)/; [ CORE::hex($1), $_ ] }
grep { /^([0-9a-fA-F]+)/ and not $seen{$1}++ } @tmp; # XXX doesn't do ranges right
+ }
+ else {
+ # mktables has gone to some trouble to make non-user defined
+ # properties well-behaved, so we can skip the effort we do for
+ # user-defined ones. Any extras are at the very beginning of
+ # the string.
+
+ # This regex splits out the first lines of $list into $1 and
+ # strips them off from $list, until we get one that begins
+ # with a hex number, alone on the line, or followed by a tab.
+ # Either portion may be empty.
+ $list =~ s/ \A ( .*? )
+ (?: \z | (?= ^ [0-9a-fA-F]+ (?: \t | $) ) )
+ //msx;
+
+ $extras = "$taint$1";
+ }
}
if ($none) {