summaryrefslogtreecommitdiff
path: root/lib/Unicode
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2013-10-16 21:44:23 -0600
committerKarl Williamson <public@khwilliamson.com>2013-10-16 22:17:09 -0600
commit2430372414b6a23811d2683eed43009456f4f8ea (patch)
treef89748049b00cc82253e7c2993c1e9e45fb9a638 /lib/Unicode
parentcb4e82463ea6b3435de7bb0b1973c230d010b6c3 (diff)
downloadperl-2430372414b6a23811d2683eed43009456f4f8ea.tar.gz
Change mktables output for some tables to use hex
This makes all the tables in the lib/unicore/To directory that map from code point to code point be formatted so that the mapped-to code point is expressed as hexadecimal. This allows for uniform treatment of these tables in utf8.c, and removes the final use of strtol() in the (non-CPAN) core. strtol() should be avoided because it is subject to locale rules, and some older libc implementations have been buggy. It was used because Perl doesn't have an efficient way of parsing a decimal number and advancing the parse pointer to beyond it; we do have such a method for hex numbers. The input to mktables published by Unicode is also in hex, so this now conforms to that convention. This also will facilitate the new work currently being done to read in the tables that find the closing bracket given an opening one.
Diffstat (limited to 'lib/Unicode')
-rw-r--r--lib/Unicode/UCD.pm8
-rw-r--r--lib/Unicode/UCD.t11
2 files changed, 13 insertions, 6 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 81e671072f..14752ae2b1 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -5,7 +5,7 @@ use warnings;
no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.54';
+our $VERSION = '0.55';
require Exporter;
@@ -548,7 +548,7 @@ sub _read_table ($;$) {
my $property = $table =~ s/\.pl//r;
$property = $utf8::file_to_swash_name{$property};
my $to_adjust = defined $property
- && $utf8::SwashInfo{$property}{'format'} eq 'a';
+ && $utf8::SwashInfo{$property}{'format'} =~ / ^ a /x;
for (split /^/m, $list) {
my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
@@ -556,6 +556,8 @@ sub _read_table ($;$) {
$ /x;
my $decimal_start = hex $start;
my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
+ $value = hex $value if $to_adjust
+ && $utf8::SwashInfo{$property}{'format'} eq 'ax';
if ($return_hash) {
foreach my $i ($decimal_start .. $decimal_end) {
$return{$i} = ($to_adjust)
@@ -3360,7 +3362,7 @@ RETRY:
# Otherwise, convert hex formatted list entries to decimal; add a
# 'Y' map for the missing value in binary properties, or
# otherwise, use the input map unchanged.
- $map = ($format eq 'x')
+ $map = ($format eq 'x' || $format eq 'ax')
? hex $map
: $format eq 'b'
? 'Y'
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index 0ba312e632..c21b7a9800 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -1465,7 +1465,7 @@ foreach my $prop (sort keys %props) {
my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
\s* ( \# .* )? $ /x;
$end = $start if $end eq "";
- push @list, [ hex $start, hex $end, $value ];
+ push @list, [ hex $start, hex $end, hex $value ];
}
# For these mappings, the file contains all the simple mappings,
@@ -1523,10 +1523,10 @@ foreach my $prop (sort keys %props) {
for my $element (@list) {
$official .= "\n" if $official;
if ($element->[1] == $element->[0]) {
- $official .= sprintf "%04X\t\t%s", $element->[0], $element->[2];
+ $official .= sprintf "%04X\t\t%X", $element->[0], $element->[2];
}
else {
- $official .= sprintf "%04X\t%04X\t%s", $element->[0], $element->[1], $element->[2];
+ $official .= sprintf "%04X\t%04X\t%X", $element->[0], $element->[1], $element->[2];
}
}
}
@@ -1646,6 +1646,11 @@ foreach my $prop (sort keys %props) {
next PROPERTY;
}
}
+ elsif ($full_name =~ # These maps are in hex
+ /(Simple_)?(Case_Folding|(Lower|Title|Upper)case_Mapping)/)
+ {
+ $invmap_ref->[$i] = sprintf("%X", $invmap_ref->[$i]);
+ }
elsif ($format eq 'ad' || $format eq 'ale') {
# The numerics in the returned map are stored as adjusted