From 4f143a729773018c42dbe31ef4414f1d7fd9f9e9 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Fri, 10 Feb 2012 15:13:10 -0700 Subject: Unicode::UCD::prop_invmap: Store Nv property as adjusted type By converting this property to requiring adjustments to get the proper values, its storage size decreases by more than half. --- lib/Unicode/UCD.pm | 59 ++++++++++++++++++++++++++++++++-------------------- lib/Unicode/UCD.t | 5 +++-- lib/unicore/mktables | 2 ++ 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index ceb491797c..6cefe977a0 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -1324,10 +1324,17 @@ sub _numeric { my $real = $rational[0] / $rational[1]; $real_to_rational{$real} = $value; $value = $real; - } - for my $i ($start .. $end) { - $NUMERIC{$i} = $value; + # Should only be single element, but just in case... + for my $i ($start .. $end) { + $NUMERIC{$i} = $value; + } + } + else { + # The values require adjusting, as is in 'a' format + for my $i ($start .. $end) { + $NUMERIC{$i} = $value + $i - $start; + } } } @@ -2415,43 +2422,43 @@ An example slice is: 0x00B0 0 ... -=item B> +=item B> means that all the elements of the map array are either rational numbers or the string C<"NaN">, meaning "Not a Number". A rational number is either an integer, or two integers separated by a solidus (C<"/">). The second integer represents the denominator of the division implied by the solidus, and is -guaranteed not to be 0. If you want to convert them to scalar numbers, you +guaranteed not to be 0. When the element is a plain integer (without the +solidus), it may need to be adjusted to get the correct value by adding the +offset, just as other C<"a"> properties. No adjustment is needed for +fractions, as the range is guaranteed to have just a single element, and so +the offset is always 0. + +If you want to convert the returned map to entirely scalar numbers, you can use something like this: my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property); - if ($format && $format eq "r") { + if ($format && $format eq "ar") { map { $_ = eval $_ } @$invmap_ref; } Here's some entries from the output of the property "Nv", which has format -C<"r">. +C<"ar">. - @numerics_ranges @numerics_maps Note + @numerics_ranges @numerics_maps Note 0x00 "NaN" - 0x30 0 DIGIT 0 - 0x31 1 - 0x32 2 - ... - 0x37 7 - 0x38 8 - 0x39 9 DIGIT 9 + 0x30 0 DIGIT 0 .. DIGIT 9 0x3A "NaN" - 0xB2 2 SUPERSCRIPT 2 - 0xB3 3 SUPERSCRIPT 2 + 0xB2 2 SUPERSCRIPTs 2 and 3 0xB4 "NaN" - 0xB9 1 SUPERSCRIPT 1 + 0xB9 1 SUPERSCRIPT 1 0xBA "NaN" - 0xBC 1/4 VULGAR FRACTION 1/4 - 0xBD 1/2 VULGAR FRACTION 1/2 - 0xBE 3/4 VULGAR FRACTION 3/4 + 0xBC 1/4 VULGAR FRACTION 1/4 + 0xBD 1/2 VULGAR FRACTION 1/2 + 0xBE 3/4 VULGAR FRACTION 3/4 0xBF "NaN" - 0x660 0 ARABIC-INDIC DIGIT ZERO + 0x660 0 ARABIC-INDIC DIGIT ZERO .. NINE + 0x66A "NaN" =item B> @@ -3253,7 +3260,13 @@ RETRY: elsif ($returned_prop eq 'ToPerlDecimalDigit') { $format = 'ae'; } - elsif ($format ne 'n' && $format ne 'r') { + elsif ($returned_prop eq 'ToNv') { + + # The one property that has this format is stored as a delta, so needs + # to indicate that need to add code point to it. + $format = 'ar'; + } + elsif ($format ne 'n') { # All others are simple scalars $format = 's'; diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index 4188671a80..99ffc9dbc4 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -509,6 +509,7 @@ is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2'); is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000'); is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); +is(num("\N{U+5146}"), 1000000000000, 'Verify num("\N{U+5146}") == 1000000000000'); # Create a user-defined property sub InKana {<<'END'} @@ -1259,7 +1260,7 @@ foreach my $prop (keys %props) { next PROPERTY; } } - elsif ($format =~ /^ a /x) { + elsif ($format =~ /^ a (?!r) /x) { if ($full_name eq 'Perl_Decimal_Digit') { if ($missing ne "") { fail("prop_invmap('$mod_prop')"); @@ -1891,7 +1892,7 @@ foreach my $prop (keys %props) { next PROPERTY; } } - elsif ($format eq 's' || $format eq 'r') { + elsif ($format eq 's') { # Here the map is not more or less directly from a file stored on # disk. We try a different tack. These should all be properties that diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 5a223ad7a8..394b6226a9 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -8919,6 +8919,8 @@ sub finish_property_setup { $bmg->set_range_size_1(1); } + property_ref('Numeric_Value')->set_to_output_map($OUTPUT_DELTAS); + return; } -- cgit v1.2.1