diff options
author | Karl Williamson <khw@cpan.org> | 2018-04-29 21:08:37 -0600 |
---|---|---|
committer | Karl Williamson <khw@cpan.org> | 2018-06-25 07:33:29 -0600 |
commit | 0426f63574a2379bce80c33f85f158ae093be0c2 (patch) | |
tree | b1dae5cd535465fbd78b01f364ae9b9732f2eb55 | |
parent | 7a6f68415b295f4315b6181237ea0000dd706cd5 (diff) | |
download | perl-0426f63574a2379bce80c33f85f158ae093be0c2.tar.gz |
Revise \p{nv=float} lookup
The Numeric Value property allows one to find all code points that have
a certain numeric value. An example would be to match against any
character in any of the world's scripts which is effectively equivalent
to the digit zero.
It is documented that we accept either integers (like \p{nv=9}) or
rationals (like \p{nv=1/2}). But we also accept floating point
representations in case a conversion to numeric has happened. I think
it is right that we not document these and their vagaries. One reason
is that Unicode might someday create a new rational number that, to the
precision we currently accept, is indistinguishable from an existing
one, so that we would have to increase the precision.
But there was a bug I introduced years ago. I thought that in order for
a float to be considered to match a close rational, that 3 significant
digits of precision would be needed, like .667 to match 2/3. That still
seems reasonable. But I didn't implement that concept. Instead, prior
to this commit, it was 3 (not necessarily significant) digits, so that
for 1/160, it would match .001.
This commit corrects that, and makes the lookup simpler. mktables will
use sprintf %e to get the number normalized and having the 3 signicant
digits required. At runtime, a floating number is normalized using the
same format, and the result looked up in a hash. This eliminates the
need to worry about matching within some epsilon.
Further simplifications in utf8_heavy.pl are achieved by making a more
precise definition as to what an acceptable number looks like, so we
don't have to check later to see if what matched really was one.
-rw-r--r-- | charclass_invlists.h | 2 | ||||
-rw-r--r-- | lib/unicore/mktables | 172 | ||||
-rw-r--r-- | lib/utf8_heavy.pl | 126 | ||||
-rw-r--r-- | regcharclass.h | 2 | ||||
-rw-r--r-- | uni_keywords.h | 2 |
5 files changed, 120 insertions, 184 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index 97565214cc..5efe5f35db 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -374323,7 +374323,7 @@ static const U8 WB_table[24][24] = { * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt - * 6f7e75c46e2c6e4cff53fd9c14a0fbc77611809565d609b15cb98868c5891cdd lib/unicore/mktables + * c237f9e6bda604db4388693b42a20ee0d5c2cf9c08152beca27aa0e1ee735550 lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 000fefe7d8..aa6c9fc568 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -1436,10 +1436,10 @@ my @missing_early_files; # Generated list of absent files that we need to my @files_actually_output; # List of files we generated. my @more_Names; # Some code point names are compound; this is used # to store the extra components of them. -my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at - # the minimum before we consider it equivalent to a - # candidate rational -my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms +my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal + # point of a normalized floating point number + # needed to match before we consider it equivalent + # to a candidate rational # These store references to certain commonly used property objects my $age; @@ -12955,6 +12955,7 @@ sub register_fraction($) { my $rational = shift; my $float = eval $rational; + $float = sprintf "%.*e", $E_FLOAT_PRECISION, $float; $nv_floating_to_rational{$float} = $rational; return; } @@ -17656,10 +17657,10 @@ $loose_to_file_of $nv_floating_to_rational ); -# If a floating point number doesn't have enough digits in it to get this -# close to a fraction, it isn't considered to be that fraction even if all the -# digits it does have match. -\$utf8::max_floating_slop = $MAX_FLOATING_SLOP; +# If a %e floating point number doesn't have this number of digits in it after +# the decimal point to get this close to a fraction, it isn't considered to be +# that fraction even if all the digits it does have match. +\$utf8::e_precision = $E_FLOAT_PRECISION; # Deprecated tables to generate a warning for. The key is the file containing # the table, so as to avoid duplication, as many property names can map to the @@ -18982,21 +18983,12 @@ sub make_property_test_script() { $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name - # Keep going down an order of magnitude - # until find that adding this quantity to - # 1 remains 1; but put an upper limit on - # this so in case this algorithm doesn't - # work properly on some platform, that we - # won't loop forever. - my $digits = 0; - my $min_floating_slop = 1; - while (1+ $min_floating_slop != 1 - && $digits++ < 50) - { - my $next = $min_floating_slop / 10; - last if $next == 0; # If underflows, - # use previous one - $min_floating_slop = $next; + # Create a list of what the %f representation is for each rational number. + # This will be used below. + my @valid_base_floats = '0.0'; + foreach my $e_representation (keys %nv_floating_to_rational) { + push @valid_base_floats, + eval $nv_floating_to_rational{$e_representation}; } # It doesn't matter whether the elements of this array contain single lines @@ -19144,70 +19136,82 @@ EOF_CODE # floating point equivalent. if ($table_name =~ qr{/}) { - # Calculate the float, and find just the fraction. + # Calculate the float, and the %e representation my $float = eval $table_name; - my ($whole, $fraction) - = $float =~ / (.*) \. (.*) /x; - - # Starting with one digit after the decimal point, - # create a test for each possible precision (number of - # digits past the decimal point) until well beyond the - # native number found on this machine. (If we started - # with 0 digits, it would be an integer, which could - # well match an unrelated table) - PLACE: - for my $i (1 .. $min_floating_slop + 3) { - my $table_name = sprintf("%.*f", $i, $float); - if ($i < $MIN_FRACTION_LENGTH) { - - # If the test case has fewer digits than the - # minimum acceptable precision, it shouldn't - # succeed, so we expect an error for it. - # E.g., 2/3 = .7 at one decimal point, and we - # shouldn't say it matches .7. We should make - # it be .667 at least before agreeing that the - # intent was to match 2/3. But at the - # less-than- acceptable level of precision, it - # might actually match an unrelated number. - # So don't generate a test case if this - # conflating is possible. In our example, we - # don't want 2/3 matching 7/10, if there is - # a 7/10 code point. - - # First, integers are not in the rationals - # table. Don't generate an error if this - # rounds to an integer using the given - # precision. - my $round = sprintf "%.0f", $table_name; - next PLACE if abs($table_name - $round) - < $MAX_FLOATING_SLOP; - - # Here, isn't close enough to an integer to be - # confusable with one. Now, see it it's - # "close" to a known rational - for my $existing - (keys %nv_floating_to_rational) + my $e_representation = sprintf("%.*e", + $E_FLOAT_PRECISION, $float); + # Parse that + my ($non_zeros, $zeros, $exponent_sign, $exponent) + = $e_representation + =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x; + my $min_e_precision; + my $min_f_precision; + + if ($exponent_sign eq '+' && $exponent != 0) { + Carp::my_carp_bug("Not yet equipped to handle" + . " positive exponents"); + return; + } + else { + # We're trying to find the minimum precision that + # is needed to indicate this particular rational + # for the given $E_FLOAT_PRECISION. For %e, any + # trailing zeros, like 1.500e-02 aren't needed, so + # the correct value is how many non-trailing zeros + # there are after the decimal point. + $min_e_precision = length $non_zeros; + + # For %f, like .01500, we want at least + # $E_FLOAT_PRECISION digits, but any trailing + # zeros aren't needed, so we can subtract the + # length of those. But we also need to include + # the zeros after the decimal point, but before + # the first significant digit. + $min_f_precision = $E_FLOAT_PRECISION + + $exponent + - length $zeros; + } + + # Make tests for each possible precision from 1 to + # just past the worst case. + my $upper_limit = ($min_e_precision > $min_f_precision) + ? $min_e_precision + : $min_f_precision; + + for my $i (1 .. $upper_limit + 1) { + for my $format ("e", "f") { + my $this_table + = sprintf("%.*$format", $i, $float); + + # If we don't have enough precision digits, + # make a fail test; otherwise a pass test. + my $pass = ($format eq "e") + ? $i >= $min_e_precision + : $i >= $min_f_precision; + if ($pass) { + push @output, generate_tests($property_name, + $this_table, + $valid, + $invalid, + $warning, + ); + } + elsif ( $format eq "e" + + # Here we would fail, but in the %f + # case, the representation at this + # precision could actually be a + # valid one for some other rational + || ! grep { $_ eq $this_table } + @valid_base_floats) { - next PLACE - if abs($table_name - $existing) - < $MAX_FLOATING_SLOP; + push @output, + generate_error($property_name, + $this_table, + 1 # 1 => already an + # error + ); } - push @output, generate_error($property_name, - $table_name, - 1 # 1 => already an error - ); - } - else { - - # Here the number of digits exceeds the - # minimum we think is needed. So generate a - # success test case for it. - push @output, generate_tests($property_name, - $table_name, - $valid, - $invalid, - $warning, - ); } } } diff --git a/lib/utf8_heavy.pl b/lib/utf8_heavy.pl index 66c968aa50..8882cf4d84 100644 --- a/lib/utf8_heavy.pl +++ b/lib/utf8_heavy.pl @@ -13,6 +13,21 @@ my %Cache; sub croak { require Carp; Carp::croak(@_) } +# Digits may be separated by a single underscore +my $digits = qr/ ( [0-9] _? )+ (?!:_) /x; + +# A sign can be surrounded by white space +my $sign = qr/ \s* [+-]? \s* /x; + +my $f_float = qr/ $sign $digits+ \. $digits* # e.g., 5.0, 5. + | $sign $digits* \. $digits+/x; # 0.7, .7 + +# A number may be an integer, a rational, or a float with an optional exponent +# We (shudder) accept a signed denominator +my $number = qr{ ^ $sign $digits+ $ + | ^ $sign $digits+ \/ $sign $digits+ $ + | ^ $f_float (?: [Ee] [+-]? $digits )? $}x; + sub _loose_name ($) { # Given a lowercase property or property-value name, return its # standardized version that is expected for look-up in the 'loose' hashes @@ -226,25 +241,14 @@ sub _loose_name ($) { # If the rhs looks like it is a number... print STDERR __LINE__, ": table=$table\n" if DEBUG; - if ($table =~ m{ ^ [ \s 0-9 _ + / . -]+ $ }x) { - print STDERR __LINE__, ": table=$table\n" if DEBUG; - # Don't allow leading nor trailing slashes - if ($table =~ / ^ \/ | \/ $ /x) { - pop @recursed if @recursed; - return $type; - } + if ($table =~ $number) { + print STDERR __LINE__, ": table=$table\n" if DEBUG; # Split on slash, in case it is a rational, like \p{1/5} my @parts = split m{ \s* / \s* }x, $table, -1; print __LINE__, ": $type\n" if @parts > 2 && DEBUG; - # Can have maximum of one slash - if (@parts > 2) { - pop @recursed if @recursed; - return $type; - } - foreach my $part (@parts) { print __LINE__, ": part=$part\n" if DEBUG; @@ -261,7 +265,7 @@ sub _loose_name ($) { $part .= '0' if $part eq '-' || $part eq ""; # No trailing zeros after a decimal point - $part =~ s/ ( \. .*? ) 0+ $ /$1/x; + $part =~ s/ ( \. [0-9]*? ) 0+ $ /$1/x; # Begin with a 0 if a leading decimal point $part =~ s/ ^ ( -? ) \. /${1}0./x; @@ -272,14 +276,6 @@ sub _loose_name ($) { print STDERR __LINE__, ": part=$part\n" if DEBUG; #return $type if $part eq ""; - - # Result better look like a number. (This test is - # needed because, for example could have a plus in - # the middle.) - if ($part !~ / ^ -? [0-9]+ ( \. [0-9]+)? $ /x) { - pop @recursed if @recursed; - return $type; - } } # If a rational... @@ -310,83 +306,19 @@ sub _loose_name ($) { $table = $parts[0]; } else { - # Here is a floating point numeric_value. Try to - # convert to rational. First see if is in the list - # of known ones. - if (exists $utf8::nv_floating_to_rational{$parts[0]}) { - $table = $utf8::nv_floating_to_rational{$parts[0]}; - } else { + # Here is a floating point numeric_value. Convert + # to rational. Get a normalized form, like + # 5.00E-01, and look that up in the hash - # Here not in the list. See if is close - # enough to something in the list. First - # determine what 'close enough' means. It has - # to be as tight as what mktables says is the - # maximum slop, and as tight as how many - # digits we were passed. That is, if the user - # said .667, .6667, .66667, etc. we match as - # many digits as they passed until get to - # where it doesn't matter any more due to the - # machine's precision. If they said .6666668, - # we fail. - (my $fraction = $parts[0]) =~ s/^.*\.//; - my $epsilon = 10 ** - (length($fraction)); - if ($epsilon > $utf8::max_floating_slop) { - $epsilon = $utf8::max_floating_slop; - } + my $float = sprintf "%.*e", + $utf8::e_precision, + 0 + $parts[0]; - # But it can't be tighter than the minimum - # precision for this machine. If haven't - # already calculated that minimum, do so now. - if (! defined $min_floating_slop) { - - # Keep going down an order of magnitude - # until find that adding this quantity to - # 1 remains 1; but put an upper limit on - # this so in case this algorithm doesn't - # work properly on some platform, that we - # won't loop forever. - my $count = 0; - $min_floating_slop = 1; - while (1+ $min_floating_slop != 1 - && $count++ < 50) - { - my $next = $min_floating_slop / 10; - last if $next == 0; # If underflows, - # use previous one - $min_floating_slop = $next; - print STDERR __LINE__, ": min_float_slop=$min_floating_slop\n" if DEBUG; - } - - # Back off a couple orders of magnitude, - # just to be safe. - $min_floating_slop *= 100; - } - - if ($epsilon < $min_floating_slop) { - $epsilon = $min_floating_slop; - } - print STDERR __LINE__, ": fraction=.$fraction; epsilon=$epsilon\n" if DEBUG; - - undef $table; - - # And for each possible rational in the table, - # see if it is within epsilon of the input. - foreach my $official - (keys %utf8::nv_floating_to_rational) - { - print STDERR __LINE__, ": epsilon=$epsilon, official=$official, diff=", abs($parts[0] - $official), "\n" if DEBUG; - if (abs($parts[0] - $official) < $epsilon) { - $table = - $utf8::nv_floating_to_rational{$official}; - last; - } - } - - # Quit if didn't find one. - if (! defined $table) { - pop @recursed if @recursed; - return $type; - } + if (exists $utf8::nv_floating_to_rational{$float}) { + $table = $utf8::nv_floating_to_rational{$float}; + } else { + pop @recursed if @recursed; + return $type; } } print STDERR __LINE__, ": $property=$table\n" if DEBUG; diff --git a/regcharclass.h b/regcharclass.h index 9646473e87..a1a67dfacc 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1898,7 +1898,7 @@ * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt - * 6f7e75c46e2c6e4cff53fd9c14a0fbc77611809565d609b15cb98868c5891cdd lib/unicore/mktables + * c237f9e6bda604db4388693b42a20ee0d5c2cf9c08152beca27aa0e1ee735550 lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 9ea6338945a7d70e5ea4b31ac7856c0b521df96be002e94b4b3b7d31debbf3ab regen/regcharclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 418651a8d9..ef959407af 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -6751,7 +6751,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt * 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt * e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt - * 6f7e75c46e2c6e4cff53fd9c14a0fbc77611809565d609b15cb98868c5891cdd lib/unicore/mktables + * c237f9e6bda604db4388693b42a20ee0d5c2cf9c08152beca27aa0e1ee735550 lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl |