diff options
-rw-r--r-- | charclass_invlists.h | 2 | ||||
-rw-r--r-- | lib/Unicode/UCD.pm | 47 | ||||
-rw-r--r-- | lib/Unicode/UCD.t | 24 | ||||
-rw-r--r-- | pod/perldelta.pod | 7 | ||||
-rw-r--r-- | regcharclass.h | 2 |
5 files changed, 65 insertions, 17 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h index 3d3b57b756..0c2e8b5830 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -109344,7 +109344,7 @@ static const U8 WB_table[24][24] = { #endif /* defined(PERL_IN_REGEXEC_C) */ /* Generated from: - * 251db67a0c884878c52e063af87b61d8b6e86f23ca6c8032877069b8ebf8e5cb lib/Unicode/UCD.pm + * 0512c6af7435cd0cb3482d76f1ce75e6a310694d3f3dbf9091678b785a7e72e5 lib/Unicode/UCD.pm * ff4404ec64f308bdf7714c50f9fdf0d1d0bf3c34db4d0a67e58ef0c6f88e818f lib/unicore/ArabicShaping.txt * 292171a0a1c13d7e581e8781eb4cdf248243b1ab267354a63c7a14429dea2740 lib/unicore/BidiBrackets.txt * 8f2695cc42989a79a715ab0d2892bd0c998759180cfdfb998674447f48231940 lib/unicore/BidiMirroring.txt diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm index ca0f4006e3..cd54e2ea50 100644 --- a/lib/Unicode/UCD.pm +++ b/lib/Unicode/UCD.pm @@ -1865,14 +1865,18 @@ sub _numeric { my $val = num("123"); my $one_quarter = num("\N{VULGAR FRACTION 1/4}"); + my ($val = num("12a", \$valid_length); # $valid_length contains 2 C<num()> returns the numeric value of the input Unicode string; or C<undef> if it doesn't think the entire string has a completely valid, safe numeric value. +If called with an optional second parameter, a reference to a scalar, C<num()> +will set the scalar to the length of any valid initial substring; or to 0 if none. If the string is just one character in length, the Unicode numeric value -is returned if it has one, or C<undef> otherwise. Note that this need -not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for -example returns -0.5. +is returned if it has one, or C<undef> otherwise. If the optional scalar ref +is passed, it would be set to 1 if the return is valid; or 0 if the return is +C<undef>. Note that the numeric value returned need not be a whole number. +C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5. =cut @@ -1894,7 +1898,9 @@ is returned. A further restriction is that the digits all have to be of the same form. A half-width digit mixed with a full-width one will return C<undef>. The Arabic script has two sets of digits; C<num> will return C<undef> unless all the digits in the string come from the same -set. +set. In all cases, the optional scalar ref parameter is set to how +long any valid initial substring of digits is; hence it will be set to the +entire string length if the main return value is not C<undef>. C<num> errs on the side of safety, and there may be valid strings of decimal digits that it doesn't recognize. Note that Unicode defines @@ -1918,16 +1924,30 @@ change these into digits, and then call C<num> on the result. # consider those, and return the <decomposition> type in the second # array element. -sub num { - my $string = $_[0]; +sub num ($;$) { + my ($string, $retlen_ref) = @_; + + use feature 'unicode_strings'; _numeric unless %NUMERIC; + $$retlen_ref = 0 if $retlen_ref; # Assume will fail + + my $length = length $string; + return if $length == 0; - my $length = length($string); - return $NUMERIC{ord($string)} if $length == 1; - return if $string =~ /\D/; my $first_ord = ord(substr($string, 0, 1)); + return if ! exists $NUMERIC{$first_ord} + || ! defined $NUMERIC{$first_ord}; + + # Here, we know the first character is numeric my $value = $NUMERIC{$first_ord}; + $$retlen_ref = 1 if $retlen_ref; # Assume only this one is numeric + + return $value if $length == 1; + + # Here, the input is longer than a single character. To be valid, it must + # be entirely decimal digits, which means it must start with one. + return if $string =~ / ^ \D /x; # To be a valid decimal number, it should be in a block of 10 consecutive # characters, whose values are 0, 1, 2, ... 9. Therefore this digit's @@ -1939,7 +1959,8 @@ sub num { # release, we verify that this first character is a member of such a # block. That is, that the block of characters surrounding this one # consists of all \d characters whose numeric values are the expected - # ones. + # ones. If not, then this single character is numeric, but the string as + # a whole is not considered to be. UnicodeVersion() unless defined $v_unicode_version; if ($v_unicode_version lt v6.0.0) { for my $i (0 .. 9) { @@ -1961,10 +1982,14 @@ sub num { # function. my $ord = ord(substr($string, $i, 1)); my $digit = $ord - $zero_ord; - return unless $digit >= 0 && $digit <= 9; + if ($digit < 0 || $digit > 9) { + $$retlen_ref = $i if $retlen_ref; + return; + } $value = $value * 10 + $digit; } + $$retlen_ref = $length if $retlen_ref; return $value; } diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t index b3476a68fa..0538bda305 100644 --- a/lib/Unicode/UCD.t +++ b/lib/Unicode/UCD.t @@ -819,10 +819,19 @@ use charnames (); # Don't use \N{} on things not in original Unicode # version; else will get a compilation error when this .t # is run on an older version. +my $ret_len; is(num("0"), 0, 'Verify num("0") == 0'); -is(num("98765"), 98765, 'Verify num("98765") == 98765'); -ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), +is(num("0", \$ret_len), 0, 'Verify num("0", \$ret_len) == 0'); +is($ret_len, 1, "... and the returned length is 1"); +ok(! defined num("", \$ret_len), 'Verify num("", \$ret_len) isnt defined'); +is($ret_len, 0, "... and the returned length is 0"); +ok(! defined num("A", \$ret_len), 'Verify num("A") isnt defined'); +is($ret_len, 0, "... and the returned length is 0"); +is(num("98765", \$ret_len), 98765, 'Verify num("98765") == 98765'); +is($ret_len, 5, "... and the returned length is 5"); +ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}", \$ret_len), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); +is($ret_len, 5, "... but the returned length is 5"); my $tai_lue_2; if ($v_unicode_version ge v4.1.0) { my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE"); @@ -834,8 +843,13 @@ if ($v_unicode_version ge v4.1.0) { } if ($v_unicode_version ge v5.2.0) { ok(! defined num($tai_lue_2 - . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")), + . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE"), \$ret_len), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); + is($ret_len, 1, "... but the returned length is 1"); + ok(! defined num(charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE") + . $tai_lue_2, \$ret_len), + 'Verify num("\N{NEW TAI LUE THAM DIGIT ONE}\N{NEW TAI LUE DIGIT TWO}") isnt defined'); + is($ret_len, 1, "... but the returned length is 1"); } if ($v_unicode_version ge v5.1.0) { my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO"); @@ -843,8 +857,10 @@ if ($v_unicode_version ge v5.1.0) { 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); if ($v_unicode_version ge v5.2.0) { ok(! defined num( $cham_0 - . charnames::string_vianame("JAVANESE DIGIT NINE")), + . charnames::string_vianame("JAVANESE DIGIT NINE"), + \$ret_len), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); + is($ret_len, 1, "... but the returned length is 1"); } } is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2'); diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 1a90b33d84..fdc8016f62 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -154,6 +154,13 @@ L<XXX> has been upgraded from version A.xx to B.yy. If there was something important to note about this change, include that here. +=item * + +L<Unicode::UCD> has been upgraded from version 0.69 to 0.70. + +The function C<num> now accepts an optional parameter to help in +diagnosing error returns. + =back =head2 Removed Modules and Pragmata diff --git a/regcharclass.h b/regcharclass.h index b575bd8b22..5f3c07fc37 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -1854,7 +1854,7 @@ #endif /* PERL_REGCHARCLASS_H_ */ /* Generated from: - * 251db67a0c884878c52e063af87b61d8b6e86f23ca6c8032877069b8ebf8e5cb lib/Unicode/UCD.pm + * 0512c6af7435cd0cb3482d76f1ce75e6a310694d3f3dbf9091678b785a7e72e5 lib/Unicode/UCD.pm * ff4404ec64f308bdf7714c50f9fdf0d1d0bf3c34db4d0a67e58ef0c6f88e818f lib/unicore/ArabicShaping.txt * 292171a0a1c13d7e581e8781eb4cdf248243b1ab267354a63c7a14429dea2740 lib/unicore/BidiBrackets.txt * 8f2695cc42989a79a715ab0d2892bd0c998759180cfdfb998674447f48231940 lib/unicore/BidiMirroring.txt |