summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--charclass_invlists.h2
-rw-r--r--lib/Unicode/UCD.pm47
-rw-r--r--lib/Unicode/UCD.t24
-rw-r--r--pod/perldelta.pod7
-rw-r--r--regcharclass.h2
5 files changed, 65 insertions, 17 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 3d3b57b756..0c2e8b5830 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -109344,7 +109344,7 @@ static const U8 WB_table[24][24] = {
#endif /* defined(PERL_IN_REGEXEC_C) */
/* Generated from:
- * 251db67a0c884878c52e063af87b61d8b6e86f23ca6c8032877069b8ebf8e5cb lib/Unicode/UCD.pm
+ * 0512c6af7435cd0cb3482d76f1ce75e6a310694d3f3dbf9091678b785a7e72e5 lib/Unicode/UCD.pm
* ff4404ec64f308bdf7714c50f9fdf0d1d0bf3c34db4d0a67e58ef0c6f88e818f lib/unicore/ArabicShaping.txt
* 292171a0a1c13d7e581e8781eb4cdf248243b1ab267354a63c7a14429dea2740 lib/unicore/BidiBrackets.txt
* 8f2695cc42989a79a715ab0d2892bd0c998759180cfdfb998674447f48231940 lib/unicore/BidiMirroring.txt
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index ca0f4006e3..cd54e2ea50 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -1865,14 +1865,18 @@ sub _numeric {
my $val = num("123");
my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+ my ($val = num("12a", \$valid_length); # $valid_length contains 2
C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, C<num()>
+will set the scalar to the length of any valid initial substring; or to 0 if none.
If the string is just one character in length, the Unicode numeric value
-is returned if it has one, or C<undef> otherwise. Note that this need
-not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
-example returns -0.5.
+is returned if it has one, or C<undef> otherwise. If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+C<undef>. Note that the numeric value returned need not be a whole number.
+C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
=cut
@@ -1894,7 +1898,9 @@ is returned. A further restriction is that the digits all have to be of
the same form. A half-width digit mixed with a full-width one will
return C<undef>. The Arabic script has two sets of digits; C<num> will
return C<undef> unless all the digits in the string come from the same
-set.
+set. In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not C<undef>.
C<num> errs on the side of safety, and there may be valid strings of
decimal digits that it doesn't recognize. Note that Unicode defines
@@ -1918,16 +1924,30 @@ change these into digits, and then call C<num> on the result.
# consider those, and return the <decomposition> type in the second
# array element.
-sub num {
- my $string = $_[0];
+sub num ($;$) {
+ my ($string, $retlen_ref) = @_;
+
+ use feature 'unicode_strings';
_numeric unless %NUMERIC;
+ $$retlen_ref = 0 if $retlen_ref; # Assume will fail
+
+ my $length = length $string;
+ return if $length == 0;
- my $length = length($string);
- return $NUMERIC{ord($string)} if $length == 1;
- return if $string =~ /\D/;
my $first_ord = ord(substr($string, 0, 1));
+ return if ! exists $NUMERIC{$first_ord}
+ || ! defined $NUMERIC{$first_ord};
+
+ # Here, we know the first character is numeric
my $value = $NUMERIC{$first_ord};
+ $$retlen_ref = 1 if $retlen_ref; # Assume only this one is numeric
+
+ return $value if $length == 1;
+
+ # Here, the input is longer than a single character. To be valid, it must
+ # be entirely decimal digits, which means it must start with one.
+ return if $string =~ / ^ \D /x;
# To be a valid decimal number, it should be in a block of 10 consecutive
# characters, whose values are 0, 1, 2, ... 9. Therefore this digit's
@@ -1939,7 +1959,8 @@ sub num {
# release, we verify that this first character is a member of such a
# block. That is, that the block of characters surrounding this one
# consists of all \d characters whose numeric values are the expected
- # ones.
+ # ones. If not, then this single character is numeric, but the string as
+ # a whole is not considered to be.
UnicodeVersion() unless defined $v_unicode_version;
if ($v_unicode_version lt v6.0.0) {
for my $i (0 .. 9) {
@@ -1961,10 +1982,14 @@ sub num {
# function.
my $ord = ord(substr($string, $i, 1));
my $digit = $ord - $zero_ord;
- return unless $digit >= 0 && $digit <= 9;
+ if ($digit < 0 || $digit > 9) {
+ $$retlen_ref = $i if $retlen_ref;
+ return;
+ }
$value = $value * 10 + $digit;
}
+ $$retlen_ref = $length if $retlen_ref;
return $value;
}
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index b3476a68fa..0538bda305 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -819,10 +819,19 @@ use charnames (); # Don't use \N{} on things not in original Unicode
# version; else will get a compilation error when this .t
# is run on an older version.
+my $ret_len;
is(num("0"), 0, 'Verify num("0") == 0');
-is(num("98765"), 98765, 'Verify num("98765") == 98765');
-ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"),
+is(num("0", \$ret_len), 0, 'Verify num("0", \$ret_len) == 0');
+is($ret_len, 1, "... and the returned length is 1");
+ok(! defined num("", \$ret_len), 'Verify num("", \$ret_len) isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+ok(! defined num("A", \$ret_len), 'Verify num("A") isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+is(num("98765", \$ret_len), 98765, 'Verify num("98765") == 98765');
+is($ret_len, 5, "... and the returned length is 5");
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}", \$ret_len),
'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+is($ret_len, 5, "... but the returned length is 5");
my $tai_lue_2;
if ($v_unicode_version ge v4.1.0) {
my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE");
@@ -834,8 +843,13 @@ if ($v_unicode_version ge v4.1.0) {
}
if ($v_unicode_version ge v5.2.0) {
ok(! defined num($tai_lue_2
- . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")),
+ . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE"), \$ret_len),
'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined');
+ is($ret_len, 1, "... but the returned length is 1");
+ ok(! defined num(charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")
+ . $tai_lue_2, \$ret_len),
+ 'Verify num("\N{NEW TAI LUE THAM DIGIT ONE}\N{NEW TAI LUE DIGIT TWO}") isnt defined');
+ is($ret_len, 1, "... but the returned length is 1");
}
if ($v_unicode_version ge v5.1.0) {
my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO");
@@ -843,8 +857,10 @@ if ($v_unicode_version ge v5.1.0) {
'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
if ($v_unicode_version ge v5.2.0) {
ok(! defined num( $cham_0
- . charnames::string_vianame("JAVANESE DIGIT NINE")),
+ . charnames::string_vianame("JAVANESE DIGIT NINE"),
+ \$ret_len),
'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+ is($ret_len, 1, "... but the returned length is 1");
}
}
is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 1a90b33d84..fdc8016f62 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -154,6 +154,13 @@ L<XXX> has been upgraded from version A.xx to B.yy.
If there was something important to note about this change, include that here.
+=item *
+
+L<Unicode::UCD> has been upgraded from version 0.69 to 0.70.
+
+The function C<num> now accepts an optional parameter to help in
+diagnosing error returns.
+
=back
=head2 Removed Modules and Pragmata
diff --git a/regcharclass.h b/regcharclass.h
index b575bd8b22..5f3c07fc37 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1854,7 +1854,7 @@
#endif /* PERL_REGCHARCLASS_H_ */
/* Generated from:
- * 251db67a0c884878c52e063af87b61d8b6e86f23ca6c8032877069b8ebf8e5cb lib/Unicode/UCD.pm
+ * 0512c6af7435cd0cb3482d76f1ce75e6a310694d3f3dbf9091678b785a7e72e5 lib/Unicode/UCD.pm
* ff4404ec64f308bdf7714c50f9fdf0d1d0bf3c34db4d0a67e58ef0c6f88e818f lib/unicore/ArabicShaping.txt
* 292171a0a1c13d7e581e8781eb4cdf248243b1ab267354a63c7a14429dea2740 lib/unicore/BidiBrackets.txt
* 8f2695cc42989a79a715ab0d2892bd0c998759180cfdfb998674447f48231940 lib/unicore/BidiMirroring.txt