5 files changed, 65 insertions, 17 deletions
diff --git a/charclass_invlists.h b/charclass_invlists.h
index 3d3b57b756..0c2e8b5830 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -109344,7 +109344,7 @@ static const U8 WB_table[24][24] = {
 #endif	/* defined(PERL_IN_REGEXEC_C) */
 
 /* Generated from:
- * 251db67a0c884878c52e063af87b61d8b6e86f23ca6c8032877069b8ebf8e5cb lib/Unicode/UCD.pm
+ * 0512c6af7435cd0cb3482d76f1ce75e6a310694d3f3dbf9091678b785a7e72e5 lib/Unicode/UCD.pm
  * ff4404ec64f308bdf7714c50f9fdf0d1d0bf3c34db4d0a67e58ef0c6f88e818f lib/unicore/ArabicShaping.txt
  * 292171a0a1c13d7e581e8781eb4cdf248243b1ab267354a63c7a14429dea2740 lib/unicore/BidiBrackets.txt
  * 8f2695cc42989a79a715ab0d2892bd0c998759180cfdfb998674447f48231940 lib/unicore/BidiMirroring.txt
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index ca0f4006e3..cd54e2ea50 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -1865,14 +1865,18 @@ sub _numeric {
 
     my $val = num("123");
     my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+    my ($val = num("12a", \$valid_length);  # $valid_length contains 2
 
 C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
 doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, C<num()>
+will set the scalar to the length of any valid initial substring; or to 0 if none.
 
 If the string is just one character in length, the Unicode numeric value
-is returned if it has one, or C<undef> otherwise.  Note that this need
-not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
-example returns -0.5.
+is returned if it has one, or C<undef> otherwise.  If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+C<undef>.  Note that the numeric value returned need not be a whole number.
+C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
 
 =cut
 
@@ -1894,7 +1898,9 @@ is returned.  A further restriction is that the digits all have to be of
 the same form.  A half-width digit mixed with a full-width one will
 return C<undef>.  The Arabic script has two sets of digits;  C<num> will
 return C<undef> unless all the digits in the string come from the same
-set.
+set.  In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not C<undef>.
 
 C<num> errs on the side of safety, and there may be valid strings of
 decimal digits that it doesn't recognize.  Note that Unicode defines
@@ -1918,16 +1924,30 @@ change these into digits, and then call C<num> on the result.
 # consider those, and return the <decomposition> type in the second
 # array element.
 
-sub num {
-    my $string = $_[0];
+sub num ($;$) {
+    my ($string, $retlen_ref) = @_;
+
+    use feature 'unicode_strings';
 
     _numeric unless %NUMERIC;
+    $$retlen_ref = 0 if $retlen_ref;    # Assume will fail
+
+    my $length = length $string;
+    return if $length == 0;
 
-    my $length = length($string);
-    return $NUMERIC{ord($string)} if $length == 1;
-    return if $string =~ /\D/;
     my $first_ord = ord(substr($string, 0, 1));
+    return if ! exists  $NUMERIC{$first_ord}
+           || ! defined $NUMERIC{$first_ord};
+
+    # Here, we know the first character is numeric
     my $value = $NUMERIC{$first_ord};
+    $$retlen_ref = 1 if $retlen_ref;    # Assume only this one is numeric
+
+    return $value if $length == 1;
+
+    # Here, the input is longer than a single character.  To be valid, it must
+    # be entirely decimal digits, which means it must start with one.
+    return if $string =~ / ^ \D /x;
 
     # To be a valid decimal number, it should be in a block of 10 consecutive
     # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
@@ -1939,7 +1959,8 @@ sub num {
     # release, we verify that this first character is a member of such a
     # block.  That is, that the block of characters surrounding this one
     # consists of all \d characters whose numeric values are the expected
-    # ones.
+    # ones.  If not, then this single character is numeric, but the string as
+    # a whole is not considered to be.
     UnicodeVersion() unless defined $v_unicode_version;
     if ($v_unicode_version lt v6.0.0) {
         for my $i (0 .. 9) {
@@ -1961,10 +1982,14 @@ sub num {
         # function.
         my $ord = ord(substr($string, $i, 1));
         my $digit = $ord - $zero_ord;
-        return unless $digit >= 0 && $digit <= 9;
+        if ($digit < 0 || $digit > 9) {
+            $$retlen_ref = $i if $retlen_ref;
+            return;
+        }
         $value = $value * 10 + $digit;
     }
 
+    $$retlen_ref = $length if $retlen_ref;
     return $value;
 }
 
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index b3476a68fa..0538bda305 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -819,10 +819,19 @@ use charnames ();   # Don't use \N{} on things not in original Unicode
                     # version; else will get a compilation error when this .t
                     # is run on an older version.
 
+my $ret_len;
 is(num("0"), 0, 'Verify num("0") == 0');
-is(num("98765"), 98765, 'Verify num("98765") == 98765');
-ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"),
+is(num("0", \$ret_len), 0, 'Verify num("0", \$ret_len) == 0');
+is($ret_len, 1, "... and the returned length is 1");
+ok(! defined num("", \$ret_len), 'Verify num("", \$ret_len) isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+ok(! defined num("A", \$ret_len), 'Verify num("A") isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+is(num("98765", \$ret_len), 98765, 'Verify num("98765") == 98765');
+is($ret_len, 5, "... and the returned length is 5");
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}", \$ret_len),
    'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+is($ret_len, 5, "... but the returned length is 5");
 my $tai_lue_2;
 if ($v_unicode_version ge v4.1.0) {
     my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE");
@@ -834,8 +843,13 @@ if ($v_unicode_version ge v4.1.0) {
 }
 if ($v_unicode_version ge v5.2.0) {
     ok(! defined num($tai_lue_2
-         . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")),
+         . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE"), \$ret_len),
          'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
+    ok(! defined num(charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")
+                     .  $tai_lue_2, \$ret_len),
+         'Verify num("\N{NEW TAI LUE THAM DIGIT ONE}\N{NEW TAI LUE DIGIT TWO}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
 }
 if ($v_unicode_version ge v5.1.0) {
     my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO");
@@ -843,8 +857,10 @@ if ($v_unicode_version ge v5.1.0) {
        'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
     if ($v_unicode_version ge v5.2.0) {
         ok(! defined num(  $cham_0
-                         . charnames::string_vianame("JAVANESE DIGIT NINE")),
+                         . charnames::string_vianame("JAVANESE DIGIT NINE"),
+                         \$ret_len),
         'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
     }
 }
 is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
diff --git a/pod/perldelta.pod b/pod/perldelta.pod
index 1a90b33d84..fdc8016f62 100644
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -154,6 +154,13 @@ L<XXX> has been upgraded from version A.xx to B.yy.
 
 If there was something important to note about this change, include that here.
 
+=item *
+
+L<Unicode::UCD> has been upgraded from version 0.69 to 0.70.
+
+The function C<num> now accepts an optional parameter to help in
+diagnosing error returns.
+
 =back
 
 =head2 Removed Modules and Pragmata
diff --git a/regcharclass.h b/regcharclass.h
index b575bd8b22..5f3c07fc37 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -1854,7 +1854,7 @@
 #endif /* PERL_REGCHARCLASS_H_ */
 
 /* Generated from:
- * 251db67a0c884878c52e063af87b61d8b6e86f23ca6c8032877069b8ebf8e5cb lib/Unicode/UCD.pm
+ * 0512c6af7435cd0cb3482d76f1ce75e6a310694d3f3dbf9091678b785a7e72e5 lib/Unicode/UCD.pm
  * ff4404ec64f308bdf7714c50f9fdf0d1d0bf3c34db4d0a67e58ef0c6f88e818f lib/unicore/ArabicShaping.txt
  * 292171a0a1c13d7e581e8781eb4cdf248243b1ab267354a63c7a14429dea2740 lib/unicore/BidiBrackets.txt
  * 8f2695cc42989a79a715ab0d2892bd0c998759180cfdfb998674447f48231940 lib/unicore/BidiMirroring.txt