Unicode::UCD: Add optional paramter to num()

As discussed in http://nntp.perl.org/group/perl.perl5.porters/244444, this sets the optional scalar ref paramater to the length of the valid initial portion of the first parameter passed to num(). This is useful in teasing apart why the input is invalid.
author: Karl Williamson <khw@cpan.org> 2017-12-27 13:50:43 -0700
committer: Karl Williamson <khw@cpan.org> 2017-12-27 14:36:22 -0700
commit: 56da55346bed3bc3537f276968054d464175b71e (patch)
tree: 87c212a4089ac31244f9c9a74a29b48dd89587fb /lib/Unicode
parent: 857320cbf85e762add18885ae8a197b5e0c21b69 (diff)
download: perl-56da55346bed3bc3537f276968054d464175b71e.tar.gz
2 files changed, 56 insertions, 15 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index ca0f4006e3..cd54e2ea50 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -1865,14 +1865,18 @@ sub _numeric {
 
     my $val = num("123");
     my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
+    my ($val = num("12a", \$valid_length);  # $valid_length contains 2
 
 C<num()> returns the numeric value of the input Unicode string; or C<undef> if it
 doesn't think the entire string has a completely valid, safe numeric value.
+If called with an optional second parameter, a reference to a scalar, C<num()>
+will set the scalar to the length of any valid initial substring; or to 0 if none.
 
 If the string is just one character in length, the Unicode numeric value
-is returned if it has one, or C<undef> otherwise.  Note that this need
-not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
-example returns -0.5.
+is returned if it has one, or C<undef> otherwise.  If the optional scalar ref
+is passed, it would be set to 1 if the return is valid; or 0 if the return is
+C<undef>.  Note that the numeric value returned need not be a whole number.
+C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for example returns -0.5.
 
 =cut
 
@@ -1894,7 +1898,9 @@ is returned.  A further restriction is that the digits all have to be of
 the same form.  A half-width digit mixed with a full-width one will
 return C<undef>.  The Arabic script has two sets of digits;  C<num> will
 return C<undef> unless all the digits in the string come from the same
-set.
+set.  In all cases, the optional scalar ref parameter is set to how
+long any valid initial substring of digits is; hence it will be set to the
+entire string length if the main return value is not C<undef>.
 
 C<num> errs on the side of safety, and there may be valid strings of
 decimal digits that it doesn't recognize.  Note that Unicode defines
@@ -1918,16 +1924,30 @@ change these into digits, and then call C<num> on the result.
 # consider those, and return the <decomposition> type in the second
 # array element.
 
-sub num {
-    my $string = $_[0];
+sub num ($;$) {
+    my ($string, $retlen_ref) = @_;
+
+    use feature 'unicode_strings';
 
     _numeric unless %NUMERIC;
+    $$retlen_ref = 0 if $retlen_ref;    # Assume will fail
+
+    my $length = length $string;
+    return if $length == 0;
 
-    my $length = length($string);
-    return $NUMERIC{ord($string)} if $length == 1;
-    return if $string =~ /\D/;
     my $first_ord = ord(substr($string, 0, 1));
+    return if ! exists  $NUMERIC{$first_ord}
+           || ! defined $NUMERIC{$first_ord};
+
+    # Here, we know the first character is numeric
     my $value = $NUMERIC{$first_ord};
+    $$retlen_ref = 1 if $retlen_ref;    # Assume only this one is numeric
+
+    return $value if $length == 1;
+
+    # Here, the input is longer than a single character.  To be valid, it must
+    # be entirely decimal digits, which means it must start with one.
+    return if $string =~ / ^ \D /x;
 
     # To be a valid decimal number, it should be in a block of 10 consecutive
     # characters, whose values are 0, 1, 2, ... 9.  Therefore this digit's
@@ -1939,7 +1959,8 @@ sub num {
     # release, we verify that this first character is a member of such a
     # block.  That is, that the block of characters surrounding this one
     # consists of all \d characters whose numeric values are the expected
-    # ones.
+    # ones.  If not, then this single character is numeric, but the string as
+    # a whole is not considered to be.
     UnicodeVersion() unless defined $v_unicode_version;
     if ($v_unicode_version lt v6.0.0) {
         for my $i (0 .. 9) {
@@ -1961,10 +1982,14 @@ sub num {
         # function.
         my $ord = ord(substr($string, $i, 1));
         my $digit = $ord - $zero_ord;
-        return unless $digit >= 0 && $digit <= 9;
+        if ($digit < 0 || $digit > 9) {
+            $$retlen_ref = $i if $retlen_ref;
+            return;
+        }
         $value = $value * 10 + $digit;
     }
 
+    $$retlen_ref = $length if $retlen_ref;
     return $value;
 }
 
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index b3476a68fa..0538bda305 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -819,10 +819,19 @@ use charnames ();   # Don't use \N{} on things not in original Unicode
                     # version; else will get a compilation error when this .t
                     # is run on an older version.
 
+my $ret_len;
 is(num("0"), 0, 'Verify num("0") == 0');
-is(num("98765"), 98765, 'Verify num("98765") == 98765');
-ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"),
+is(num("0", \$ret_len), 0, 'Verify num("0", \$ret_len) == 0');
+is($ret_len, 1, "... and the returned length is 1");
+ok(! defined num("", \$ret_len), 'Verify num("", \$ret_len) isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+ok(! defined num("A", \$ret_len), 'Verify num("A") isnt defined');
+is($ret_len, 0, "... and the returned length is 0");
+is(num("98765", \$ret_len), 98765, 'Verify num("98765") == 98765');
+is($ret_len, 5, "... and the returned length is 5");
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}", \$ret_len),
    'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+is($ret_len, 5, "... but the returned length is 5");
 my $tai_lue_2;
 if ($v_unicode_version ge v4.1.0) {
     my $tai_lue_1 = charnames::string_vianame("NEW TAI LUE DIGIT ONE");
@@ -834,8 +843,13 @@ if ($v_unicode_version ge v4.1.0) {
 }
 if ($v_unicode_version ge v5.2.0) {
     ok(! defined num($tai_lue_2
-         . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")),
+         . charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE"), \$ret_len),
          'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
+    ok(! defined num(charnames::string_vianame("NEW TAI LUE THAM DIGIT ONE")
+                     .  $tai_lue_2, \$ret_len),
+         'Verify num("\N{NEW TAI LUE THAM DIGIT ONE}\N{NEW TAI LUE DIGIT TWO}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
 }
 if ($v_unicode_version ge v5.1.0) {
     my $cham_0 = charnames::string_vianame("CHAM DIGIT ZERO");
@@ -843,8 +857,10 @@ if ($v_unicode_version ge v5.1.0) {
        'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
     if ($v_unicode_version ge v5.2.0) {
         ok(! defined num(  $cham_0
-                         . charnames::string_vianame("JAVANESE DIGIT NINE")),
+                         . charnames::string_vianame("JAVANESE DIGIT NINE"),
+                         \$ret_len),
         'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+    is($ret_len, 1, "... but the returned length is 1");
     }
 }
 is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
author	Karl Williamson <khw@cpan.org>	2017-12-27 13:50:43 -0700
committer	Karl Williamson <khw@cpan.org>	2017-12-27 14:36:22 -0700
commit	56da55346bed3bc3537f276968054d464175b71e (patch)
tree	87c212a4089ac31244f9c9a74a29b48dd89587fb /lib/Unicode
parent	857320cbf85e762add18885ae8a197b5e0c21b69 (diff)
download	perl-56da55346bed3bc3537f276968054d464175b71e.tar.gz