Add UCD::num() to get safe numeric value of a string

This function will return the numeric value of the string passed it, and undef if the entire string has no safe numeric value. To be safe, a string must be a single character which has a numeric value, or consist entirely of characters that match \d, coming from the same Unicode block of digits. Thus, a mix of Bengali and Western digits would be considered unsafe, as well as a mix of half- and full-width digits.
author: Karl Williamson <public@khwilliamson.com> 2011-02-15 08:37:16 -0700
committer: Karl Williamson <public@khwilliamson.com> 2011-02-15 09:27:15 -0700
commit: 7319f91d2a5083b255891bbdec7edc0bdfa49f4d (patch)
tree: ae85af39985bef6ef49af7557189d7d81e25f324 /lib
parent: cef6a343d5e19fe2dc2c3655ecf621c8ff26f252 (diff)
download: perl-7319f91d2a5083b255891bbdec7edc0bdfa49f4d.tar.gz
2 files changed, 128 insertions, 3 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 3de4d11b42..ecfa1703ac 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -4,7 +4,7 @@ use strict;
 use warnings;
 use charnames ();
 
-our $VERSION = '0.30';
+our $VERSION = '0.31';
 
 use Storable qw(dclone);
 
@@ -19,7 +19,9 @@ our @EXPORT_OK = qw(charinfo
 		    general_categories bidi_types
 		    compexcl
 		    casefold casespec
-		    namedseq);
+		    namedseq
+                    num
+                );
 
 use Carp;
 
@@ -66,6 +68,9 @@ Unicode::UCD - Unicode character database
 
     my $unicode_version = Unicode::UCD::UnicodeVersion();
 
+    my $convert_to_numeric =
+                Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
+
 =head1 DESCRIPTION
 
 The Unicode::UCD module offers a series of functions that
@@ -1179,6 +1184,113 @@ sub namedseq {
     return;
 }
 
+my %NUMERIC;
+
+sub _numeric {
+
+    # Unicode 6.0 instituted the rule that only digits in a consecutive
+    # block of 10 would be considered decimal digits.  Before that, the only
+    # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
+    # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
+    # ONE.  The code could be modified to handle that, but not bothering, as
+    # in TUS 6.0, U+19DA was changed to Nt=Di.
+    if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
+	croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
+    }
+    my @temp = split /^/m, do "unicore/To/Nv.pl";
+    foreach my $line (@temp) {
+        chomp $line;
+        my @value = split /\t\t/, $line;
+        if ((my @rational = split /\//, $value[1]) == 2) {
+            $value[1] = $rational[0] / $rational[1];
+        }
+        $NUMERIC{$value[0]} = $value[1];
+    }
+    use Math::Trig;
+    my $pi = acos(-1.0);
+    $NUMERIC{"03C0"} = $pi;
+
+    # Euler's constant, not to be confused with Euler's number
+    $NUMERIC{"2107"} = 0.57721566490153286060651209008240243104215933593992;
+
+    # Euler's number
+    $NUMERIC{"212F"} = 2.7182818284590452353602874713526624977572;
+    return;
+}
+
+=pod
+
+=head2 num
+
+C<num> returns the numeric value of the input Unicode string; or C<undef> if it
+doesn't think the entire string has a completely valid, safe numeric value.
+
+If the string is just one character in length, the Unicode numeric value
+is returned if it has one, or C<undef> otherwise.  Note that this need
+not be a whole number.  C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
+example returns -0.5.  A few characters to which Unicode doesn't officially
+assign a numeric value are considered numeric by C<num>.
+These are:
+
+ EULER CONSTANT             0.5772...  (this is NOT Euler's number)
+ SCRIPT SMALL E             2.71828... (this IS Euler's number)
+ GREEK SMALL LETTER PI      3.14159...
+
+If the string is more than one character, C<undef> is returned unless
+all its characters are decimal digits (that is they would match C<\d+>),
+from the same script.  For example if you have an ASCII '0' and a Bengali
+'3', mixed together, they aren't considered a valid number, and C<undef>
+is returned.  A further restriction is that the digits all have to be of
+the same form.  A half-width digit mixed with a full-width one will
+return C<undef>.  The Arabic script has two sets of digits;  C<num> will
+return C<undef> unless all the digits in the string come from the same
+set.
+
+C<num> errs on the side of safety, and there may be valid strings of
+decimal digits that it doesn't recognize.  Note that Unicode defines
+a number of "digit" characters that aren't "decimal digit" characters.
+"Decimal digits" have the property that they have a positional value, that is
+there is a units position, a 10's position, a 100's, etc, AND they are
+arranged in Unicode in blocks of 10 contiguous code points.  The Chinese
+digits, for example, are not in such a contiguous block, and so Unicode
+doesn't view them as decimal digits, but merely digits, and so C<\d> will not
+match them.  A single-character string containing one of these digits will
+have its decimal value returned by C<num>, but any longer string containing
+only these digits will return C<undef>.
+
+Strings of sub- and superscripts are not recognized as numbers.  You can
+use either of the compatibility decompositions in Unicode::Normalize to
+change these into digits, and then call C<num> on the result.
+
+=cut
+
+# To handle sub, superscripts, this could if called in list context,
+# consider those, and return the <decomposition> type in the second
+# array element.
+
+sub num {
+    my $string = $_[0];
+
+    _numeric unless %NUMERIC;
+
+    my $length = length($string);
+    return $NUMERIC{sprintf("%04X", ord($string))} if $length == 1;
+    return if $string =~ /\D/;
+    my $first_ord = ord(substr($string, 0, 1));
+    my $value = $NUMERIC{sprintf("%04X", $first_ord)};
+    my $zero_ord = $first_ord - $value;
+
+    for my $i (1 .. $length -1) {
+        my $ord = ord(substr($string, $i, 1));
+        my $digit = $ord - $zero_ord;
+        return unless $digit >= 0 && $digit <= 9;
+        $value = $value * 10 + $digit;
+    }
+    return $value;
+}
+
+
+
 =head2 Unicode::UCD::UnicodeVersion
 
 This returns the version of the Unicode Character Database, in other words, the
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index b764c6daee..01276f2f16 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -17,7 +17,7 @@ use strict;
 use Unicode::UCD;
 use Test::More;
 
-BEGIN { plan tests => 258 };
+BEGIN { plan tests => 268 };
 
 use Unicode::UCD 'charinfo';
 
@@ -448,3 +448,16 @@ is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
 @ns = namedseq(42);
 is(@ns, 0);
 
+use Unicode::UCD qw(num);
+use charnames ":full";
+
+is(num("0"), 0, 'Verify num("0") == 0');
+is(num("98765"), 98765, 'Verify num("98765") == 98765');
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify \N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}" == 21');
+ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify \N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}" isnt defined');
+is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
+ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
+is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000');
+is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');
author	Karl Williamson <public@khwilliamson.com>	2011-02-15 08:37:16 -0700
committer	Karl Williamson <public@khwilliamson.com>	2011-02-15 09:27:15 -0700
commit	7319f91d2a5083b255891bbdec7edc0bdfa49f4d (patch)
tree	ae85af39985bef6ef49af7557189d7d81e25f324 /lib
parent	cef6a343d5e19fe2dc2c3655ecf621c8ff26f252 (diff)
download	perl-7319f91d2a5083b255891bbdec7edc0bdfa49f4d.tar.gz