summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorKarl Williamson <public@khwilliamson.com>2011-02-15 08:37:16 -0700
committerKarl Williamson <public@khwilliamson.com>2011-02-15 09:27:15 -0700
commit7319f91d2a5083b255891bbdec7edc0bdfa49f4d (patch)
treeae85af39985bef6ef49af7557189d7d81e25f324 /lib
parentcef6a343d5e19fe2dc2c3655ecf621c8ff26f252 (diff)
downloadperl-7319f91d2a5083b255891bbdec7edc0bdfa49f4d.tar.gz
Add UCD::num() to get safe numeric value of a string
This function will return the numeric value of the string passed it, and undef if the entire string has no safe numeric value. To be safe, a string must be a single character which has a numeric value, or consist entirely of characters that match \d, coming from the same Unicode block of digits. Thus, a mix of Bengali and Western digits would be considered unsafe, as well as a mix of half- and full-width digits.
Diffstat (limited to 'lib')
-rw-r--r--lib/Unicode/UCD.pm116
-rw-r--r--lib/Unicode/UCD.t15
2 files changed, 128 insertions, 3 deletions
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm
index 3de4d11b42..ecfa1703ac 100644
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -4,7 +4,7 @@ use strict;
use warnings;
use charnames ();
-our $VERSION = '0.30';
+our $VERSION = '0.31';
use Storable qw(dclone);
@@ -19,7 +19,9 @@ our @EXPORT_OK = qw(charinfo
general_categories bidi_types
compexcl
casefold casespec
- namedseq);
+ namedseq
+ num
+ );
use Carp;
@@ -66,6 +68,9 @@ Unicode::UCD - Unicode character database
my $unicode_version = Unicode::UCD::UnicodeVersion();
+ my $convert_to_numeric =
+ Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
+
=head1 DESCRIPTION
The Unicode::UCD module offers a series of functions that
@@ -1179,6 +1184,113 @@ sub namedseq {
return;
}
+my %NUMERIC;
+
+sub _numeric {
+
+ # Unicode 6.0 instituted the rule that only digits in a consecutive
+ # block of 10 would be considered decimal digits. Before that, the only
+ # problematic code point that I'm (khw) aware of is U+019DA, NEW TAI LUE
+ # THAM DIGIT ONE, which is an alternate form of U+019D1, NEW TAI LUE DIGIT
+ # ONE. The code could be modified to handle that, but not bothering, as
+ # in TUS 6.0, U+19DA was changed to Nt=Di.
+ if ((pack "C*", split /\./, UnicodeVersion()) lt 6.0.0) {
+ croak __PACKAGE__, "::num requires Unicode 6.0 or greater"
+ }
+ my @temp = split /^/m, do "unicore/To/Nv.pl";
+ foreach my $line (@temp) {
+ chomp $line;
+ my @value = split /\t\t/, $line;
+ if ((my @rational = split /\//, $value[1]) == 2) {
+ $value[1] = $rational[0] / $rational[1];
+ }
+ $NUMERIC{$value[0]} = $value[1];
+ }
+ use Math::Trig;
+ my $pi = acos(-1.0);
+ $NUMERIC{"03C0"} = $pi;
+
+ # Euler's constant, not to be confused with Euler's number
+ $NUMERIC{"2107"} = 0.57721566490153286060651209008240243104215933593992;
+
+ # Euler's number
+ $NUMERIC{"212F"} = 2.7182818284590452353602874713526624977572;
+ return;
+}
+
+=pod
+
+=head2 num
+
+C<num> returns the numeric value of the input Unicode string; or C<undef> if it
+doesn't think the entire string has a completely valid, safe numeric value.
+
+If the string is just one character in length, the Unicode numeric value
+is returned if it has one, or C<undef> otherwise. Note that this need
+not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
+example returns -0.5. A few characters to which Unicode doesn't officially
+assign a numeric value are considered numeric by C<num>.
+These are:
+
+ EULER CONSTANT 0.5772... (this is NOT Euler's number)
+ SCRIPT SMALL E 2.71828... (this IS Euler's number)
+ GREEK SMALL LETTER PI 3.14159...
+
+If the string is more than one character, C<undef> is returned unless
+all its characters are decimal digits (that is they would match C<\d+>),
+from the same script. For example if you have an ASCII '0' and a Bengali
+'3', mixed together, they aren't considered a valid number, and C<undef>
+is returned. A further restriction is that the digits all have to be of
+the same form. A half-width digit mixed with a full-width one will
+return C<undef>. The Arabic script has two sets of digits; C<num> will
+return C<undef> unless all the digits in the string come from the same
+set.
+
+C<num> errs on the side of safety, and there may be valid strings of
+decimal digits that it doesn't recognize. Note that Unicode defines
+a number of "digit" characters that aren't "decimal digit" characters.
+"Decimal digits" have the property that they have a positional value, that is
+there is a units position, a 10's position, a 100's, etc, AND they are
+arranged in Unicode in blocks of 10 contiguous code points. The Chinese
+digits, for example, are not in such a contiguous block, and so Unicode
+doesn't view them as decimal digits, but merely digits, and so C<\d> will not
+match them. A single-character string containing one of these digits will
+have its decimal value returned by C<num>, but any longer string containing
+only these digits will return C<undef>.
+
+Strings of sub- and superscripts are not recognized as numbers. You can
+use either of the compatibility decompositions in Unicode::Normalize to
+change these into digits, and then call C<num> on the result.
+
+=cut
+
+# To handle sub, superscripts, this could if called in list context,
+# consider those, and return the <decomposition> type in the second
+# array element.
+
+sub num {
+ my $string = $_[0];
+
+ _numeric unless %NUMERIC;
+
+ my $length = length($string);
+ return $NUMERIC{sprintf("%04X", ord($string))} if $length == 1;
+ return if $string =~ /\D/;
+ my $first_ord = ord(substr($string, 0, 1));
+ my $value = $NUMERIC{sprintf("%04X", $first_ord)};
+ my $zero_ord = $first_ord - $value;
+
+ for my $i (1 .. $length -1) {
+ my $ord = ord(substr($string, $i, 1));
+ my $digit = $ord - $zero_ord;
+ return unless $digit >= 0 && $digit <= 9;
+ $value = $value * 10 + $digit;
+ }
+ return $value;
+}
+
+
+
=head2 Unicode::UCD::UnicodeVersion
This returns the version of the Unicode Character Database, in other words, the
diff --git a/lib/Unicode/UCD.t b/lib/Unicode/UCD.t
index b764c6daee..01276f2f16 100644
--- a/lib/Unicode/UCD.t
+++ b/lib/Unicode/UCD.t
@@ -17,7 +17,7 @@ use strict;
use Unicode::UCD;
use Test::More;
-BEGIN { plan tests => 258 };
+BEGIN { plan tests => 268 };
use Unicode::UCD 'charinfo';
@@ -448,3 +448,16 @@ is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
@ns = namedseq(42);
is(@ns, 0);
+use Unicode::UCD qw(num);
+use charnames ":full";
+
+is(num("0"), 0, 'Verify num("0") == 0');
+is(num("98765"), 98765, 'Verify num("98765") == 98765');
+ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
+is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify \N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}" == 21');
+ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify \N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}" isnt defined');
+is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
+ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
+is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
+is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000');
+is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');