The parsing regular expressions support unicode decimals.

The regular expressions to separate numbers from non-numbers has been updated to accept more than just ASCII numbers.
author: Seth M Morton <seth.m.morton@gmail.com> 2018-04-10 22:30:35 -0700
committer: Seth M Morton <seth.m.morton@gmail.com> 2018-04-10 22:30:35 -0700
commit: d495a090f07942f0eacbd1830abb150138f404a2 (patch)
tree: d28cfae1dbd78def0c840c01499e586509326f1f
parent: 7a700ae26299878a3e59ac68dd517a2aef799b5f (diff)
download: natsort-d495a090f07942f0eacbd1830abb150138f404a2.tar.gz
2 files changed, 18 insertions, 10 deletions
diff --git a/natsort/unicode_numbers.py b/natsort/unicode_numbers.py
index 921c5c1..e87654f 100644
--- a/natsort/unicode_numbers.py
+++ b/natsort/unicode_numbers.py
@@ -280,10 +280,17 @@ for a in numeric_hex:
 digit_chars = [a for a in numeric_chars
                if unicodedata.digit(a, None) is not None]
 
+# The decimal characters are a subset of the numberals
+# (probably of the digits, but let's be safe).
+decimal_chars = [a for a in numeric_chars
+                 if unicodedata.decimal(a, None) is not None]
+
 # Create a single string with the above data.
+decimals = ''.join(decimal_chars)
 digits = ''.join(digit_chars)
 numeric = ''.join(numeric_chars)
-
+digits_no_decimals = ''.join([x for x in digits if x not in decimals])
+numeric_no_decimals = ''.join([x for x in numeric if x not in decimals])
 
 # Some code that can be used to create the above list of hex numbers.
 if __name__ == '__main__':
diff --git a/natsort/utils.py b/natsort/utils.py
index b6484b0..306762a 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -58,7 +58,7 @@ from unicodedata import normalize
 
 # Local imports.
 from natsort.ns_enum import ns
-from natsort.unicode_numbers import digits, numeric
+from natsort.unicode_numbers import numeric_no_decimals, digits_no_decimals
 from natsort.compat.pathlib import PurePath, has_pathlib
 from natsort.compat.locale import (
     get_strxfrm,
@@ -80,25 +80,26 @@ if PY_VERSION >= 3:
     long = int
 
 # The regex that locates floats - include Unicode numerals.
-_exp = r'(?:[eE][-+]?[0-9]+)?'
-_num = r'(?:[0-9]+\.?[0-9]*|\.[0-9]+)'
+_nnd = numeric_no_decimals
+_exp = r'(?:[eE][-+]?\d+)?'
+_num = r'(?:\d+\.?\d*|\.\d+)'
 _float_sign_exp_re = r'([-+]?{0}{1}|[{2}])'
-_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, numeric)
+_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, _nnd)
 _float_sign_exp_re = re.compile(_float_sign_exp_re, flags=re.U)
 _float_nosign_exp_re = r'({0}{1}|[{2}])'
-_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, numeric)
+_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, _nnd)
 _float_nosign_exp_re = re.compile(_float_nosign_exp_re, flags=re.U)
 _float_sign_noexp_re = r'([-+]?{0}|[{1}])'
-_float_sign_noexp_re = _float_sign_noexp_re.format(_num, numeric)
+_float_sign_noexp_re = _float_sign_noexp_re.format(_num, _nnd)
 _float_sign_noexp_re = re.compile(_float_sign_noexp_re, flags=re.U)
 _float_nosign_noexp_re = r'({0}|[{1}])'
-_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, numeric)
+_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, _nnd)
 _float_nosign_noexp_re = re.compile(_float_nosign_noexp_re, flags=re.U)
 
 # Integer regexes - include Unicode digits.
-_int_nosign_re = r'([0-9]+|[{0}])'.format(digits)
+_int_nosign_re = r'(\d+|[{0}])'.format(digits_no_decimals)
 _int_nosign_re = re.compile(_int_nosign_re, flags=re.U)
-_int_sign_re = r'([-+]?[0-9]+|[{0}])'.format(digits)
+_int_sign_re = r'([-+]?\d+|[{0}])'.format(digits_no_decimals)
 _int_sign_re = re.compile(_int_sign_re, flags=re.U)
 
 # This dict will help select the correct regex and number conversion function.
author	Seth M Morton <seth.m.morton@gmail.com>	2018-04-10 22:30:35 -0700
committer	Seth M Morton <seth.m.morton@gmail.com>	2018-04-10 22:30:35 -0700
commit	d495a090f07942f0eacbd1830abb150138f404a2 (patch)
tree	d28cfae1dbd78def0c840c01499e586509326f1f
parent	7a700ae26299878a3e59ac68dd517a2aef799b5f (diff)
download	natsort-d495a090f07942f0eacbd1830abb150138f404a2.tar.gz