summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth M Morton <seth.m.morton@gmail.com>2018-04-10 22:30:35 -0700
committerSeth M Morton <seth.m.morton@gmail.com>2018-04-10 22:30:35 -0700
commitd495a090f07942f0eacbd1830abb150138f404a2 (patch)
treed28cfae1dbd78def0c840c01499e586509326f1f
parent7a700ae26299878a3e59ac68dd517a2aef799b5f (diff)
downloadnatsort-d495a090f07942f0eacbd1830abb150138f404a2.tar.gz
The parsing regular expressions support unicode decimals.
The regular expressions to separate numbers from non-numbers has been updated to accept more than just ASCII numbers.
-rw-r--r--natsort/unicode_numbers.py9
-rw-r--r--natsort/utils.py19
2 files changed, 18 insertions, 10 deletions
diff --git a/natsort/unicode_numbers.py b/natsort/unicode_numbers.py
index 921c5c1..e87654f 100644
--- a/natsort/unicode_numbers.py
+++ b/natsort/unicode_numbers.py
@@ -280,10 +280,17 @@ for a in numeric_hex:
digit_chars = [a for a in numeric_chars
if unicodedata.digit(a, None) is not None]
+# The decimal characters are a subset of the numberals
+# (probably of the digits, but let's be safe).
+decimal_chars = [a for a in numeric_chars
+ if unicodedata.decimal(a, None) is not None]
+
# Create a single string with the above data.
+decimals = ''.join(decimal_chars)
digits = ''.join(digit_chars)
numeric = ''.join(numeric_chars)
-
+digits_no_decimals = ''.join([x for x in digits if x not in decimals])
+numeric_no_decimals = ''.join([x for x in numeric if x not in decimals])
# Some code that can be used to create the above list of hex numbers.
if __name__ == '__main__':
diff --git a/natsort/utils.py b/natsort/utils.py
index b6484b0..306762a 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -58,7 +58,7 @@ from unicodedata import normalize
# Local imports.
from natsort.ns_enum import ns
-from natsort.unicode_numbers import digits, numeric
+from natsort.unicode_numbers import numeric_no_decimals, digits_no_decimals
from natsort.compat.pathlib import PurePath, has_pathlib
from natsort.compat.locale import (
get_strxfrm,
@@ -80,25 +80,26 @@ if PY_VERSION >= 3:
long = int
# The regex that locates floats - include Unicode numerals.
-_exp = r'(?:[eE][-+]?[0-9]+)?'
-_num = r'(?:[0-9]+\.?[0-9]*|\.[0-9]+)'
+_nnd = numeric_no_decimals
+_exp = r'(?:[eE][-+]?\d+)?'
+_num = r'(?:\d+\.?\d*|\.\d+)'
_float_sign_exp_re = r'([-+]?{0}{1}|[{2}])'
-_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, numeric)
+_float_sign_exp_re = _float_sign_exp_re.format(_num, _exp, _nnd)
_float_sign_exp_re = re.compile(_float_sign_exp_re, flags=re.U)
_float_nosign_exp_re = r'({0}{1}|[{2}])'
-_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, numeric)
+_float_nosign_exp_re = _float_nosign_exp_re.format(_num, _exp, _nnd)
_float_nosign_exp_re = re.compile(_float_nosign_exp_re, flags=re.U)
_float_sign_noexp_re = r'([-+]?{0}|[{1}])'
-_float_sign_noexp_re = _float_sign_noexp_re.format(_num, numeric)
+_float_sign_noexp_re = _float_sign_noexp_re.format(_num, _nnd)
_float_sign_noexp_re = re.compile(_float_sign_noexp_re, flags=re.U)
_float_nosign_noexp_re = r'({0}|[{1}])'
-_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, numeric)
+_float_nosign_noexp_re = _float_nosign_noexp_re.format(_num, _nnd)
_float_nosign_noexp_re = re.compile(_float_nosign_noexp_re, flags=re.U)
# Integer regexes - include Unicode digits.
-_int_nosign_re = r'([0-9]+|[{0}])'.format(digits)
+_int_nosign_re = r'(\d+|[{0}])'.format(digits_no_decimals)
_int_nosign_re = re.compile(_int_nosign_re, flags=re.U)
-_int_sign_re = r'([-+]?[0-9]+|[{0}])'.format(digits)
+_int_sign_re = r'([-+]?\d+|[{0}])'.format(digits_no_decimals)
_int_sign_re = re.compile(_int_sign_re, flags=re.U)
# This dict will help select the correct regex and number conversion function.