From 83e6c031994d553b74991501c6cd85e3517fadd8 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Thu, 12 Aug 2021 16:58:41 +0200 Subject: Implement a dedicated int/float parser for XML (schema) values in lxml.objectify. This disables support for "_" in numbers, which are allowed by Python but not by XMLSchema. Wee keep a few additional literals, such as "+NaN", simply because they shouldn't hurt. See https://mail.python.org/archives/list/lxml@python.org/thread/6F7VIDKWZTJ6LB6VOX6IJNNWICYHFPNR/ --- src/lxml/objectify.pyx | 119 ++++++++++++++++++++++++++++++++++++++- src/lxml/tests/test_objectify.py | 69 ++++++++++++++++++++--- 2 files changed, 179 insertions(+), 9 deletions(-) diff --git a/src/lxml/objectify.pyx b/src/lxml/objectify.pyx index e587e4f2..cacbe806 100644 --- a/src/lxml/objectify.pyx +++ b/src/lxml/objectify.pyx @@ -943,6 +943,121 @@ cdef object _parseNumber(NumberElement element): return element._parse_value(textOf(element._c_node)) +cdef enum NumberParserState: + NPS_SPACE_PRE = 0 + NPS_SIGN = 1 + NPS_DIGITS = 2 + NPS_POINT_LEAD = 3 + NPS_POINT = 4 + NPS_FRACTION = 5 + NPS_EXP = 6 + NPS_EXP_SIGN = 7 + NPS_DIGITS_EXP = 8 + NPS_SPACE_TAIL = 9 + NPS_INF1 = 20 + NPS_INF2 = 21 + NPS_INF3 = 22 + NPS_NAN1 = 23 + NPS_NAN2 = 24 + NPS_NAN3 = 25 + NPS_ERROR = 99 + + +ctypedef fused bytes_unicode: + bytes + unicode + + +cdef _checkNumber(bytes_unicode s, bint allow_float): + cdef Py_UCS4 c + cdef NumberParserState state = NPS_SPACE_PRE + + for c in s: + if c.isdigit() if (bytes_unicode is unicode) else c in b'0123456789': + if state in (NPS_DIGITS, NPS_FRACTION, NPS_DIGITS_EXP): + pass + elif state in (NPS_SPACE_PRE, NPS_SIGN): + state = NPS_DIGITS + elif state in (NPS_POINT_LEAD, NPS_POINT): + state = NPS_FRACTION + elif state in (NPS_EXP, NPS_EXP_SIGN): + state = NPS_DIGITS_EXP + else: + state = NPS_ERROR + else: + if c == u'.': + if state in (NPS_SPACE_PRE, NPS_SIGN): + state = NPS_POINT_LEAD + elif state == NPS_DIGITS: + state = NPS_POINT + else: + state = NPS_ERROR + if not allow_float: + state = NPS_ERROR + elif c in u'-+': + if state == NPS_SPACE_PRE: + state = NPS_SIGN + elif state == NPS_EXP: + state = NPS_EXP_SIGN + else: + state = NPS_ERROR + elif c == u'E': + if state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION): + state = NPS_EXP + else: + state = NPS_ERROR + if not allow_float: + state = NPS_ERROR + # Allow INF and NaN. XMLSchema requires case, we don't, like Python. + elif c in u'iI': + state = NPS_INF1 if allow_float and state in (NPS_SPACE_PRE, NPS_SIGN) else NPS_ERROR + elif c in u'fF': + state = NPS_INF3 if state == NPS_INF2 else NPS_ERROR + elif c in u'aA': + state = NPS_NAN2 if state == NPS_NAN1 else NPS_ERROR + elif c in u'nN': + # Python also allows [+-]NaN, so let's accept that. + if state in (NPS_SPACE_PRE, NPS_SIGN): + state = NPS_NAN1 if allow_float else NPS_ERROR + elif state == NPS_NAN2: + state = NPS_NAN3 + elif state == NPS_INF1: + state = NPS_INF2 + else: + state = NPS_ERROR + # Allow spaces around text values. + else: + if c.isspace() if (bytes_unicode is unicode) else c in b'\x09\x0a\x0b\x0c\x0d\x20': + if state in (NPS_SPACE_PRE, NPS_SPACE_TAIL): + pass + elif state in (NPS_DIGITS, NPS_POINT, NPS_FRACTION, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3): + state = NPS_SPACE_TAIL + else: + state = NPS_ERROR + else: + state = NPS_ERROR + + if state == NPS_ERROR: + break + + if state not in (NPS_DIGITS, NPS_FRACTION, NPS_POINT, NPS_DIGITS_EXP, NPS_INF3, NPS_NAN3, NPS_SPACE_TAIL): + raise ValueError + + +cdef _checkInt(s): + if python.IS_PYTHON2 and type(s) is bytes: + return _checkNumber(s, allow_float=False) + else: + return _checkNumber(s, allow_float=False) + + +cdef _checkFloat(s): + if python.IS_PYTHON2 and type(s) is bytes: + return _checkNumber(s, allow_float=True) + else: + return _checkNumber(s, allow_float=True) + + cdef object _strValueOf(obj): if python._isString(obj): return obj @@ -1104,7 +1219,7 @@ def pytypename(obj): return _pytypename(obj) cdef _registerPyTypes(): - pytype = PyType(u'int', int, IntElement) + pytype = PyType(u'int', _checkInt, IntElement) # wraps functions for Python pytype.xmlSchemaTypes = (u"integer", u"int", u"short", u"byte", u"unsignedShort", u"unsignedByte", u"nonPositiveInteger", u"negativeInteger", u"long", u"nonNegativeInteger", @@ -1115,7 +1230,7 @@ cdef _registerPyTypes(): pytype = PyType(u'long', None, IntElement) pytype.register() - pytype = PyType(u'float', float, FloatElement, repr) + pytype = PyType(u'float', _checkFloat, FloatElement, repr) # wraps _parseFloat for Python pytype.xmlSchemaTypes = (u"double", u"float") pytype.register() diff --git a/src/lxml/tests/test_objectify.py b/src/lxml/tests/test_objectify.py index a12ae7e1..178ba256 100644 --- a/src/lxml/tests/test_objectify.py +++ b/src/lxml/tests/test_objectify.py @@ -6,7 +6,9 @@ Tests specific to the lxml.objectify API from __future__ import absolute_import -import unittest, operator +import operator +import random +import unittest from .common_imports import ( etree, HelperTestCase, fileInTestDir, doctest, make_doctest, _bytes, _str, BytesIO @@ -2641,6 +2643,9 @@ class ObjectifyTestCase(HelperTestCase): 4294967296 -4294967296 1.1 + .1 + .1E23 + .1E-23 true false Strange things happen, where strings collide @@ -2649,6 +2654,11 @@ class ObjectifyTestCase(HelperTestCase): t f + 12_34 + 1.2_34 + 34E + .E + . None @@ -2656,20 +2666,65 @@ class ObjectifyTestCase(HelperTestCase): root = XML(xml) for i in root.i: - self.assertTrue(isinstance(i, objectify.IntElement)) + self.assertTrue(isinstance(i, objectify.IntElement), (i.text, type(i))) for l in root.l: - self.assertTrue(isinstance(l, objectify.IntElement)) + self.assertTrue(isinstance(l, objectify.IntElement), (l.text, type(l))) for f in root.f: - self.assertTrue(isinstance(f, objectify.FloatElement)) + self.assertTrue(isinstance(f, objectify.FloatElement), (f.text, type(f))) for b in root.b: - self.assertTrue(isinstance(b, objectify.BoolElement)) + self.assertTrue(isinstance(b, objectify.BoolElement), (b.text, type(b))) self.assertEqual(True, root.b[0]) self.assertEqual(False, root.b[1]) for s in root.s: - self.assertTrue(isinstance(s, objectify.StringElement)) - self.assertTrue(isinstance(root.n, objectify.NoneElement)) + self.assertTrue(isinstance(s, objectify.StringElement), (s.text, type(s))) + self.assertTrue(isinstance(root.n, objectify.NoneElement), root.n) self.assertEqual(None, root.n) + def test_standard_lookup_fuzz(self): + SPACES = ('',) * 10 + ('\t', 'x', '\n', '\r\n', u'\xA0', u'\x0A', u'\u200A', u'\u200B') + DIGITS = ('', '0', '1', '11', '21', '345678', '9'*20) + + def space(_choice=random.choice): + return _choice(SPACES) + + fuzz = [ + '%s\n' % (space() + sign + digits + point + fraction + exp + exp_sign + exp_digits + special + space()) + for sign in ('', '+', '-') + for digits in DIGITS + for point in ('', '.') + for fraction in DIGITS + for exp in ('', 'E') + for exp_sign in ('', '+', '-') + for exp_digits in DIGITS + for special in ('', 'INF', 'inf', 'NaN', 'nan', 'an', 'na', 'ana', 'nf') + ] + + root = self.XML(_bytes('''\ + + ''' + ''.join(fuzz) + ''' + + ''')) + + test_count = 0 + for el in root.iterchildren(): + text = el.text + expected_type = objectify.ObjectifiedElement + if text: + try: + int(text) + expected_type = objectify.IntElement + except ValueError: + try: + float(text) + expected_type = objectify.FloatElement + except ValueError: + expected_type = objectify.StringElement + + self.assertTrue(isinstance(el, expected_type), (text, expected_type, type(el))) + test_count += 1 + self.assertEqual(len(fuzz), test_count) + + def test_suite(): suite = unittest.TestSuite() suite.addTests([unittest.makeSuite(ObjectifyTestCase)]) -- cgit v1.2.1