diff options
author | Lennart Regebro <regebro@gmail.com> | 2018-11-23 15:18:50 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-11-23 15:18:50 +0100 |
commit | 51308a28ac6e4e5ec7e014932a1ef39c1f99c5de (patch) | |
tree | 8ad507b1ee511f4172c179a6d2f7bb4a95784f87 | |
parent | 8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b (diff) | |
parent | 2ea6f97c5758b80d6a8394724c36091234fc9191 (diff) | |
download | python-lxml-51308a28ac6e4e5ec7e014932a1ef39c1f99c5de.tar.gz |
Merge branch 'master' into master
-rw-r--r-- | CHANGES.txt | 5 | ||||
-rw-r--r-- | src/lxml/apihelpers.pxi | 48 | ||||
-rw-r--r-- | src/lxml/etree.pyx | 6 | ||||
-rw-r--r-- | src/lxml/html/__init__.py | 1 | ||||
-rw-r--r-- | src/lxml/html/_setmixin.py | 6 | ||||
-rw-r--r-- | src/lxml/serializer.pxi | 2 |
6 files changed, 56 insertions, 12 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index 33f929aa..4b2503ca 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -14,6 +14,11 @@ Features added point to the same URI, the first prefix in alphabetical order is used for attributes. +Bugs fixed +---------- + +* LP#1799755: Fix a DeprecationWarning in Py3.7+. + 4.2.6 (2018-??-??) ================== diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi index 91f85e4f..5366fcaf 100644 --- a/src/lxml/apihelpers.pxi +++ b/src/lxml/apihelpers.pxi @@ -1340,14 +1340,50 @@ cdef int _addSibling(_Element element, _Element sibling, bint as_next) except -1 moveNodeToDocument(element._doc, c_source_doc, c_node) return 0 -cdef inline int isutf8(const_xmlChar* s): +cdef inline bint isutf8(const_xmlChar* s): cdef xmlChar c = s[0] while c != c'\0': if c & 0x80: - return 1 + return True s += 1 c = s[0] - return 0 + return False + +cdef bint isutf8l(const_xmlChar* s, size_t length): + """ + Search for non-ASCII characters in the string, knowing its length in advance. + """ + cdef int i + cdef unsigned long non_ascii_mask + cdef const unsigned long *lptr = <const unsigned long*> s + + cdef const unsigned long *end = lptr + length // sizeof(unsigned long) + if length >= sizeof(non_ascii_mask): + # Build constant 0x80808080... mask (and let the C compiler fold it). + non_ascii_mask = 0 + for i in range(sizeof(non_ascii_mask) // 2): + non_ascii_mask = (non_ascii_mask << 16) | 0x8080 + + # Advance to long-aligned character before we start reading longs. + while (<size_t>s) % sizeof(unsigned long) and s < <const_xmlChar *>end: + if s[0] & 0x80: + return True + s += 1 + + # Read one long at a time + lptr = <const unsigned long*> s + while lptr < end: + if lptr[0] & non_ascii_mask: + return True + lptr += 1 + s = <const_xmlChar *>lptr + + while s < (<const_xmlChar *>end + length % sizeof(unsigned long)): + if s[0] & 0x80: + return True + s += 1 + + return False cdef int _is_valid_xml_ascii(bytes pystring): """Check if a string is XML ascii content.""" @@ -1411,7 +1447,7 @@ cdef object funicode(const_xmlChar* s): spos += 1 slen = spos - s if spos[0] != c'\0': - slen += tree.xmlStrlen(spos) + slen += cstring_h.strlen(<const char*> spos) if is_non_ascii: return s[:slen].decode('UTF-8') return <bytes>s[:slen] @@ -1520,7 +1556,7 @@ cdef object _encodeFilenameUTF8(object filename): if filename is None: return None elif isinstance(filename, bytes): - if not isutf8(<bytes>filename): + if not isutf8l(<bytes>filename, len(<bytes>filename)): # plain ASCII! return filename c_filename = _cstr(<bytes>filename) @@ -1657,7 +1693,7 @@ cdef object _namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name): return python.PyUnicode_FromFormat("{%s}%s", href, name) else: s = python.PyBytes_FromFormat("{%s}%s", href, name) - if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8(_xcstr(s))): + if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8l(s, len(s))): return (<bytes>s).decode('utf8') else: return s diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx index 69a553bd..a38440ba 100644 --- a/src/lxml/etree.pyx +++ b/src/lxml/etree.pyx @@ -3274,9 +3274,9 @@ def tostring(element_or_tree, *, encoding=None, method="xml", declaration by default. You can also serialise to a Unicode string without declaration by - passing the ``unicode`` function as encoding (or ``str`` in Py3), - or the name 'unicode'. This changes the return value from a byte - string to an unencoded unicode string. + passing the name ``'unicode'`` as encoding (or the ``str`` function + in Py3 or ``unicode`` in Py2). This changes the return value from + a byte string to an unencoded unicode string. The keyword argument 'pretty_print' (bool) enables formatted XML. diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py index 4502373e..5751f709 100644 --- a/src/lxml/html/__init__.py +++ b/src/lxml/html/__init__.py @@ -46,7 +46,6 @@ import re from functools import partial try: - # while unnecessary, importing from 'collections.abc' is the right way to do it from collections.abc import MutableMapping, MutableSet except ImportError: from collections import MutableMapping, MutableSet diff --git a/src/lxml/html/_setmixin.py b/src/lxml/html/_setmixin.py index c14a3eb0..c99738e3 100644 --- a/src/lxml/html/_setmixin.py +++ b/src/lxml/html/_setmixin.py @@ -1,4 +1,8 @@ -from collections import MutableSet +try: + from collections.abc import MutableSet +except ImportError: + from collections import MutableSet + class SetMixin(MutableSet): diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi index 15327511..3c70258a 100644 --- a/src/lxml/serializer.pxi +++ b/src/lxml/serializer.pxi @@ -61,7 +61,7 @@ cdef _textToString(xmlNode* c_node, encoding, bint with_tail): encoding = encoding.lower() if encoding not in (u'utf8', u'utf-8'): if encoding == u'ascii': - if isutf8(c_text): + if isutf8l(c_text, tree.xmlBufferLength(c_buffer)): # will raise a decode error below needs_conversion = 1 else: |