Merge branch 'master' into master

author: Lennart Regebro <regebro@gmail.com> 2018-11-23 15:18:50 +0100
committer: GitHub <noreply@github.com> 2018-11-23 15:18:50 +0100
commit: 51308a28ac6e4e5ec7e014932a1ef39c1f99c5de (patch)
tree: 8ad507b1ee511f4172c179a6d2f7bb4a95784f87
parent: 8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b (diff)
parent: 2ea6f97c5758b80d6a8394724c36091234fc9191 (diff)
download: python-lxml-51308a28ac6e4e5ec7e014932a1ef39c1f99c5de.tar.gz
6 files changed, 56 insertions, 12 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 33f929aa..4b2503ca 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -14,6 +14,11 @@ Features added
   point to the same URI, the first prefix in alphabetical order is used
   for attributes.
 
+Bugs fixed
+----------
+
+* LP#1799755: Fix a DeprecationWarning in Py3.7+.
+
 
 4.2.6 (2018-??-??)
 ==================
diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
index 91f85e4f..5366fcaf 100644
--- a/src/lxml/apihelpers.pxi
+++ b/src/lxml/apihelpers.pxi
@@ -1340,14 +1340,50 @@ cdef int _addSibling(_Element element, _Element sibling, bint as_next) except -1
     moveNodeToDocument(element._doc, c_source_doc, c_node)
     return 0
 
-cdef inline int isutf8(const_xmlChar* s):
+cdef inline bint isutf8(const_xmlChar* s):
     cdef xmlChar c = s[0]
     while c != c'\0':
         if c & 0x80:
-            return 1
+            return True
         s += 1
         c = s[0]
-    return 0
+    return False
+
+cdef bint isutf8l(const_xmlChar* s, size_t length):
+    """
+    Search for non-ASCII characters in the string, knowing its length in advance.
+    """
+    cdef int i
+    cdef unsigned long non_ascii_mask
+    cdef const unsigned long *lptr = <const unsigned long*> s
+
+    cdef const unsigned long *end = lptr + length // sizeof(unsigned long)
+    if length >= sizeof(non_ascii_mask):
+        # Build constant 0x80808080... mask (and let the C compiler fold it).
+        non_ascii_mask = 0
+        for i in range(sizeof(non_ascii_mask) // 2):
+            non_ascii_mask = (non_ascii_mask << 16) | 0x8080
+
+        # Advance to long-aligned character before we start reading longs.
+        while (<size_t>s) % sizeof(unsigned long) and s < <const_xmlChar *>end:
+            if s[0] & 0x80:
+                return True
+            s += 1
+
+        # Read one long at a time
+        lptr = <const unsigned long*> s
+        while lptr < end:
+            if lptr[0] & non_ascii_mask:
+                return True
+            lptr += 1
+        s = <const_xmlChar *>lptr
+
+    while s < (<const_xmlChar *>end + length % sizeof(unsigned long)):
+        if s[0] & 0x80:
+            return True
+        s += 1
+
+    return False
 
 cdef int _is_valid_xml_ascii(bytes pystring):
     """Check if a string is XML ascii content."""
@@ -1411,7 +1447,7 @@ cdef object funicode(const_xmlChar* s):
         spos += 1
     slen = spos - s
     if spos[0] != c'\0':
-        slen += tree.xmlStrlen(spos)
+        slen += cstring_h.strlen(<const char*> spos)
     if is_non_ascii:
         return s[:slen].decode('UTF-8')
     return <bytes>s[:slen]
@@ -1520,7 +1556,7 @@ cdef object _encodeFilenameUTF8(object filename):
     if filename is None:
         return None
     elif isinstance(filename, bytes):
-        if not isutf8(<bytes>filename):
+        if not isutf8l(<bytes>filename, len(<bytes>filename)):
             # plain ASCII!
             return filename
         c_filename = _cstr(<bytes>filename)
@@ -1657,7 +1693,7 @@ cdef object _namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name):
         return python.PyUnicode_FromFormat("{%s}%s", href, name)
     else:
         s = python.PyBytes_FromFormat("{%s}%s", href, name)
-        if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8(_xcstr(s))):
+        if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8l(s, len(s))):
             return (<bytes>s).decode('utf8')
         else:
             return s
diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx
index 69a553bd..a38440ba 100644
--- a/src/lxml/etree.pyx
+++ b/src/lxml/etree.pyx
@@ -3274,9 +3274,9 @@ def tostring(element_or_tree, *, encoding=None, method="xml",
     declaration by default.
 
     You can also serialise to a Unicode string without declaration by
-    passing the ``unicode`` function as encoding (or ``str`` in Py3),
-    or the name 'unicode'.  This changes the return value from a byte
-    string to an unencoded unicode string.
+    passing the name ``'unicode'`` as encoding (or the ``str`` function
+    in Py3 or ``unicode`` in Py2).  This changes the return value from
+    a byte string to an unencoded unicode string.
 
     The keyword argument 'pretty_print' (bool) enables formatted XML.
 
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index 4502373e..5751f709 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -46,7 +46,6 @@ import re
 from functools import partial
 
 try:
-    # while unnecessary, importing from 'collections.abc' is the right way to do it
     from collections.abc import MutableMapping, MutableSet
 except ImportError:
     from collections import MutableMapping, MutableSet
diff --git a/src/lxml/html/_setmixin.py b/src/lxml/html/_setmixin.py
index c14a3eb0..c99738e3 100644
--- a/src/lxml/html/_setmixin.py
+++ b/src/lxml/html/_setmixin.py
@@ -1,4 +1,8 @@
-from collections import MutableSet
+try:
+    from collections.abc import MutableSet
+except ImportError:
+    from collections import MutableSet
+
 
 class SetMixin(MutableSet):
 
diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi
index 15327511..3c70258a 100644
--- a/src/lxml/serializer.pxi
+++ b/src/lxml/serializer.pxi
@@ -61,7 +61,7 @@ cdef _textToString(xmlNode* c_node, encoding, bint with_tail):
             encoding = encoding.lower()
             if encoding not in (u'utf8', u'utf-8'):
                 if encoding == u'ascii':
-                    if isutf8(c_text):
+                    if isutf8l(c_text, tree.xmlBufferLength(c_buffer)):
                         # will raise a decode error below
                         needs_conversion = 1
                 else:
author	Lennart Regebro <regebro@gmail.com>	2018-11-23 15:18:50 +0100
committer	GitHub <noreply@github.com>	2018-11-23 15:18:50 +0100
commit	51308a28ac6e4e5ec7e014932a1ef39c1f99c5de (patch)
tree	8ad507b1ee511f4172c179a6d2f7bb4a95784f87
parent	8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b (diff)
parent	2ea6f97c5758b80d6a8394724c36091234fc9191 (diff)
download	python-lxml-51308a28ac6e4e5ec7e014932a1ef39c1f99c5de.tar.gz