summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLennart Regebro <regebro@gmail.com>2018-11-23 15:18:50 +0100
committerGitHub <noreply@github.com>2018-11-23 15:18:50 +0100
commit51308a28ac6e4e5ec7e014932a1ef39c1f99c5de (patch)
tree8ad507b1ee511f4172c179a6d2f7bb4a95784f87
parent8c8e6136cd35f12ad0b90e8265eb13c5ea58e29b (diff)
parent2ea6f97c5758b80d6a8394724c36091234fc9191 (diff)
downloadpython-lxml-51308a28ac6e4e5ec7e014932a1ef39c1f99c5de.tar.gz
Merge branch 'master' into master
-rw-r--r--CHANGES.txt5
-rw-r--r--src/lxml/apihelpers.pxi48
-rw-r--r--src/lxml/etree.pyx6
-rw-r--r--src/lxml/html/__init__.py1
-rw-r--r--src/lxml/html/_setmixin.py6
-rw-r--r--src/lxml/serializer.pxi2
6 files changed, 56 insertions, 12 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 33f929aa..4b2503ca 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -14,6 +14,11 @@ Features added
point to the same URI, the first prefix in alphabetical order is used
for attributes.
+Bugs fixed
+----------
+
+* LP#1799755: Fix a DeprecationWarning in Py3.7+.
+
4.2.6 (2018-??-??)
==================
diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
index 91f85e4f..5366fcaf 100644
--- a/src/lxml/apihelpers.pxi
+++ b/src/lxml/apihelpers.pxi
@@ -1340,14 +1340,50 @@ cdef int _addSibling(_Element element, _Element sibling, bint as_next) except -1
moveNodeToDocument(element._doc, c_source_doc, c_node)
return 0
-cdef inline int isutf8(const_xmlChar* s):
+cdef inline bint isutf8(const_xmlChar* s):
cdef xmlChar c = s[0]
while c != c'\0':
if c & 0x80:
- return 1
+ return True
s += 1
c = s[0]
- return 0
+ return False
+
+cdef bint isutf8l(const_xmlChar* s, size_t length):
+ """
+ Search for non-ASCII characters in the string, knowing its length in advance.
+ """
+ cdef int i
+ cdef unsigned long non_ascii_mask
+ cdef const unsigned long *lptr = <const unsigned long*> s
+
+ cdef const unsigned long *end = lptr + length // sizeof(unsigned long)
+ if length >= sizeof(non_ascii_mask):
+ # Build constant 0x80808080... mask (and let the C compiler fold it).
+ non_ascii_mask = 0
+ for i in range(sizeof(non_ascii_mask) // 2):
+ non_ascii_mask = (non_ascii_mask << 16) | 0x8080
+
+ # Advance to long-aligned character before we start reading longs.
+ while (<size_t>s) % sizeof(unsigned long) and s < <const_xmlChar *>end:
+ if s[0] & 0x80:
+ return True
+ s += 1
+
+ # Read one long at a time
+ lptr = <const unsigned long*> s
+ while lptr < end:
+ if lptr[0] & non_ascii_mask:
+ return True
+ lptr += 1
+ s = <const_xmlChar *>lptr
+
+ while s < (<const_xmlChar *>end + length % sizeof(unsigned long)):
+ if s[0] & 0x80:
+ return True
+ s += 1
+
+ return False
cdef int _is_valid_xml_ascii(bytes pystring):
"""Check if a string is XML ascii content."""
@@ -1411,7 +1447,7 @@ cdef object funicode(const_xmlChar* s):
spos += 1
slen = spos - s
if spos[0] != c'\0':
- slen += tree.xmlStrlen(spos)
+ slen += cstring_h.strlen(<const char*> spos)
if is_non_ascii:
return s[:slen].decode('UTF-8')
return <bytes>s[:slen]
@@ -1520,7 +1556,7 @@ cdef object _encodeFilenameUTF8(object filename):
if filename is None:
return None
elif isinstance(filename, bytes):
- if not isutf8(<bytes>filename):
+ if not isutf8l(<bytes>filename, len(<bytes>filename)):
# plain ASCII!
return filename
c_filename = _cstr(<bytes>filename)
@@ -1657,7 +1693,7 @@ cdef object _namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name):
return python.PyUnicode_FromFormat("{%s}%s", href, name)
else:
s = python.PyBytes_FromFormat("{%s}%s", href, name)
- if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8(_xcstr(s))):
+ if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8l(s, len(s))):
return (<bytes>s).decode('utf8')
else:
return s
diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx
index 69a553bd..a38440ba 100644
--- a/src/lxml/etree.pyx
+++ b/src/lxml/etree.pyx
@@ -3274,9 +3274,9 @@ def tostring(element_or_tree, *, encoding=None, method="xml",
declaration by default.
You can also serialise to a Unicode string without declaration by
- passing the ``unicode`` function as encoding (or ``str`` in Py3),
- or the name 'unicode'. This changes the return value from a byte
- string to an unencoded unicode string.
+ passing the name ``'unicode'`` as encoding (or the ``str`` function
+ in Py3 or ``unicode`` in Py2). This changes the return value from
+ a byte string to an unencoded unicode string.
The keyword argument 'pretty_print' (bool) enables formatted XML.
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index 4502373e..5751f709 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -46,7 +46,6 @@ import re
from functools import partial
try:
- # while unnecessary, importing from 'collections.abc' is the right way to do it
from collections.abc import MutableMapping, MutableSet
except ImportError:
from collections import MutableMapping, MutableSet
diff --git a/src/lxml/html/_setmixin.py b/src/lxml/html/_setmixin.py
index c14a3eb0..c99738e3 100644
--- a/src/lxml/html/_setmixin.py
+++ b/src/lxml/html/_setmixin.py
@@ -1,4 +1,8 @@
-from collections import MutableSet
+try:
+ from collections.abc import MutableSet
+except ImportError:
+ from collections import MutableSet
+
class SetMixin(MutableSet):
diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi
index 15327511..3c70258a 100644
--- a/src/lxml/serializer.pxi
+++ b/src/lxml/serializer.pxi
@@ -61,7 +61,7 @@ cdef _textToString(xmlNode* c_node, encoding, bint with_tail):
encoding = encoding.lower()
if encoding not in (u'utf8', u'utf-8'):
if encoding == u'ascii':
- if isutf8(c_text):
+ if isutf8l(c_text, tree.xmlBufferLength(c_buffer)):
# will raise a decode error below
needs_conversion = 1
else: