summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/lxml/includes/tree.pxd3
-rw-r--r--src/lxml/lxml.etree.pyx23
-rw-r--r--src/lxml/serializer.pxi217
-rw-r--r--src/lxml/tests/common_imports.py7
-rw-r--r--src/lxml/tests/test_etree.py100
-rw-r--r--src/lxml/tests/test_incremental_xmlfile.py21
6 files changed, 296 insertions, 75 deletions
diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd
index b3bab52f..0d9d8843 100644
--- a/src/lxml/includes/tree.pxd
+++ b/src/lxml/includes/tree.pxd
@@ -61,6 +61,7 @@ cdef extern from "libxml/encoding.h":
cdef extern from "libxml/chvalid.h":
cdef int xmlIsChar_ch(char c) nogil
+ cdef int xmlIsCharQ(int ch) nogil
cdef extern from "libxml/hash.h":
ctypedef struct xmlHashTable
@@ -357,6 +358,8 @@ cdef extern from "libxml/tree.h":
cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf,
xmlDoc* doc, xmlNode* cur, int level,
int format, const_char* encoding) nogil
+ cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc,
+ xmlAttr *attr, const_xmlChar *string) nogil
cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) nogil
cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) nogil
cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) nogil
diff --git a/src/lxml/lxml.etree.pyx b/src/lxml/lxml.etree.pyx
index 67ff69b2..c336cef2 100644
--- a/src/lxml/lxml.etree.pyx
+++ b/src/lxml/lxml.etree.pyx
@@ -1954,11 +1954,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
def write(self, file, *, encoding=None, method=u"xml",
pretty_print=False, xml_declaration=None, with_tail=True,
- standalone=None, docstring=None, compression=0,
- exclusive=False, with_comments=True, inclusive_ns_prefixes=None):
+ standalone=None, doctype=None, compression=0,
+ exclusive=False, with_comments=True, inclusive_ns_prefixes=None,
+ docstring=None):
u"""write(self, file, encoding=None, method="xml",
pretty_print=False, xml_declaration=None, with_tail=True,
- standalone=None, compression=0,
+ standalone=None, doctype=None, compression=0,
exclusive=False, with_comments=True, inclusive_ns_prefixes=None)
Write the tree to a filename, file or file-like object.
@@ -1976,6 +1977,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
output an XML declaration with the corresponding
``standalone`` flag.
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
The ``compression`` option enables GZip compression level 1-9.
The ``inclusive_ns_prefixes`` should be a list of namespace strings
@@ -2030,7 +2037,15 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
else:
write_declaration = 1
is_standalone = 0
- _tofilelike(file, self._context_node, encoding, docstring, method,
+
+ if docstring is not None and doctype is None:
+ import warnings
+ warnings.warn(
+ "The 'docstring' option is deprecated. Use 'doctype' instead.",
+ DeprecationWarning)
+ doctype = docstring
+
+ _tofilelike(file, self._context_node, encoding, doctype, method,
write_declaration, 1, pretty_print, with_tail,
is_standalone, compression)
diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi
index 4b264a50..4ef53bc9 100644
--- a/src/lxml/serializer.pxi
+++ b/src/lxml/serializer.pxi
@@ -399,6 +399,208 @@ cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
pretty_print, encoding)
c_sibling = c_sibling.next
+
+# copied and adapted from libxml2
+cdef unsigned char *xmlSerializeHexCharRef(unsigned char *out, int val):
+ cdef xmlChar *ptr
+ cdef xmlChar c
+
+ out[0] = '&'
+ out += 1
+
+ out[0] = '#'
+ out += 1
+
+ out[0] = 'x'
+ out += 1
+
+ if (val < 0x10):
+ ptr = out
+ elif (val < 0x100):
+ ptr = out + 1
+ elif (val < 0x1000):
+ ptr = out + 2
+ elif (val < 0x10000):
+ ptr = out + 3
+ elif (val < 0x100000):
+ ptr = out + 4
+ else:
+ ptr = out + 5
+
+ out = ptr + 1
+ while val > 0:
+ c = (val & 0xF)
+
+ if c == 0:
+ ptr[0] = '0'
+ elif c == 1:
+ ptr[0] = '1'
+ elif c == 2:
+ ptr[0] = '2'
+ elif c == 3:
+ ptr[0] = '3'
+ elif c == 4:
+ ptr[0] = '4'
+ elif c == 5:
+ ptr[0] = '5'
+ elif c == 6:
+ ptr[0] = '6'
+ elif c == 7:
+ ptr[0] = '7'
+ elif c == 8:
+ ptr[0] = '8'
+ elif c == 9:
+ ptr[0] = '9'
+ elif c == 0xA:
+ ptr[0] = 'A'
+ elif c == 0xB:
+ ptr[0] = 'B'
+ elif c == 0xC:
+ ptr[0] = 'C'
+ elif c == 0xD:
+ ptr[0] = 'D'
+ elif c == 0xE:
+ ptr[0] = 'E'
+ elif c == 0xF:
+ ptr[0] = 'F'
+ else:
+ ptr[0] = '0'
+
+ ptr -= 1
+
+ val >>= 4
+
+ out[0] = ';'
+ out += 1
+ out[0] = 0
+
+ return out
+
+
+# copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent())
+cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string):
+ cdef const char *base
+ cdef const char *cur
+
+ cdef unsigned char tmp[12]
+ cdef int val = 0
+ cdef int l
+
+ if string == NULL:
+ return
+
+ base = cur = <const char*>string
+ while (cur[0] != 0):
+ if (cur[0] == '\n'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&#10;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '\r'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&#13;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '\t'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&#9;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '"'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 6, "&quot;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '<'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&lt;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '>'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&gt;")
+ cur += 1
+ base = cur
+ elif (cur[0] == '&'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&amp;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] >= 0x80) and (cur[1] != 0):
+
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ if (cur[0] < 0xC0):
+ # invalid UTF-8 sequence
+ val = cur[0]
+ l = 1
+
+ elif (cur[0] < 0xE0):
+ val = (cur[0]) & 0x1F
+ val <<= 6
+ val |= (cur[1]) & 0x3F
+ l = 2
+
+ elif ((cur[0] < 0xF0) and (cur[2] != 0)):
+ val = (cur[0]) & 0x0F
+ val <<= 6
+ val |= (cur[1]) & 0x3F
+ val <<= 6
+ val |= (cur[2]) & 0x3F
+ l = 3
+
+ elif ((cur[0] < 0xF8) and (cur[2] != 0) and (cur[3] != 0)):
+ val = (cur[0]) & 0x07
+ val <<= 6
+ val |= (cur[1]) & 0x3F
+ val <<= 6
+ val |= (cur[2]) & 0x3F
+ val <<= 6
+ val |= (cur[3]) & 0x3F
+ l = 4
+ else:
+ # invalid UTF-8 sequence
+ val = cur[0]
+ l = 1
+
+ if ((l == 1) or (not tree.xmlIsCharQ(val))):
+ raise ValueError("Invalid character: %X" % val)
+
+ # We could do multiple things here. Just save
+ # as a char ref
+ xmlSerializeHexCharRef(tmp, val)
+ tree.xmlOutputBufferWrite(buf, -1, <const char*> tmp)
+ cur += l
+ base = cur
+
+ else:
+ cur += 1
+
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+
############################################################
# output to file-like objects
@@ -897,7 +1099,8 @@ cdef class _IncrementalFileWriter:
tree.xmlOutputBufferWrite(self._c_out, 1, ' ')
self._write_qname(name, prefix)
tree.xmlOutputBufferWrite(self._c_out, 2, '="')
- tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(value), NULL)
+ _write_attr_string(self._c_out, _cstr(value))
+
tree.xmlOutputBufferWrite(self._c_out, 1, '"')
cdef _write_end_element(self, element_config):
@@ -969,14 +1172,12 @@ cdef class _IncrementalFileWriter:
raise LxmlSyntaxError("not in an element")
content = _utf8(content)
- if len(self._element_stack) > 0:
- ns, name, _, _ = self._element_stack[-1]
- else:
- ns, name = None, None
+ ns, name, _, _ = self._element_stack[-1]
+ if (c_method == OUTPUT_METHOD_HTML and
+ ns in (None, b'http://www.w3.org/1999/xhtml') and
+ name in (b'script', b'style')):
+ tree.xmlOutputBufferWrite(self._c_out, len(content), _cstr(content))
- if c_method == OUTPUT_METHOD_HTML and \
- ns in (None, 'http://www.w3.org/1999/xhtml') and name in ('script', 'style'):
- tree.xmlOutputBufferWrite(self._c_out, len(content), content)
else:
tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL)
diff --git a/src/lxml/tests/common_imports.py b/src/lxml/tests/common_imports.py
index 85b1157d..4547d2d5 100644
--- a/src/lxml/tests/common_imports.py
+++ b/src/lxml/tests/common_imports.py
@@ -117,12 +117,15 @@ def _get_caller_relative_path(filename, frame_depth=2):
from io import StringIO
+unichr_escape = re.compile(r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}')
+
if sys.version_info[0] >= 3:
# Python 3
from builtins import str as unicode
+ from codecs import unicode_escape_decode
_chr = chr
def _str(s, encoding="UTF-8"):
- return s
+ return unichr_escape.sub(lambda x: unicode_escape_decode(x.group(0))[0], s)
def _bytes(s, encoding="UTF-8"):
return s.encode(encoding)
from io import BytesIO as _BytesIO
@@ -144,8 +147,6 @@ if sys.version_info[0] >= 3:
doctests, {}, os.path.basename(filename), filename, 0))
else:
# Python 2
- unichr_escape = re.compile(r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}')
-
from __builtin__ import unicode
_chr = unichr
def _str(s, encoding="UTF-8"):
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index 4ec59096..d1c79e05 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -20,6 +20,7 @@ import tempfile
import textwrap
import zlib
import gzip
+from contextlib import closing, contextmanager
from .common_imports import etree, StringIO, BytesIO, HelperTestCase
from .common_imports import fileInTestDir, fileUrlInTestDir, read_file, path2url
@@ -43,6 +44,16 @@ except NameError:
_unicode = str
+@contextmanager
+def tmpfile():
+ handle, filename = tempfile.mkstemp()
+ try:
+ yield filename
+ finally:
+ os.close(handle)
+ os.remove(filename)
+
+
class ETreeOnlyTestCase(HelperTestCase):
"""Tests only for etree, not ElementTree"""
etree = etree
@@ -4062,39 +4073,25 @@ class ETreeC14NTestCase(HelperTestCase):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
f = BytesIO()
tree.write_c14n(f, compression=9)
- gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue()))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
s = gzfile.read()
- finally:
- gzfile.close()
self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
s)
def test_c14n_file(self):
tree = self.parse(_bytes('<a><b/></a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write_c14n(filename)
data = read_file(filename, 'rb')
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a><b></b></a>'),
data)
def test_c14n_file_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write_c14n(filename, compression=9)
- f = gzip.open(filename, 'rb')
- try:
+ with closing(gzip.open(filename, 'rb')) as f:
data = f.read()
- finally:
- f.close()
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
data)
@@ -4225,18 +4222,32 @@ class ETreeWriteTestCase(HelperTestCase):
self.assertEqual(_bytes('<a><b/></a>'),
s)
+ def test_write_doctype(self):
+ tree = self.parse(_bytes('<a><b/></a>'))
+ f = BytesIO()
+ tree.write(f, doctype='HUHU')
+ s = f.getvalue()
+ self.assertEqual(_bytes('HUHU\n<a><b/></a>'),
+ s)
+
def test_write_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
f = BytesIO()
tree.write(f, compression=9)
- gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue()))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
s = gzfile.read()
- finally:
- gzfile.close()
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
s)
+ def test_write_gzip_doctype(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ f = BytesIO()
+ tree.write(f, compression=9, doctype='<!DOCTYPE a>')
+ with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
+ s = gzfile.read()
+ self.assertEqual(_bytes('<!DOCTYPE a>\n<a>'+'<b/>'*200+'</a>'),
+ s)
+
def test_write_gzip_level(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
f = BytesIO()
@@ -4251,21 +4262,15 @@ class ETreeWriteTestCase(HelperTestCase):
tree.write(f, compression=1)
s = f.getvalue()
self.assertTrue(len(s) <= len(s0))
- gzfile = gzip.GzipFile(fileobj=BytesIO(s))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile:
s1 = gzfile.read()
- finally:
- gzfile.close()
f = BytesIO()
tree.write(f, compression=9)
s = f.getvalue()
self.assertTrue(len(s) <= len(s0))
- gzfile = gzip.GzipFile(fileobj=BytesIO(s))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile:
s9 = gzfile.read()
- finally:
- gzfile.close()
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
s0)
@@ -4276,57 +4281,39 @@ class ETreeWriteTestCase(HelperTestCase):
def test_write_file(self):
tree = self.parse(_bytes('<a><b/></a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename)
data = read_file(filename, 'rb')
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a><b/></a>'),
data)
def test_write_file_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename, compression=9)
- f = gzip.open(filename, 'rb')
- try:
+ with closing(gzip.open(filename, 'rb')) as f:
data = f.read()
- finally:
- f.close()
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)
def test_write_file_gzip_parse(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename, compression=9)
data = etree.tostring(etree.parse(filename))
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)
def test_write_file_gzipfile_parse(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename, compression=9)
- data = etree.tostring(etree.parse(
- gzip.GzipFile(filename)))
- finally:
- os.close(handle)
- os.remove(filename)
+ with closing(gzip.GzipFile(filename)) as f:
+ data = etree.tostring(etree.parse(f))
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)
+
class ETreeErrorLogTest(HelperTestCase):
etree = etree
@@ -4527,5 +4514,6 @@ def test_suite():
[make_doctest('../../../doc/resolvers.txt')])
return suite
+
if __name__ == '__main__':
print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py
index 81f49ac6..c2f162b2 100644
--- a/src/lxml/tests/test_incremental_xmlfile.py
+++ b/src/lxml/tests/test_incremental_xmlfile.py
@@ -2,11 +2,9 @@
"""
Tests for the incremental XML serialisation API.
-
-Tests require Python 2.5 or later.
"""
-from __future__ import with_statement
+from __future__ import with_statement, absolute_import
import unittest
import tempfile, os, sys
@@ -17,7 +15,8 @@ this_dir = os.path.dirname(__file__)
if this_dir not in sys.path:
sys.path.insert(0, this_dir) # needed for Py3
-from common_imports import etree, BytesIO, HelperTestCase, skipIf
+from .common_imports import etree, BytesIO, HelperTestCase, skipIf, _str
+
class _XmlFileTestCaseBase(HelperTestCase):
_file = None # to be set by specific subtypes below
@@ -454,6 +453,20 @@ class HtmlFileTestCase(_XmlFileTestCaseBase):
'</root>')
self._file = BytesIO()
+ def test_attribute_quoting(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("tagname", attrib={"attr": '"misquoted"'}):
+ xf.write("foo")
+
+ self.assertXml('<tagname attr="&quot;misquoted&quot;">foo</tagname>')
+
+ def test_attribute_quoting_unicode(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("tagname", attrib={"attr": _str('"misquöted\\u3344\\U00013344"')}):
+ xf.write("foo")
+
+ self.assertXml('<tagname attr="&quot;misqu&#246;ted&#13124;&#78660;&quot;">foo</tagname>')
+
def test_unescaped_script(self):
with etree.htmlfile(self._file) as xf:
elt = etree.Element('script')