diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/lxml/includes/tree.pxd | 3 | ||||
-rw-r--r-- | src/lxml/lxml.etree.pyx | 23 | ||||
-rw-r--r-- | src/lxml/serializer.pxi | 217 | ||||
-rw-r--r-- | src/lxml/tests/common_imports.py | 7 | ||||
-rw-r--r-- | src/lxml/tests/test_etree.py | 100 | ||||
-rw-r--r-- | src/lxml/tests/test_incremental_xmlfile.py | 21 |
6 files changed, 296 insertions, 75 deletions
diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd index b3bab52f..0d9d8843 100644 --- a/src/lxml/includes/tree.pxd +++ b/src/lxml/includes/tree.pxd @@ -61,6 +61,7 @@ cdef extern from "libxml/encoding.h": cdef extern from "libxml/chvalid.h": cdef int xmlIsChar_ch(char c) nogil + cdef int xmlIsCharQ(int ch) nogil cdef extern from "libxml/hash.h": ctypedef struct xmlHashTable @@ -357,6 +358,8 @@ cdef extern from "libxml/tree.h": cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf, xmlDoc* doc, xmlNode* cur, int level, int format, const_char* encoding) nogil + cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc, + xmlAttr *attr, const_xmlChar *string) nogil cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) nogil cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) nogil cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) nogil diff --git a/src/lxml/lxml.etree.pyx b/src/lxml/lxml.etree.pyx index 67ff69b2..c336cef2 100644 --- a/src/lxml/lxml.etree.pyx +++ b/src/lxml/lxml.etree.pyx @@ -1954,11 +1954,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType, def write(self, file, *, encoding=None, method=u"xml", pretty_print=False, xml_declaration=None, with_tail=True, - standalone=None, docstring=None, compression=0, - exclusive=False, with_comments=True, inclusive_ns_prefixes=None): + standalone=None, doctype=None, compression=0, + exclusive=False, with_comments=True, inclusive_ns_prefixes=None, + docstring=None): u"""write(self, file, encoding=None, method="xml", pretty_print=False, xml_declaration=None, with_tail=True, - standalone=None, compression=0, + standalone=None, doctype=None, compression=0, exclusive=False, with_comments=True, inclusive_ns_prefixes=None) Write the tree to a filename, file or file-like object. @@ -1976,6 +1977,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType, output an XML declaration with the corresponding ``standalone`` flag. + The ``doctype`` option allows passing in a plain string that will + be serialised before the XML tree. Note that passing in non + well-formed content here will make the XML output non well-formed. + Also, an existing doctype in the document tree will not be removed + when serialising an ElementTree instance. + The ``compression`` option enables GZip compression level 1-9. The ``inclusive_ns_prefixes`` should be a list of namespace strings @@ -2030,7 +2037,15 @@ cdef public class _ElementTree [ type LxmlElementTreeType, else: write_declaration = 1 is_standalone = 0 - _tofilelike(file, self._context_node, encoding, docstring, method, + + if docstring is not None and doctype is None: + import warnings + warnings.warn( + "The 'docstring' option is deprecated. Use 'doctype' instead.", + DeprecationWarning) + doctype = docstring + + _tofilelike(file, self._context_node, encoding, doctype, method, write_declaration, 1, pretty_print, with_tail, is_standalone, compression) diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi index 4b264a50..4ef53bc9 100644 --- a/src/lxml/serializer.pxi +++ b/src/lxml/serializer.pxi @@ -399,6 +399,208 @@ cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, pretty_print, encoding) c_sibling = c_sibling.next + +# copied and adapted from libxml2 +cdef unsigned char *xmlSerializeHexCharRef(unsigned char *out, int val): + cdef xmlChar *ptr + cdef xmlChar c + + out[0] = '&' + out += 1 + + out[0] = '#' + out += 1 + + out[0] = 'x' + out += 1 + + if (val < 0x10): + ptr = out + elif (val < 0x100): + ptr = out + 1 + elif (val < 0x1000): + ptr = out + 2 + elif (val < 0x10000): + ptr = out + 3 + elif (val < 0x100000): + ptr = out + 4 + else: + ptr = out + 5 + + out = ptr + 1 + while val > 0: + c = (val & 0xF) + + if c == 0: + ptr[0] = '0' + elif c == 1: + ptr[0] = '1' + elif c == 2: + ptr[0] = '2' + elif c == 3: + ptr[0] = '3' + elif c == 4: + ptr[0] = '4' + elif c == 5: + ptr[0] = '5' + elif c == 6: + ptr[0] = '6' + elif c == 7: + ptr[0] = '7' + elif c == 8: + ptr[0] = '8' + elif c == 9: + ptr[0] = '9' + elif c == 0xA: + ptr[0] = 'A' + elif c == 0xB: + ptr[0] = 'B' + elif c == 0xC: + ptr[0] = 'C' + elif c == 0xD: + ptr[0] = 'D' + elif c == 0xE: + ptr[0] = 'E' + elif c == 0xF: + ptr[0] = 'F' + else: + ptr[0] = '0' + + ptr -= 1 + + val >>= 4 + + out[0] = ';' + out += 1 + out[0] = 0 + + return out + + +# copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent()) +cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string): + cdef const char *base + cdef const char *cur + + cdef unsigned char tmp[12] + cdef int val = 0 + cdef int l + + if string == NULL: + return + + base = cur = <const char*>string + while (cur[0] != 0): + if (cur[0] == '\n'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 5, " ") + cur += 1 + base = cur + + elif (cur[0] == '\r'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 5, " ") + cur += 1 + base = cur + + elif (cur[0] == '\t'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 4, "	") + cur += 1 + base = cur + + elif (cur[0] == '"'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 6, """) + cur += 1 + base = cur + + elif (cur[0] == '<'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 4, "<") + cur += 1 + base = cur + + elif (cur[0] == '>'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 4, ">") + cur += 1 + base = cur + elif (cur[0] == '&'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 5, "&") + cur += 1 + base = cur + + elif (cur[0] >= 0x80) and (cur[1] != 0): + + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + if (cur[0] < 0xC0): + # invalid UTF-8 sequence + val = cur[0] + l = 1 + + elif (cur[0] < 0xE0): + val = (cur[0]) & 0x1F + val <<= 6 + val |= (cur[1]) & 0x3F + l = 2 + + elif ((cur[0] < 0xF0) and (cur[2] != 0)): + val = (cur[0]) & 0x0F + val <<= 6 + val |= (cur[1]) & 0x3F + val <<= 6 + val |= (cur[2]) & 0x3F + l = 3 + + elif ((cur[0] < 0xF8) and (cur[2] != 0) and (cur[3] != 0)): + val = (cur[0]) & 0x07 + val <<= 6 + val |= (cur[1]) & 0x3F + val <<= 6 + val |= (cur[2]) & 0x3F + val <<= 6 + val |= (cur[3]) & 0x3F + l = 4 + else: + # invalid UTF-8 sequence + val = cur[0] + l = 1 + + if ((l == 1) or (not tree.xmlIsCharQ(val))): + raise ValueError("Invalid character: %X" % val) + + # We could do multiple things here. Just save + # as a char ref + xmlSerializeHexCharRef(tmp, val) + tree.xmlOutputBufferWrite(buf, -1, <const char*> tmp) + cur += l + base = cur + + else: + cur += 1 + + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + ############################################################ # output to file-like objects @@ -897,7 +1099,8 @@ cdef class _IncrementalFileWriter: tree.xmlOutputBufferWrite(self._c_out, 1, ' ') self._write_qname(name, prefix) tree.xmlOutputBufferWrite(self._c_out, 2, '="') - tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(value), NULL) + _write_attr_string(self._c_out, _cstr(value)) + tree.xmlOutputBufferWrite(self._c_out, 1, '"') cdef _write_end_element(self, element_config): @@ -969,14 +1172,12 @@ cdef class _IncrementalFileWriter: raise LxmlSyntaxError("not in an element") content = _utf8(content) - if len(self._element_stack) > 0: - ns, name, _, _ = self._element_stack[-1] - else: - ns, name = None, None + ns, name, _, _ = self._element_stack[-1] + if (c_method == OUTPUT_METHOD_HTML and + ns in (None, b'http://www.w3.org/1999/xhtml') and + name in (b'script', b'style')): + tree.xmlOutputBufferWrite(self._c_out, len(content), _cstr(content)) - if c_method == OUTPUT_METHOD_HTML and \ - ns in (None, 'http://www.w3.org/1999/xhtml') and name in ('script', 'style'): - tree.xmlOutputBufferWrite(self._c_out, len(content), content) else: tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL) diff --git a/src/lxml/tests/common_imports.py b/src/lxml/tests/common_imports.py index 85b1157d..4547d2d5 100644 --- a/src/lxml/tests/common_imports.py +++ b/src/lxml/tests/common_imports.py @@ -117,12 +117,15 @@ def _get_caller_relative_path(filename, frame_depth=2): from io import StringIO +unichr_escape = re.compile(r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}') + if sys.version_info[0] >= 3: # Python 3 from builtins import str as unicode + from codecs import unicode_escape_decode _chr = chr def _str(s, encoding="UTF-8"): - return s + return unichr_escape.sub(lambda x: unicode_escape_decode(x.group(0))[0], s) def _bytes(s, encoding="UTF-8"): return s.encode(encoding) from io import BytesIO as _BytesIO @@ -144,8 +147,6 @@ if sys.version_info[0] >= 3: doctests, {}, os.path.basename(filename), filename, 0)) else: # Python 2 - unichr_escape = re.compile(r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}') - from __builtin__ import unicode _chr = unichr def _str(s, encoding="UTF-8"): diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py index 4ec59096..d1c79e05 100644 --- a/src/lxml/tests/test_etree.py +++ b/src/lxml/tests/test_etree.py @@ -20,6 +20,7 @@ import tempfile import textwrap import zlib import gzip +from contextlib import closing, contextmanager from .common_imports import etree, StringIO, BytesIO, HelperTestCase from .common_imports import fileInTestDir, fileUrlInTestDir, read_file, path2url @@ -43,6 +44,16 @@ except NameError: _unicode = str +@contextmanager +def tmpfile(): + handle, filename = tempfile.mkstemp() + try: + yield filename + finally: + os.close(handle) + os.remove(filename) + + class ETreeOnlyTestCase(HelperTestCase): """Tests only for etree, not ElementTree""" etree = etree @@ -4062,39 +4073,25 @@ class ETreeC14NTestCase(HelperTestCase): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) f = BytesIO() tree.write_c14n(f, compression=9) - gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue())) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile: s = gzfile.read() - finally: - gzfile.close() self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'), s) def test_c14n_file(self): tree = self.parse(_bytes('<a><b/></a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write_c14n(filename) data = read_file(filename, 'rb') - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a><b></b></a>'), data) def test_c14n_file_gzip(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write_c14n(filename, compression=9) - f = gzip.open(filename, 'rb') - try: + with closing(gzip.open(filename, 'rb')) as f: data = f.read() - finally: - f.close() - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'), data) @@ -4225,18 +4222,32 @@ class ETreeWriteTestCase(HelperTestCase): self.assertEqual(_bytes('<a><b/></a>'), s) + def test_write_doctype(self): + tree = self.parse(_bytes('<a><b/></a>')) + f = BytesIO() + tree.write(f, doctype='HUHU') + s = f.getvalue() + self.assertEqual(_bytes('HUHU\n<a><b/></a>'), + s) + def test_write_gzip(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) f = BytesIO() tree.write(f, compression=9) - gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue())) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile: s = gzfile.read() - finally: - gzfile.close() self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), s) + def test_write_gzip_doctype(self): + tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) + f = BytesIO() + tree.write(f, compression=9, doctype='<!DOCTYPE a>') + with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile: + s = gzfile.read() + self.assertEqual(_bytes('<!DOCTYPE a>\n<a>'+'<b/>'*200+'</a>'), + s) + def test_write_gzip_level(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) f = BytesIO() @@ -4251,21 +4262,15 @@ class ETreeWriteTestCase(HelperTestCase): tree.write(f, compression=1) s = f.getvalue() self.assertTrue(len(s) <= len(s0)) - gzfile = gzip.GzipFile(fileobj=BytesIO(s)) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile: s1 = gzfile.read() - finally: - gzfile.close() f = BytesIO() tree.write(f, compression=9) s = f.getvalue() self.assertTrue(len(s) <= len(s0)) - gzfile = gzip.GzipFile(fileobj=BytesIO(s)) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile: s9 = gzfile.read() - finally: - gzfile.close() self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), s0) @@ -4276,57 +4281,39 @@ class ETreeWriteTestCase(HelperTestCase): def test_write_file(self): tree = self.parse(_bytes('<a><b/></a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename) data = read_file(filename, 'rb') - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a><b/></a>'), data) def test_write_file_gzip(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename, compression=9) - f = gzip.open(filename, 'rb') - try: + with closing(gzip.open(filename, 'rb')) as f: data = f.read() - finally: - f.close() - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), data) def test_write_file_gzip_parse(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename, compression=9) data = etree.tostring(etree.parse(filename)) - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), data) def test_write_file_gzipfile_parse(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename, compression=9) - data = etree.tostring(etree.parse( - gzip.GzipFile(filename))) - finally: - os.close(handle) - os.remove(filename) + with closing(gzip.GzipFile(filename)) as f: + data = etree.tostring(etree.parse(f)) self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), data) + class ETreeErrorLogTest(HelperTestCase): etree = etree @@ -4527,5 +4514,6 @@ def test_suite(): [make_doctest('../../../doc/resolvers.txt')]) return suite + if __name__ == '__main__': print('to test use test.py %s' % __file__) diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py index 81f49ac6..c2f162b2 100644 --- a/src/lxml/tests/test_incremental_xmlfile.py +++ b/src/lxml/tests/test_incremental_xmlfile.py @@ -2,11 +2,9 @@ """ Tests for the incremental XML serialisation API. - -Tests require Python 2.5 or later. """ -from __future__ import with_statement +from __future__ import with_statement, absolute_import import unittest import tempfile, os, sys @@ -17,7 +15,8 @@ this_dir = os.path.dirname(__file__) if this_dir not in sys.path: sys.path.insert(0, this_dir) # needed for Py3 -from common_imports import etree, BytesIO, HelperTestCase, skipIf +from .common_imports import etree, BytesIO, HelperTestCase, skipIf, _str + class _XmlFileTestCaseBase(HelperTestCase): _file = None # to be set by specific subtypes below @@ -454,6 +453,20 @@ class HtmlFileTestCase(_XmlFileTestCaseBase): '</root>') self._file = BytesIO() + def test_attribute_quoting(self): + with etree.htmlfile(self._file) as xf: + with xf.element("tagname", attrib={"attr": '"misquoted"'}): + xf.write("foo") + + self.assertXml('<tagname attr=""misquoted"">foo</tagname>') + + def test_attribute_quoting_unicode(self): + with etree.htmlfile(self._file) as xf: + with xf.element("tagname", attrib={"attr": _str('"misquöted\\u3344\\U00013344"')}): + xf.write("foo") + + self.assertXml('<tagname attr=""misquöted㍄𓍄"">foo</tagname>') + def test_unescaped_script(self): with etree.htmlfile(self._file) as xf: elt = etree.Element('script') |