6 files changed, 296 insertions, 75 deletions
diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd
index b3bab52f..0d9d8843 100644
--- a/src/lxml/includes/tree.pxd
+++ b/src/lxml/includes/tree.pxd
@@ -61,6 +61,7 @@ cdef extern from "libxml/encoding.h":
 
 cdef extern from "libxml/chvalid.h":
     cdef int xmlIsChar_ch(char c) nogil
+    cdef int xmlIsCharQ(int ch) nogil
 
 cdef extern from "libxml/hash.h":
     ctypedef struct xmlHashTable
@@ -357,6 +358,8 @@ cdef extern from "libxml/tree.h":
     cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf,
                                 xmlDoc* doc, xmlNode* cur, int level,
                                 int format, const_char* encoding) nogil
+    cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc,
+                                xmlAttr *attr, const_xmlChar *string) nogil
     cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) nogil
     cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) nogil
     cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) nogil
diff --git a/src/lxml/lxml.etree.pyx b/src/lxml/lxml.etree.pyx
index 67ff69b2..c336cef2 100644
--- a/src/lxml/lxml.etree.pyx
+++ b/src/lxml/lxml.etree.pyx
@@ -1954,11 +1954,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
 
     def write(self, file, *, encoding=None, method=u"xml",
               pretty_print=False, xml_declaration=None, with_tail=True,
-              standalone=None, docstring=None, compression=0,
-              exclusive=False, with_comments=True, inclusive_ns_prefixes=None):
+              standalone=None, doctype=None, compression=0,
+              exclusive=False, with_comments=True, inclusive_ns_prefixes=None,
+              docstring=None):
         u"""write(self, file, encoding=None, method="xml",
                   pretty_print=False, xml_declaration=None, with_tail=True,
-                  standalone=None, compression=0,
+                  standalone=None, doctype=None, compression=0,
                   exclusive=False, with_comments=True, inclusive_ns_prefixes=None)
 
         Write the tree to a filename, file or file-like object.
@@ -1976,6 +1977,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
         output an XML declaration with the corresponding
         ``standalone`` flag.
 
+        The ``doctype`` option allows passing in a plain string that will
+        be serialised before the XML tree.  Note that passing in non
+        well-formed content here will make the XML output non well-formed.
+        Also, an existing doctype in the document tree will not be removed
+        when serialising an ElementTree instance.
+
         The ``compression`` option enables GZip compression level 1-9.
 
         The ``inclusive_ns_prefixes`` should be a list of namespace strings
@@ -2030,7 +2037,15 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
         else:
             write_declaration = 1
             is_standalone = 0
-        _tofilelike(file, self._context_node, encoding, docstring, method,
+
+        if docstring is not None and doctype is None:
+            import warnings
+            warnings.warn(
+                "The 'docstring' option is deprecated. Use 'doctype' instead.",
+                DeprecationWarning)
+            doctype = docstring
+
+        _tofilelike(file, self._context_node, encoding, doctype, method,
                     write_declaration, 1, pretty_print, with_tail,
                     is_standalone, compression)
 
diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi
index 4b264a50..4ef53bc9 100644
--- a/src/lxml/serializer.pxi
+++ b/src/lxml/serializer.pxi
@@ -399,6 +399,208 @@ cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
                                pretty_print, encoding)
         c_sibling = c_sibling.next
 
+
+# copied and adapted from libxml2
+cdef unsigned char *xmlSerializeHexCharRef(unsigned char *out, int val):
+    cdef xmlChar *ptr
+    cdef xmlChar c
+
+    out[0] = '&'
+    out += 1
+
+    out[0] = '#'
+    out += 1
+
+    out[0] = 'x'
+    out += 1
+
+    if (val < 0x10):
+        ptr = out
+    elif (val < 0x100):
+        ptr = out + 1
+    elif (val < 0x1000):
+        ptr = out + 2
+    elif (val < 0x10000):
+        ptr = out + 3
+    elif (val < 0x100000):
+        ptr = out + 4
+    else:
+        ptr = out + 5
+
+    out = ptr + 1
+    while val > 0:
+        c = (val & 0xF)
+
+        if c == 0:
+            ptr[0] = '0'
+        elif c == 1:
+            ptr[0] = '1'
+        elif c == 2:
+            ptr[0] = '2'
+        elif c == 3:
+            ptr[0] = '3'
+        elif c == 4:
+            ptr[0] = '4'
+        elif c == 5:
+            ptr[0] = '5'
+        elif c == 6:
+            ptr[0] = '6'
+        elif c == 7:
+            ptr[0] = '7'
+        elif c == 8:
+            ptr[0] = '8'
+        elif c == 9:
+            ptr[0] = '9'
+        elif c == 0xA:
+            ptr[0] = 'A'
+        elif c == 0xB:
+            ptr[0] = 'B'
+        elif c == 0xC:
+            ptr[0] = 'C'
+        elif c == 0xD:
+            ptr[0] = 'D'
+        elif c == 0xE:
+            ptr[0] = 'E'
+        elif c == 0xF:
+            ptr[0] = 'F'
+        else:
+            ptr[0] = '0'
+
+        ptr -= 1
+
+        val >>= 4
+
+    out[0] = ';'
+    out += 1
+    out[0] = 0
+
+    return out
+
+
+# copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent())
+cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string):
+    cdef const char *base
+    cdef const char *cur
+
+    cdef unsigned char tmp[12]
+    cdef int val = 0
+    cdef int l
+
+    if string == NULL:
+        return
+
+    base = cur = <const char*>string
+    while (cur[0] != 0):
+        if (cur[0] == '\n'):
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            tree.xmlOutputBufferWrite(buf, 5, "&#10;")
+            cur += 1
+            base = cur
+
+        elif (cur[0] == '\r'):
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            tree.xmlOutputBufferWrite(buf, 5, "&#13;")
+            cur += 1
+            base = cur
+
+        elif (cur[0] == '\t'):
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            tree.xmlOutputBufferWrite(buf, 4, "&#9;")
+            cur += 1
+            base = cur
+
+        elif (cur[0] == '"'):
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            tree.xmlOutputBufferWrite(buf, 6, "&quot;")
+            cur += 1
+            base = cur
+
+        elif (cur[0] == '<'):
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            tree.xmlOutputBufferWrite(buf, 4, "&lt;")
+            cur += 1
+            base = cur
+
+        elif (cur[0] == '>'):
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            tree.xmlOutputBufferWrite(buf, 4, "&gt;")
+            cur += 1
+            base = cur
+        elif (cur[0] == '&'):
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            tree.xmlOutputBufferWrite(buf, 5, "&amp;")
+            cur += 1
+            base = cur
+
+        elif (cur[0] >= 0x80) and (cur[1] != 0):
+
+            if (base != cur):
+                tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+            if (cur[0] < 0xC0):
+                # invalid UTF-8 sequence
+                val = cur[0]
+                l = 1
+
+            elif (cur[0] < 0xE0):
+                val = (cur[0]) & 0x1F
+                val <<= 6
+                val |= (cur[1]) & 0x3F
+                l = 2
+
+            elif ((cur[0] < 0xF0) and (cur[2] != 0)):
+                val = (cur[0]) & 0x0F
+                val <<= 6
+                val |= (cur[1]) & 0x3F
+                val <<= 6
+                val |= (cur[2]) & 0x3F
+                l = 3
+
+            elif ((cur[0] < 0xF8) and (cur[2] != 0) and (cur[3] != 0)):
+                val = (cur[0]) & 0x07
+                val <<= 6
+                val |= (cur[1]) & 0x3F
+                val <<= 6
+                val |= (cur[2]) & 0x3F
+                val <<= 6
+                val |= (cur[3]) & 0x3F
+                l = 4
+            else:
+                # invalid UTF-8 sequence
+                val = cur[0]
+                l = 1
+
+            if ((l == 1) or (not tree.xmlIsCharQ(val))):
+                raise ValueError("Invalid character: %X" % val)
+
+            # We could do multiple things here. Just save
+            # as a char ref
+            xmlSerializeHexCharRef(tmp, val)
+            tree.xmlOutputBufferWrite(buf, -1, <const char*> tmp)
+            cur += l
+            base = cur
+
+        else:
+            cur += 1
+
+    if (base != cur):
+        tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+
 ############################################################
 # output to file-like objects
 
@@ -897,7 +1099,8 @@ cdef class _IncrementalFileWriter:
             tree.xmlOutputBufferWrite(self._c_out, 1, ' ')
             self._write_qname(name, prefix)
             tree.xmlOutputBufferWrite(self._c_out, 2, '="')
-            tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(value), NULL)
+            _write_attr_string(self._c_out, _cstr(value))
+
             tree.xmlOutputBufferWrite(self._c_out, 1, '"')
 
     cdef _write_end_element(self, element_config):
@@ -969,14 +1172,12 @@ cdef class _IncrementalFileWriter:
                         raise LxmlSyntaxError("not in an element")
                 content = _utf8(content)
 
-                if len(self._element_stack) > 0:
-                    ns, name, _, _ = self._element_stack[-1]
-                else:
-                    ns, name = None, None
+                ns, name, _, _ = self._element_stack[-1]
+                if (c_method == OUTPUT_METHOD_HTML and
+                        ns in (None, b'http://www.w3.org/1999/xhtml') and
+                        name in (b'script', b'style')):
+                    tree.xmlOutputBufferWrite(self._c_out, len(content), _cstr(content))
 
-                if c_method == OUTPUT_METHOD_HTML and \
-                        ns in (None, 'http://www.w3.org/1999/xhtml') and name in ('script', 'style'):
-                    tree.xmlOutputBufferWrite(self._c_out, len(content), content)
                 else:
                     tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL)
 
diff --git a/src/lxml/tests/common_imports.py b/src/lxml/tests/common_imports.py
index 85b1157d..4547d2d5 100644
--- a/src/lxml/tests/common_imports.py
+++ b/src/lxml/tests/common_imports.py
@@ -117,12 +117,15 @@ def _get_caller_relative_path(filename, frame_depth=2):
 
 from io import StringIO
 
+unichr_escape = re.compile(r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}')
+
 if sys.version_info[0] >= 3:
     # Python 3
     from builtins import str as unicode
+    from codecs import unicode_escape_decode
     _chr = chr
     def _str(s, encoding="UTF-8"):
-        return s
+        return unichr_escape.sub(lambda x: unicode_escape_decode(x.group(0))[0], s)
     def _bytes(s, encoding="UTF-8"):
         return s.encode(encoding)
     from io import BytesIO as _BytesIO
@@ -144,8 +147,6 @@ if sys.version_info[0] >= 3:
                 doctests, {}, os.path.basename(filename), filename, 0))
 else:
     # Python 2
-    unichr_escape = re.compile(r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}')
-
     from __builtin__ import unicode
     _chr = unichr
     def _str(s, encoding="UTF-8"):
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index 4ec59096..d1c79e05 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -20,6 +20,7 @@ import tempfile
 import textwrap
 import zlib
 import gzip
+from contextlib import closing, contextmanager
 
 from .common_imports import etree, StringIO, BytesIO, HelperTestCase
 from .common_imports import fileInTestDir, fileUrlInTestDir, read_file, path2url
@@ -43,6 +44,16 @@ except NameError:
     _unicode = str
 
 
+@contextmanager
+def tmpfile():
+    handle, filename = tempfile.mkstemp()
+    try:
+        yield filename
+    finally:
+        os.close(handle)
+        os.remove(filename)
+
+
 class ETreeOnlyTestCase(HelperTestCase):
     """Tests only for etree, not ElementTree"""
     etree = etree
@@ -4062,39 +4073,25 @@ class ETreeC14NTestCase(HelperTestCase):
         tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
         f = BytesIO()
         tree.write_c14n(f, compression=9)
-        gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue()))
-        try:
+        with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
             s = gzfile.read()
-        finally:
-            gzfile.close()
         self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
                           s)
 
     def test_c14n_file(self):
         tree = self.parse(_bytes('<a><b/></a>'))
-        handle, filename = tempfile.mkstemp()
-        try:
+        with tmpfile() as filename:
             tree.write_c14n(filename)
             data = read_file(filename, 'rb')
-        finally:
-            os.close(handle)
-            os.remove(filename)
         self.assertEqual(_bytes('<a><b></b></a>'),
                           data)
 
     def test_c14n_file_gzip(self):
         tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
-        handle, filename = tempfile.mkstemp()
-        try:
+        with tmpfile() as filename:
             tree.write_c14n(filename, compression=9)
-            f = gzip.open(filename, 'rb')
-            try:
+            with closing(gzip.open(filename, 'rb')) as f:
                 data = f.read()
-            finally:
-                f.close()
-        finally:
-            os.close(handle)
-            os.remove(filename)
         self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
                           data)
 
@@ -4225,18 +4222,32 @@ class ETreeWriteTestCase(HelperTestCase):
         self.assertEqual(_bytes('<a><b/></a>'),
                           s)
 
+    def test_write_doctype(self):
+        tree = self.parse(_bytes('<a><b/></a>'))
+        f = BytesIO()
+        tree.write(f, doctype='HUHU')
+        s = f.getvalue()
+        self.assertEqual(_bytes('HUHU\n<a><b/></a>'),
+                          s)
+
     def test_write_gzip(self):
         tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
         f = BytesIO()
         tree.write(f, compression=9)
-        gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue()))
-        try:
+        with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
             s = gzfile.read()
-        finally:
-            gzfile.close()
         self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
                           s)
 
+    def test_write_gzip_doctype(self):
+        tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+        f = BytesIO()
+        tree.write(f, compression=9, doctype='<!DOCTYPE a>')
+        with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
+            s = gzfile.read()
+        self.assertEqual(_bytes('<!DOCTYPE a>\n<a>'+'<b/>'*200+'</a>'),
+                          s)
+
     def test_write_gzip_level(self):
         tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
         f = BytesIO()
@@ -4251,21 +4262,15 @@ class ETreeWriteTestCase(HelperTestCase):
         tree.write(f, compression=1)
         s = f.getvalue()
         self.assertTrue(len(s) <= len(s0))
-        gzfile = gzip.GzipFile(fileobj=BytesIO(s))
-        try:
+        with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile:
             s1 = gzfile.read()
-        finally:
-            gzfile.close()
 
         f = BytesIO()
         tree.write(f, compression=9)
         s = f.getvalue()
         self.assertTrue(len(s) <= len(s0))
-        gzfile = gzip.GzipFile(fileobj=BytesIO(s))
-        try:
+        with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile:
             s9 = gzfile.read()
-        finally:
-            gzfile.close()
 
         self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
                           s0)
@@ -4276,57 +4281,39 @@ class ETreeWriteTestCase(HelperTestCase):
 
     def test_write_file(self):
         tree = self.parse(_bytes('<a><b/></a>'))
-        handle, filename = tempfile.mkstemp()
-        try:
+        with tmpfile() as filename:
             tree.write(filename)
             data = read_file(filename, 'rb')
-        finally:
-            os.close(handle)
-            os.remove(filename)
         self.assertEqual(_bytes('<a><b/></a>'),
                           data)
 
     def test_write_file_gzip(self):
         tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
-        handle, filename = tempfile.mkstemp()
-        try:
+        with tmpfile() as filename:
             tree.write(filename, compression=9)
-            f = gzip.open(filename, 'rb')
-            try:
+            with closing(gzip.open(filename, 'rb')) as f:
                 data = f.read()
-            finally:
-                f.close()
-        finally:
-            os.close(handle)
-            os.remove(filename)
         self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
                           data)
 
     def test_write_file_gzip_parse(self):
         tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
-        handle, filename = tempfile.mkstemp()
-        try:
+        with tmpfile() as filename:
             tree.write(filename, compression=9)
             data = etree.tostring(etree.parse(filename))
-        finally:
-            os.close(handle)
-            os.remove(filename)
         self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
                           data)
 
     def test_write_file_gzipfile_parse(self):
         tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
-        handle, filename = tempfile.mkstemp()
-        try:
+        with tmpfile() as filename:
             tree.write(filename, compression=9)
-            data = etree.tostring(etree.parse(
-                gzip.GzipFile(filename)))
-        finally:
-            os.close(handle)
-            os.remove(filename)
+            with closing(gzip.GzipFile(filename)) as f:
+                data = etree.tostring(etree.parse(f))
         self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
                           data)
 
+
 class ETreeErrorLogTest(HelperTestCase):
     etree = etree
 
@@ -4527,5 +4514,6 @@ def test_suite():
         [make_doctest('../../../doc/resolvers.txt')])
     return suite
 
+
 if __name__ == '__main__':
     print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py
index 81f49ac6..c2f162b2 100644
--- a/src/lxml/tests/test_incremental_xmlfile.py
+++ b/src/lxml/tests/test_incremental_xmlfile.py
@@ -2,11 +2,9 @@
 
 """
 Tests for the incremental XML serialisation API.
-
-Tests require Python 2.5 or later.
 """
 
-from __future__ import with_statement
+from __future__ import with_statement, absolute_import
 
 import unittest
 import tempfile, os, sys
@@ -17,7 +15,8 @@ this_dir = os.path.dirname(__file__)
 if this_dir not in sys.path:
     sys.path.insert(0, this_dir) # needed for Py3
 
-from common_imports import etree, BytesIO, HelperTestCase, skipIf
+from .common_imports import etree, BytesIO, HelperTestCase, skipIf, _str
+
 
 class _XmlFileTestCaseBase(HelperTestCase):
     _file = None  # to be set by specific subtypes below
@@ -454,6 +453,20 @@ class HtmlFileTestCase(_XmlFileTestCaseBase):
             '</root>')
         self._file = BytesIO()
 
+    def test_attribute_quoting(self):
+        with etree.htmlfile(self._file) as xf:
+            with xf.element("tagname", attrib={"attr": '"misquoted"'}):
+                xf.write("foo")
+
+        self.assertXml('<tagname attr="&quot;misquoted&quot;">foo</tagname>')
+
+    def test_attribute_quoting_unicode(self):
+        with etree.htmlfile(self._file) as xf:
+            with xf.element("tagname", attrib={"attr": _str('"misquöted\\u3344\\U00013344"')}):
+                xf.write("foo")
+
+        self.assertXml('<tagname attr="&quot;misqu&#246;ted&#13124;&#78660;&quot;">foo</tagname>')
+
     def test_unescaped_script(self):
         with etree.htmlfile(self._file) as xf:
             elt = etree.Element('script')