summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefan Behnel <stefan_ml@behnel.de>2017-01-08 15:22:45 +0100
committerStefan Behnel <stefan_ml@behnel.de>2017-01-08 15:22:45 +0100
commit765f0399740169128fcad89685f25e607bf1f1cc (patch)
treed088b04a68ff13fe1a94322c885e1720e4db1b20
parent1f0fbb14dcd7e4fd5388be482280d1c7dfd7e5fa (diff)
parent2256f345a0e6b5168cc66b6c97288f4846dfdb29 (diff)
downloadpython-lxml-765f0399740169128fcad89685f25e607bf1f1cc.tar.gz
integrate recent master changes into lxml-3.7 branch
-rw-r--r--CHANGES.txt13
-rw-r--r--src/lxml/includes/tree.pxd3
-rw-r--r--src/lxml/serializer.pxi214
-rw-r--r--src/lxml/tests/test_incremental_xmlfile.py31
4 files changed, 258 insertions, 3 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 0c9d59fd..9d8d0edb 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -2,6 +2,19 @@
lxml changelog
==============
+Latest
+==================
+
+Bugs fixed
+----------
+
+* GH#219: ``xmlfile.element()`` was not properly quoting attribute values.
+ Patch by Burak Arslan.
+
+* GH#218: ``xmlfile.element()`` was not properly escaping text content of
+ script/style tags. Patch by Burak Arslan.
+
+
3.7.1 (2016-12-23)
==================
diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd
index b3bab52f..0d9d8843 100644
--- a/src/lxml/includes/tree.pxd
+++ b/src/lxml/includes/tree.pxd
@@ -61,6 +61,7 @@ cdef extern from "libxml/encoding.h":
cdef extern from "libxml/chvalid.h":
cdef int xmlIsChar_ch(char c) nogil
+ cdef int xmlIsCharQ(int ch) nogil
cdef extern from "libxml/hash.h":
ctypedef struct xmlHashTable
@@ -357,6 +358,8 @@ cdef extern from "libxml/tree.h":
cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf,
xmlDoc* doc, xmlNode* cur, int level,
int format, const_char* encoding) nogil
+ cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc,
+ xmlAttr *attr, const_xmlChar *string) nogil
cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) nogil
cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) nogil
cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) nogil
diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi
index a3d22365..0f93bb09 100644
--- a/src/lxml/serializer.pxi
+++ b/src/lxml/serializer.pxi
@@ -399,6 +399,208 @@ cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
pretty_print, encoding)
c_sibling = c_sibling.next
+
+# copied and adapted from libxml2
+cdef unsigned char *xmlSerializeHexCharRef(unsigned char *out, int val):
+ cdef xmlChar *ptr
+ cdef xmlChar c
+
+ out[0] = '&'
+ out += 1
+
+ out[0] = '#'
+ out += 1
+
+ out[0] = 'x'
+ out += 1
+
+ if (val < 0x10):
+ ptr = out
+ elif (val < 0x100):
+ ptr = out + 1
+ elif (val < 0x1000):
+ ptr = out + 2
+ elif (val < 0x10000):
+ ptr = out + 3
+ elif (val < 0x100000):
+ ptr = out + 4
+ else:
+ ptr = out + 5
+
+ out = ptr + 1
+ while val > 0:
+ c = (val & 0xF)
+
+ if c == 0:
+ ptr[0] = '0'
+ elif c == 1:
+ ptr[0] = '1'
+ elif c == 2:
+ ptr[0] = '2'
+ elif c == 3:
+ ptr[0] = '3'
+ elif c == 4:
+ ptr[0] = '4'
+ elif c == 5:
+ ptr[0] = '5'
+ elif c == 6:
+ ptr[0] = '6'
+ elif c == 7:
+ ptr[0] = '7'
+ elif c == 8:
+ ptr[0] = '8'
+ elif c == 9:
+ ptr[0] = '9'
+ elif c == 0xA:
+ ptr[0] = 'A'
+ elif c == 0xB:
+ ptr[0] = 'B'
+ elif c == 0xC:
+ ptr[0] = 'C'
+ elif c == 0xD:
+ ptr[0] = 'D'
+ elif c == 0xE:
+ ptr[0] = 'E'
+ elif c == 0xF:
+ ptr[0] = 'F'
+ else:
+ ptr[0] = '0'
+
+ ptr -= 1
+
+ val >>= 4
+
+ out[0] = ';'
+ out += 1
+ out[0] = 0
+
+ return out
+
+
+# copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent())
+cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string):
+ cdef const char *base
+ cdef const char *cur
+
+ cdef unsigned char tmp[12]
+ cdef int val = 0
+ cdef int l
+
+ if string == NULL:
+ return
+
+ base = cur = <const char*>string
+ while (cur[0] != 0):
+ if (cur[0] == '\n'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&#10;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '\r'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&#13;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '\t'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&#9;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '"'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 6, "&quot;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '<'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&lt;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] == '>'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&gt;")
+ cur += 1
+ base = cur
+ elif (cur[0] == '&'):
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&amp;")
+ cur += 1
+ base = cur
+
+ elif (cur[0] >= 0x80) and (cur[1] != 0):
+
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ if (cur[0] < 0xC0):
+ # invalid UTF-8 sequence
+ val = char[0]
+ l = 1
+
+ elif (cur[0] < 0xE0):
+ val = (cur[0]) & 0x1F
+ val <<= 6
+ val |= (cur[1]) & 0x3F
+ l = 2
+
+ elif ((cur[0] < 0xF0) and (cur[2] != 0)):
+ val = (cur[0]) & 0x0F
+ val <<= 6
+ val |= (cur[1]) & 0x3F
+ val <<= 6
+ val |= (cur[2]) & 0x3F
+ l = 3
+
+ elif ((cur[0] < 0xF8) and (cur[2] != 0) and (cur[3] != 0)):
+ val = (cur[0]) & 0x07
+ val <<= 6
+ val |= (cur[1]) & 0x3F
+ val <<= 6
+ val |= (cur[2]) & 0x3F
+ val <<= 6
+ val |= (cur[3]) & 0x3F
+ l = 4
+ else:
+ # invalid UTF-8 sequence
+ val = char[0]
+ l = 1
+
+ if ((l == 1) or (not tree.xmlIsCharQ(val))):
+ raise ValueError("Invalid character: %X" % val)
+
+ # We could do multiple things here. Just save
+ # as a char ref
+ xmlSerializeHexCharRef(tmp, val)
+ tree.xmlOutputBufferWrite(buf, -1, <const char*> tmp)
+ cur += l
+ base = cur
+
+ else:
+ cur += 1
+
+ if (base != cur):
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+
############################################################
# output to file-like objects
@@ -883,7 +1085,8 @@ cdef class _IncrementalFileWriter:
tree.xmlOutputBufferWrite(self._c_out, 1, ' ')
self._write_qname(name, prefix)
tree.xmlOutputBufferWrite(self._c_out, 2, '="')
- tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(value), NULL)
+ _write_attr_string(self._c_out, _cstr(value))
+
tree.xmlOutputBufferWrite(self._c_out, 1, '"')
cdef _write_end_element(self, element_config):
@@ -954,7 +1157,14 @@ cdef class _IncrementalFileWriter:
if self._status > WRITER_IN_ELEMENT or content.strip():
raise LxmlSyntaxError("not in an element")
content = _utf8(content)
- tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL)
+
+ ns, name, _, _ = self._element_stack[-1]
+ if c_method == OUTPUT_METHOD_HTML and \
+ ns in (None, 'http://www.w3.org/1999/xhtml') and name in ('script', 'style'):
+ tree.xmlOutputBufferWrite(self._c_out, len(content), content)
+ else:
+ tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL)
+
elif iselement(content):
if self._status > WRITER_IN_ELEMENT:
raise LxmlSyntaxError("cannot append trailing element to complete XML document")
diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py
index b6245618..7a7e0730 100644
--- a/src/lxml/tests/test_incremental_xmlfile.py
+++ b/src/lxml/tests/test_incremental_xmlfile.py
@@ -15,7 +15,8 @@ this_dir = os.path.dirname(__file__)
if this_dir not in sys.path:
sys.path.insert(0, this_dir) # needed for Py3
-from common_imports import etree, BytesIO, HelperTestCase, skipIf
+from common_imports import etree, BytesIO, HelperTestCase, skipIf, _str
+
class _XmlFileTestCaseBase(HelperTestCase):
_file = None # to be set by specific subtypes below
@@ -418,6 +419,34 @@ class HtmlFileTestCase(_XmlFileTestCaseBase):
'</root>')
self._file = BytesIO()
+ def test_attribute_quoting(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("tagname", attrib={"attr": '"misquoted"'}):
+ xf.write("foo")
+
+ self.assertXml('<tagname attr="&quot;misquoted&quot;">foo</tagname>')
+
+ def test_attribute_quoting_unicode(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("tagname", attrib={"attr": _str('"misquöted\\u3344\\U00013344"')}):
+ xf.write("foo")
+
+ self.assertXml('<tagname attr="&quot;misqu&#246;ted&#13124;&#78660;&quot;">foo</tagname>')
+
+ def test_unescaped_script(self):
+ with etree.htmlfile(self._file) as xf:
+ elt = etree.Element('script')
+ elt.text = "if (a < b);"
+ xf.write(elt)
+ self.assertXml('<script>if (a < b);</script>')
+
+ def test_unescaped_script_incremental(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element('script'):
+ xf.write("if (a < b);")
+
+ self.assertXml('<script>if (a < b);</script>')
+
def test_write_declaration(self):
with etree.htmlfile(self._file) as xf:
try: