diff options
author | Stefan Behnel <stefan_ml@behnel.de> | 2017-01-08 15:22:45 +0100 |
---|---|---|
committer | Stefan Behnel <stefan_ml@behnel.de> | 2017-01-08 15:22:45 +0100 |
commit | 765f0399740169128fcad89685f25e607bf1f1cc (patch) | |
tree | d088b04a68ff13fe1a94322c885e1720e4db1b20 | |
parent | 1f0fbb14dcd7e4fd5388be482280d1c7dfd7e5fa (diff) | |
parent | 2256f345a0e6b5168cc66b6c97288f4846dfdb29 (diff) | |
download | python-lxml-765f0399740169128fcad89685f25e607bf1f1cc.tar.gz |
integrate recent master changes into lxml-3.7 branch
-rw-r--r-- | CHANGES.txt | 13 | ||||
-rw-r--r-- | src/lxml/includes/tree.pxd | 3 | ||||
-rw-r--r-- | src/lxml/serializer.pxi | 214 | ||||
-rw-r--r-- | src/lxml/tests/test_incremental_xmlfile.py | 31 |
4 files changed, 258 insertions, 3 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index 0c9d59fd..9d8d0edb 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,19 @@ lxml changelog ============== +Latest +================== + +Bugs fixed +---------- + +* GH#219: ``xmlfile.element()`` was not properly quoting attribute values. + Patch by Burak Arslan. + +* GH#218: ``xmlfile.element()`` was not properly escaping text content of + script/style tags. Patch by Burak Arslan. + + 3.7.1 (2016-12-23) ================== diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd index b3bab52f..0d9d8843 100644 --- a/src/lxml/includes/tree.pxd +++ b/src/lxml/includes/tree.pxd @@ -61,6 +61,7 @@ cdef extern from "libxml/encoding.h": cdef extern from "libxml/chvalid.h": cdef int xmlIsChar_ch(char c) nogil + cdef int xmlIsCharQ(int ch) nogil cdef extern from "libxml/hash.h": ctypedef struct xmlHashTable @@ -357,6 +358,8 @@ cdef extern from "libxml/tree.h": cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf, xmlDoc* doc, xmlNode* cur, int level, int format, const_char* encoding) nogil + cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc, + xmlAttr *attr, const_xmlChar *string) nogil cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) nogil cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) nogil cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) nogil diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi index a3d22365..0f93bb09 100644 --- a/src/lxml/serializer.pxi +++ b/src/lxml/serializer.pxi @@ -399,6 +399,208 @@ cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node, pretty_print, encoding) c_sibling = c_sibling.next + +# copied and adapted from libxml2 +cdef unsigned char *xmlSerializeHexCharRef(unsigned char *out, int val): + cdef xmlChar *ptr + cdef xmlChar c + + out[0] = '&' + out += 1 + + out[0] = '#' + out += 1 + + out[0] = 'x' + out += 1 + + if (val < 0x10): + ptr = out + elif (val < 0x100): + ptr = out + 1 + elif (val < 0x1000): + ptr = out + 2 + elif (val < 0x10000): + ptr = out + 3 + elif (val < 0x100000): + ptr = out + 4 + else: + ptr = out + 5 + + out = ptr + 1 + while val > 0: + c = (val & 0xF) + + if c == 0: + ptr[0] = '0' + elif c == 1: + ptr[0] = '1' + elif c == 2: + ptr[0] = '2' + elif c == 3: + ptr[0] = '3' + elif c == 4: + ptr[0] = '4' + elif c == 5: + ptr[0] = '5' + elif c == 6: + ptr[0] = '6' + elif c == 7: + ptr[0] = '7' + elif c == 8: + ptr[0] = '8' + elif c == 9: + ptr[0] = '9' + elif c == 0xA: + ptr[0] = 'A' + elif c == 0xB: + ptr[0] = 'B' + elif c == 0xC: + ptr[0] = 'C' + elif c == 0xD: + ptr[0] = 'D' + elif c == 0xE: + ptr[0] = 'E' + elif c == 0xF: + ptr[0] = 'F' + else: + ptr[0] = '0' + + ptr -= 1 + + val >>= 4 + + out[0] = ';' + out += 1 + out[0] = 0 + + return out + + +# copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent()) +cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string): + cdef const char *base + cdef const char *cur + + cdef unsigned char tmp[12] + cdef int val = 0 + cdef int l + + if string == NULL: + return + + base = cur = <const char*>string + while (cur[0] != 0): + if (cur[0] == '\n'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 5, " ") + cur += 1 + base = cur + + elif (cur[0] == '\r'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 5, " ") + cur += 1 + base = cur + + elif (cur[0] == '\t'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 4, "	") + cur += 1 + base = cur + + elif (cur[0] == '"'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 6, """) + cur += 1 + base = cur + + elif (cur[0] == '<'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 4, "<") + cur += 1 + base = cur + + elif (cur[0] == '>'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 4, ">") + cur += 1 + base = cur + elif (cur[0] == '&'): + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + tree.xmlOutputBufferWrite(buf, 5, "&") + cur += 1 + base = cur + + elif (cur[0] >= 0x80) and (cur[1] != 0): + + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + if (cur[0] < 0xC0): + # invalid UTF-8 sequence + val = char[0] + l = 1 + + elif (cur[0] < 0xE0): + val = (cur[0]) & 0x1F + val <<= 6 + val |= (cur[1]) & 0x3F + l = 2 + + elif ((cur[0] < 0xF0) and (cur[2] != 0)): + val = (cur[0]) & 0x0F + val <<= 6 + val |= (cur[1]) & 0x3F + val <<= 6 + val |= (cur[2]) & 0x3F + l = 3 + + elif ((cur[0] < 0xF8) and (cur[2] != 0) and (cur[3] != 0)): + val = (cur[0]) & 0x07 + val <<= 6 + val |= (cur[1]) & 0x3F + val <<= 6 + val |= (cur[2]) & 0x3F + val <<= 6 + val |= (cur[3]) & 0x3F + l = 4 + else: + # invalid UTF-8 sequence + val = char[0] + l = 1 + + if ((l == 1) or (not tree.xmlIsCharQ(val))): + raise ValueError("Invalid character: %X" % val) + + # We could do multiple things here. Just save + # as a char ref + xmlSerializeHexCharRef(tmp, val) + tree.xmlOutputBufferWrite(buf, -1, <const char*> tmp) + cur += l + base = cur + + else: + cur += 1 + + if (base != cur): + tree.xmlOutputBufferWrite(buf, cur - base, base) + + ############################################################ # output to file-like objects @@ -883,7 +1085,8 @@ cdef class _IncrementalFileWriter: tree.xmlOutputBufferWrite(self._c_out, 1, ' ') self._write_qname(name, prefix) tree.xmlOutputBufferWrite(self._c_out, 2, '="') - tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(value), NULL) + _write_attr_string(self._c_out, _cstr(value)) + tree.xmlOutputBufferWrite(self._c_out, 1, '"') cdef _write_end_element(self, element_config): @@ -954,7 +1157,14 @@ cdef class _IncrementalFileWriter: if self._status > WRITER_IN_ELEMENT or content.strip(): raise LxmlSyntaxError("not in an element") content = _utf8(content) - tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL) + + ns, name, _, _ = self._element_stack[-1] + if c_method == OUTPUT_METHOD_HTML and \ + ns in (None, 'http://www.w3.org/1999/xhtml') and name in ('script', 'style'): + tree.xmlOutputBufferWrite(self._c_out, len(content), content) + else: + tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL) + elif iselement(content): if self._status > WRITER_IN_ELEMENT: raise LxmlSyntaxError("cannot append trailing element to complete XML document") diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py index b6245618..7a7e0730 100644 --- a/src/lxml/tests/test_incremental_xmlfile.py +++ b/src/lxml/tests/test_incremental_xmlfile.py @@ -15,7 +15,8 @@ this_dir = os.path.dirname(__file__) if this_dir not in sys.path: sys.path.insert(0, this_dir) # needed for Py3 -from common_imports import etree, BytesIO, HelperTestCase, skipIf +from common_imports import etree, BytesIO, HelperTestCase, skipIf, _str + class _XmlFileTestCaseBase(HelperTestCase): _file = None # to be set by specific subtypes below @@ -418,6 +419,34 @@ class HtmlFileTestCase(_XmlFileTestCaseBase): '</root>') self._file = BytesIO() + def test_attribute_quoting(self): + with etree.htmlfile(self._file) as xf: + with xf.element("tagname", attrib={"attr": '"misquoted"'}): + xf.write("foo") + + self.assertXml('<tagname attr=""misquoted"">foo</tagname>') + + def test_attribute_quoting_unicode(self): + with etree.htmlfile(self._file) as xf: + with xf.element("tagname", attrib={"attr": _str('"misquöted\\u3344\\U00013344"')}): + xf.write("foo") + + self.assertXml('<tagname attr=""misquöted㍄𓍄"">foo</tagname>') + + def test_unescaped_script(self): + with etree.htmlfile(self._file) as xf: + elt = etree.Element('script') + elt.text = "if (a < b);" + xf.write(elt) + self.assertXml('<script>if (a < b);</script>') + + def test_unescaped_script_incremental(self): + with etree.htmlfile(self._file) as xf: + with xf.element('script'): + xf.write("if (a < b);") + + self.assertXml('<script>if (a < b);</script>') + def test_write_declaration(self): with etree.htmlfile(self._file) as xf: try: |