diff options
-rw-r--r-- | CHANGES.txt | 26 | ||||
-rw-r--r-- | buildlibxml.py | 9 | ||||
-rw-r--r-- | doc/lxmlhtml.txt | 4 | ||||
-rw-r--r-- | setup.py | 2 | ||||
-rw-r--r-- | src/lxml/html/__init__.py | 27 | ||||
-rw-r--r-- | src/lxml/html/tests/test_forms.txt | 26 | ||||
-rw-r--r-- | src/lxml/lxml.etree.pyx | 23 | ||||
-rw-r--r-- | src/lxml/serializer.pxi | 60 | ||||
-rw-r--r-- | src/lxml/tests/test_etree.py | 100 | ||||
-rw-r--r-- | src/lxml/tests/test_incremental_xmlfile.py | 36 |
10 files changed, 222 insertions, 91 deletions
diff --git a/CHANGES.txt b/CHANGES.txt index fada054b..fbb906e3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,32 @@ lxml changelog ============== +3.8.0 (2017-??-??) +================== + +Features added +-------------- + +* ``ElementTree.write()`` has a new option ``doctype`` that writes out a + doctype string before the serialisation, in the same way as ``tostring()``. + +* GH#220: ``xmlfile`` allows switching output methods at an element level. + Patch by Burak Arslan. + +Bugs fixed +---------- + +* LP#1665241, GH#228: Form data handling in lxml.html no longer strips the + option values specified in form attributes but only the text values. + Patch by Ashish Kulkarni. + +Other changes +------------- + +* The previously undocumented ``docstring`` option in ``ElementTree.write()`` + produces a deprecation warning and will eventually be removed. + + 3.7.3 (2017-??-??) ================== diff --git a/buildlibxml.py b/buildlibxml.py index 5b32034c..f55f03cc 100644 --- a/buildlibxml.py +++ b/buildlibxml.py @@ -56,9 +56,12 @@ def download_and_extract_zlatkovic_binaries(destdir): for libname, libfn in libs.items(): srcfile = urljoin(url, libfn) destfile = os.path.join(destdir, libfn) - print('Retrieving "%s" to "%s"' % (srcfile, destfile)) - urlcleanup() # work around FTP bug 27973 in Py2.7.12+ - urlretrieve(srcfile, destfile) + if os.path.exists(destfile + ".keep"): + print('Using local copy of "{}"'.format(srcfile)) + else: + print('Retrieving "%s" to "%s"' % (srcfile, destfile)) + urlcleanup() # work around FTP bug 27973 in Py2.7.12+ + urlretrieve(srcfile, destfile) d = unpack_zipfile(destfile, destdir) libs[libname] = d diff --git a/doc/lxmlhtml.txt b/doc/lxmlhtml.txt index cc59d97a..9827ed9f 100644 --- a/doc/lxmlhtml.txt +++ b/doc/lxmlhtml.txt @@ -477,8 +477,8 @@ Example: >>> from lxml.html import parse, submit_form >>> page = parse('http://tinyurl.com').getroot() - >>> page.forms[1].fields['url'] = 'http://lxml.de/' - >>> result = parse(submit_form(page.forms[1])).getroot() + >>> page.forms[0].fields['url'] = 'http://lxml.de/' + >>> result = parse(submit_form(page.forms[0])).getroot() >>> [a.attrib['href'] for a in result.xpath("//a[@target='_blank']")] ['http://tinyurl.com/2xae8s', 'http://preview.tinyurl.com/2xae8s'] @@ -219,10 +219,10 @@ an appropriate version of Cython installed. 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Programming Language :: C', 'Operating System :: OS Independent', 'Topic :: Text Processing :: Markup :: HTML', diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py index 525f9dc2..c0297d67 100644 --- a/src/lxml/html/__init__.py +++ b/src/lxml/html/__init__.py @@ -1137,6 +1137,8 @@ def open_http_urllib(method, url, values): data = None else: data = urlencode(values) + if not isinstance(data, bytes): + data = data.encode('ASCII') return urlopen(url, data) @@ -1325,9 +1327,7 @@ class SelectElement(InputMixin, HtmlElement): if el.get('selected') is not None: value = el.get('value') if value is None: - value = el.text or '' - if value: - value = value.strip() + value = (el.text or '').strip() return value return None @@ -1342,13 +1342,10 @@ class SelectElement(InputMixin, HtmlElement): return checked_option = None if value is not None: - value = value.strip() for el in _options_xpath(self): opt_value = el.get('value') if opt_value is None: - opt_value = el.text or '' - if opt_value: - opt_value = opt_value.strip() + opt_value = (el.text or '').strip() if opt_value == value: checked_option = el break @@ -1379,9 +1376,7 @@ class SelectElement(InputMixin, HtmlElement): for el in _options_xpath(self): value = el.get('value') if value is None: - value = el.text or '' - if value: - value = value.strip() + value = (el.text or '').strip() options.append(value) return options @@ -1426,18 +1421,14 @@ class MultipleSelectOptions(SetMixin): if 'selected' in option.attrib: opt_value = option.get('value') if opt_value is None: - opt_value = option.text or '' - if opt_value: - opt_value = opt_value.strip() + opt_value = (option.text or '').strip() yield opt_value def add(self, item): for option in self.options: opt_value = option.get('value') if opt_value is None: - opt_value = option.text or '' - if opt_value: - opt_value = opt_value.strip() + opt_value = (option.text or '').strip() if opt_value == item: option.set('selected', '') break @@ -1449,9 +1440,7 @@ class MultipleSelectOptions(SetMixin): for option in self.options: opt_value = option.get('value') if opt_value is None: - opt_value = option.text or '' - if opt_value: - opt_value = opt_value.strip() + opt_value = (option.text or '').strip() if opt_value == item: if 'selected' in option.attrib: del option.attrib['selected'] diff --git a/src/lxml/html/tests/test_forms.txt b/src/lxml/html/tests/test_forms.txt index 25773013..e475587b 100644 --- a/src/lxml/html/tests/test_forms.txt +++ b/src/lxml/html/tests/test_forms.txt @@ -28,6 +28,14 @@ ... <option value="3">number 3</option> ... <option>number 4</option> ... </select> +... <select name="select3"> +... <option value="01 " selected>text 1</option> +... <option value=" 02">text 2</option> +... </select> +... <select name="select4" multiple> +... <option value="01 " selected>text 1</option> +... <option value=" 02">text 2</option> +... </select> ... <input type="file" name="file_field" value="nonsense_value"> ... <input type="submit" name="submit1" value="submit"> ... <input type="submit" name="submit2" value="submit"> @@ -133,10 +141,24 @@ ValueError: There is no option with the value 'asdf' >>> select.value.remove('number 4') >>> select.value_options ['1', '2', '3', 'number 4'] +>>> select = f.inputs['select3'] +>>> select.value +'01 ' +>>> select.value_options +['01 ', ' 02'] +>>> select.value = " 02" +>>> select.value +' 02' +>>> select = f.inputs['select4'] +>>> select.value # doctest:+NOPARSE_MARKUP +<MultipleSelectOptions {'01 '} for select name='select4'> +>>> select.value.add(' 02') +>>> select.value # doctest:+NOPARSE_MARKUP +<MultipleSelectOptions {'01 ', ' 02'} for select name='select4'> >>> try: from urllib import urlencode ... except ImportError: from urllib.parse import urlencode >>> print(urlencode(f.form_values())) -hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=No+value&select2=2 +hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=No+value&select2=2&select3=+02&select4=01+&select4=+02 >>> fields = f.fields >>> fields # doctest:+NOPARSE_MARKUP <FieldsDict for form 0> @@ -149,6 +171,8 @@ radios: None reset1: None select1: 'No value' select2: <MultipleSelectOptions {'2'} for select name='select2'> +select3: ' 02' +select4: <MultipleSelectOptions {'01 ', ' 02'} for select name='select4'> single_checkbox: 'on' single_checkbox2: 'good' submit1: 'submit' diff --git a/src/lxml/lxml.etree.pyx b/src/lxml/lxml.etree.pyx index 67ff69b2..c336cef2 100644 --- a/src/lxml/lxml.etree.pyx +++ b/src/lxml/lxml.etree.pyx @@ -1954,11 +1954,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType, def write(self, file, *, encoding=None, method=u"xml", pretty_print=False, xml_declaration=None, with_tail=True, - standalone=None, docstring=None, compression=0, - exclusive=False, with_comments=True, inclusive_ns_prefixes=None): + standalone=None, doctype=None, compression=0, + exclusive=False, with_comments=True, inclusive_ns_prefixes=None, + docstring=None): u"""write(self, file, encoding=None, method="xml", pretty_print=False, xml_declaration=None, with_tail=True, - standalone=None, compression=0, + standalone=None, doctype=None, compression=0, exclusive=False, with_comments=True, inclusive_ns_prefixes=None) Write the tree to a filename, file or file-like object. @@ -1976,6 +1977,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType, output an XML declaration with the corresponding ``standalone`` flag. + The ``doctype`` option allows passing in a plain string that will + be serialised before the XML tree. Note that passing in non + well-formed content here will make the XML output non well-formed. + Also, an existing doctype in the document tree will not be removed + when serialising an ElementTree instance. + The ``compression`` option enables GZip compression level 1-9. The ``inclusive_ns_prefixes`` should be a list of namespace strings @@ -2030,7 +2037,15 @@ cdef public class _ElementTree [ type LxmlElementTreeType, else: write_declaration = 1 is_standalone = 0 - _tofilelike(file, self._context_node, encoding, docstring, method, + + if docstring is not None and doctype is None: + import warnings + warnings.warn( + "The 'docstring' option is deprecated. Use 'doctype' instead.", + DeprecationWarning) + doctype = docstring + + _tofilelike(file, self._context_node, encoding, doctype, method, write_declaration, 1, pretty_print, with_tail, is_standalone, compression) diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi index 8cee18d8..4ef53bc9 100644 --- a/src/lxml/serializer.pxi +++ b/src/lxml/serializer.pxi @@ -1014,10 +1014,21 @@ cdef class _IncrementalFileWriter: tree.xmlOutputBufferFlush(self._c_out) self._handle_error(self._c_out.error) - def element(self, tag, attrib=None, nsmap=None, **_extra): - """element(self, tag, attrib=None, nsmap=None, **_extra) + def method(self, method): + """method(self, method) + + Returns a context manager that overrides and restores the output method. + method is one of (None, 'xml', 'html') where None means 'xml'. + """ + assert self._c_out is not NULL + c_method = self._method if method is None else _findOutputMethod(method) + return _MethodChanger(self, c_method) + + def element(self, tag, attrib=None, nsmap=None, method=None, **_extra): + """element(self, tag, attrib=None, nsmap=None, method, **_extra) Returns a context manager that writes an opening and closing tag. + method is one of (None, 'xml', 'html') where None means 'xml'. """ assert self._c_out is not NULL attributes = [] @@ -1038,7 +1049,10 @@ cdef class _IncrementalFileWriter: _prefixValidOrRaise(prefix) reversed_nsmap[_utf8(ns)] = prefix ns, name = _getNsTag(tag) - return _FileWriterElement(self, (ns, name, attributes, reversed_nsmap)) + + c_method = self._method if method is None else _findOutputMethod(method) + + return _FileWriterElement(self, (ns, name, attributes, reversed_nsmap), c_method) cdef _write_qname(self, bytes name, bytes prefix): if prefix: # empty bytes for no prefix (not None to allow sorting) @@ -1163,6 +1177,7 @@ cdef class _IncrementalFileWriter: ns in (None, b'http://www.w3.org/1999/xhtml') and name in (b'script', b'style')): tree.xmlOutputBufferWrite(self._c_out, len(content), _cstr(content)) + else: tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL) @@ -1219,14 +1234,51 @@ cdef class _IncrementalFileWriter: @cython.freelist(8) cdef class _FileWriterElement: cdef object _element + cdef int _new_method + cdef int _old_method cdef _IncrementalFileWriter _writer - def __cinit__(self, _IncrementalFileWriter writer not None, element_config): + def __cinit__(self, _IncrementalFileWriter writer not None, element_config, int method): self._writer = writer self._element = element_config + self._new_method = method + self._old_method = writer._method def __enter__(self): + self._writer._method = self._new_method self._writer._write_start_element(self._element) def __exit__(self, exc_type, exc_val, exc_tb): self._writer._write_end_element(self._element) + self._writer._method = self._old_method + +@cython.final +@cython.internal +@cython.freelist(8) +cdef class _MethodChanger: + cdef int _new_method + cdef int _old_method + cdef bint _entered + cdef bint _exited + cdef _IncrementalFileWriter _writer + + def __cinit__(self, _IncrementalFileWriter writer not None, int method): + self._writer = writer + self._new_method = method + self._old_method = writer._method + self._entered = False + self._exited = False + + def __enter__(self): + if self._entered: + raise LxmlSyntaxError("Inconsistent enter action in context manager") + self._writer._method = self._new_method + self._entered = True + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._exited: + raise LxmlSyntaxError("Inconsistent exit action in context manager") + if self._writer._method != self._new_method: + raise LxmlSyntaxError("Method changed outside of context manager") + self._writer._method = self._old_method + self._exited = True diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py index 4ec59096..d1c79e05 100644 --- a/src/lxml/tests/test_etree.py +++ b/src/lxml/tests/test_etree.py @@ -20,6 +20,7 @@ import tempfile import textwrap import zlib import gzip +from contextlib import closing, contextmanager from .common_imports import etree, StringIO, BytesIO, HelperTestCase from .common_imports import fileInTestDir, fileUrlInTestDir, read_file, path2url @@ -43,6 +44,16 @@ except NameError: _unicode = str +@contextmanager +def tmpfile(): + handle, filename = tempfile.mkstemp() + try: + yield filename + finally: + os.close(handle) + os.remove(filename) + + class ETreeOnlyTestCase(HelperTestCase): """Tests only for etree, not ElementTree""" etree = etree @@ -4062,39 +4073,25 @@ class ETreeC14NTestCase(HelperTestCase): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) f = BytesIO() tree.write_c14n(f, compression=9) - gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue())) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile: s = gzfile.read() - finally: - gzfile.close() self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'), s) def test_c14n_file(self): tree = self.parse(_bytes('<a><b/></a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write_c14n(filename) data = read_file(filename, 'rb') - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a><b></b></a>'), data) def test_c14n_file_gzip(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write_c14n(filename, compression=9) - f = gzip.open(filename, 'rb') - try: + with closing(gzip.open(filename, 'rb')) as f: data = f.read() - finally: - f.close() - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'), data) @@ -4225,18 +4222,32 @@ class ETreeWriteTestCase(HelperTestCase): self.assertEqual(_bytes('<a><b/></a>'), s) + def test_write_doctype(self): + tree = self.parse(_bytes('<a><b/></a>')) + f = BytesIO() + tree.write(f, doctype='HUHU') + s = f.getvalue() + self.assertEqual(_bytes('HUHU\n<a><b/></a>'), + s) + def test_write_gzip(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) f = BytesIO() tree.write(f, compression=9) - gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue())) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile: s = gzfile.read() - finally: - gzfile.close() self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), s) + def test_write_gzip_doctype(self): + tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) + f = BytesIO() + tree.write(f, compression=9, doctype='<!DOCTYPE a>') + with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile: + s = gzfile.read() + self.assertEqual(_bytes('<!DOCTYPE a>\n<a>'+'<b/>'*200+'</a>'), + s) + def test_write_gzip_level(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) f = BytesIO() @@ -4251,21 +4262,15 @@ class ETreeWriteTestCase(HelperTestCase): tree.write(f, compression=1) s = f.getvalue() self.assertTrue(len(s) <= len(s0)) - gzfile = gzip.GzipFile(fileobj=BytesIO(s)) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile: s1 = gzfile.read() - finally: - gzfile.close() f = BytesIO() tree.write(f, compression=9) s = f.getvalue() self.assertTrue(len(s) <= len(s0)) - gzfile = gzip.GzipFile(fileobj=BytesIO(s)) - try: + with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile: s9 = gzfile.read() - finally: - gzfile.close() self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), s0) @@ -4276,57 +4281,39 @@ class ETreeWriteTestCase(HelperTestCase): def test_write_file(self): tree = self.parse(_bytes('<a><b/></a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename) data = read_file(filename, 'rb') - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a><b/></a>'), data) def test_write_file_gzip(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename, compression=9) - f = gzip.open(filename, 'rb') - try: + with closing(gzip.open(filename, 'rb')) as f: data = f.read() - finally: - f.close() - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), data) def test_write_file_gzip_parse(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename, compression=9) data = etree.tostring(etree.parse(filename)) - finally: - os.close(handle) - os.remove(filename) self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), data) def test_write_file_gzipfile_parse(self): tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>')) - handle, filename = tempfile.mkstemp() - try: + with tmpfile() as filename: tree.write(filename, compression=9) - data = etree.tostring(etree.parse( - gzip.GzipFile(filename))) - finally: - os.close(handle) - os.remove(filename) + with closing(gzip.GzipFile(filename)) as f: + data = etree.tostring(etree.parse(f)) self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'), data) + class ETreeErrorLogTest(HelperTestCase): etree = etree @@ -4527,5 +4514,6 @@ def test_suite(): [make_doctest('../../../doc/resolvers.txt')]) return suite + if __name__ == '__main__': print('to test use test.py %s' % __file__) diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py index 867db4a8..e73758df 100644 --- a/src/lxml/tests/test_incremental_xmlfile.py +++ b/src/lxml/tests/test_incremental_xmlfile.py @@ -9,6 +9,8 @@ from __future__ import with_statement, absolute_import import unittest import tempfile, os, sys +from lxml.etree import LxmlSyntaxError + this_dir = os.path.dirname(__file__) if this_dir not in sys.path: sys.path.insert(0, this_dir) # needed for Py3 @@ -380,8 +382,29 @@ class HtmlFileTestCase(_XmlFileTestCaseBase): self.assertXml('<%s>' % tag) self._file = BytesIO() + def test_method_context_manager_misuse(self): + with etree.htmlfile(self._file) as xf: + with xf.element('foo'): + cm = xf.method('xml') + cm.__enter__() + + self.assertRaises(LxmlSyntaxError, cm.__enter__) + + cm2 = xf.method('xml') + cm2.__enter__() + cm2.__exit__(None, None, None) + + self.assertRaises(LxmlSyntaxError, cm2.__exit__, None, None, None) + + cm3 = xf.method('xml') + cm3.__enter__() + with xf.method('html'): + self.assertRaises(LxmlSyntaxError, cm3.__exit__, None, None, None) + def test_xml_mode_write_inside_html(self): - elt = etree.Element("foo", attrib={'selected': 'bar'}) + tag = 'foo' + attrib = {'selected': 'bar'} + elt = etree.Element(tag, attrib=attrib) with etree.htmlfile(self._file) as xf: with xf.element("root"): @@ -393,11 +416,22 @@ class HtmlFileTestCase(_XmlFileTestCaseBase): elt.text = "" xf.write(elt, method='xml') # 3 + with xf.element(tag, attrib=attrib, method='xml'): + pass # 4 + + xf.write(elt) # 5 + + with xf.method('xml'): + xf.write(elt) # 6 + self.assertXml( '<root>' '<foo selected></foo>' # 1 '<foo selected="bar"/>' # 2 '<foo selected="bar"></foo>' # 3 + '<foo selected="bar"></foo>' # 4 + '<foo selected></foo>' # 5 + '<foo selected="bar"></foo>' # 6 '</root>') self._file = BytesIO() |