summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGES.txt26
-rw-r--r--buildlibxml.py9
-rw-r--r--doc/lxmlhtml.txt4
-rw-r--r--setup.py2
-rw-r--r--src/lxml/html/__init__.py27
-rw-r--r--src/lxml/html/tests/test_forms.txt26
-rw-r--r--src/lxml/lxml.etree.pyx23
-rw-r--r--src/lxml/serializer.pxi60
-rw-r--r--src/lxml/tests/test_etree.py100
-rw-r--r--src/lxml/tests/test_incremental_xmlfile.py36
10 files changed, 222 insertions, 91 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index 029b64a1..9e9e7d9d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -2,6 +2,32 @@
lxml changelog
==============
+3.8.0 (2017-??-??)
+==================
+
+Features added
+--------------
+
+* ``ElementTree.write()`` has a new option ``doctype`` that writes out a
+ doctype string before the serialisation, in the same way as ``tostring()``.
+
+* GH#220: ``xmlfile`` allows switching output methods at an element level.
+ Patch by Burak Arslan.
+
+Bugs fixed
+----------
+
+* LP#1665241, GH#228: Form data handling in lxml.html no longer strips the
+ option values specified in form attributes but only the text values.
+ Patch by Ashish Kulkarni.
+
+Other changes
+-------------
+
+* The previously undocumented ``docstring`` option in ``ElementTree.write()``
+ produces a deprecation warning and will eventually be removed.
+
+
3.7.3 (2017-02-18)
==================
diff --git a/buildlibxml.py b/buildlibxml.py
index 5b32034c..f55f03cc 100644
--- a/buildlibxml.py
+++ b/buildlibxml.py
@@ -56,9 +56,12 @@ def download_and_extract_zlatkovic_binaries(destdir):
for libname, libfn in libs.items():
srcfile = urljoin(url, libfn)
destfile = os.path.join(destdir, libfn)
- print('Retrieving "%s" to "%s"' % (srcfile, destfile))
- urlcleanup() # work around FTP bug 27973 in Py2.7.12+
- urlretrieve(srcfile, destfile)
+ if os.path.exists(destfile + ".keep"):
+ print('Using local copy of "{}"'.format(srcfile))
+ else:
+ print('Retrieving "%s" to "%s"' % (srcfile, destfile))
+ urlcleanup() # work around FTP bug 27973 in Py2.7.12+
+ urlretrieve(srcfile, destfile)
d = unpack_zipfile(destfile, destdir)
libs[libname] = d
diff --git a/doc/lxmlhtml.txt b/doc/lxmlhtml.txt
index cc59d97a..9827ed9f 100644
--- a/doc/lxmlhtml.txt
+++ b/doc/lxmlhtml.txt
@@ -477,8 +477,8 @@ Example:
>>> from lxml.html import parse, submit_form
>>> page = parse('http://tinyurl.com').getroot()
- >>> page.forms[1].fields['url'] = 'http://lxml.de/'
- >>> result = parse(submit_form(page.forms[1])).getroot()
+ >>> page.forms[0].fields['url'] = 'http://lxml.de/'
+ >>> result = parse(submit_form(page.forms[0])).getroot()
>>> [a.attrib['href'] for a in result.xpath("//a[@target='_blank']")]
['http://tinyurl.com/2xae8s', 'http://preview.tinyurl.com/2xae8s']
diff --git a/setup.py b/setup.py
index 68f064ef..0118a547 100644
--- a/setup.py
+++ b/setup.py
@@ -219,10 +219,10 @@ an appropriate version of Cython installed.
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.2',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6',
'Programming Language :: C',
'Operating System :: OS Independent',
'Topic :: Text Processing :: Markup :: HTML',
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
index 525f9dc2..c0297d67 100644
--- a/src/lxml/html/__init__.py
+++ b/src/lxml/html/__init__.py
@@ -1137,6 +1137,8 @@ def open_http_urllib(method, url, values):
data = None
else:
data = urlencode(values)
+ if not isinstance(data, bytes):
+ data = data.encode('ASCII')
return urlopen(url, data)
@@ -1325,9 +1327,7 @@ class SelectElement(InputMixin, HtmlElement):
if el.get('selected') is not None:
value = el.get('value')
if value is None:
- value = el.text or ''
- if value:
- value = value.strip()
+ value = (el.text or '').strip()
return value
return None
@@ -1342,13 +1342,10 @@ class SelectElement(InputMixin, HtmlElement):
return
checked_option = None
if value is not None:
- value = value.strip()
for el in _options_xpath(self):
opt_value = el.get('value')
if opt_value is None:
- opt_value = el.text or ''
- if opt_value:
- opt_value = opt_value.strip()
+ opt_value = (el.text or '').strip()
if opt_value == value:
checked_option = el
break
@@ -1379,9 +1376,7 @@ class SelectElement(InputMixin, HtmlElement):
for el in _options_xpath(self):
value = el.get('value')
if value is None:
- value = el.text or ''
- if value:
- value = value.strip()
+ value = (el.text or '').strip()
options.append(value)
return options
@@ -1426,18 +1421,14 @@ class MultipleSelectOptions(SetMixin):
if 'selected' in option.attrib:
opt_value = option.get('value')
if opt_value is None:
- opt_value = option.text or ''
- if opt_value:
- opt_value = opt_value.strip()
+ opt_value = (option.text or '').strip()
yield opt_value
def add(self, item):
for option in self.options:
opt_value = option.get('value')
if opt_value is None:
- opt_value = option.text or ''
- if opt_value:
- opt_value = opt_value.strip()
+ opt_value = (option.text or '').strip()
if opt_value == item:
option.set('selected', '')
break
@@ -1449,9 +1440,7 @@ class MultipleSelectOptions(SetMixin):
for option in self.options:
opt_value = option.get('value')
if opt_value is None:
- opt_value = option.text or ''
- if opt_value:
- opt_value = opt_value.strip()
+ opt_value = (option.text or '').strip()
if opt_value == item:
if 'selected' in option.attrib:
del option.attrib['selected']
diff --git a/src/lxml/html/tests/test_forms.txt b/src/lxml/html/tests/test_forms.txt
index 25773013..e475587b 100644
--- a/src/lxml/html/tests/test_forms.txt
+++ b/src/lxml/html/tests/test_forms.txt
@@ -28,6 +28,14 @@
... <option value="3">number 3</option>
... <option>number 4</option>
... </select>
+... <select name="select3">
+... <option value="01 " selected>text 1</option>
+... <option value=" 02">text 2</option>
+... </select>
+... <select name="select4" multiple>
+... <option value="01 " selected>text 1</option>
+... <option value=" 02">text 2</option>
+... </select>
... <input type="file" name="file_field" value="nonsense_value">
... <input type="submit" name="submit1" value="submit">
... <input type="submit" name="submit2" value="submit">
@@ -133,10 +141,24 @@ ValueError: There is no option with the value 'asdf'
>>> select.value.remove('number 4')
>>> select.value_options
['1', '2', '3', 'number 4']
+>>> select = f.inputs['select3']
+>>> select.value
+'01 '
+>>> select.value_options
+['01 ', ' 02']
+>>> select.value = " 02"
+>>> select.value
+' 02'
+>>> select = f.inputs['select4']
+>>> select.value # doctest:+NOPARSE_MARKUP
+<MultipleSelectOptions {'01 '} for select name='select4'>
+>>> select.value.add(' 02')
+>>> select.value # doctest:+NOPARSE_MARKUP
+<MultipleSelectOptions {'01 ', ' 02'} for select name='select4'>
>>> try: from urllib import urlencode
... except ImportError: from urllib.parse import urlencode
>>> print(urlencode(f.form_values()))
-hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=No+value&select2=2
+hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=No+value&select2=2&select3=+02&select4=01+&select4=+02
>>> fields = f.fields
>>> fields # doctest:+NOPARSE_MARKUP
<FieldsDict for form 0>
@@ -149,6 +171,8 @@ radios: None
reset1: None
select1: 'No value'
select2: <MultipleSelectOptions {'2'} for select name='select2'>
+select3: ' 02'
+select4: <MultipleSelectOptions {'01 ', ' 02'} for select name='select4'>
single_checkbox: 'on'
single_checkbox2: 'good'
submit1: 'submit'
diff --git a/src/lxml/lxml.etree.pyx b/src/lxml/lxml.etree.pyx
index 67ff69b2..c336cef2 100644
--- a/src/lxml/lxml.etree.pyx
+++ b/src/lxml/lxml.etree.pyx
@@ -1954,11 +1954,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
def write(self, file, *, encoding=None, method=u"xml",
pretty_print=False, xml_declaration=None, with_tail=True,
- standalone=None, docstring=None, compression=0,
- exclusive=False, with_comments=True, inclusive_ns_prefixes=None):
+ standalone=None, doctype=None, compression=0,
+ exclusive=False, with_comments=True, inclusive_ns_prefixes=None,
+ docstring=None):
u"""write(self, file, encoding=None, method="xml",
pretty_print=False, xml_declaration=None, with_tail=True,
- standalone=None, compression=0,
+ standalone=None, doctype=None, compression=0,
exclusive=False, with_comments=True, inclusive_ns_prefixes=None)
Write the tree to a filename, file or file-like object.
@@ -1976,6 +1977,12 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
output an XML declaration with the corresponding
``standalone`` flag.
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
The ``compression`` option enables GZip compression level 1-9.
The ``inclusive_ns_prefixes`` should be a list of namespace strings
@@ -2030,7 +2037,15 @@ cdef public class _ElementTree [ type LxmlElementTreeType,
else:
write_declaration = 1
is_standalone = 0
- _tofilelike(file, self._context_node, encoding, docstring, method,
+
+ if docstring is not None and doctype is None:
+ import warnings
+ warnings.warn(
+ "The 'docstring' option is deprecated. Use 'doctype' instead.",
+ DeprecationWarning)
+ doctype = docstring
+
+ _tofilelike(file, self._context_node, encoding, doctype, method,
write_declaration, 1, pretty_print, with_tail,
is_standalone, compression)
diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi
index 8cee18d8..4ef53bc9 100644
--- a/src/lxml/serializer.pxi
+++ b/src/lxml/serializer.pxi
@@ -1014,10 +1014,21 @@ cdef class _IncrementalFileWriter:
tree.xmlOutputBufferFlush(self._c_out)
self._handle_error(self._c_out.error)
- def element(self, tag, attrib=None, nsmap=None, **_extra):
- """element(self, tag, attrib=None, nsmap=None, **_extra)
+ def method(self, method):
+ """method(self, method)
+
+ Returns a context manager that overrides and restores the output method.
+ method is one of (None, 'xml', 'html') where None means 'xml'.
+ """
+ assert self._c_out is not NULL
+ c_method = self._method if method is None else _findOutputMethod(method)
+ return _MethodChanger(self, c_method)
+
+ def element(self, tag, attrib=None, nsmap=None, method=None, **_extra):
+ """element(self, tag, attrib=None, nsmap=None, method, **_extra)
Returns a context manager that writes an opening and closing tag.
+ method is one of (None, 'xml', 'html') where None means 'xml'.
"""
assert self._c_out is not NULL
attributes = []
@@ -1038,7 +1049,10 @@ cdef class _IncrementalFileWriter:
_prefixValidOrRaise(prefix)
reversed_nsmap[_utf8(ns)] = prefix
ns, name = _getNsTag(tag)
- return _FileWriterElement(self, (ns, name, attributes, reversed_nsmap))
+
+ c_method = self._method if method is None else _findOutputMethod(method)
+
+ return _FileWriterElement(self, (ns, name, attributes, reversed_nsmap), c_method)
cdef _write_qname(self, bytes name, bytes prefix):
if prefix: # empty bytes for no prefix (not None to allow sorting)
@@ -1163,6 +1177,7 @@ cdef class _IncrementalFileWriter:
ns in (None, b'http://www.w3.org/1999/xhtml') and
name in (b'script', b'style')):
tree.xmlOutputBufferWrite(self._c_out, len(content), _cstr(content))
+
else:
tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(content), NULL)
@@ -1219,14 +1234,51 @@ cdef class _IncrementalFileWriter:
@cython.freelist(8)
cdef class _FileWriterElement:
cdef object _element
+ cdef int _new_method
+ cdef int _old_method
cdef _IncrementalFileWriter _writer
- def __cinit__(self, _IncrementalFileWriter writer not None, element_config):
+ def __cinit__(self, _IncrementalFileWriter writer not None, element_config, int method):
self._writer = writer
self._element = element_config
+ self._new_method = method
+ self._old_method = writer._method
def __enter__(self):
+ self._writer._method = self._new_method
self._writer._write_start_element(self._element)
def __exit__(self, exc_type, exc_val, exc_tb):
self._writer._write_end_element(self._element)
+ self._writer._method = self._old_method
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _MethodChanger:
+ cdef int _new_method
+ cdef int _old_method
+ cdef bint _entered
+ cdef bint _exited
+ cdef _IncrementalFileWriter _writer
+
+ def __cinit__(self, _IncrementalFileWriter writer not None, int method):
+ self._writer = writer
+ self._new_method = method
+ self._old_method = writer._method
+ self._entered = False
+ self._exited = False
+
+ def __enter__(self):
+ if self._entered:
+ raise LxmlSyntaxError("Inconsistent enter action in context manager")
+ self._writer._method = self._new_method
+ self._entered = True
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ if self._exited:
+ raise LxmlSyntaxError("Inconsistent exit action in context manager")
+ if self._writer._method != self._new_method:
+ raise LxmlSyntaxError("Method changed outside of context manager")
+ self._writer._method = self._old_method
+ self._exited = True
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index 4ec59096..d1c79e05 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -20,6 +20,7 @@ import tempfile
import textwrap
import zlib
import gzip
+from contextlib import closing, contextmanager
from .common_imports import etree, StringIO, BytesIO, HelperTestCase
from .common_imports import fileInTestDir, fileUrlInTestDir, read_file, path2url
@@ -43,6 +44,16 @@ except NameError:
_unicode = str
+@contextmanager
+def tmpfile():
+ handle, filename = tempfile.mkstemp()
+ try:
+ yield filename
+ finally:
+ os.close(handle)
+ os.remove(filename)
+
+
class ETreeOnlyTestCase(HelperTestCase):
"""Tests only for etree, not ElementTree"""
etree = etree
@@ -4062,39 +4073,25 @@ class ETreeC14NTestCase(HelperTestCase):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
f = BytesIO()
tree.write_c14n(f, compression=9)
- gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue()))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
s = gzfile.read()
- finally:
- gzfile.close()
self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
s)
def test_c14n_file(self):
tree = self.parse(_bytes('<a><b/></a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write_c14n(filename)
data = read_file(filename, 'rb')
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a><b></b></a>'),
data)
def test_c14n_file_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write_c14n(filename, compression=9)
- f = gzip.open(filename, 'rb')
- try:
+ with closing(gzip.open(filename, 'rb')) as f:
data = f.read()
- finally:
- f.close()
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
data)
@@ -4225,18 +4222,32 @@ class ETreeWriteTestCase(HelperTestCase):
self.assertEqual(_bytes('<a><b/></a>'),
s)
+ def test_write_doctype(self):
+ tree = self.parse(_bytes('<a><b/></a>'))
+ f = BytesIO()
+ tree.write(f, doctype='HUHU')
+ s = f.getvalue()
+ self.assertEqual(_bytes('HUHU\n<a><b/></a>'),
+ s)
+
def test_write_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
f = BytesIO()
tree.write(f, compression=9)
- gzfile = gzip.GzipFile(fileobj=BytesIO(f.getvalue()))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
s = gzfile.read()
- finally:
- gzfile.close()
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
s)
+ def test_write_gzip_doctype(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ f = BytesIO()
+ tree.write(f, compression=9, doctype='<!DOCTYPE a>')
+ with closing(gzip.GzipFile(fileobj=BytesIO(f.getvalue()))) as gzfile:
+ s = gzfile.read()
+ self.assertEqual(_bytes('<!DOCTYPE a>\n<a>'+'<b/>'*200+'</a>'),
+ s)
+
def test_write_gzip_level(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
f = BytesIO()
@@ -4251,21 +4262,15 @@ class ETreeWriteTestCase(HelperTestCase):
tree.write(f, compression=1)
s = f.getvalue()
self.assertTrue(len(s) <= len(s0))
- gzfile = gzip.GzipFile(fileobj=BytesIO(s))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile:
s1 = gzfile.read()
- finally:
- gzfile.close()
f = BytesIO()
tree.write(f, compression=9)
s = f.getvalue()
self.assertTrue(len(s) <= len(s0))
- gzfile = gzip.GzipFile(fileobj=BytesIO(s))
- try:
+ with closing(gzip.GzipFile(fileobj=BytesIO(s))) as gzfile:
s9 = gzfile.read()
- finally:
- gzfile.close()
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
s0)
@@ -4276,57 +4281,39 @@ class ETreeWriteTestCase(HelperTestCase):
def test_write_file(self):
tree = self.parse(_bytes('<a><b/></a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename)
data = read_file(filename, 'rb')
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a><b/></a>'),
data)
def test_write_file_gzip(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename, compression=9)
- f = gzip.open(filename, 'rb')
- try:
+ with closing(gzip.open(filename, 'rb')) as f:
data = f.read()
- finally:
- f.close()
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)
def test_write_file_gzip_parse(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename, compression=9)
data = etree.tostring(etree.parse(filename))
- finally:
- os.close(handle)
- os.remove(filename)
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)
def test_write_file_gzipfile_parse(self):
tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
- handle, filename = tempfile.mkstemp()
- try:
+ with tmpfile() as filename:
tree.write(filename, compression=9)
- data = etree.tostring(etree.parse(
- gzip.GzipFile(filename)))
- finally:
- os.close(handle)
- os.remove(filename)
+ with closing(gzip.GzipFile(filename)) as f:
+ data = etree.tostring(etree.parse(f))
self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
data)
+
class ETreeErrorLogTest(HelperTestCase):
etree = etree
@@ -4527,5 +4514,6 @@ def test_suite():
[make_doctest('../../../doc/resolvers.txt')])
return suite
+
if __name__ == '__main__':
print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py
index 867db4a8..e73758df 100644
--- a/src/lxml/tests/test_incremental_xmlfile.py
+++ b/src/lxml/tests/test_incremental_xmlfile.py
@@ -9,6 +9,8 @@ from __future__ import with_statement, absolute_import
import unittest
import tempfile, os, sys
+from lxml.etree import LxmlSyntaxError
+
this_dir = os.path.dirname(__file__)
if this_dir not in sys.path:
sys.path.insert(0, this_dir) # needed for Py3
@@ -380,8 +382,29 @@ class HtmlFileTestCase(_XmlFileTestCaseBase):
self.assertXml('<%s>' % tag)
self._file = BytesIO()
+ def test_method_context_manager_misuse(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element('foo'):
+ cm = xf.method('xml')
+ cm.__enter__()
+
+ self.assertRaises(LxmlSyntaxError, cm.__enter__)
+
+ cm2 = xf.method('xml')
+ cm2.__enter__()
+ cm2.__exit__(None, None, None)
+
+ self.assertRaises(LxmlSyntaxError, cm2.__exit__, None, None, None)
+
+ cm3 = xf.method('xml')
+ cm3.__enter__()
+ with xf.method('html'):
+ self.assertRaises(LxmlSyntaxError, cm3.__exit__, None, None, None)
+
def test_xml_mode_write_inside_html(self):
- elt = etree.Element("foo", attrib={'selected': 'bar'})
+ tag = 'foo'
+ attrib = {'selected': 'bar'}
+ elt = etree.Element(tag, attrib=attrib)
with etree.htmlfile(self._file) as xf:
with xf.element("root"):
@@ -393,11 +416,22 @@ class HtmlFileTestCase(_XmlFileTestCaseBase):
elt.text = ""
xf.write(elt, method='xml') # 3
+ with xf.element(tag, attrib=attrib, method='xml'):
+ pass # 4
+
+ xf.write(elt) # 5
+
+ with xf.method('xml'):
+ xf.write(elt) # 6
+
self.assertXml(
'<root>'
'<foo selected></foo>' # 1
'<foo selected="bar"/>' # 2
'<foo selected="bar"></foo>' # 3
+ '<foo selected="bar"></foo>' # 4
+ '<foo selected></foo>' # 5
+ '<foo selected="bar"></foo>' # 6
'</root>')
self._file = BytesIO()