summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2023-04-06 18:41:10 +0000
committermilde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>2023-04-06 18:41:10 +0000
commita1b8f453f4e75b0b2d06cca1b8782645cef57285 (patch)
treeb62dc034e10447c7db611b42c6d06270ac08598c
parent4fa49d410aa413cbc1799047616e68c8b1ac0b9d (diff)
downloaddocutils-a1b8f453f4e75b0b2d06cca1b8782645cef57285.tar.gz
Add "auto_encode" argument to `publish_string()`
Add "auto_encode" argument to publish_string() and publish_programmatically() to give the user an option to select the output type (`bytes` or `OutString`) in a way that does not interfere with the intended encoding of the output (the problem with the "dummy" output encoding name ``unicode``). The default will change from ``False`` to ``True`` in Docutils 0.22 New class for `io.StringOutput`: `io.OutString` adds "encoding" and "errors" attributes to `str`. Allows storing the "output_encoding" and "output_encoding_error_handler" settings in a transparent and easy to process way. git-svn-id: https://svn.code.sf.net/p/docutils/code/trunk@9336 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
-rw-r--r--docutils/HISTORY.txt7
-rw-r--r--docutils/RELEASE-NOTES.txt9
-rw-r--r--docutils/docutils/core.py36
-rw-r--r--docutils/docutils/io.py84
-rwxr-xr-xdocutils/test/test_io.py59
-rwxr-xr-xdocutils/test/test_publisher.py21
6 files changed, 196 insertions, 20 deletions
diff --git a/docutils/HISTORY.txt b/docutils/HISTORY.txt
index ef841514b..46c878fae 100644
--- a/docutils/HISTORY.txt
+++ b/docutils/HISTORY.txt
@@ -25,6 +25,8 @@ Release 0.20b (unpublished)
- Added new `publish_bytes()` function to explicitly return
output as binary data in a `bytes` object.
+ - New argument "auto_encode" for `publish_string()` and
+ `publish_programmatically()`.
- New functions `rst2…()` for use as "console_scripts" `entry point`_.
* docutils/frontend.py
@@ -32,6 +34,11 @@ Release 0.20b (unpublished)
- New setting ``output``. Obsoletes the ``<destination>`` positional
argument (cf. "Future changes" in the RELEASE-NOTES).
+* docutils/io.py
+
+ - New `str` sub-class `io.OutString` with "encoding" and "errors"
+ attributes.
+
* docutils/languages/
docutils/parsers/rst/languages/
diff --git a/docutils/RELEASE-NOTES.txt b/docutils/RELEASE-NOTES.txt
index 24539bad7..29deb9189 100644
--- a/docutils/RELEASE-NOTES.txt
+++ b/docutils/RELEASE-NOTES.txt
@@ -131,6 +131,10 @@ Drop support for Python 2.7 and 2.8 in Docutils 0.21.
- Remove ``use_verbatim_when_possible`` setting
(use literal_block_env_: verbatim) in Docutils 2.0.
+* The default value of the `auto_encode` argument of
+ `core.publish_str()` and `core.publish_programmatically()`
+ will change to ``False`` in Docutils 0.22.
+
* Remove the "rawsource" argument from `nodes.Text.__init__()`
(deprecated and ignored since Docutils 0.18) in Docutils 2.0.
@@ -188,6 +192,11 @@ Release 0.20 (unpublished)
.. _[latex writers]: docs/user/config.html#latex-writers
+* The new function argument `auto_encode` for `core.publish_string()` and
+ `core.publish_programmatically()` selects whether the output document is
+ encoded and returned as `bytes` instance. The default will change to
+ ``False`` in Docutils 0.22.
+
* Bugfixes and improvements (see HISTORY_).
diff --git a/docutils/docutils/core.py b/docutils/docutils/core.py
index b369a9cfe..d9a769557 100644
--- a/docutils/docutils/core.py
+++ b/docutils/docutils/core.py
@@ -435,26 +435,24 @@ def publish_string(source, source_path=None, destination_path=None,
writer=None, writer_name='pseudoxml',
settings=None, settings_spec=None,
settings_overrides=None, config_section=None,
- enable_exit_status=False):
+ enable_exit_status=False,
+ auto_encode=True):
"""
Set up & run a `Publisher` for programmatic use with string I/O.
Accepts a `bytes` or `str` instance as `source`.
- The output is encoded according to the "output_encoding" setting;
- the return value is a `bytes` instance (unless `output_encoding`_
- is "unicode", see below).
- To get Docutils output as `str` instance, use `publish_parts()`::
+ If `auto_encode` is True, the output is encoded according to the
+ `output_encoding`_ setting; the return value is a `bytes` instance
+ (unless `output_encoding`_ is "unicode",
+ cf. `docutils.io.StringOutput.write()`).
- output = publish_parts(...)['whole']
+ If `auto_encode` is False, the output is an instance of a `str`
+ sub-class with "output_encoding" and "output_encoding_error_handler"
+ settings stored as `encoding` and `errors` attributes.
- or set `output_encoding`_ to the pseudo encoding name "unicode", e.g.::
-
- publish_string(..., settings_overrides={'output_encoding': 'unicode'})
-
- Beware that the `output_encoding`_ setting may affect the content
- of the output (e.g. an encoding declaration in HTML or XML or the
- representation of characters as LaTeX macro vs. literal character).
+ The default value of `auto_encode` will change to ``False`` in
+ Docutils 0.22.
Parameters: see `publish_programmatically()`.
@@ -471,7 +469,8 @@ def publish_string(source, source_path=None, destination_path=None,
settings=settings, settings_spec=settings_spec,
settings_overrides=settings_overrides,
config_section=config_section,
- enable_exit_status=enable_exit_status)
+ enable_exit_status=enable_exit_status,
+ auto_encode=auto_encode)
return output
@@ -662,7 +661,8 @@ def publish_programmatically(source_class, source, source_path,
writer, writer_name,
settings, settings_spec,
settings_overrides, config_section,
- enable_exit_status):
+ enable_exit_status,
+ auto_encode=True):
"""
Set up & run a `Publisher` for custom programmatic use.
@@ -754,6 +754,10 @@ def publish_programmatically(source_class, source, source_path,
defined by `settings_spec`. Used only if no `settings` specified.
* `enable_exit_status`: Boolean; enable exit status at end of processing?
+
+ * `auto_encode`: Boolean; encode string output and return `bytes`?
+ Ignored with `io.FileOutput`.
+ The default value will change to ``False`` in Docutils 0.22.
"""
publisher = Publisher(reader, parser, writer, settings=settings,
source_class=source_class,
@@ -763,6 +767,8 @@ def publish_programmatically(source_class, source, source_path,
settings_spec, settings_overrides, config_section)
publisher.set_source(source, source_path)
publisher.set_destination(destination, destination_path)
+ if not auto_encode and isinstance(publisher.destination, io.StringOutput):
+ publisher.destination.auto_encode = auto_encode
output = publisher.publish(enable_exit_status=enable_exit_status)
return output, publisher
diff --git a/docutils/docutils/io.py b/docutils/docutils/io.py
index c291f40cb..68fde9ae0 100644
--- a/docutils/docutils/io.py
+++ b/docutils/docutils/io.py
@@ -74,6 +74,57 @@ def error_string(err):
return f'{err.__class__.__name__}: {err}'
+class OutString(str):
+ """Return a string representation of `object` with known encoding.
+
+ Differences to `str()`:
+
+ If the `encoding` is given, both `str` instances and byte-like objects
+ are stored as text string, the latter decoded with `encoding` and
+ `errors` (defaulting to 'strict').
+
+ The encoding is never guessed. If `encoding` is None (the default),
+ an informal string representation is used, also if `errors` are given.
+
+ The original or intended encoding and error handler are stored in the
+ attributes `encoding` and `errors`.
+ Typecasting to `bytes` uses the stored values.
+ """
+
+ def __new__(cls, object, encoding=None, errors='strict'):
+ """Return a new OutString object.
+
+ Provisional.
+ """
+ try:
+ # decode bytes-like objects if encoding is known
+ return super().__new__(cls, object, encoding, errors)
+ except TypeError:
+ return super().__new__(cls, object)
+
+ def __init__(self, object, encoding=None, errors='strict'):
+ """Set "encoding" and "errors" attributes."""
+ self.encoding = encoding
+ self.errors = errors
+
+ def __bytes__(self):
+ try:
+ return super().encode(self.encoding, self.errors)
+ except TypeError:
+ raise TypeError('OutString instance without known encoding')
+
+ def __repr__(self):
+ if self.errors != 'strict':
+ errors_arg = f', errors={self.errors!r}'
+ else:
+ errors_arg = ''
+ return (f'{self.__class__.__name__}({super().__repr__()}, '
+ f'encoding={self.encoding!r}{errors_arg})')
+
+ def encode(self, encoding=None, errors=None):
+ return super().encode(encoding or self.encoding, errors or self.errors)
+
+
class Input(TransformSpec):
"""
Abstract base class for input wrappers.
@@ -264,14 +315,14 @@ class Output(TransformSpec):
raise NotImplementedError
def encode(self, data):
- """Encode and return `data`.
+ """
+ Encode and return `data`.
If `data` is a `bytes` instance, it is returned unchanged.
Otherwise it is encoded with `self.encoding`.
If `self.encoding` is set to the pseudo encoding name "unicode",
`data` must be a `str` instance and is returned unchanged.
-
"""
if self.encoding and self.encoding.lower() == 'unicode':
assert isinstance(data, str), ('output encoding is "unicode" '
@@ -616,14 +667,39 @@ class StringOutput(Output):
default_destination_path = '<string>'
+ def __init__(self, destination=None, destination_path=None,
+ encoding=None, error_handler='strict', auto_encode=True):
+ self.auto_encode = auto_encode
+ """Let `write()` encode the output document and return `bytes`."""
+ super().__init__(destination, destination_path,
+ encoding, error_handler)
+
def write(self, data):
- """Encode `data`, store it in `self.destination`, and return it.
+ """Store `data` in `self.destination`, and return it.
+
+ If `self.auto_encode` is False, store and return a `str`
+ sub-class instance with "encoding" and "errors" attributes
+ set to `self.encoding` and `self.error_handler`.
+ If `self.auto_encode` is True, encode `data` with `self.encoding`
+ and `self.error_handler` and store/return a `bytes` instance.
+ Exception:
If `self.encoding` is set to the pseudo encoding name "unicode",
`data` must be a `str` instance and is returned unchanged
(cf. `Output.encode`).
+ Beware that the `output_encoding`_ setting may affect the content
+ of the output (e.g. an encoding declaration in HTML or XML or the
+ representation of characters as LaTeX macro vs. literal character).
"""
- self.destination = self.encode(data)
+ if self.auto_encode:
+ self.destination = self.encode(data)
+ return self.destination
+
+ if not self.encoding or self.encoding.lower() == 'unicode':
+ encoding = None
+ else:
+ encoding = self.encoding
+ self.destination = OutString(data, encoding, self.error_handler)
return self.destination
diff --git a/docutils/test/test_io.py b/docutils/test/test_io.py
index b1e55a148..6c8c70254 100755
--- a/docutils/test/test_io.py
+++ b/docutils/test/test_io.py
@@ -190,6 +190,19 @@ class OutputTests(unittest.TestCase):
fo.write(self.udata)
self.assertEqual(self.udrain.getvalue(), self.udata)
+ def test_write_auto_encode_false(self):
+ so = du_io.StringOutput(encoding='latin1', error_handler='replace',
+ auto_encode=False)
+ output = so.write(self.udata)
+ # store output in self.destination and also return it
+ self.assertEqual(output, self.udata)
+ self.assertEqual(so.destination, self.udata)
+ # store also encoding and encoding error handler ...
+ self.assertEqual(output.encoding, 'latin1')
+ self.assertEqual(output.errors, 'replace')
+ # ... to allow easy conversion to `bytes`:
+ self.assertEqual(bytes(output), self.bdata)
+
def test_FileOutput_hande_io_errors_deprection_warning(self):
with self.assertWarnsRegex(DeprecationWarning,
'"handle_io_errors" is ignored'):
@@ -225,6 +238,52 @@ class OutputTests(unittest.TestCase):
self.assertRaises(ValueError, fo.write, self.udata)
+class OutStringTests(unittest.TestCase):
+
+ def test__init__defaults(self):
+ """Test `__new__()` and `__init__()` with default values."""
+
+ os = du_io.OutString('Grüße')
+ self.assertEqual(str(os), 'Grüße')
+ self.assertEqual(os.encoding, None)
+ self.assertEqual(os.errors, 'strict')
+ # converting to `bytes` fails if the encoding is not known:
+ with self.assertRaises(TypeError):
+ self.assertEqual(bytes(os), 'Grüße')
+ # without known encoding, `bytes` and other incompatible types
+ # are converted to their string representation ...
+ bos = du_io.OutString(b'gut')
+ self.assertEqual(str(bos), "b'gut'")
+ bos_e = du_io.OutString('Grüße'.encode('latin1'), errors='ignore')
+ self.assertEqual(str(bos_e), r"b'Gr\xfc\xdfe'")
+ bos = du_io.OutString(b'gut', encoding=None)
+ self.assertEqual(str(bos), "b'gut'")
+
+ def test__init__custom_attributes(self):
+ """Test `__new__()` and `__init__()` with custom encoding."""
+ os8 = du_io.OutString('Grüße', encoding='utf-8')
+ self.assertEqual(str(os8), 'Grüße')
+ self.assertEqual(bytes(os8), b'Gr\xc3\xbc\xc3\x9fe')
+ self.assertEqual(repr(os8), "OutString('Grüße', encoding='utf-8')")
+ # With known encoding, "bytes-like" objects are decoded
+ bos1 = du_io.OutString(b'Gr\xfc\xdfe', encoding='latin1')
+ self.assertEqual(str(bos1), 'Grüße')
+ self.assertEqual(bytes(bos1), b'Gr\xfc\xdfe')
+ # Invalid encodings (including the empty string) raise an error
+ with self.assertRaises(LookupError):
+ du_io.OutString(b'Gr\xfc\xdfe', encoding='')
+
+ def test__init__custom_errors(self):
+ """Test `__new__()` and `__init__()` with custom `errors`."""
+ ts8_r = du_io.OutString('Grüße', encoding='utf-8', errors='replace')
+ # Encoding uses the stored error handler:
+ self.assertEqual(ts8_r.encode('ascii'), b'Gr??e')
+ # Initialization with a `bytes` object uses the error handler, too:
+ bts8_r = du_io.OutString(b'Gr\xfc\xdfe', encoding='utf-8',
+ errors='replace')
+ self.assertEqual(str(bts8_r), 'Gr��e')
+
+
class ErrorOutputTests(unittest.TestCase):
def test_defaults(self):
e = du_io.ErrorOutput()
diff --git a/docutils/test/test_publisher.py b/docutils/test/test_publisher.py
index 492cdbac1..c3b6394c4 100755
--- a/docutils/test/test_publisher.py
+++ b/docutils/test/test_publisher.py
@@ -99,7 +99,8 @@ class PublisherTests(unittest.TestCase):
core.publish_cmdline(argv=['-', 'dest_name'],
settings_overrides=settings)
- def test_publish_string(self):
+ def test_publish_string_input_encoding(self):
+ """Test handling of encoded input."""
# Transparently decode `bytes` source (with "input_encoding" setting)
# default: auto-detect, fallback utf-8
# Output is encoded according to "output_encoding" setting.
@@ -121,6 +122,24 @@ class PublisherTests(unittest.TestCase):
settings_overrides=settings)
self.assertTrue(output.endswith('Grüße\n'))
+ def test_publish_string_output_encoding(self):
+ settings = {'_disable_config': True,
+ 'datestamp': False,
+ 'output_encoding': 'latin1',
+ 'output_encoding_error_handler': 'replace'}
+ source = 'Grüß → dich'
+ expected = ('<document source="<string>">\n'
+ ' <paragraph>\n'
+ ' Grüß → dich\n')
+ # current default: encode output, return `bytes`
+ output = core.publish_string(source, settings_overrides=settings)
+ self.assertEqual(output, expected.encode('latin1', 'replace'))
+ # no encoding if `auto_encode` is False:
+ output = core.publish_string(source, settings_overrides=settings,
+ auto_encode=False)
+ self.assertEqual(output, expected)
+ # self.assertEqual(output.encoding, 'latin1')
+
class PublishDoctreeTestCase(unittest.TestCase, docutils.SettingsSpec):