diff options
author | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2023-04-06 18:41:10 +0000 |
---|---|---|
committer | milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> | 2023-04-06 18:41:10 +0000 |
commit | a1b8f453f4e75b0b2d06cca1b8782645cef57285 (patch) | |
tree | b62dc034e10447c7db611b42c6d06270ac08598c | |
parent | 4fa49d410aa413cbc1799047616e68c8b1ac0b9d (diff) | |
download | docutils-a1b8f453f4e75b0b2d06cca1b8782645cef57285.tar.gz |
Add "auto_encode" argument to `publish_string()`
Add "auto_encode" argument to publish_string() and
publish_programmatically() to give the user an option to select the
output type (`bytes` or `OutString`) in a way that does not interfere
with the intended encoding of the output
(the problem with the "dummy" output encoding name ``unicode``).
The default will change from ``False`` to ``True`` in Docutils 0.22
New class for `io.StringOutput`: `io.OutString` adds "encoding"
and "errors" attributes to `str`.
Allows storing the "output_encoding" and "output_encoding_error_handler"
settings in a transparent and easy to process way.
git-svn-id: https://svn.code.sf.net/p/docutils/code/trunk@9336 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
-rw-r--r-- | docutils/HISTORY.txt | 7 | ||||
-rw-r--r-- | docutils/RELEASE-NOTES.txt | 9 | ||||
-rw-r--r-- | docutils/docutils/core.py | 36 | ||||
-rw-r--r-- | docutils/docutils/io.py | 84 | ||||
-rwxr-xr-x | docutils/test/test_io.py | 59 | ||||
-rwxr-xr-x | docutils/test/test_publisher.py | 21 |
6 files changed, 196 insertions, 20 deletions
diff --git a/docutils/HISTORY.txt b/docutils/HISTORY.txt index ef841514b..46c878fae 100644 --- a/docutils/HISTORY.txt +++ b/docutils/HISTORY.txt @@ -25,6 +25,8 @@ Release 0.20b (unpublished) - Added new `publish_bytes()` function to explicitly return output as binary data in a `bytes` object. + - New argument "auto_encode" for `publish_string()` and + `publish_programmatically()`. - New functions `rst2…()` for use as "console_scripts" `entry point`_. * docutils/frontend.py @@ -32,6 +34,11 @@ Release 0.20b (unpublished) - New setting ``output``. Obsoletes the ``<destination>`` positional argument (cf. "Future changes" in the RELEASE-NOTES). +* docutils/io.py + + - New `str` sub-class `io.OutString` with "encoding" and "errors" + attributes. + * docutils/languages/ docutils/parsers/rst/languages/ diff --git a/docutils/RELEASE-NOTES.txt b/docutils/RELEASE-NOTES.txt index 24539bad7..29deb9189 100644 --- a/docutils/RELEASE-NOTES.txt +++ b/docutils/RELEASE-NOTES.txt @@ -131,6 +131,10 @@ Drop support for Python 2.7 and 2.8 in Docutils 0.21. - Remove ``use_verbatim_when_possible`` setting (use literal_block_env_: verbatim) in Docutils 2.0. +* The default value of the `auto_encode` argument of + `core.publish_str()` and `core.publish_programmatically()` + will change to ``False`` in Docutils 0.22. + * Remove the "rawsource" argument from `nodes.Text.__init__()` (deprecated and ignored since Docutils 0.18) in Docutils 2.0. @@ -188,6 +192,11 @@ Release 0.20 (unpublished) .. _[latex writers]: docs/user/config.html#latex-writers +* The new function argument `auto_encode` for `core.publish_string()` and + `core.publish_programmatically()` selects whether the output document is + encoded and returned as `bytes` instance. The default will change to + ``False`` in Docutils 0.22. + * Bugfixes and improvements (see HISTORY_). diff --git a/docutils/docutils/core.py b/docutils/docutils/core.py index b369a9cfe..d9a769557 100644 --- a/docutils/docutils/core.py +++ b/docutils/docutils/core.py @@ -435,26 +435,24 @@ def publish_string(source, source_path=None, destination_path=None, writer=None, writer_name='pseudoxml', settings=None, settings_spec=None, settings_overrides=None, config_section=None, - enable_exit_status=False): + enable_exit_status=False, + auto_encode=True): """ Set up & run a `Publisher` for programmatic use with string I/O. Accepts a `bytes` or `str` instance as `source`. - The output is encoded according to the "output_encoding" setting; - the return value is a `bytes` instance (unless `output_encoding`_ - is "unicode", see below). - To get Docutils output as `str` instance, use `publish_parts()`:: + If `auto_encode` is True, the output is encoded according to the + `output_encoding`_ setting; the return value is a `bytes` instance + (unless `output_encoding`_ is "unicode", + cf. `docutils.io.StringOutput.write()`). - output = publish_parts(...)['whole'] + If `auto_encode` is False, the output is an instance of a `str` + sub-class with "output_encoding" and "output_encoding_error_handler" + settings stored as `encoding` and `errors` attributes. - or set `output_encoding`_ to the pseudo encoding name "unicode", e.g.:: - - publish_string(..., settings_overrides={'output_encoding': 'unicode'}) - - Beware that the `output_encoding`_ setting may affect the content - of the output (e.g. an encoding declaration in HTML or XML or the - representation of characters as LaTeX macro vs. literal character). + The default value of `auto_encode` will change to ``False`` in + Docutils 0.22. Parameters: see `publish_programmatically()`. @@ -471,7 +469,8 @@ def publish_string(source, source_path=None, destination_path=None, settings=settings, settings_spec=settings_spec, settings_overrides=settings_overrides, config_section=config_section, - enable_exit_status=enable_exit_status) + enable_exit_status=enable_exit_status, + auto_encode=auto_encode) return output @@ -662,7 +661,8 @@ def publish_programmatically(source_class, source, source_path, writer, writer_name, settings, settings_spec, settings_overrides, config_section, - enable_exit_status): + enable_exit_status, + auto_encode=True): """ Set up & run a `Publisher` for custom programmatic use. @@ -754,6 +754,10 @@ def publish_programmatically(source_class, source, source_path, defined by `settings_spec`. Used only if no `settings` specified. * `enable_exit_status`: Boolean; enable exit status at end of processing? + + * `auto_encode`: Boolean; encode string output and return `bytes`? + Ignored with `io.FileOutput`. + The default value will change to ``False`` in Docutils 0.22. """ publisher = Publisher(reader, parser, writer, settings=settings, source_class=source_class, @@ -763,6 +767,8 @@ def publish_programmatically(source_class, source, source_path, settings_spec, settings_overrides, config_section) publisher.set_source(source, source_path) publisher.set_destination(destination, destination_path) + if not auto_encode and isinstance(publisher.destination, io.StringOutput): + publisher.destination.auto_encode = auto_encode output = publisher.publish(enable_exit_status=enable_exit_status) return output, publisher diff --git a/docutils/docutils/io.py b/docutils/docutils/io.py index c291f40cb..68fde9ae0 100644 --- a/docutils/docutils/io.py +++ b/docutils/docutils/io.py @@ -74,6 +74,57 @@ def error_string(err): return f'{err.__class__.__name__}: {err}' +class OutString(str): + """Return a string representation of `object` with known encoding. + + Differences to `str()`: + + If the `encoding` is given, both `str` instances and byte-like objects + are stored as text string, the latter decoded with `encoding` and + `errors` (defaulting to 'strict'). + + The encoding is never guessed. If `encoding` is None (the default), + an informal string representation is used, also if `errors` are given. + + The original or intended encoding and error handler are stored in the + attributes `encoding` and `errors`. + Typecasting to `bytes` uses the stored values. + """ + + def __new__(cls, object, encoding=None, errors='strict'): + """Return a new OutString object. + + Provisional. + """ + try: + # decode bytes-like objects if encoding is known + return super().__new__(cls, object, encoding, errors) + except TypeError: + return super().__new__(cls, object) + + def __init__(self, object, encoding=None, errors='strict'): + """Set "encoding" and "errors" attributes.""" + self.encoding = encoding + self.errors = errors + + def __bytes__(self): + try: + return super().encode(self.encoding, self.errors) + except TypeError: + raise TypeError('OutString instance without known encoding') + + def __repr__(self): + if self.errors != 'strict': + errors_arg = f', errors={self.errors!r}' + else: + errors_arg = '' + return (f'{self.__class__.__name__}({super().__repr__()}, ' + f'encoding={self.encoding!r}{errors_arg})') + + def encode(self, encoding=None, errors=None): + return super().encode(encoding or self.encoding, errors or self.errors) + + class Input(TransformSpec): """ Abstract base class for input wrappers. @@ -264,14 +315,14 @@ class Output(TransformSpec): raise NotImplementedError def encode(self, data): - """Encode and return `data`. + """ + Encode and return `data`. If `data` is a `bytes` instance, it is returned unchanged. Otherwise it is encoded with `self.encoding`. If `self.encoding` is set to the pseudo encoding name "unicode", `data` must be a `str` instance and is returned unchanged. - """ if self.encoding and self.encoding.lower() == 'unicode': assert isinstance(data, str), ('output encoding is "unicode" ' @@ -616,14 +667,39 @@ class StringOutput(Output): default_destination_path = '<string>' + def __init__(self, destination=None, destination_path=None, + encoding=None, error_handler='strict', auto_encode=True): + self.auto_encode = auto_encode + """Let `write()` encode the output document and return `bytes`.""" + super().__init__(destination, destination_path, + encoding, error_handler) + def write(self, data): - """Encode `data`, store it in `self.destination`, and return it. + """Store `data` in `self.destination`, and return it. + + If `self.auto_encode` is False, store and return a `str` + sub-class instance with "encoding" and "errors" attributes + set to `self.encoding` and `self.error_handler`. + If `self.auto_encode` is True, encode `data` with `self.encoding` + and `self.error_handler` and store/return a `bytes` instance. + Exception: If `self.encoding` is set to the pseudo encoding name "unicode", `data` must be a `str` instance and is returned unchanged (cf. `Output.encode`). + Beware that the `output_encoding`_ setting may affect the content + of the output (e.g. an encoding declaration in HTML or XML or the + representation of characters as LaTeX macro vs. literal character). """ - self.destination = self.encode(data) + if self.auto_encode: + self.destination = self.encode(data) + return self.destination + + if not self.encoding or self.encoding.lower() == 'unicode': + encoding = None + else: + encoding = self.encoding + self.destination = OutString(data, encoding, self.error_handler) return self.destination diff --git a/docutils/test/test_io.py b/docutils/test/test_io.py index b1e55a148..6c8c70254 100755 --- a/docutils/test/test_io.py +++ b/docutils/test/test_io.py @@ -190,6 +190,19 @@ class OutputTests(unittest.TestCase): fo.write(self.udata) self.assertEqual(self.udrain.getvalue(), self.udata) + def test_write_auto_encode_false(self): + so = du_io.StringOutput(encoding='latin1', error_handler='replace', + auto_encode=False) + output = so.write(self.udata) + # store output in self.destination and also return it + self.assertEqual(output, self.udata) + self.assertEqual(so.destination, self.udata) + # store also encoding and encoding error handler ... + self.assertEqual(output.encoding, 'latin1') + self.assertEqual(output.errors, 'replace') + # ... to allow easy conversion to `bytes`: + self.assertEqual(bytes(output), self.bdata) + def test_FileOutput_hande_io_errors_deprection_warning(self): with self.assertWarnsRegex(DeprecationWarning, '"handle_io_errors" is ignored'): @@ -225,6 +238,52 @@ class OutputTests(unittest.TestCase): self.assertRaises(ValueError, fo.write, self.udata) +class OutStringTests(unittest.TestCase): + + def test__init__defaults(self): + """Test `__new__()` and `__init__()` with default values.""" + + os = du_io.OutString('Grüße') + self.assertEqual(str(os), 'Grüße') + self.assertEqual(os.encoding, None) + self.assertEqual(os.errors, 'strict') + # converting to `bytes` fails if the encoding is not known: + with self.assertRaises(TypeError): + self.assertEqual(bytes(os), 'Grüße') + # without known encoding, `bytes` and other incompatible types + # are converted to their string representation ... + bos = du_io.OutString(b'gut') + self.assertEqual(str(bos), "b'gut'") + bos_e = du_io.OutString('Grüße'.encode('latin1'), errors='ignore') + self.assertEqual(str(bos_e), r"b'Gr\xfc\xdfe'") + bos = du_io.OutString(b'gut', encoding=None) + self.assertEqual(str(bos), "b'gut'") + + def test__init__custom_attributes(self): + """Test `__new__()` and `__init__()` with custom encoding.""" + os8 = du_io.OutString('Grüße', encoding='utf-8') + self.assertEqual(str(os8), 'Grüße') + self.assertEqual(bytes(os8), b'Gr\xc3\xbc\xc3\x9fe') + self.assertEqual(repr(os8), "OutString('Grüße', encoding='utf-8')") + # With known encoding, "bytes-like" objects are decoded + bos1 = du_io.OutString(b'Gr\xfc\xdfe', encoding='latin1') + self.assertEqual(str(bos1), 'Grüße') + self.assertEqual(bytes(bos1), b'Gr\xfc\xdfe') + # Invalid encodings (including the empty string) raise an error + with self.assertRaises(LookupError): + du_io.OutString(b'Gr\xfc\xdfe', encoding='') + + def test__init__custom_errors(self): + """Test `__new__()` and `__init__()` with custom `errors`.""" + ts8_r = du_io.OutString('Grüße', encoding='utf-8', errors='replace') + # Encoding uses the stored error handler: + self.assertEqual(ts8_r.encode('ascii'), b'Gr??e') + # Initialization with a `bytes` object uses the error handler, too: + bts8_r = du_io.OutString(b'Gr\xfc\xdfe', encoding='utf-8', + errors='replace') + self.assertEqual(str(bts8_r), 'Gr��e') + + class ErrorOutputTests(unittest.TestCase): def test_defaults(self): e = du_io.ErrorOutput() diff --git a/docutils/test/test_publisher.py b/docutils/test/test_publisher.py index 492cdbac1..c3b6394c4 100755 --- a/docutils/test/test_publisher.py +++ b/docutils/test/test_publisher.py @@ -99,7 +99,8 @@ class PublisherTests(unittest.TestCase): core.publish_cmdline(argv=['-', 'dest_name'], settings_overrides=settings) - def test_publish_string(self): + def test_publish_string_input_encoding(self): + """Test handling of encoded input.""" # Transparently decode `bytes` source (with "input_encoding" setting) # default: auto-detect, fallback utf-8 # Output is encoded according to "output_encoding" setting. @@ -121,6 +122,24 @@ class PublisherTests(unittest.TestCase): settings_overrides=settings) self.assertTrue(output.endswith('Grüße\n')) + def test_publish_string_output_encoding(self): + settings = {'_disable_config': True, + 'datestamp': False, + 'output_encoding': 'latin1', + 'output_encoding_error_handler': 'replace'} + source = 'Grüß → dich' + expected = ('<document source="<string>">\n' + ' <paragraph>\n' + ' Grüß → dich\n') + # current default: encode output, return `bytes` + output = core.publish_string(source, settings_overrides=settings) + self.assertEqual(output, expected.encode('latin1', 'replace')) + # no encoding if `auto_encode` is False: + output = core.publish_string(source, settings_overrides=settings, + auto_encode=False) + self.assertEqual(output, expected) + # self.assertEqual(output.encoding, 'latin1') + class PublishDoctreeTestCase(unittest.TestCase, docutils.SettingsSpec): |