Add "auto_encode" argument to `publish_string()`

Add "auto_encode" argument to publish_string() and publish_programmatically() to give the user an option to select the output type (`bytes` or `OutString`) in a way that does not interfere with the intended encoding of the output (the problem with the "dummy" output encoding name ``unicode``). The default will change from ``False`` to ``True`` in Docutils 0.22 New class for `io.StringOutput`: `io.OutString` adds "encoding" and "errors" attributes to `str`. Allows storing the "output_encoding" and "output_encoding_error_handler" settings in a transparent and easy to process way. git-svn-id: https://svn.code.sf.net/p/docutils/code/trunk@9336 929543f6-e4f2-0310-98a6-ba3bd3dd1d04
author: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2023-04-06 18:41:10 +0000
committer: milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04> 2023-04-06 18:41:10 +0000
commit: a1b8f453f4e75b0b2d06cca1b8782645cef57285 (patch)
tree: b62dc034e10447c7db611b42c6d06270ac08598c
parent: 4fa49d410aa413cbc1799047616e68c8b1ac0b9d (diff)
download: docutils-a1b8f453f4e75b0b2d06cca1b8782645cef57285.tar.gz
6 files changed, 196 insertions, 20 deletions
diff --git a/docutils/HISTORY.txt b/docutils/HISTORY.txt
index ef841514b..46c878fae 100644
--- a/docutils/HISTORY.txt
+++ b/docutils/HISTORY.txt
@@ -25,6 +25,8 @@ Release 0.20b (unpublished)
 
   - Added new `publish_bytes()` function to explicitly return
     output as binary data in a `bytes` object.
+  - New argument "auto_encode" for `publish_string()` and
+    `publish_programmatically()`.
   - New functions `rst2…()` for use as "console_scripts" `entry point`_.
 
 * docutils/frontend.py
@@ -32,6 +34,11 @@ Release 0.20b (unpublished)
   - New setting ``output``. Obsoletes the ``<destination>`` positional
     argument (cf. "Future changes" in the RELEASE-NOTES).
 
+* docutils/io.py
+
+  - New `str` sub-class `io.OutString` with "encoding" and "errors"
+    attributes.
+
 * docutils/languages/
   docutils/parsers/rst/languages/
 
diff --git a/docutils/RELEASE-NOTES.txt b/docutils/RELEASE-NOTES.txt
index 24539bad7..29deb9189 100644
--- a/docutils/RELEASE-NOTES.txt
+++ b/docutils/RELEASE-NOTES.txt
@@ -131,6 +131,10 @@ Drop support for Python 2.7 and 2.8 in Docutils 0.21.
   - Remove ``use_verbatim_when_possible`` setting
     (use literal_block_env_: verbatim) in Docutils 2.0.
 
+* The default value of the `auto_encode` argument of
+  `core.publish_str()` and `core.publish_programmatically()`
+  will change to ``False`` in Docutils 0.22.
+
 * Remove the "rawsource" argument from `nodes.Text.__init__()`
   (deprecated and ignored since Docutils 0.18) in Docutils 2.0.
 
@@ -188,6 +192,11 @@ Release 0.20 (unpublished)
 
     .. _[latex writers]: docs/user/config.html#latex-writers
 
+* The new function argument `auto_encode` for `core.publish_string()` and
+  `core.publish_programmatically()` selects whether the output document is
+  encoded and returned as `bytes` instance.  The default will change to
+  ``False`` in Docutils 0.22.
+
 * Bugfixes and improvements (see HISTORY_).
 
 
diff --git a/docutils/docutils/core.py b/docutils/docutils/core.py
index b369a9cfe..d9a769557 100644
--- a/docutils/docutils/core.py
+++ b/docutils/docutils/core.py
@@ -435,26 +435,24 @@ def publish_string(source, source_path=None, destination_path=None,
                    writer=None, writer_name='pseudoxml',
                    settings=None, settings_spec=None,
                    settings_overrides=None, config_section=None,
-                   enable_exit_status=False):
+                   enable_exit_status=False,
+                   auto_encode=True):
     """
     Set up & run a `Publisher` for programmatic use with string I/O.
 
     Accepts a `bytes` or `str` instance as `source`.
-    The output is encoded according to the "output_encoding" setting;
-    the return value is a `bytes` instance (unless `output_encoding`_
-    is "unicode", see below).
 
-    To get Docutils output as `str` instance, use `publish_parts()`::
+    If `auto_encode` is True, the output is encoded according to the
+    `output_encoding`_ setting; the return value is a `bytes` instance
+    (unless `output_encoding`_ is "unicode",
+    cf. `docutils.io.StringOutput.write()`).
 
-      output = publish_parts(...)['whole']
+    If `auto_encode` is False, the output is an instance of a `str`
+    sub-class with "output_encoding" and "output_encoding_error_handler"
+    settings stored as `encoding` and `errors` attributes.
 
-    or set `output_encoding`_ to the pseudo encoding name "unicode", e.g.::
-
-      publish_string(..., settings_overrides={'output_encoding': 'unicode'})
-
-    Beware that the `output_encoding`_ setting may affect the content
-    of the output (e.g. an encoding declaration in HTML or XML or the
-    representation of characters as LaTeX macro vs. literal character).
+    The default value of `auto_encode` will change to ``False`` in
+    Docutils 0.22.
 
     Parameters: see `publish_programmatically()`.
 
@@ -471,7 +469,8 @@ def publish_string(source, source_path=None, destination_path=None,
         settings=settings, settings_spec=settings_spec,
         settings_overrides=settings_overrides,
         config_section=config_section,
-        enable_exit_status=enable_exit_status)
+        enable_exit_status=enable_exit_status,
+        auto_encode=auto_encode)
     return output
 
 
@@ -662,7 +661,8 @@ def publish_programmatically(source_class, source, source_path,
                              writer, writer_name,
                              settings, settings_spec,
                              settings_overrides, config_section,
-                             enable_exit_status):
+                             enable_exit_status,
+                             auto_encode=True):
     """
     Set up & run a `Publisher` for custom programmatic use.
 
@@ -754,6 +754,10 @@ def publish_programmatically(source_class, source, source_path,
       defined by `settings_spec`.  Used only if no `settings` specified.
 
     * `enable_exit_status`: Boolean; enable exit status at end of processing?
+
+    * `auto_encode`: Boolean; encode string output and return `bytes`?
+      Ignored with `io.FileOutput`.
+      The default value will change to ``False`` in Docutils 0.22.
     """
     publisher = Publisher(reader, parser, writer, settings=settings,
                           source_class=source_class,
@@ -763,6 +767,8 @@ def publish_programmatically(source_class, source, source_path,
         settings_spec, settings_overrides, config_section)
     publisher.set_source(source, source_path)
     publisher.set_destination(destination, destination_path)
+    if not auto_encode and isinstance(publisher.destination, io.StringOutput):
+        publisher.destination.auto_encode = auto_encode
     output = publisher.publish(enable_exit_status=enable_exit_status)
     return output, publisher
 
diff --git a/docutils/docutils/io.py b/docutils/docutils/io.py
index c291f40cb..68fde9ae0 100644
--- a/docutils/docutils/io.py
+++ b/docutils/docutils/io.py
@@ -74,6 +74,57 @@ def error_string(err):
     return f'{err.__class__.__name__}: {err}'
 
 
+class OutString(str):
+    """Return a string representation of `object` with known encoding.
+
+    Differences to `str()`:
+
+    If the `encoding` is given, both `str` instances and byte-like objects
+    are stored as text string, the latter decoded with `encoding` and
+    `errors` (defaulting to 'strict').
+
+    The encoding is never guessed. If `encoding` is None (the default),
+    an informal string representation is used, also if `errors` are given.
+
+    The original or intended encoding and error handler are stored in the
+    attributes `encoding` and `errors`.
+    Typecasting to `bytes` uses the stored values.
+    """
+
+    def __new__(cls, object, encoding=None, errors='strict'):
+        """Return a new OutString object.
+
+        Provisional.
+        """
+        try:
+            # decode bytes-like objects if encoding is known
+            return super().__new__(cls, object, encoding, errors)
+        except TypeError:
+            return super().__new__(cls, object)
+
+    def __init__(self, object, encoding=None, errors='strict'):
+        """Set "encoding" and "errors" attributes."""
+        self.encoding = encoding
+        self.errors = errors
+
+    def __bytes__(self):
+        try:
+            return super().encode(self.encoding, self.errors)
+        except TypeError:
+            raise TypeError('OutString instance without known encoding')
+
+    def __repr__(self):
+        if self.errors != 'strict':
+            errors_arg = f', errors={self.errors!r}'
+        else:
+            errors_arg = ''
+        return (f'{self.__class__.__name__}({super().__repr__()}, '
+                f'encoding={self.encoding!r}{errors_arg})')
+
+    def encode(self, encoding=None, errors=None):
+        return super().encode(encoding or self.encoding, errors or self.errors)
+
+
 class Input(TransformSpec):
     """
     Abstract base class for input wrappers.
@@ -264,14 +315,14 @@ class Output(TransformSpec):
         raise NotImplementedError
 
     def encode(self, data):
-        """Encode and return `data`.
+        """
+        Encode and return `data`.
 
         If `data` is a `bytes` instance, it is returned unchanged.
         Otherwise it is encoded with `self.encoding`.
 
         If `self.encoding` is set to the pseudo encoding name "unicode",
         `data` must be a `str` instance and is returned unchanged.
-
         """
         if self.encoding and self.encoding.lower() == 'unicode':
             assert isinstance(data, str), ('output encoding is "unicode" '
@@ -616,14 +667,39 @@ class StringOutput(Output):
 
     default_destination_path = '<string>'
 
+    def __init__(self, destination=None, destination_path=None,
+                 encoding=None, error_handler='strict', auto_encode=True):
+        self.auto_encode = auto_encode
+        """Let `write()` encode the output document and return `bytes`."""
+        super().__init__(destination, destination_path,
+                         encoding, error_handler)
+
     def write(self, data):
-        """Encode `data`, store it in `self.destination`, and return it.
+        """Store `data` in `self.destination`, and return it.
+
+        If `self.auto_encode` is False, store and return a `str`
+        sub-class instance with "encoding" and "errors" attributes
+        set to `self.encoding` and `self.error_handler`.
 
+        If `self.auto_encode` is True, encode `data` with `self.encoding`
+        and `self.error_handler` and store/return a `bytes` instance.
+        Exception:
         If `self.encoding` is set to the pseudo encoding name "unicode",
         `data` must be a `str` instance and is returned unchanged
         (cf. `Output.encode`).
+        Beware that the `output_encoding`_ setting may affect the content
+        of the output (e.g. an encoding declaration in HTML or XML or the
+        representation of characters as LaTeX macro vs. literal character).
         """
-        self.destination = self.encode(data)
+        if self.auto_encode:
+            self.destination = self.encode(data)
+            return self.destination
+
+        if not self.encoding or self.encoding.lower() == 'unicode':
+            encoding = None
+        else:
+            encoding = self.encoding
+        self.destination = OutString(data, encoding, self.error_handler)
         return self.destination
 
 
diff --git a/docutils/test/test_io.py b/docutils/test/test_io.py
index b1e55a148..6c8c70254 100755
--- a/docutils/test/test_io.py
+++ b/docutils/test/test_io.py
@@ -190,6 +190,19 @@ class OutputTests(unittest.TestCase):
         fo.write(self.udata)
         self.assertEqual(self.udrain.getvalue(), self.udata)
 
+    def test_write_auto_encode_false(self):
+        so = du_io.StringOutput(encoding='latin1', error_handler='replace',
+                                auto_encode=False)
+        output = so.write(self.udata)
+        # store output in self.destination and also return it
+        self.assertEqual(output, self.udata)
+        self.assertEqual(so.destination, self.udata)
+        # store also encoding and encoding error handler ...
+        self.assertEqual(output.encoding, 'latin1')
+        self.assertEqual(output.errors, 'replace')
+        # ... to allow easy conversion to `bytes`:
+        self.assertEqual(bytes(output), self.bdata)
+
     def test_FileOutput_hande_io_errors_deprection_warning(self):
         with self.assertWarnsRegex(DeprecationWarning,
                                    '"handle_io_errors" is ignored'):
@@ -225,6 +238,52 @@ class OutputTests(unittest.TestCase):
         self.assertRaises(ValueError, fo.write, self.udata)
 
 
+class OutStringTests(unittest.TestCase):
+
+    def test__init__defaults(self):
+        """Test `__new__()` and `__init__()` with default values."""
+
+        os = du_io.OutString('Grüße')
+        self.assertEqual(str(os), 'Grüße')
+        self.assertEqual(os.encoding, None)
+        self.assertEqual(os.errors, 'strict')
+        # converting to `bytes` fails if the encoding is not known:
+        with self.assertRaises(TypeError):
+            self.assertEqual(bytes(os), 'Grüße')
+        # without known encoding, `bytes` and other incompatible types
+        # are converted to their string representation ...
+        bos = du_io.OutString(b'gut')
+        self.assertEqual(str(bos), "b'gut'")
+        bos_e = du_io.OutString('Grüße'.encode('latin1'), errors='ignore')
+        self.assertEqual(str(bos_e), r"b'Gr\xfc\xdfe'")
+        bos = du_io.OutString(b'gut', encoding=None)
+        self.assertEqual(str(bos), "b'gut'")
+
+    def test__init__custom_attributes(self):
+        """Test `__new__()` and `__init__()` with custom encoding."""
+        os8 = du_io.OutString('Grüße', encoding='utf-8')
+        self.assertEqual(str(os8), 'Grüße')
+        self.assertEqual(bytes(os8), b'Gr\xc3\xbc\xc3\x9fe')
+        self.assertEqual(repr(os8), "OutString('Grüße', encoding='utf-8')")
+        # With known encoding, "bytes-like" objects are decoded
+        bos1 = du_io.OutString(b'Gr\xfc\xdfe', encoding='latin1')
+        self.assertEqual(str(bos1), 'Grüße')
+        self.assertEqual(bytes(bos1), b'Gr\xfc\xdfe')
+        # Invalid encodings (including the empty string) raise an error
+        with self.assertRaises(LookupError):
+            du_io.OutString(b'Gr\xfc\xdfe', encoding='')
+
+    def test__init__custom_errors(self):
+        """Test `__new__()` and `__init__()` with custom `errors`."""
+        ts8_r = du_io.OutString('Grüße', encoding='utf-8', errors='replace')
+        # Encoding uses the stored error handler:
+        self.assertEqual(ts8_r.encode('ascii'), b'Gr??e')
+        # Initialization with a `bytes` object uses the error handler, too:
+        bts8_r = du_io.OutString(b'Gr\xfc\xdfe', encoding='utf-8',
+                                 errors='replace')
+        self.assertEqual(str(bts8_r), 'Gr��e')
+
+
 class ErrorOutputTests(unittest.TestCase):
     def test_defaults(self):
         e = du_io.ErrorOutput()
diff --git a/docutils/test/test_publisher.py b/docutils/test/test_publisher.py
index 492cdbac1..c3b6394c4 100755
--- a/docutils/test/test_publisher.py
+++ b/docutils/test/test_publisher.py
@@ -99,7 +99,8 @@ class PublisherTests(unittest.TestCase):
             core.publish_cmdline(argv=['-', 'dest_name'],
                                  settings_overrides=settings)
 
-    def test_publish_string(self):
+    def test_publish_string_input_encoding(self):
+        """Test handling of encoded input."""
         # Transparently decode `bytes` source (with "input_encoding" setting)
         # default: auto-detect, fallback utf-8
         # Output is encoded according to "output_encoding" setting.
@@ -121,6 +122,24 @@ class PublisherTests(unittest.TestCase):
                                      settings_overrides=settings)
         self.assertTrue(output.endswith('Grüße\n'))
 
+    def test_publish_string_output_encoding(self):
+        settings = {'_disable_config': True,
+                    'datestamp': False,
+                    'output_encoding': 'latin1',
+                    'output_encoding_error_handler': 'replace'}
+        source = 'Grüß → dich'
+        expected = ('<document source="<string>">\n'
+                    '    <paragraph>\n'
+                    '        Grüß → dich\n')
+        # current default: encode output, return `bytes`
+        output = core.publish_string(source, settings_overrides=settings)
+        self.assertEqual(output, expected.encode('latin1', 'replace'))
+        # no encoding if `auto_encode` is False:
+        output = core.publish_string(source, settings_overrides=settings,
+                                     auto_encode=False)
+        self.assertEqual(output, expected)
+        # self.assertEqual(output.encoding, 'latin1')
+
 
 class PublishDoctreeTestCase(unittest.TestCase, docutils.SettingsSpec):
author	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2023-04-06 18:41:10 +0000
committer	milde <milde@929543f6-e4f2-0310-98a6-ba3bd3dd1d04>	2023-04-06 18:41:10 +0000
commit	a1b8f453f4e75b0b2d06cca1b8782645cef57285 (patch)
tree	b62dc034e10447c7db611b42c6d06270ac08598c
parent	4fa49d410aa413cbc1799047616e68c8b1ac0b9d (diff)
download	docutils-a1b8f453f4e75b0b2d06cca1b8782645cef57285.tar.gz