summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZuul <zuul@review.openstack.org>2017-12-04 15:48:15 +0000
committerGerrit Code Review <review@openstack.org>2017-12-04 15:48:15 +0000
commite14f905a4fa79817bd73a103c6a677fe344b0983 (patch)
tree7b6931c05ae06bc462bd28cb7b3fe839ac9b9c7f
parent10963e0abad1a69118df61bc77aa23b0a9c29e3c (diff)
parentc61cc30060ca56257ca3504153578e02e68e7f0a (diff)
downloadcliff-e14f905a4fa79817bd73a103c6a677fe344b0983.tar.gz
Merge "Fix codec error when format=csv"
-rw-r--r--cliff/app.py4
-rw-r--r--cliff/formatters/commaseparated.py23
-rw-r--r--cliff/tests/test_app.py50
-rw-r--r--cliff/utils.py52
4 files changed, 122 insertions, 7 deletions
diff --git a/cliff/app.py b/cliff/app.py
index c632f4f..62f822e 100644
--- a/cliff/app.py
+++ b/cliff/app.py
@@ -118,10 +118,10 @@ class App(object):
stdin = codecs.getreader(encoding)(sys.stdin)
if not (stdout or isinstance(sys.stdout, codecs.StreamWriter)):
- stdout = codecs.getwriter(encoding)(sys.stdout)
+ stdout = utils.getwriter(encoding)(sys.stdout)
if not (stderr or isinstance(sys.stderr, codecs.StreamWriter)):
- stderr = codecs.getwriter(encoding)(sys.stderr)
+ stderr = utils.getwriter(encoding)(sys.stderr)
self.stdin = stdin or sys.stdin
self.stdout = stdout or sys.stdout
diff --git a/cliff/formatters/commaseparated.py b/cliff/formatters/commaseparated.py
index 46a7bc5..c3511b4 100644
--- a/cliff/formatters/commaseparated.py
+++ b/cliff/formatters/commaseparated.py
@@ -47,11 +47,24 @@ class CSVLister(ListFormatter):
)
def emit_list(self, column_names, data, stdout, parsed_args):
- writer = csv.writer(stdout,
- quoting=self.QUOTE_MODES[parsed_args.quote_mode],
- lineterminator=os.linesep,
- escapechar='\\',
- )
+ writer_kwargs = dict(
+ quoting=self.QUOTE_MODES[parsed_args.quote_mode],
+ lineterminator=os.linesep,
+ escapechar='\\',
+ )
+
+ # In Py2 we replace the csv module with unicodecsv because the
+ # Py2 csv module cannot handle unicode. unicodecsv encodes
+ # unicode objects based on the value of it's encoding keyword
+ # with the result unicodecsv emits encoded bytes in a str
+ # object. The utils.getwriter assures no attempt is made to
+ # re-encode the encoded bytes in the str object.
+
+ if six.PY2:
+ writer_kwargs['encoding'] = (getattr(stdout, 'encoding', None)
+ or 'utf-8')
+
+ writer = csv.writer(stdout, **writer_kwargs)
writer.writerow(column_names)
for row in data:
writer.writerow(
diff --git a/cliff/tests/test_app.py b/cliff/tests/test_app.py
index e26e4f7..35f19c4 100644
--- a/cliff/tests/test_app.py
+++ b/cliff/tests/test_app.py
@@ -498,3 +498,53 @@ class TestIO(base.TestBase):
self.assertIs(sys.stdin, app.stdin)
self.assertIs(sys.stdout, app.stdout)
self.assertIs(io, app.stderr)
+
+ def test_writer_encoding(self):
+ # The word "test" with the e replaced by
+ # Unicode latin small letter e with acute,
+ # U+00E9, utf-8 encoded as 0xC3 0xA9
+ text = u't\u00E9st'
+ text_utf8 = text.encode('utf-8')
+
+ if six.PY2:
+ # In PY2 StreamWriter can't accept non-ASCII encoded characters
+ # because it must first promote the encoded byte stream to
+ # unicode in order to encode it in the desired encoding.
+ # Because the encoding of the byte stream is not known at this
+ # point the default-encoding of ASCII is utilized, but you can't
+ # decode a non-ASCII charcater to ASCII.
+ io = six.StringIO()
+ writer = codecs.getwriter('utf-8')(io)
+ self.assertRaises(UnicodeDecodeError,
+ writer.write,
+ text_utf8)
+
+ # In PY2 with our override of codecs.getwriter we do not
+ # attempt to encode bytes in a str object (only unicode
+ # objects) therefore the final output string should be the
+ # utf-8 encoded byte sequence
+ io = six.StringIO()
+ writer = utils.getwriter('utf-8')(io)
+ writer.write(text)
+ output = io.getvalue()
+ self.assertEqual(text_utf8, output)
+
+ io = six.StringIO()
+ writer = utils.getwriter('utf-8')(io)
+ writer.write(text_utf8)
+ output = io.getvalue()
+ self.assertEqual(text_utf8, output)
+ else:
+ # In PY3 you can't write encoded bytes to a text writer
+ # instead text functions require text.
+ io = six.StringIO()
+ writer = utils.getwriter('utf-8')(io)
+ self.assertRaises(TypeError,
+ writer.write,
+ text)
+
+ io = six.StringIO()
+ writer = utils.getwriter('utf-8')(io)
+ self.assertRaises(TypeError,
+ writer.write,
+ text_utf8)
diff --git a/cliff/utils.py b/cliff/utils.py
index 50f3ab6..a9ee975 100644
--- a/cliff/utils.py
+++ b/cliff/utils.py
@@ -11,11 +11,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
+import codecs
import ctypes
import os
import struct
import sys
+import six
+
# Each edit operation is assigned different cost, such as:
# 'w' means swap operation, the cost is 0;
# 's' means substitution operation, the cost is 2;
@@ -153,3 +156,52 @@ def _get_terminal_width_ioctl(stdout):
return columns
except IOError:
return None
+
+
+if six.PY2:
+ def getwriter(encoding):
+ '''Override codecs.getwriter() to prevent codec errors.
+
+ The StreamWriter returned by codecs.getwriter has an unfortunate
+ property, it will attempt to encode every object presented to it's
+ write() function. Normally we only want unicode objects to be
+ encoded to a byte stream. If bytes are presented (e.g. str in
+ Python2) we make the assumption those bytes represent an already
+ encoded text stream or they are indeed binary bytes and hence
+ should not be encoded.
+
+ When the core StreamWriter attempts to encode a str object Python
+ will first promote the str object to a unicode object. The
+ promotion of str to unicode requires the str bytes to be
+ decoded. However the encoding associated with the str object is
+ not known therefore Python applies the default-encoding which is
+ ASCII. In the case where the str object contains utf-8 encoded
+ non-ASCII characters a decoding error is raised. By not attempting
+ to encode a byte stream we avoid this error.
+
+ It really does not make much sense to try and encode a byte
+ stream. First of all a byte stream should not be encoded if it's
+ not text (e.g. binary data). If the byte stream is encoded text
+ the only way to re-encode it is if we known it's encoding so we
+ can decode it into a canonical form (e.g. unicode). Thus to
+ re-encode it we encode from the canonical form (e.g. unicode) to
+ the new binary encoding. The problem in Python2 is we never know
+ if the bytes in a str object are text or binary data and if it's
+ text which encoding it is, hence we should not try to apply
+ an encoding to a str object.
+ '''
+ class _StreamWriter(codecs.StreamWriter):
+ def __init__(self, stream, errors='strict'):
+ codecs.StreamWriter.__init__(self, stream, errors)
+
+ def encode(self, msg, errors='strict'):
+ if isinstance(msg, six.text_type):
+ return self.encoder(msg, errors)
+ return msg, len(msg)
+
+ _StreamWriter.encoder = codecs.getencoder(encoding)
+ _StreamWriter.encoding = encoding
+ return _StreamWriter
+
+else:
+ getwriter = codecs.getwriter