diff options
author | Zuul <zuul@review.openstack.org> | 2017-12-04 15:48:15 +0000 |
---|---|---|
committer | Gerrit Code Review <review@openstack.org> | 2017-12-04 15:48:15 +0000 |
commit | e14f905a4fa79817bd73a103c6a677fe344b0983 (patch) | |
tree | 7b6931c05ae06bc462bd28cb7b3fe839ac9b9c7f | |
parent | 10963e0abad1a69118df61bc77aa23b0a9c29e3c (diff) | |
parent | c61cc30060ca56257ca3504153578e02e68e7f0a (diff) | |
download | cliff-e14f905a4fa79817bd73a103c6a677fe344b0983.tar.gz |
Merge "Fix codec error when format=csv"
-rw-r--r-- | cliff/app.py | 4 | ||||
-rw-r--r-- | cliff/formatters/commaseparated.py | 23 | ||||
-rw-r--r-- | cliff/tests/test_app.py | 50 | ||||
-rw-r--r-- | cliff/utils.py | 52 |
4 files changed, 122 insertions, 7 deletions
diff --git a/cliff/app.py b/cliff/app.py index c632f4f..62f822e 100644 --- a/cliff/app.py +++ b/cliff/app.py @@ -118,10 +118,10 @@ class App(object): stdin = codecs.getreader(encoding)(sys.stdin) if not (stdout or isinstance(sys.stdout, codecs.StreamWriter)): - stdout = codecs.getwriter(encoding)(sys.stdout) + stdout = utils.getwriter(encoding)(sys.stdout) if not (stderr or isinstance(sys.stderr, codecs.StreamWriter)): - stderr = codecs.getwriter(encoding)(sys.stderr) + stderr = utils.getwriter(encoding)(sys.stderr) self.stdin = stdin or sys.stdin self.stdout = stdout or sys.stdout diff --git a/cliff/formatters/commaseparated.py b/cliff/formatters/commaseparated.py index 46a7bc5..c3511b4 100644 --- a/cliff/formatters/commaseparated.py +++ b/cliff/formatters/commaseparated.py @@ -47,11 +47,24 @@ class CSVLister(ListFormatter): ) def emit_list(self, column_names, data, stdout, parsed_args): - writer = csv.writer(stdout, - quoting=self.QUOTE_MODES[parsed_args.quote_mode], - lineterminator=os.linesep, - escapechar='\\', - ) + writer_kwargs = dict( + quoting=self.QUOTE_MODES[parsed_args.quote_mode], + lineterminator=os.linesep, + escapechar='\\', + ) + + # In Py2 we replace the csv module with unicodecsv because the + # Py2 csv module cannot handle unicode. unicodecsv encodes + # unicode objects based on the value of it's encoding keyword + # with the result unicodecsv emits encoded bytes in a str + # object. The utils.getwriter assures no attempt is made to + # re-encode the encoded bytes in the str object. + + if six.PY2: + writer_kwargs['encoding'] = (getattr(stdout, 'encoding', None) + or 'utf-8') + + writer = csv.writer(stdout, **writer_kwargs) writer.writerow(column_names) for row in data: writer.writerow( diff --git a/cliff/tests/test_app.py b/cliff/tests/test_app.py index e26e4f7..35f19c4 100644 --- a/cliff/tests/test_app.py +++ b/cliff/tests/test_app.py @@ -498,3 +498,53 @@ class TestIO(base.TestBase): self.assertIs(sys.stdin, app.stdin) self.assertIs(sys.stdout, app.stdout) self.assertIs(io, app.stderr) + + def test_writer_encoding(self): + # The word "test" with the e replaced by + # Unicode latin small letter e with acute, + # U+00E9, utf-8 encoded as 0xC3 0xA9 + text = u't\u00E9st' + text_utf8 = text.encode('utf-8') + + if six.PY2: + # In PY2 StreamWriter can't accept non-ASCII encoded characters + # because it must first promote the encoded byte stream to + # unicode in order to encode it in the desired encoding. + # Because the encoding of the byte stream is not known at this + # point the default-encoding of ASCII is utilized, but you can't + # decode a non-ASCII charcater to ASCII. + io = six.StringIO() + writer = codecs.getwriter('utf-8')(io) + self.assertRaises(UnicodeDecodeError, + writer.write, + text_utf8) + + # In PY2 with our override of codecs.getwriter we do not + # attempt to encode bytes in a str object (only unicode + # objects) therefore the final output string should be the + # utf-8 encoded byte sequence + io = six.StringIO() + writer = utils.getwriter('utf-8')(io) + writer.write(text) + output = io.getvalue() + self.assertEqual(text_utf8, output) + + io = six.StringIO() + writer = utils.getwriter('utf-8')(io) + writer.write(text_utf8) + output = io.getvalue() + self.assertEqual(text_utf8, output) + else: + # In PY3 you can't write encoded bytes to a text writer + # instead text functions require text. + io = six.StringIO() + writer = utils.getwriter('utf-8')(io) + self.assertRaises(TypeError, + writer.write, + text) + + io = six.StringIO() + writer = utils.getwriter('utf-8')(io) + self.assertRaises(TypeError, + writer.write, + text_utf8) diff --git a/cliff/utils.py b/cliff/utils.py index 50f3ab6..a9ee975 100644 --- a/cliff/utils.py +++ b/cliff/utils.py @@ -11,11 +11,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import codecs import ctypes import os import struct import sys +import six + # Each edit operation is assigned different cost, such as: # 'w' means swap operation, the cost is 0; # 's' means substitution operation, the cost is 2; @@ -153,3 +156,52 @@ def _get_terminal_width_ioctl(stdout): return columns except IOError: return None + + +if six.PY2: + def getwriter(encoding): + '''Override codecs.getwriter() to prevent codec errors. + + The StreamWriter returned by codecs.getwriter has an unfortunate + property, it will attempt to encode every object presented to it's + write() function. Normally we only want unicode objects to be + encoded to a byte stream. If bytes are presented (e.g. str in + Python2) we make the assumption those bytes represent an already + encoded text stream or they are indeed binary bytes and hence + should not be encoded. + + When the core StreamWriter attempts to encode a str object Python + will first promote the str object to a unicode object. The + promotion of str to unicode requires the str bytes to be + decoded. However the encoding associated with the str object is + not known therefore Python applies the default-encoding which is + ASCII. In the case where the str object contains utf-8 encoded + non-ASCII characters a decoding error is raised. By not attempting + to encode a byte stream we avoid this error. + + It really does not make much sense to try and encode a byte + stream. First of all a byte stream should not be encoded if it's + not text (e.g. binary data). If the byte stream is encoded text + the only way to re-encode it is if we known it's encoding so we + can decode it into a canonical form (e.g. unicode). Thus to + re-encode it we encode from the canonical form (e.g. unicode) to + the new binary encoding. The problem in Python2 is we never know + if the bytes in a str object are text or binary data and if it's + text which encoding it is, hence we should not try to apply + an encoding to a str object. + ''' + class _StreamWriter(codecs.StreamWriter): + def __init__(self, stream, errors='strict'): + codecs.StreamWriter.__init__(self, stream, errors) + + def encode(self, msg, errors='strict'): + if isinstance(msg, six.text_type): + return self.encoder(msg, errors) + return msg, len(msg) + + _StreamWriter.encoder = codecs.getencoder(encoding) + _StreamWriter.encoding = encoding + return _StreamWriter + +else: + getwriter = codecs.getwriter |