buildlogger: make everything unicode

author: Dan Crosta <dcrosta@10gen.com> 2012-04-11 10:57:25 -0400
committer: Dan Crosta <dcrosta@10gen.com> 2012-04-12 17:36:24 -0400
commit: 5c8053463a38a8c554e0d8a02bef8e5942df8aee (patch)
tree: 46a5b02e7ee2056ad13ba91e44319586ec247e63 /buildscripts/utils.py
parent: 7c024a1dd496da0e8975e7ab6b6d4cbfa47f5d51 (diff)
download: mongo-5c8053463a38a8c554e0d8a02bef8e5942df8aee.tar.gz
1 files changed, 19 insertions, 0 deletions
diff --git a/buildscripts/utils.py b/buildscripts/utils.py
index 413f22681af..be16d0b9f82 100644
--- a/buildscripts/utils.py
+++ b/buildscripts/utils.py
@@ -1,4 +1,5 @@
 
+import codecs
 import re
 import socket
 import time
@@ -191,3 +192,21 @@ def run_smoke_command(*args):
     # otherwise SCons treats it as a list of dependencies.
     return [smoke_command(*args)]
 
+# unicode is a pain. some strings cannot be unicode()'d
+# but we want to just preserve the bytes in a human-readable
+# fashion. this codec error handler will substitute the
+# repr() of the offending bytes into the decoded string
+# at the position they occurred
+def replace_with_repr(unicode_error):
+    offender = unicode_error.object[unicode_error.start:unicode_error.end]
+    return (unicode(repr(offender).strip("'").strip('"')), unicode_error.end)
+
+codecs.register_error('repr', replace_with_repr)
+
+def unicode_dammit(string, encoding='utf8'):
+    # convert a string to a unicode, using the Python
+    # representation of non-ascii bytes when necessary
+    #
+    # name inpsired by BeautifulSoup's "UnicodeDammit"
+    return string.decode(encoding, 'repr')
+
author	Dan Crosta <dcrosta@10gen.com>	2012-04-11 10:57:25 -0400
committer	Dan Crosta <dcrosta@10gen.com>	2012-04-12 17:36:24 -0400
commit	5c8053463a38a8c554e0d8a02bef8e5942df8aee (patch)
tree	46a5b02e7ee2056ad13ba91e44319586ec247e63 /buildscripts/utils.py
parent	7c024a1dd496da0e8975e7ab6b6d4cbfa47f5d51 (diff)
download	mongo-5c8053463a38a8c554e0d8a02bef8e5942df8aee.tar.gz