buildlogger: make everything unicode

author: Dan Crosta <dcrosta@10gen.com> 2012-04-11 10:57:25 -0400
committer: Dan Crosta <dcrosta@10gen.com> 2012-04-12 17:36:24 -0400
commit: 5c8053463a38a8c554e0d8a02bef8e5942df8aee (patch)
tree: 46a5b02e7ee2056ad13ba91e44319586ec247e63
parent: 7c024a1dd496da0e8975e7ab6b6d4cbfa47f5d51 (diff)
download: mongo-5c8053463a38a8c554e0d8a02bef8e5942df8aee.tar.gz
2 files changed, 21 insertions, 0 deletions
diff --git a/buildscripts/buildlogger.py b/buildscripts/buildlogger.py
index a6f91d840f8..02016317348 100644
--- a/buildscripts/buildlogger.py
+++ b/buildscripts/buildlogger.py
@@ -32,6 +32,7 @@ import sys
 import time
 import traceback
 import urllib2
+import utils
 
 try:
     import json
@@ -322,6 +323,7 @@ def loop_and_callback(command, callback):
     while proc.poll() is None:
         try:
             line = proc.stdout.readline().strip('\r\n')
+            line = utils.unicode_dammit(line)
             callback(line)
         except IOError:
             # if the signal handler is called while
diff --git a/buildscripts/utils.py b/buildscripts/utils.py
index 413f22681af..be16d0b9f82 100644
--- a/buildscripts/utils.py
+++ b/buildscripts/utils.py
@@ -1,4 +1,5 @@
 
+import codecs
 import re
 import socket
 import time
@@ -191,3 +192,21 @@ def run_smoke_command(*args):
     # otherwise SCons treats it as a list of dependencies.
     return [smoke_command(*args)]
 
+# unicode is a pain. some strings cannot be unicode()'d
+# but we want to just preserve the bytes in a human-readable
+# fashion. this codec error handler will substitute the
+# repr() of the offending bytes into the decoded string
+# at the position they occurred
+def replace_with_repr(unicode_error):
+    offender = unicode_error.object[unicode_error.start:unicode_error.end]
+    return (unicode(repr(offender).strip("'").strip('"')), unicode_error.end)
+
+codecs.register_error('repr', replace_with_repr)
+
+def unicode_dammit(string, encoding='utf8'):
+    # convert a string to a unicode, using the Python
+    # representation of non-ascii bytes when necessary
+    #
+    # name inpsired by BeautifulSoup's "UnicodeDammit"
+    return string.decode(encoding, 'repr')
+
author	Dan Crosta <dcrosta@10gen.com>	2012-04-11 10:57:25 -0400
committer	Dan Crosta <dcrosta@10gen.com>	2012-04-12 17:36:24 -0400
commit	5c8053463a38a8c554e0d8a02bef8e5942df8aee (patch)
tree	46a5b02e7ee2056ad13ba91e44319586ec247e63
parent	7c024a1dd496da0e8975e7ab6b6d4cbfa47f5d51 (diff)
download	mongo-5c8053463a38a8c554e0d8a02bef8e5942df8aee.tar.gz