6 files changed, 194 insertions, 15 deletions
diff --git a/AUTHORS b/AUTHORS
index 5b15e8c7..1bedef4b 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -122,6 +122,7 @@ Other contributors, listed alphabetically, are:
 * Ronny Pfannschmidt -- BBCode lexer
 * Benjamin Peterson -- Test suite refactoring
 * Dominik Picheta -- Nimrod lexer
+* Andrew Pinkham -- RTF Formatter Refactoring
 * Clément Prévost -- UrbiScript lexer
 * raichoo -- Idris lexer
 * Kashif Rasul -- CUDA lexer
diff --git a/pygments/formatters/rtf.py b/pygments/formatters/rtf.py
index 4b03f8a7..cf65a927 100644
--- a/pygments/formatters/rtf.py
+++ b/pygments/formatters/rtf.py
@@ -10,7 +10,7 @@
 """
 
 from pygments.formatter import Formatter
-from pygments.util import get_int_opt
+from pygments.util import get_int_opt, _surrogatepair
 
 
 __all__ = ['RtfFormatter']
@@ -22,6 +22,10 @@ class RtfFormatter(Formatter):
     documents with color information and other useful stuff. Perfect for Copy and
     Paste into Microsoft® Word® documents.
 
+    Please note that ``encoding`` and ``outencoding`` options are ignored.
+    The RTF format is ASCII natively, but handles unicode characters correctly
+    thanks to escape sequences.
+
     .. versionadded:: 0.6
 
     Additional options accepted:
@@ -74,28 +78,27 @@ class RtfFormatter(Formatter):
 
         # escape text
         text = self._escape(text)
-        if self.encoding in ('utf-8', 'utf-16', 'utf-32'):
-            encoding = 'iso-8859-15'
-        else:
-            encoding = self.encoding or 'iso-8859-15'
 
         buf = []
         for c in text:
-            if ord(c) > 128:
-                ansic = c.encode(encoding, 'ignore')
-                if ansic and ord(ansic) > 128:
-                    ansic = '\\\'%x' % ord(ansic)
-                else:
-                    ansic = '?'
-                buf.append(r'\ud{\u%d%s}' % (ord(c), ansic))
-            else:
+            cn = ord(c)
+            if cn < (2**7):
+                # ASCII character
                 buf.append(str(c))
+            elif (2**7) <= cn < (2**16):
+                # single unicode escape sequence
+                buf.append(r'{\u%d}' % cn)
+            elif (2**16) <= cn:
+                # RTF limits unicode to 16 bits.
+                # Force surrogate pairs
+                h,l = _surrogatepair(cn)
+                buf.append(r'{\u%d}{\u%d}' % (h,l))
 
         return ''.join(buf).replace('\n', '\\par\n')
 
     def format_unencoded(self, tokensource, outfile):
         # rtf 1.8 header
-        outfile.write(r'{\rtf1\ansi\deff0'
+        outfile.write(r'{\rtf1\ansi\uc0\deff0'
                       r'{\fonttbl{\f0\fmodern\fprq1\fcharset0%s;}}'
                       r'{\colortbl;' % (self.fontface and
                                         ' ' + self._escape(self.fontface) or
@@ -114,7 +117,7 @@ class RtfFormatter(Formatter):
                         int(color[4:6], 16)
                     ))
                     offset += 1
-        outfile.write(r'}\f0')
+        outfile.write(r'}\f0 ')
         if self.fontsize:
             outfile.write(r'\fs%d' % (self.fontsize))
 
diff --git a/pygments/util.py b/pygments/util.py
index c302900f..5dc6981f 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -208,6 +208,11 @@ def looks_like_xml(text):
 # Python narrow build compatibility
 
 def _surrogatepair(c):
+    # Given a unicode character code
+    # with length greater than 16 bits,
+    # return the two 16 bit surrogate pair.
+    # From example D28 of:
+    # http://www.unicode.org/book/ch03.pdf
     return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff)))
 
 def unirange(a, b):
diff --git a/tests/string_asserts.py b/tests/string_asserts.py
new file mode 100644
index 00000000..025a5281
--- /dev/null
+++ b/tests/string_asserts.py
@@ -0,0 +1,22 @@
+# -*- coding: utf-8 -*-
+"""
+    Pygments string assert utility
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+class StringTests(object):
+
+    def assertStartsWith(self, haystack, needle, msg=None):
+        if msg is None:
+            msg = "'{}' does not start with '{}'".format(haystack, needle)
+        if not haystack.startswith(needle):
+            raise(AssertionError(msg))
+
+    def assertEndsWith(self, haystack, needle, msg=None):
+        if msg is None:
+            msg = "'{}' does not end with '{}'".format(haystack, needle)
+        if not haystack.endswith(needle):
+            raise(AssertionError(msg))
diff --git a/tests/test_rtf_formatter.py b/tests/test_rtf_formatter.py
new file mode 100644
index 00000000..30b136fd
--- /dev/null
+++ b/tests/test_rtf_formatter.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+"""
+    Pygments RTF formatter tests
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import unittest
+from string_asserts import StringTests
+
+from pygments.util import StringIO
+from pygments.formatters import RtfFormatter
+from pygments.lexers.special import TextLexer
+
+class RtfFormatterTest(StringTests, unittest.TestCase):
+    foot = (r'\par' '\n' r'}')
+
+    def _escape(self, string):
+        return(string.replace("\n", r"\n"))
+
+    def _build_message(self, *args, **kwargs):
+        string = kwargs.get('string', None)
+        t = self._escape(kwargs.get('t', ''))
+        expected = self._escape(kwargs.get('expected', ''))
+        result = self._escape(kwargs.get('result', ''))
+
+        if string is None:
+            string = (u"The expected output of '{t}'\n"
+                      u"\t\tShould be '{expected}'\n"
+                      u"\t\tActually outputs '{result}'\n"
+                      u"\t(WARNING: Partial Output of Result!)")
+
+        end = -(len(self._escape(self.foot)))
+        start = end-len(expected)
+
+        return string.format(t=t,
+                             result = result[start:end],
+                             expected = expected)
+
+    def format_rtf(self, t):
+        tokensource = list(TextLexer().get_tokens(t))
+        fmt = RtfFormatter()
+        buf = StringIO()
+        fmt.format(tokensource, buf)
+        result = buf.getvalue()
+        buf.close()
+        return result
+
+    def test_rtf_header(self):
+        t = u''
+        result = self.format_rtf(t)
+        expected = r'{\rtf1\ansi\uc0'
+        msg = (u"RTF documents are expected to start with '{expected}'\n"
+               u"\t\tStarts intead with '{result}'\n"
+               u"\t(WARNING: Partial Output of Result!)".format(
+                   expected = expected,
+                   result = result[:len(expected)]))
+        self.assertStartsWith(result, expected, msg)
+
+    def test_rtf_footer(self):
+        t = u''
+        result = self.format_rtf(t)
+        expected = self.foot
+        msg = (u"RTF documents are expected to end with '{expected}'\n"
+               u"\t\tEnds intead with '{result}'\n"
+               u"\t(WARNING: Partial Output of Result!)".format(
+                   expected = self._escape(expected),
+                   result = self._escape(result[-len(expected):])))
+        self.assertEndsWith(result, expected, msg)
+
+    def test_ascii_characters(self):
+        t = u'a b c d ~'
+        result = self.format_rtf(t)
+        expected = (r'a b c d ~')
+        if not result.endswith(self.foot):
+            return(unittest.skip('RTF Footer incorrect'))
+        msg = self._build_message(t=t, result=result, expected=expected)
+        self.assertEndsWith(result, expected+self.foot, msg)
+
+    def test_escape_characters(self):
+        t = u'\ {{'
+        result = self.format_rtf(t)
+        expected = (r'\\ \{\{')
+        if not result.endswith(self.foot):
+            return(unittest.skip('RTF Footer incorrect'))
+        msg = self._build_message(t=t, result=result, expected=expected)
+        self.assertEndsWith(result, expected+self.foot, msg)
+
+    def test_single_characters(self):
+        t = u'â € ¤ каждой'
+        result = self.format_rtf(t)
+        expected = (r'{\u226} {\u8364} {\u164} '
+                    r'{\u1082}{\u1072}{\u1078}{\u1076}{\u1086}{\u1081}')
+        if not result.endswith(self.foot):
+            return(unittest.skip('RTF Footer incorrect'))
+        msg = self._build_message(t=t, result=result, expected=expected)
+        self.assertEndsWith(result, expected+self.foot, msg)
+
+    def test_double_characters(self):
+        t = u'က 힣 ↕ ↕︎ 鼖'
+        result = self.format_rtf(t)
+        expected = (r'{\u4096} {\u55203} {\u8597} '
+                    r'{\u8597}{\u65038} {\u55422}{\u56859}')
+        if not result.endswith(self.foot):
+            return(unittest.skip('RTF Footer incorrect'))
+        msg = self._build_message(t=t, result=result, expected=expected)
+        self.assertEndsWith(result, expected+self.foot, msg)
diff --git a/tests/test_string_asserts.py b/tests/test_string_asserts.py
new file mode 100644
index 00000000..0beed15c
--- /dev/null
+++ b/tests/test_string_asserts.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+"""
+    Pygments string assert utility tests
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import unittest
+from string_asserts import StringTests
+
+class TestStringTests(StringTests, unittest.TestCase):
+
+    def test_startswith_correct(self):
+        self.assertStartsWith("AAA", "A")
+
+    # @unittest.expectedFailure not supported by nose
+    def test_startswith_incorrect(self):
+        with self.assertRaises(AssertionError):
+            self.assertStartsWith("AAA", "B")
+
+    # @unittest.expectedFailure not supported by nose
+    def test_startswith_short(self):
+        with self.assertRaises(AssertionError):
+            self.assertStartsWith("A", "AA")
+
+    def test_endswith_correct(self):
+        self.assertEndsWith("AAA", "A")
+
+    # @unittest.expectedFailure not supported by nose
+    def test_endswith_incorrect(self):
+        with self.assertRaises(AssertionError):
+            self.assertEndsWith("AAA", "B")
+
+    # @unittest.expectedFailure not supported by nose
+    def test_endswith_short(self):
+        with self.assertRaises(AssertionError):
+            self.assertEndsWith("A", "AA")