Merge pull request #211 from richvdh/rav/fix_linebreak_encoding

Avoid escaping U+2028 and U+2029 without ensure_ascii
author: Bob Ippolito <bob@redivi.com> 2018-04-02 10:40:47 -0700
committer: GitHub <noreply@github.com> 2018-04-02 10:40:47 -0700
commit: ca2f85ed1f6a7d7679af942faa695f95c30ccbda (patch)
tree: d887b379d3ba115d7b7c9cad48f134d17e4a6d4f
parent: 0406430cbcb0e31d27bd064db57837b94ed05294 (diff)
parent: 3309d122f922540102da1463744e1ad54c3ccb18 (diff)
download: simplejson-ca2f85ed1f6a7d7679af942faa695f95c30ccbda.tar.gz
4 files changed, 27 insertions, 8 deletions
diff --git a/index.rst b/index.rst
index 3869a50..8c9c7ab 100644
--- a/index.rst
+++ b/index.rst
@@ -767,6 +767,11 @@ Encoders and decoders
 
    Subclass of :class:`JSONEncoder` that escapes &, <, and > for embedding in HTML.
 
+   It also escapes the characters U+2028 (LINE SEPARATOR) and
+   U+2029 (PARAGRAPH SEPARATOR), irrespective of the *ensure_ascii* setting,
+   as these characters are not valid in JavaScript strings (see
+   http://timelessrepo.com/json-isnt-a-javascript-subset).
+
    .. versionchanged:: 2.1.0
       New in 2.1.0
 
diff --git a/simplejson/encoder.py b/simplejson/encoder.py
index ae76ae3..ec73ce3 100644
--- a/simplejson/encoder.py
+++ b/simplejson/encoder.py
@@ -17,10 +17,7 @@ c_encode_basestring_ascii, c_make_encoder = _import_speedups()
 from .decoder import PosInf
 from .raw_json import RawJSON
 
-#ESCAPE = re.compile(ur'[\x00-\x1f\\"\b\f\n\r\t\u2028\u2029]')
-# This is required because u() will mangle the string and ur'' isn't valid
-# python3 syntax
-ESCAPE = re.compile(u'[\\x00-\\x1f\\\\"\\b\\f\\n\\r\\t\u2028\u2029]')
+ESCAPE = re.compile(r'[\x00-\x1f\\"]')
 ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
 HAS_UTF8 = re.compile(r'[\x80-\xff]')
 ESCAPE_DCT = {
@@ -35,8 +32,6 @@ ESCAPE_DCT = {
 for i in range(0x20):
     #ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
     ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
-for i in [0x2028, 0x2029]:
-    ESCAPE_DCT.setdefault(unichr(i), '\\u%04x' % (i,))
 
 FLOAT_REPR = repr
 
@@ -382,6 +377,11 @@ class JSONEncoderForHTML(JSONEncoder):
     characters &, < and > should be escaped. They cannot be escaped
     with the usual entities (e.g. &amp;) because they are not expanded
     within <script> tags.
+
+    This class also escapes the line separator and paragraph separator
+    characters U+2028 and U+2029, irrespective of the ensure_ascii setting,
+    as these characters are not valid in JavaScript strings (see
+    http://timelessrepo.com/json-isnt-a-javascript-subset).
     """
 
     def encode(self, o):
@@ -399,6 +399,11 @@ class JSONEncoderForHTML(JSONEncoder):
             chunk = chunk.replace('&', '\\u0026')
             chunk = chunk.replace('<', '\\u003c')
             chunk = chunk.replace('>', '\\u003e')
+
+            if not self.ensure_ascii:
+                chunk = chunk.replace(u'\u2028', '\\u2028')
+                chunk = chunk.replace(u'\u2029', '\\u2029')
+
             yield chunk
 
 
diff --git a/simplejson/tests/test_encode_for_html.py b/simplejson/tests/test_encode_for_html.py
index f995254..3a840aa 100644
--- a/simplejson/tests/test_encode_for_html.py
+++ b/simplejson/tests/test_encode_for_html.py
@@ -7,11 +7,19 @@ class TestEncodeForHTML(unittest.TestCase):
     def setUp(self):
         self.decoder = json.JSONDecoder()
         self.encoder = json.JSONEncoderForHTML()
+        self.non_ascii_encoder = json.JSONEncoderForHTML(ensure_ascii=False)
 
     def test_basic_encode(self):
         self.assertEqual(r'"\u0026"', self.encoder.encode('&'))
         self.assertEqual(r'"\u003c"', self.encoder.encode('<'))
         self.assertEqual(r'"\u003e"', self.encoder.encode('>'))
+        self.assertEqual(r'"\u2028"', self.encoder.encode(u'\u2028'))
+
+    def test_non_ascii_basic_encode(self):
+        self.assertEqual(r'"\u0026"', self.non_ascii_encoder.encode('&'))
+        self.assertEqual(r'"\u003c"', self.non_ascii_encoder.encode('<'))
+        self.assertEqual(r'"\u003e"', self.non_ascii_encoder.encode('>'))
+        self.assertEqual(r'"\u2028"', self.non_ascii_encoder.encode(u'\u2028'))
 
     def test_basic_roundtrip(self):
         for char in '&<>':
diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py
index 3b37f65..1c7e95e 100644
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
@@ -106,10 +106,11 @@ class TestUnicode(TestCase):
         s1 = u'\u2029\u2028'
         s2 = s1.encode('utf8')
         expect = '"\\u2029\\u2028"'
+        expect_non_ascii = u'"\u2029\u2028"'
         self.assertEqual(json.dumps(s1), expect)
         self.assertEqual(json.dumps(s2), expect)
-        self.assertEqual(json.dumps(s1, ensure_ascii=False), expect)
-        self.assertEqual(json.dumps(s2, ensure_ascii=False), expect)
+        self.assertEqual(json.dumps(s1, ensure_ascii=False), expect_non_ascii)
+        self.assertEqual(json.dumps(s2, ensure_ascii=False), expect_non_ascii)
 
     def test_invalid_escape_sequences(self):
         # incomplete escape sequence
author	Bob Ippolito <bob@redivi.com>	2018-04-02 10:40:47 -0700
committer	GitHub <noreply@github.com>	2018-04-02 10:40:47 -0700
commit	ca2f85ed1f6a7d7679af942faa695f95c30ccbda (patch)
tree	d887b379d3ba115d7b7c9cad48f134d17e4a6d4f
parent	0406430cbcb0e31d27bd064db57837b94ed05294 (diff)
parent	3309d122f922540102da1463744e1ad54c3ccb18 (diff)
download	simplejson-ca2f85ed1f6a7d7679af942faa695f95c30ccbda.tar.gz