summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Ippolito <bob@redivi.com>2018-04-02 10:40:47 -0700
committerGitHub <noreply@github.com>2018-04-02 10:40:47 -0700
commitca2f85ed1f6a7d7679af942faa695f95c30ccbda (patch)
treed887b379d3ba115d7b7c9cad48f134d17e4a6d4f
parent0406430cbcb0e31d27bd064db57837b94ed05294 (diff)
parent3309d122f922540102da1463744e1ad54c3ccb18 (diff)
downloadsimplejson-ca2f85ed1f6a7d7679af942faa695f95c30ccbda.tar.gz
Merge pull request #211 from richvdh/rav/fix_linebreak_encoding
Avoid escaping U+2028 and U+2029 without ensure_ascii
-rw-r--r--index.rst5
-rw-r--r--simplejson/encoder.py17
-rw-r--r--simplejson/tests/test_encode_for_html.py8
-rw-r--r--simplejson/tests/test_unicode.py5
4 files changed, 27 insertions, 8 deletions
diff --git a/index.rst b/index.rst
index 3869a50..8c9c7ab 100644
--- a/index.rst
+++ b/index.rst
@@ -767,6 +767,11 @@ Encoders and decoders
Subclass of :class:`JSONEncoder` that escapes &, <, and > for embedding in HTML.
+ It also escapes the characters U+2028 (LINE SEPARATOR) and
+ U+2029 (PARAGRAPH SEPARATOR), irrespective of the *ensure_ascii* setting,
+ as these characters are not valid in JavaScript strings (see
+ http://timelessrepo.com/json-isnt-a-javascript-subset).
+
.. versionchanged:: 2.1.0
New in 2.1.0
diff --git a/simplejson/encoder.py b/simplejson/encoder.py
index ae76ae3..ec73ce3 100644
--- a/simplejson/encoder.py
+++ b/simplejson/encoder.py
@@ -17,10 +17,7 @@ c_encode_basestring_ascii, c_make_encoder = _import_speedups()
from .decoder import PosInf
from .raw_json import RawJSON
-#ESCAPE = re.compile(ur'[\x00-\x1f\\"\b\f\n\r\t\u2028\u2029]')
-# This is required because u() will mangle the string and ur'' isn't valid
-# python3 syntax
-ESCAPE = re.compile(u'[\\x00-\\x1f\\\\"\\b\\f\\n\\r\\t\u2028\u2029]')
+ESCAPE = re.compile(r'[\x00-\x1f\\"]')
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])')
HAS_UTF8 = re.compile(r'[\x80-\xff]')
ESCAPE_DCT = {
@@ -35,8 +32,6 @@ ESCAPE_DCT = {
for i in range(0x20):
#ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i))
ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,))
-for i in [0x2028, 0x2029]:
- ESCAPE_DCT.setdefault(unichr(i), '\\u%04x' % (i,))
FLOAT_REPR = repr
@@ -382,6 +377,11 @@ class JSONEncoderForHTML(JSONEncoder):
characters &, < and > should be escaped. They cannot be escaped
with the usual entities (e.g. &amp;) because they are not expanded
within <script> tags.
+
+ This class also escapes the line separator and paragraph separator
+ characters U+2028 and U+2029, irrespective of the ensure_ascii setting,
+ as these characters are not valid in JavaScript strings (see
+ http://timelessrepo.com/json-isnt-a-javascript-subset).
"""
def encode(self, o):
@@ -399,6 +399,11 @@ class JSONEncoderForHTML(JSONEncoder):
chunk = chunk.replace('&', '\\u0026')
chunk = chunk.replace('<', '\\u003c')
chunk = chunk.replace('>', '\\u003e')
+
+ if not self.ensure_ascii:
+ chunk = chunk.replace(u'\u2028', '\\u2028')
+ chunk = chunk.replace(u'\u2029', '\\u2029')
+
yield chunk
diff --git a/simplejson/tests/test_encode_for_html.py b/simplejson/tests/test_encode_for_html.py
index f995254..3a840aa 100644
--- a/simplejson/tests/test_encode_for_html.py
+++ b/simplejson/tests/test_encode_for_html.py
@@ -7,11 +7,19 @@ class TestEncodeForHTML(unittest.TestCase):
def setUp(self):
self.decoder = json.JSONDecoder()
self.encoder = json.JSONEncoderForHTML()
+ self.non_ascii_encoder = json.JSONEncoderForHTML(ensure_ascii=False)
def test_basic_encode(self):
self.assertEqual(r'"\u0026"', self.encoder.encode('&'))
self.assertEqual(r'"\u003c"', self.encoder.encode('<'))
self.assertEqual(r'"\u003e"', self.encoder.encode('>'))
+ self.assertEqual(r'"\u2028"', self.encoder.encode(u'\u2028'))
+
+ def test_non_ascii_basic_encode(self):
+ self.assertEqual(r'"\u0026"', self.non_ascii_encoder.encode('&'))
+ self.assertEqual(r'"\u003c"', self.non_ascii_encoder.encode('<'))
+ self.assertEqual(r'"\u003e"', self.non_ascii_encoder.encode('>'))
+ self.assertEqual(r'"\u2028"', self.non_ascii_encoder.encode(u'\u2028'))
def test_basic_roundtrip(self):
for char in '&<>':
diff --git a/simplejson/tests/test_unicode.py b/simplejson/tests/test_unicode.py
index 3b37f65..1c7e95e 100644
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
@@ -106,10 +106,11 @@ class TestUnicode(TestCase):
s1 = u'\u2029\u2028'
s2 = s1.encode('utf8')
expect = '"\\u2029\\u2028"'
+ expect_non_ascii = u'"\u2029\u2028"'
self.assertEqual(json.dumps(s1), expect)
self.assertEqual(json.dumps(s2), expect)
- self.assertEqual(json.dumps(s1, ensure_ascii=False), expect)
- self.assertEqual(json.dumps(s2, ensure_ascii=False), expect)
+ self.assertEqual(json.dumps(s1, ensure_ascii=False), expect_non_ascii)
+ self.assertEqual(json.dumps(s2, ensure_ascii=False), expect_non_ascii)
def test_invalid_escape_sequences(self):
# incomplete escape sequence