From 7c2ccb773ab9343e25f14d72356dab75420e288f Mon Sep 17 00:00:00 2001 From: Bob Ippolito Date: Tue, 4 Apr 2023 09:59:33 -0700 Subject: SJ-PT-23-01: Fix invalid handling of unicode escape sequences in Python decoder --- CHANGES.txt | 5 +++++ conf.py | 4 ++-- simplejson/__init__.py | 2 +- simplejson/decoder.py | 44 +++++++++++++++++-------------------- simplejson/tests/test_scanstring.py | 4 +++- 5 files changed, 31 insertions(+), 28 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index d2f7a94..06ac399 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,8 @@ +Version 3.19.0 released 2023-04-XX + +* Fix invalid handling of unicode escape sequences in the pure Python + implementation of the decoder (SJ-PT-23-01) + Version 3.18.4 released 2023-03-14 * Test the sdist to prevent future regressions diff --git a/conf.py b/conf.py index 921bbef..5a2dded 100644 --- a/conf.py +++ b/conf.py @@ -42,9 +42,9 @@ copyright = '2023, Bob Ippolito' # other places throughout the built documents. # # The short X.Y version. -version = '3.18' +version = '3.19' # The full version, including alpha/beta/rc tags. -release = '3.18.4' +release = '3.19.0' # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: diff --git a/simplejson/__init__.py b/simplejson/__init__.py index 47e49a3..2f0bebf 100644 --- a/simplejson/__init__.py +++ b/simplejson/__init__.py @@ -118,7 +118,7 @@ Serializing multiple objects to JSON lines (newline-delimited JSON):: """ from __future__ import absolute_import -__version__ = '3.18.4' +__version__ = '3.19.0' __all__ = [ 'dump', 'dumps', 'load', 'loads', 'JSONDecoder', 'JSONDecodeError', 'JSONEncoder', diff --git a/simplejson/decoder.py b/simplejson/decoder.py index 1a8f772..e1f10ae 100644 --- a/simplejson/decoder.py +++ b/simplejson/decoder.py @@ -46,9 +46,22 @@ BACKSLASH = { DEFAULT_ENCODING = "utf-8" +def scan_four_digit_hex(s, end, _m=re.compile(r'^[0-9a-fA-F]{4}$').match): + """Scan a four digit hex number from s[end:end + 4] + """ + msg = "Invalid \\uXXXX escape sequence" + esc = s[end:end + 4] + if not _m(esc): + raise JSONDecodeError(msg, s, end - 2) + try: + return int(esc, 16), end + 4 + except ValueError: + raise JSONDecodeError(msg, s, end - 2) + def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match, _join=u''.join, - _PY3=PY3, _maxunicode=sys.maxunicode): + _PY3=PY3, _maxunicode=sys.maxunicode, + _scan_four_digit_hex=scan_four_digit_hex): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError @@ -100,35 +113,18 @@ def py_scanstring(s, end, encoding=None, strict=True, end += 1 else: # Unicode escape sequence - msg = "Invalid \\uXXXX escape sequence" - esc = s[end + 1:end + 5] - escX = esc[1:2] - if len(esc) != 4 or escX == 'x' or escX == 'X': - raise JSONDecodeError(msg, s, end - 1) - try: - uni = int(esc, 16) - except ValueError: - raise JSONDecodeError(msg, s, end - 1) - if uni < 0 or uni > _maxunicode: - raise JSONDecodeError(msg, s, end - 1) - end += 5 + uni, end = _scan_four_digit_hex(s, end + 1) # Check for surrogate pair on UCS-4 systems # Note that this will join high/low surrogate pairs # but will also pass unpaired surrogates through if (_maxunicode > 65535 and uni & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u'): - esc2 = s[end + 2:end + 6] - escX = esc2[1:2] - if len(esc2) == 4 and not (escX == 'x' or escX == 'X'): - try: - uni2 = int(esc2, 16) - except ValueError: - raise JSONDecodeError(msg, s, end) - if uni2 & 0xfc00 == 0xdc00: - uni = 0x10000 + (((uni - 0xd800) << 10) | - (uni2 - 0xdc00)) - end += 6 + uni2, end2 = _scan_four_digit_hex(s, end + 2) + if uni2 & 0xfc00 == 0xdc00: + uni = 0x10000 + (((uni - 0xd800) << 10) | + (uni2 - 0xdc00)) + end = end2 char = unichr(uni) # Append the unescaped character _append(char) diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py index c6c53b8..1f54483 100644 --- a/simplejson/tests/test_scanstring.py +++ b/simplejson/tests/test_scanstring.py @@ -132,7 +132,9 @@ class TestScanString(TestCase): self.assertRaises(ValueError, scanstring, '\\ud834\\x0123"', 0, None, True) - self.assertRaises(json.JSONDecodeError, scanstring, "\\u-123", 0, None, True) + self.assertRaises(json.JSONDecodeError, scanstring, '\\u-123"', 0, None, True) + # SJ-PT-23-01: Invalid Handling of Broken Unicode Escape Sequences + self.assertRaises(json.JSONDecodeError, scanstring, '\\u EDD"', 0, None, True) def test_issue3623(self): self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1, -- cgit v1.2.1