summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Ippolito <bob@redivi.com>2023-04-04 09:59:33 -0700
committerBob Ippolito <bob@redivi.com>2023-04-04 09:59:33 -0700
commit7c2ccb773ab9343e25f14d72356dab75420e288f (patch)
tree5f8370a129f71fc30c632c9e411681c09de30b68
parent9559fc756deaf20b6bae961b58c5289d8582c8b7 (diff)
downloadsimplejson-7c2ccb773ab9343e25f14d72356dab75420e288f.tar.gz
SJ-PT-23-01: Fix invalid handling of unicode escape sequences in Python decoder
-rw-r--r--CHANGES.txt5
-rw-r--r--conf.py4
-rw-r--r--simplejson/__init__.py2
-rw-r--r--simplejson/decoder.py44
-rw-r--r--simplejson/tests/test_scanstring.py4
5 files changed, 31 insertions, 28 deletions
diff --git a/CHANGES.txt b/CHANGES.txt
index d2f7a94..06ac399 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,8 @@
+Version 3.19.0 released 2023-04-XX
+
+* Fix invalid handling of unicode escape sequences in the pure Python
+ implementation of the decoder (SJ-PT-23-01)
+
Version 3.18.4 released 2023-03-14
* Test the sdist to prevent future regressions
diff --git a/conf.py b/conf.py
index 921bbef..5a2dded 100644
--- a/conf.py
+++ b/conf.py
@@ -42,9 +42,9 @@ copyright = '2023, Bob Ippolito'
# other places throughout the built documents.
#
# The short X.Y version.
-version = '3.18'
+version = '3.19'
# The full version, including alpha/beta/rc tags.
-release = '3.18.4'
+release = '3.19.0'
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
diff --git a/simplejson/__init__.py b/simplejson/__init__.py
index 47e49a3..2f0bebf 100644
--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -118,7 +118,7 @@ Serializing multiple objects to JSON lines (newline-delimited JSON)::
"""
from __future__ import absolute_import
-__version__ = '3.18.4'
+__version__ = '3.19.0'
__all__ = [
'dump', 'dumps', 'load', 'loads',
'JSONDecoder', 'JSONDecodeError', 'JSONEncoder',
diff --git a/simplejson/decoder.py b/simplejson/decoder.py
index 1a8f772..e1f10ae 100644
--- a/simplejson/decoder.py
+++ b/simplejson/decoder.py
@@ -46,9 +46,22 @@ BACKSLASH = {
DEFAULT_ENCODING = "utf-8"
+def scan_four_digit_hex(s, end, _m=re.compile(r'^[0-9a-fA-F]{4}$').match):
+ """Scan a four digit hex number from s[end:end + 4]
+ """
+ msg = "Invalid \\uXXXX escape sequence"
+ esc = s[end:end + 4]
+ if not _m(esc):
+ raise JSONDecodeError(msg, s, end - 2)
+ try:
+ return int(esc, 16), end + 4
+ except ValueError:
+ raise JSONDecodeError(msg, s, end - 2)
+
def py_scanstring(s, end, encoding=None, strict=True,
_b=BACKSLASH, _m=STRINGCHUNK.match, _join=u''.join,
- _PY3=PY3, _maxunicode=sys.maxunicode):
+ _PY3=PY3, _maxunicode=sys.maxunicode,
+ _scan_four_digit_hex=scan_four_digit_hex):
"""Scan the string s for a JSON string. End is the index of the
character in s after the quote that started the JSON string.
Unescapes all valid JSON string escape sequences and raises ValueError
@@ -100,35 +113,18 @@ def py_scanstring(s, end, encoding=None, strict=True,
end += 1
else:
# Unicode escape sequence
- msg = "Invalid \\uXXXX escape sequence"
- esc = s[end + 1:end + 5]
- escX = esc[1:2]
- if len(esc) != 4 or escX == 'x' or escX == 'X':
- raise JSONDecodeError(msg, s, end - 1)
- try:
- uni = int(esc, 16)
- except ValueError:
- raise JSONDecodeError(msg, s, end - 1)
- if uni < 0 or uni > _maxunicode:
- raise JSONDecodeError(msg, s, end - 1)
- end += 5
+ uni, end = _scan_four_digit_hex(s, end + 1)
# Check for surrogate pair on UCS-4 systems
# Note that this will join high/low surrogate pairs
# but will also pass unpaired surrogates through
if (_maxunicode > 65535 and
uni & 0xfc00 == 0xd800 and
s[end:end + 2] == '\\u'):
- esc2 = s[end + 2:end + 6]
- escX = esc2[1:2]
- if len(esc2) == 4 and not (escX == 'x' or escX == 'X'):
- try:
- uni2 = int(esc2, 16)
- except ValueError:
- raise JSONDecodeError(msg, s, end)
- if uni2 & 0xfc00 == 0xdc00:
- uni = 0x10000 + (((uni - 0xd800) << 10) |
- (uni2 - 0xdc00))
- end += 6
+ uni2, end2 = _scan_four_digit_hex(s, end + 2)
+ if uni2 & 0xfc00 == 0xdc00:
+ uni = 0x10000 + (((uni - 0xd800) << 10) |
+ (uni2 - 0xdc00))
+ end = end2
char = unichr(uni)
# Append the unescaped character
_append(char)
diff --git a/simplejson/tests/test_scanstring.py b/simplejson/tests/test_scanstring.py
index c6c53b8..1f54483 100644
--- a/simplejson/tests/test_scanstring.py
+++ b/simplejson/tests/test_scanstring.py
@@ -132,7 +132,9 @@ class TestScanString(TestCase):
self.assertRaises(ValueError,
scanstring, '\\ud834\\x0123"', 0, None, True)
- self.assertRaises(json.JSONDecodeError, scanstring, "\\u-123", 0, None, True)
+ self.assertRaises(json.JSONDecodeError, scanstring, '\\u-123"', 0, None, True)
+ # SJ-PT-23-01: Invalid Handling of Broken Unicode Escape Sequences
+ self.assertRaises(json.JSONDecodeError, scanstring, '\\u EDD"', 0, None, True)
def test_issue3623(self):
self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,