summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric V. Smith <eric@trueblade.com>2016-10-31 09:22:08 -0400
committerEric V. Smith <eric@trueblade.com>2016-10-31 09:22:08 -0400
commit770891917b1ffe9eb66e72c61355c3a95d4a6db5 (patch)
treed0e7606eb8f1854e44682a11ba99b81912eb0e38
parentcd7172b25ab066ae464bce14ced01c93bc7f6617 (diff)
downloadcpython-770891917b1ffe9eb66e72c61355c3a95d4a6db5.tar.gz
Issue 28128: Print out better error/warning messages for invalid string escapes.
-rw-r--r--Include/bytesobject.h5
-rw-r--r--Include/unicodeobject.h11
-rw-r--r--Lib/test/test_string_literals.py27
-rw-r--r--Lib/test/test_unicode.py7
-rw-r--r--Misc/NEWS4
-rw-r--r--Objects/bytesobject.c37
-rw-r--r--Objects/unicodeobject.c38
-rw-r--r--Python/ast.c66
8 files changed, 173 insertions, 22 deletions
diff --git a/Include/bytesobject.h b/Include/bytesobject.h
index 11d8218402..98e29b6879 100644
--- a/Include/bytesobject.h
+++ b/Include/bytesobject.h
@@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
const char *, Py_ssize_t,
const char *);
+/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
+PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
+ const char *, Py_ssize_t,
+ const char *,
+ const char **);
/* Macro, trading safety for speed */
#ifndef Py_LIMITED_API
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
index 5711de0164..b5ef3e4130 100644
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
const char *errors /* error handling */
);
+/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
+ chars. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape(
+ const char *string, /* Unicode-Escape encoded string */
+ Py_ssize_t length, /* size of string */
+ const char *errors, /* error handling */
+ const char **first_invalid_escape /* on return, points to first
+ invalid escaped char in
+ string. */
+);
+
PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
PyObject *unicode /* Unicode object */
);
diff --git a/Lib/test/test_string_literals.py b/Lib/test/test_string_literals.py
index 37ace230f5..54f2be3598 100644
--- a/Lib/test/test_string_literals.py
+++ b/Lib/test/test_string_literals.py
@@ -31,6 +31,7 @@ import os
import sys
import shutil
import tempfile
+import warnings
import unittest
@@ -104,6 +105,19 @@ class TestLiterals(unittest.TestCase):
self.assertRaises(SyntaxError, eval, r""" '\U000000' """)
self.assertRaises(SyntaxError, eval, r""" '\U0000000' """)
+ def test_eval_str_invalid_escape(self):
+ for b in range(1, 128):
+ if b in b"""\n\r"'01234567NU\\abfnrtuvx""":
+ continue
+ with self.assertWarns(DeprecationWarning):
+ self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b))
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always', category=DeprecationWarning)
+ eval("'''\n\\z'''")
+ self.assertEqual(len(w), 1)
+ self.assertEqual(w[0].filename, '<string>')
+ self.assertEqual(w[0].lineno, 2)
+
def test_eval_str_raw(self):
self.assertEqual(eval(""" r'x' """), 'x')
self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01')
@@ -130,6 +144,19 @@ class TestLiterals(unittest.TestCase):
self.assertRaises(SyntaxError, eval, r""" b'\x' """)
self.assertRaises(SyntaxError, eval, r""" b'\x0' """)
+ def test_eval_bytes_invalid_escape(self):
+ for b in range(1, 128):
+ if b in b"""\n\r"'01234567\\abfnrtvx""":
+ continue
+ with self.assertWarns(DeprecationWarning):
+ self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b]))
+ with warnings.catch_warnings(record=True) as w:
+ warnings.simplefilter('always', category=DeprecationWarning)
+ eval("b'''\n\\z'''")
+ self.assertEqual(len(w), 1)
+ self.assertEqual(w[0].filename, '<string>')
+ self.assertEqual(w[0].lineno, 2)
+
def test_eval_bytes_raw(self):
self.assertEqual(eval(""" br'x' """), b'x')
self.assertEqual(eval(""" rb'x' """), b'x')
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
index fe6cd28c63..0737140ccf 100644
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -2413,13 +2413,6 @@ class UnicodeTest(string_tests.CommonTest,
support.check_free_after_iterating(self, iter, str)
support.check_free_after_iterating(self, reversed, str)
- def test_invalid_sequences(self):
- for letter in string.ascii_letters + "89": # 0-7 are octal escapes
- if letter in "abfnrtuvxNU":
- continue
- with self.assertWarns(DeprecationWarning):
- eval(r"'\%s'" % letter)
-
class CAPITest(unittest.TestCase):
diff --git a/Misc/NEWS b/Misc/NEWS
index fde9ab1960..a03ab3a9d1 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,10 @@ What's New in Python 3.7.0 alpha 1
Core and Builtins
-----------------
+- Issue #28128: Deprecation warning for invalid str and byte escape
+ sequences now prints better information about where the error
+ occurs. Patch by Serhiy Storchaka and Eric Smith.
+
- Issue #28509: dict.update() no longer allocate unnecessary large memory.
- Issue #28426: Fixed potential crash in PyUnicode_AsDecodedObject() in debug
diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c
index 598f6a13cf..779fe295db 100644
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@@ -1105,11 +1105,12 @@ _PyBytes_DecodeEscapeRecode(const char **s, const char *end,
return p;
}
-PyObject *PyBytes_DecodeEscape(const char *s,
+PyObject *_PyBytes_DecodeEscape(const char *s,
Py_ssize_t len,
const char *errors,
Py_ssize_t unicode,
- const char *recode_encoding)
+ const char *recode_encoding,
+ const char **first_invalid_escape)
{
int c;
char *p;
@@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const char *s,
return NULL;
writer.overallocate = 1;
+ *first_invalid_escape = NULL;
+
end = s + len;
while (s < end) {
if (*s != '\\') {
@@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const char *s,
break;
default:
- if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0)
- goto failed;
+ if (*first_invalid_escape == NULL) {
+ *first_invalid_escape = s-1; /* Back up one char, since we've
+ already incremented s. */
+ }
*p++ = '\\';
+ s--;
goto non_esc; /* an arbitrary number of unescaped
UTF-8 bytes may follow. */
}
@@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const char *s,
return NULL;
}
+PyObject *PyBytes_DecodeEscape(const char *s,
+ Py_ssize_t len,
+ const char *errors,
+ Py_ssize_t unicode,
+ const char *recode_encoding)
+{
+ const char* first_invalid_escape;
+ PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode,
+ recode_encoding,
+ &first_invalid_escape);
+ if (result == NULL)
+ return NULL;
+ if (first_invalid_escape != NULL) {
+ if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+ "invalid escape sequence '\\%c'",
+ *first_invalid_escape) < 0) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ }
+ return result;
+
+}
/* -------------------------------------------------------------------- */
/* object api */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
index 134fa07600..6e63e009a9 100644
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5877,9 +5877,10 @@ PyUnicode_AsUTF16String(PyObject *unicode)
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
PyObject *
-PyUnicode_DecodeUnicodeEscape(const char *s,
- Py_ssize_t size,
- const char *errors)
+_PyUnicode_DecodeUnicodeEscape(const char *s,
+ Py_ssize_t size,
+ const char *errors,
+ const char **first_invalid_escape)
{
const char *starts = s;
_PyUnicodeWriter writer;
@@ -5887,6 +5888,9 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
+ // so we can remember if we've seen an invalid escape char or not
+ *first_invalid_escape = NULL;
+
if (size == 0) {
_Py_RETURN_UNICODE_EMPTY();
}
@@ -6061,9 +6065,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
goto error;
default:
- if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
- "invalid escape sequence '\\%c'", c) < 0)
- goto onError;
+ if (*first_invalid_escape == NULL) {
+ *first_invalid_escape = s-1; /* Back up one char, since we've
+ already incremented s. */
+ }
WRITE_ASCII_CHAR('\\');
WRITE_CHAR(c);
continue;
@@ -6098,6 +6103,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
return NULL;
}
+PyObject *
+PyUnicode_DecodeUnicodeEscape(const char *s,
+ Py_ssize_t size,
+ const char *errors)
+{
+ const char *first_invalid_escape;
+ PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
+ &first_invalid_escape);
+ if (result == NULL)
+ return NULL;
+ if (first_invalid_escape != NULL) {
+ if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+ "invalid escape sequence '\\%c'",
+ *first_invalid_escape) < 0) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ }
+ return result;
+}
+
/* Return a Unicode-Escape string version of the Unicode object.
If quotes is true, the string is enclosed in u"" or u'' quotes as
diff --git a/Python/ast.c b/Python/ast.c
index 76daf6f446..91e7d0129f 100644
--- a/Python/ast.c
+++ b/Python/ast.c
@@ -4113,8 +4113,34 @@ decode_utf8(struct compiling *c, const char **sPtr, const char *end)
return PyUnicode_DecodeUTF8(t, s - t, NULL);
}
+static int
+warn_invalid_escape_sequence(struct compiling *c, const node *n,
+ char first_invalid_escape_char)
+{
+ PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \\%c",
+ first_invalid_escape_char);
+ if (msg == NULL) {
+ return -1;
+ }
+ if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg,
+ c->c_filename, LINENO(n),
+ NULL, NULL) < 0 &&
+ PyErr_ExceptionMatches(PyExc_DeprecationWarning))
+ {
+ const char *s = PyUnicode_AsUTF8(msg);
+ if (s != NULL) {
+ ast_error(c, n, s);
+ }
+ Py_DECREF(msg);
+ return -1;
+ }
+ Py_DECREF(msg);
+ return 0;
+}
+
static PyObject *
-decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
+decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s,
+ size_t len)
{
PyObject *v, *u;
char *buf;
@@ -4167,11 +4193,41 @@ decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len)
len = p - buf;
s = buf;
- v = PyUnicode_DecodeUnicodeEscape(s, len, NULL);
+ const char *first_invalid_escape;
+ v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
+
+ if (v != NULL && first_invalid_escape != NULL) {
+ if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
+ /* We have not decref u before because first_invalid_escape points
+ inside u. */
+ Py_XDECREF(u);
+ Py_DECREF(v);
+ return NULL;
+ }
+ }
Py_XDECREF(u);
return v;
}
+static PyObject *
+decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s,
+ size_t len)
+{
+ const char *first_invalid_escape;
+ PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL,
+ &first_invalid_escape);
+ if (result == NULL)
+ return NULL;
+
+ if (first_invalid_escape != NULL) {
+ if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) {
+ Py_DECREF(result);
+ return NULL;
+ }
+ }
+ return result;
+}
+
/* Compile this expression in to an expr_ty. Add parens around the
expression, in order to allow leading spaces in the expression. */
static expr_ty
@@ -4310,7 +4366,7 @@ done:
literal_end-literal_start,
NULL, NULL);
else
- *literal = decode_unicode_with_escapes(c, literal_start,
+ *literal = decode_unicode_with_escapes(c, n, literal_start,
literal_end-literal_start);
if (!*literal)
return -1;
@@ -5048,12 +5104,12 @@ parsestr(struct compiling *c, const node *n, int *bytesmode, int *rawmode,
if (*rawmode)
*result = PyBytes_FromStringAndSize(s, len);
else
- *result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL);
+ *result = decode_bytes_with_escapes(c, n, s, len);
} else {
if (*rawmode)
*result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
else
- *result = decode_unicode_with_escapes(c, s, len);
+ *result = decode_unicode_with_escapes(c, n, s, len);
}
return *result == NULL ? -1 : 0;
}