summaryrefslogtreecommitdiff
path: root/Parser/tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'Parser/tokenizer.c')
-rw-r--r--Parser/tokenizer.c67
1 files changed, 43 insertions, 24 deletions
diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c
index 5ba12a409d..f4d7e3fc68 100644
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -128,7 +128,6 @@ tok_new(void)
tok->prompt = tok->nextprompt = NULL;
tok->lineno = 0;
tok->level = 0;
- tok->filename = NULL;
tok->altwarning = 1;
tok->alterror = 1;
tok->alttabsize = 1;
@@ -140,6 +139,7 @@ tok_new(void)
tok->encoding = NULL;
tok->cont_line = 0;
#ifndef PGEN
+ tok->filename = NULL;
tok->decoding_readline = NULL;
tok->decoding_buffer = NULL;
#endif
@@ -545,7 +545,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
{
char *line = NULL;
int badchar = 0;
- PyObject *filename;
for (;;) {
if (tok->decoding_state == STATE_NORMAL) {
/* We already have a codec associated with
@@ -586,19 +585,12 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
if (badchar) {
/* Need to add 1 to the line number, since this line
has not been counted, yet. */
- if (tok->filename != NULL)
- filename = PyUnicode_DecodeFSDefault(tok->filename);
- else
- filename = PyUnicode_FromString("<file>");
- if (filename != NULL) {
- PyErr_Format(PyExc_SyntaxError,
- "Non-UTF-8 code starting with '\\x%.2x' "
- "in file %U on line %i, "
- "but no encoding declared; "
- "see http://python.org/dev/peps/pep-0263/ for details",
- badchar, filename, tok->lineno + 1);
- Py_DECREF(filename);
- }
+ PyErr_Format(PyExc_SyntaxError,
+ "Non-UTF-8 code starting with '\\x%.2x' "
+ "in file %U on line %i, "
+ "but no encoding declared; "
+ "see http://python.org/dev/peps/pep-0263/ for details",
+ badchar, tok->filename, tok->lineno + 1);
return error_ret(tok);
}
#endif
@@ -856,6 +848,7 @@ PyTokenizer_Free(struct tok_state *tok)
#ifndef PGEN
Py_XDECREF(tok->decoding_readline);
Py_XDECREF(tok->decoding_buffer);
+ Py_XDECREF(tok->filename);
#endif
if (tok->fp != NULL && tok->buf != NULL)
PyMem_FREE(tok->buf);
@@ -1250,8 +1243,13 @@ indenterror(struct tok_state *tok)
return 1;
}
if (tok->altwarning) {
- PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
+#ifdef PGEN
+ PySys_WriteStderr("inconsistent use of tabs and spaces "
+ "in indentation\n");
+#else
+ PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
"in indentation\n", tok->filename);
+#endif
tok->altwarning = 0;
}
return 0;
@@ -1692,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
return result;
}
-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+ the file starts with a BOM.
- PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
- the first or second line of the file (in which case the encoding
- should be assumed to be PyUnicode_GetDefaultEncoding()).
+ PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+ encoding in the first or second line of the file (in which case the encoding
+ should be assumed to be UTF-8).
+
+ The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+ by the caller. */
- The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
- by the caller.
-*/
char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
{
struct tok_state *tok;
FILE *fp;
@@ -1721,6 +1720,20 @@ PyTokenizer_FindEncoding(int fd)
fclose(fp);
return NULL;
}
+#ifndef PGEN
+ if (filename != NULL) {
+ Py_INCREF(filename);
+ tok->filename = filename;
+ }
+ else {
+ tok->filename = PyUnicode_FromString("<string>");
+ if (tok->filename == NULL) {
+ fclose(fp);
+ PyTokenizer_Free(tok);
+ return encoding;
+ }
+ }
+#endif
while (tok->lineno < 2 && tok->done == E_OK) {
PyTokenizer_Get(tok, &p_start, &p_end);
}
@@ -1734,6 +1747,12 @@ PyTokenizer_FindEncoding(int fd)
return encoding;
}
+char *
+PyTokenizer_FindEncoding(int fd)
+{
+ return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
#ifdef Py_DEBUG
void