implicit-str-concat-in-sequence: Handling lines with multi-bytes characters - fix #2610 (#2611)

author: Lucas Cimon <lucas.cimon@gmail.com> 2018-11-26 14:22:04 +0100
committer: Claudiu Popa <pcmanticore@gmail.com> 2018-11-26 14:22:04 +0100
commit: f4ebdce59f7043eb612d3363244f3c3cb7b7b8ce (patch)
tree: c6284f3d79aa3f0e98d4200e801acf9f88f4cb8c
parent: 1ac83855b38a8d053ec5d0403a5c0e6e8b191fbb (diff)
download: pylint-git-f4ebdce59f7043eb612d3363244f3c3cb7b7b8ce.tar.gz
6 files changed, 24 insertions, 4 deletions
diff --git a/pylint/checkers/strings.py b/pylint/checkers/strings.py
index d1f227133..92dcbbeec 100644
--- a/pylint/checkers/strings.py
+++ b/pylint/checkers/strings.py
@@ -592,12 +592,20 @@ class StringConstantChecker(BaseTokenChecker):
         self._unicode_literals = "unicode_literals" in module.future_imports
 
     def process_tokens(self, tokens):
-        for i, (tok_type, token, start, _, _) in enumerate(tokens):
-            if tok_type == tokenize.STRING:
+        encoding = "ascii"
+        for i, (tok_type, token, start, _, line) in enumerate(tokens):
+            if tok_type == tokenize.ENCODING:
+                # this is always the first token processed
+                encoding = token
+            elif tok_type == tokenize.STRING:
                 # 'token' is the whole un-parsed token; we can look at the start
                 # of it to see whether it's a raw or unicode string etc.
                 self.process_string_token(token, start[0])
                 next_token = tokens[i + 1] if i + 1 < len(tokens) else None
+                if encoding != "ascii":
+                    # We convert `tokenize` character count into a byte count,
+                    # to match with astroid `.col_offset`
+                    start = (start[0], len(line[: start[1]].encode(encoding)))
                 self.string_tokens[start] = (str_eval(token), next_token)
 
     @check_messages(*(MSGS.keys()))
@@ -618,6 +626,10 @@ class StringConstantChecker(BaseTokenChecker):
                 if elt.col_offset < 0:
                     # This can happen in case of escaped newlines
                     continue
+                if (elt.lineno, elt.col_offset) not in self.string_tokens:
+                    # This may happen with Latin1 encoding
+                    # cf. https://github.com/PyCQA/pylint/issues/2610
+                    continue
                 matching_token, next_token = self.string_tokens[
                     (elt.lineno, elt.col_offset)
                 ]
diff --git a/pylint/test/functional/implicit_str_concat_in_sequence_latin1.py b/pylint/test/functional/implicit_str_concat_in_sequence_latin1.py
new file mode 100644
index 000000000..44a2a94cb
--- /dev/null
+++ b/pylint/test/functional/implicit_str_concat_in_sequence_latin1.py
@@ -0,0 +1,4 @@
+# coding: latin_1
+#pylint: disable=bad-continuation,invalid-name,missing-docstring
+
+TOTO = ('Caf�', 'Caf�', 'Caf�')
diff --git a/pylint/test/functional/implicit_str_concat_in_sequence_latin1.txt b/pylint/test/functional/implicit_str_concat_in_sequence_latin1.txt
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/pylint/test/functional/implicit_str_concat_in_sequence_latin1.txt
diff --git a/pylint/test/functional/implicit_str_concat_in_sequence_utf8.py b/pylint/test/functional/implicit_str_concat_in_sequence_utf8.py
new file mode 100644
index 000000000..fa996e90c
--- /dev/null
+++ b/pylint/test/functional/implicit_str_concat_in_sequence_utf8.py
@@ -0,0 +1,3 @@
+#pylint: disable=bad-continuation,invalid-name,missing-docstring
+
+TOTO = ('Café', 'Café', 'Café')
diff --git a/pylint/test/functional/implicit_str_concat_in_sequence_utf8.txt b/pylint/test/functional/implicit_str_concat_in_sequence_utf8.txt
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/pylint/test/functional/implicit_str_concat_in_sequence_utf8.txt
diff --git a/pylint/test/test_functional.py b/pylint/test/test_functional.py
index c5d19bb09..576cd0d56 100644
--- a/pylint/test/test_functional.py
+++ b/pylint/test/test_functional.py
@@ -279,8 +279,9 @@ class LintModuleTest(object):
     def _open_source_file(self):
         if self._test_file.base == "invalid_encoded_data":
             return open(self._test_file.source)
-        else:
-            return io.open(self._test_file.source, encoding="utf8")
+        if "latin1" in self._test_file.base:
+            return io.open(self._test_file.source, encoding="latin1")
+        return io.open(self._test_file.source, encoding="utf8")
 
     def _get_expected(self):
         with self._open_source_file() as fobj:
author	Lucas Cimon <lucas.cimon@gmail.com>	2018-11-26 14:22:04 +0100
committer	Claudiu Popa <pcmanticore@gmail.com>	2018-11-26 14:22:04 +0100
commit	f4ebdce59f7043eb612d3363244f3c3cb7b7b8ce (patch)
tree	c6284f3d79aa3f0e98d4200e801acf9f88f4cb8c
parent	1ac83855b38a8d053ec5d0403a5c0e6e8b191fbb (diff)
download	pylint-git-f4ebdce59f7043eb612d3363244f3c3cb7b7b8ce.tar.gz