# Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html # For details: https://github.com/pylint-dev/pylint/blob/main/LICENSE # Copyright (c) https://github.com/pylint-dev/pylint/blob/main/CONTRIBUTORS.txt from __future__ import annotations import itertools from pathlib import Path import pytest import pylint.checkers.unicode SEARCH_DICT_BYTE_UTF8 = { char.unescaped.encode("utf-8"): char for char in pylint.checkers.unicode.BAD_CHARS } @pytest.mark.parametrize( "line, expected, search_dict", [ # Test special carrier return cases pytest.param( "valid windows\r\n", {}, pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, id="valid-windows", ), pytest.param( b"TOTO = ('Caf\xe9', 'Caf\xe9', 'Caf\xe9')\r\n", {}, SEARCH_DICT_BYTE_UTF8, id="valid-windows-bytes", ), pytest.param( "invalid\r windows\r\n", {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, id="invalid-carrier-return-windows", ), pytest.param( "invalid\r linux\n", {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, id="invalid-carrier-return-linux", ), pytest.param( b"invalid\r windows\r\n", {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, SEARCH_DICT_BYTE_UTF8, id="invalid-carrier-return-windows-bytes", ), pytest.param( b"invalid\r linux\n", {7: pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT["\r"]}, SEARCH_DICT_BYTE_UTF8, id="invalid-carrier-return-linux-bytes", ), # Auto test Linux all remaining Linux cases ... *( pytest.param( f"invalid{char.unescaped} back\n", {7: char}, pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, id=f"invalid-{char.name}-linux", ) for char in pylint.checkers.unicode.BAD_CHARS if char.unescaped != "\r" ), # ... also byte encoded *( pytest.param( f"invalid{char.unescaped} back\n".encode("ASCII"), {7: char}, SEARCH_DICT_BYTE_UTF8, id=f"invalid-{char.name}-linux-bytes", ) for char in pylint.checkers.unicode.BAD_CHARS[:-1] if char.unescaped != "\r" ), # Test all remaining windows cases ... *( pytest.param( f"invalid{char.unescaped} back\r\n", {7: char}, pylint.checkers.unicode.BAD_ASCII_SEARCH_DICT, id=f"invalid-{char.name}-windows", ) for char in pylint.checkers.unicode.BAD_CHARS if char.unescaped != "\r" ), # ... also byte encoded *( pytest.param( f"invalid{char.unescaped} back\r\n".encode("ASCII"), {7: char}, SEARCH_DICT_BYTE_UTF8, id=f"invalid-{char.name}-windows-bytes", ) for char in pylint.checkers.unicode.BAD_CHARS[:-1] if char.unescaped != "\r" ), ], ) def test_map_positions_to_result( line: pylint.checkers.unicode._StrLike, expected: dict[int, pylint.checkers.unicode._BadChar], search_dict: dict[ pylint.checkers.unicode._StrLike, pylint.checkers.unicode._BadChar ], ) -> None: """Test all possible outcomes for map position function in UTF-8 and ASCII.""" if isinstance(line, bytes): newline = b"\n" else: newline = "\n" assert ( pylint.checkers.unicode._map_positions_to_result( line, search_dict, new_line=newline ) == expected ) @pytest.mark.parametrize( "line", [ pytest.param("1234567890", id="no_line_ending"), pytest.param(b"1234567890", id="no_line_ending_byte"), pytest.param("1234567890\n", id="linux"), pytest.param(b"1234567890\n", id="linux_byte"), pytest.param("1234567890\r\n", id="windows"), pytest.param(b"1234567890\r\n", id="windows_byte"), pytest.param("12345678\n\r", id="wrong_order"), pytest.param(b"12345678\n\r", id="wrong_order_byte"), ], ) def test_line_length(line: pylint.checkers.unicode._StrLike) -> None: assert pylint.checkers.unicode._line_length(line, "utf-8") == 10 @pytest.mark.parametrize( "line", [ pytest.param("1234567890", id="no_line_ending"), pytest.param("1234567890\n", id="linux"), pytest.param("1234567890\r\n", id="windows"), pytest.param("12345678\n\r", id="wrong_order"), ], ) def test_line_length_utf16(line: str) -> None: assert pylint.checkers.unicode._line_length(line.encode("utf-16"), "utf-16") == 10 @pytest.mark.parametrize( "line", [ pytest.param("1234567890", id="no_line_ending"), pytest.param("1234567890\n", id="linux"), pytest.param("1234567890\r\n", id="windows"), pytest.param("12345678\n\r", id="wrong_order"), ], ) def test_line_length_utf32(line: str) -> None: assert pylint.checkers.unicode._line_length(line.encode("utf-32"), "utf-32") == 10 @pytest.mark.parametrize( "codec, expected", [ ("utf-8sig", "utf-8"), ("utf8", "utf-8"), ("utf 8", "utf-8"), ("utf-8", "utf-8"), ("utf-8", "utf-8"), ("utf-16", "utf-16"), ("utf-32", "utf-32"), ("utf 16", "utf-16"), ("utf 32", "utf-32"), ("utf 16 LE", "utf-16le"), ("utf 32-BE", "utf-32be"), ("UTF-32", "utf-32"), ("UTF-32-le", "utf-32le"), ("UTF-16 LE", "utf-16le"), ("UTF-16BE", "utf-16be"), ("UTF8", "utf-8"), ("Latin1", "latin1"), ("ASCII", "ascii"), ], ) def test__normalize_codec_name(codec: str, expected: str) -> None: assert pylint.checkers.unicode._normalize_codec_name(codec) == expected @pytest.mark.parametrize( "codec, line_ending, final_new_line", [ pytest.param( codec, line_ending[0], final_nl[0], id=f"{codec}_{line_ending[1]}_{final_nl[1]}", ) for codec, line_ending, final_nl in itertools.product( ( "utf-8", "utf-16", "utf-16le", "utf-16be", "utf-32", "utf-32le", "utf-32be", ), (("\n", "linux"), ("\r\n", "windows")), ((True, "final_nl"), (False, "no_final_nl")), ) ], ) def test___fix_utf16_32_line_stream( tmp_path: Path, codec: str, line_ending: str, final_new_line: bool ) -> None: """Content of stream should be the same as should be the length.""" def decode_line(line: bytes, codec: str) -> str: return line.decode(codec) file = tmp_path / "test.txt" content = [ f"line1{line_ending}", f"# Line 2{line_ending}", f"łöł{line_ending}", f"last line{line_ending if final_new_line else ''}", ] text = "".join(content) encoded = text.encode(codec) file.write_bytes(encoded) gathered = b"" collected = [] with file.open("rb") as f: for line in pylint.checkers.unicode._fix_utf16_32_line_stream(f, codec): gathered += line collected.append(decode_line(line, codec)) # Test content equality assert collected == content # Test byte equality assert gathered == encoded @pytest.mark.parametrize( "codec, expected", [ ("utf-32", 4), ("utf-32-le", 4), ("utf-16", 2), ("utf-8", 1), ("latin1", 1), ("ascii", 1), ], ) def test__byte_to_str_length(codec: str, expected: int) -> None: assert pylint.checkers.unicode._byte_to_str_length(codec) == expected