From 6703bb3a1893eb6c8e10425ad336cbb5b002666f Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Tue, 2 Jan 2024 09:35:36 -0500 Subject: [PATCH 1/6] tests: Use correct encoding for path Before this change, build_temp_workspace() would always encode a path using UTF-8 and the strict error handler [1]. Most of the time, this is fine, but systems do not necessarily use UTF-8 and the strict error handler for paths [2]. [1]: [2]: --- tests/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common.py b/tests/common.py index 29dcfb9c..25b2f6e1 100644 --- a/tests/common.py +++ b/tests/common.py @@ -87,7 +87,7 @@ def build_temp_workspace(files): tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') for path, content in files.items(): - path = os.path.join(tempdir, path).encode('utf-8') + path = os.fsencode(os.path.join(tempdir, path)) if not os.path.exists(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) From 48817899df0cef3f6d09dddbdb232b6d5734cce8 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Wed, 3 Jan 2024 11:50:42 -0500 Subject: [PATCH 2/6] tests: Restore stdout and stderr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this commit, test_run_default_format_output_in_tty() changed the values of sys.stdout and sys.stderr, but it would never change them back. This commit makes sure that they get changed back. At the moment, this commit doesn’t make a user-visible difference. A future commit will add a new test named test_ignored_from_file_with_multiple_encodings(). That new test requires stdout and stderr to be restored, or else it will fail. --- tests/test_cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index e0ae0fee..765d7083 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -499,6 +499,7 @@ def test_run_default_format_output_in_tty(self): path = os.path.join(self.wd, 'a.yaml') # Create a pseudo-TTY and redirect stdout to it + old_stdout, old_stderr = sys.stdout, sys.stderr master, slave = pty.openpty() sys.stdout = sys.stderr = os.fdopen(slave, 'w') @@ -518,6 +519,7 @@ def test_run_default_format_output_in_tty(self): sys.stdout.close() sys.stderr.close() output.close() + sys.stdout, sys.stderr = old_stdout, old_stderr self.assertEqual(out, ( f'\033[4m{path}\033[0m\n' From e5ef0391c0dd9de41ac69f2990821d6b3221a732 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Sat, 30 Dec 2023 12:51:24 -0500 Subject: [PATCH 3/6] decoder: Autodetect detect encoding of YAML files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, yamllint would open YAML files using open()’s default encoding. As long as UTF-8 mode isn’t enabled, open() defaults to using the system’s locale encoding [1][2]. This can cause problems in multiple different scenarios. The first scenario involves linting UTF-8 YAML files on Linux systems. Most of the time, the locale encoding on Linux systems is set to UTF-8 [3][4], but it can be set to something else [5]. In the unlikely event that someone was using Linux with a locale encoding other than UTF-8, there was a chance that yamllint would crash with a UnicodeDecodeError. The second scenario involves linting UTF-8 YAML files on Windows systems. The locale encoding on Windows systems is the system’s ANSI code page [6]. The ANSI code page on Windows systems is NOT set to UTF-8 by default [7]. In the very likely event that someone was using Windows with a locale encoding other than UTF-8, there was a chance that yamllint would crash with a UnicodeDecodeError. Additionally, using open()’s default encoding is a violation of the YAML spec. Chapter 5.2 says: “On input, a YAML processor must support the UTF-8 and UTF-16 character encodings. For JSON compatibility, the UTF-32 encodings must also be supported. If a character stream begins with a byte order mark, the character encoding will be taken to be as indicated by the byte order mark. Otherwise, the stream must begin with an ASCII character. This allows the encoding to be deduced by the pattern of null (x00) characters.” [8] This change fixes all of those problems by implementing the YAML spec’s character encoding detection algorithm. Now, as long as YAML files begin with either a byte order mark or an ASCII character, yamllint will automatically detect them as being UTF-8, UTF-16 or UTF-32. Other character encodings are not supported at the moment. Credit for the idea of having tests with pre-encoded strings goes to @adrienverge [9]. Fixes #218. Fixes #238. Fixes #347. [1]: [2]: [3]: [4]: [5]: [6]: [7]: [8]: [9]: --- tests/common.py | 182 +++++++++++++---- tests/test_cli.py | 58 +++++- tests/test_decoder.py | 452 ++++++++++++++++++++++++++++++++++++++++++ yamllint/cli.py | 2 +- yamllint/config.py | 5 +- yamllint/decoder.py | 65 ++++++ yamllint/linter.py | 4 +- 7 files changed, 729 insertions(+), 39 deletions(-) create mode 100644 tests/test_decoder.py create mode 100644 yamllint/decoder.py diff --git a/tests/common.py b/tests/common.py index 25b2f6e1..7b73e3ca 100644 --- a/tests/common.py +++ b/tests/common.py @@ -13,6 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import codecs import contextlib from io import StringIO import os @@ -20,6 +21,8 @@ import sys import tempfile import unittest +import warnings +from codecs import CodecInfo import yaml @@ -27,6 +30,151 @@ from yamllint.config import YamlLintConfig +# Encoding related stuff: +UTF_CODECS = ( + 'utf_32_be', + 'utf_32_be_sig', + 'utf_32_le', + 'utf_32_le_sig', + 'utf_16_be', + 'utf_16_be_sig', + 'utf_16_le', + 'utf_16_le_sig', + 'utf_8', + 'utf_8_sig' +) + + +def encode_utf_32_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors), + len(obj) + ) + + +def encode_utf_32_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors), + len(obj) + ) + + +def encode_utf_16_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors), + len(obj) + ) + + +def encode_utf_16_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors), + len(obj) + ) + + +test_codec_infos = { + 'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), # noqa: E501 + 'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), # noqa: E501 +} + + +def register_test_codecs(): + codecs.register(test_codec_infos.get) + + +def unregister_test_codecs(): + if sys.version_info >= (3, 10, 0): + codecs.unregister(test_codec_infos.get) + else: + warnings.warn( + "This version of Python doesn’t allow us to unregister codecs.", + stacklevel=1 + ) + + +def is_test_codec(codec): + return codec in test_codec_infos.keys() + + +def test_codec_built_in_equivalent(test_codec): + return_value = test_codec + for suffix in ('_sig', '_be', '_le'): + return_value = return_value.replace(suffix, '') + return return_value + + +def uses_bom(codec): + for suffix in ('_32', '_16', '_sig'): + if codec.endswith(suffix): + return True + return False + + +def encoding_detectable(string, codec): + """ + Returns True if encoding can be detected after string is encoded + + Encoding detection only works if you’re using a BOM or the first character + is ASCII. See yamllint.decoder.auto_decode()’s docstring. + """ + return uses_bom(codec) or (len(string) > 0 and string[0].isascii()) + + +# Workspace related stuff: +class Blob: + def __init__(self, text, encoding): + self.text = text + self.encoding = encoding + + +def build_temp_workspace(files): + tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') + + for path, content in files.items(): + path = os.fsencode(os.path.join(tempdir, path)) + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + + if isinstance(content, list): + os.mkdir(path) + elif isinstance(content, str) and content.startswith('symlink://'): + os.symlink(content[10:], path) + else: + if isinstance(content, Blob): + content = content.text.encode(content.encoding) + mode = 'wb' if isinstance(content, bytes) else 'w' + with open(path, mode) as f: + f.write(content) + + return tempdir + + +@contextlib.contextmanager +def temp_workspace(files): + """Provide a temporary workspace that is automatically cleaned up.""" + backup_wd = os.getcwd() + wd = build_temp_workspace(files) + + try: + os.chdir(wd) + yield + finally: + os.chdir(backup_wd) + shutil.rmtree(wd) + + +def temp_workspace_with_files_in_many_codecs(path_template, text): + workspace = {} + for codec in UTF_CODECS: + if encoding_detectable(text, codec): + workspace[path_template.format(codec)] = Blob(text, codec) + return workspace + + +# Miscellaneous stuff: class RuleTestCase(unittest.TestCase): def build_fake_config(self, conf): if conf is None: @@ -81,37 +229,3 @@ def __exit__(self, *exc_info): @property def returncode(self): return self._raises_ctx.exception.code - - -def build_temp_workspace(files): - tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') - - for path, content in files.items(): - path = os.fsencode(os.path.join(tempdir, path)) - if not os.path.exists(os.path.dirname(path)): - os.makedirs(os.path.dirname(path)) - - if isinstance(content, list): - os.mkdir(path) - elif isinstance(content, str) and content.startswith('symlink://'): - os.symlink(content[10:], path) - else: - mode = 'wb' if isinstance(content, bytes) else 'w' - with open(path, mode) as f: - f.write(content) - - return tempdir - - -@contextlib.contextmanager -def temp_workspace(files): - """Provide a temporary workspace that is automatically cleaned up.""" - backup_wd = os.getcwd() - wd = build_temp_workspace(files) - - try: - os.chdir(wd) - yield - finally: - os.chdir(backup_wd) - shutil.rmtree(wd) diff --git a/tests/test_cli.py b/tests/test_cli.py index 765d7083..51825efc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,7 +23,14 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext, temp_workspace +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + temp_workspace, + unregister_test_codecs, + temp_workspace_with_files_in_many_codecs, +) from yamllint import cli, config @@ -819,3 +826,52 @@ def test_multiple_parent_config_file(self): self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, './4spaces.yml:2:5: [warning] wrong indentation: ' 'expected 3 but found 4 (indentation)\n', '')) + + +class CommandLineEncodingTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + register_test_codecs() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + unregister_test_codecs() + + def test_valid_encodings(self): + conf = ('---\n' + 'rules:\n' + ' key-ordering: enable\n') + config_files = temp_workspace_with_files_in_many_codecs( + 'config_{}.yaml', + conf + ) + sorted_correctly = ('---\n' + 'A: YAML\n' + 'Z: YAML\n') + sorted_correctly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_correctly/{}.yaml', + sorted_correctly + ) + sorted_incorrectly = ('---\n' + 'Z: YAML\n' + 'A: YAML\n') + sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_incorrectly/{}.yaml', + sorted_incorrectly + ) + workspace = { + **config_files, + **sorted_correctly_files, + **sorted_incorrectly_files + } + + with temp_workspace(workspace): + for config_path in config_files.keys(): + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_correctly/')) + self.assertEqual(ctx.returncode, 0) + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_incorrectly/')) + self.assertNotEqual(ctx.returncode, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py new file mode 100644 index 00000000..f7ef1650 --- /dev/null +++ b/tests/test_decoder.py @@ -0,0 +1,452 @@ +# Copyright (C) 2023–2024 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs +import unittest + +from tests.common import ( + UTF_CODECS, + encoding_detectable, + is_test_codec, + register_test_codecs, + test_codec_built_in_equivalent, + unregister_test_codecs, + uses_bom, +) + +from yamllint import decoder + + +class PreEncodedTestStringInfo(): + def __init__( + self, + input_bytes, + codec_for_input_bytes, + expected_output_str + ): + self.input_bytes = input_bytes + self.codec_for_input_bytes = codec_for_input_bytes + self.expected_output_str = expected_output_str + + +PRE_ENCODED_TEST_STRING_INFOS = ( + # An empty string + PreEncodedTestStringInfo( + b'', + None, + '' + ), + + # A single ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00|', + 'utf_32_be', + '|' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00|', + 'utf_32', + '|' + ), + PreEncodedTestStringInfo( + b'|\x00\x00\x00', + 'utf_32_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00|\x00\x00\x00', + 'utf_32', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'\x00|', + 'utf_16_be', + '|' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00|', + 'utf_16', # BE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|\x00', + 'utf_16_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe|\x00', + 'utf_16', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|', + 'utf_8', + '|' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf|', + 'utf_8_sig', + '|' + ), + + # A string that starts with an ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfeW\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'What\xe2\x80\x99s up?', + 'utf_8', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbfWhat\xe2\x80\x99s up?', + 'utf_8_sig', + 'What’s up?' + ), + + # A single non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x01\xf4;', + 'utf_32', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00;\xf4\x01\x00', + 'utf_32', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\xd8=\xdc;', + 'utf_16', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe=\xd8;\xdc', + 'utf_16', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xf0\x9f\x90\xbb', + 'utf_8_sig', + '🐻' + ), + + # A string that starts with a non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00\xc7\x00a\x00 \x00v\x00a\x00?', + 'utf_16', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\xc7\x00a\x00 \x00v\x00a\x00?\x00', + 'utf_16', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xc3\x87a va?', + 'utf_8_sig', + 'Ça va?' + ) +) +TEST_STRINGS_TO_ENCODE_AT_RUNTIME = ( + "", + "y", + "yaml", + "🇾⁠🇦⁠🇲⁠🇱⁠❗" +) +setUpModule = register_test_codecs +tearDownModule = unregister_test_codecs + + +class EncodingStuffFromCommonTestCase(unittest.TestCase): + def test_test_codecs_and_utf_codecs(self): + error = "{} failed to correctly encode then decode {}." + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + self.assertEqual( + string, + string.encode(codec).decode(codec), + msg=error.format(repr(codec), repr(string)) + ) + + def test_is_test_codec(self): + self.assertFalse(is_test_codec('utf_32')) + self.assertFalse(is_test_codec('utf_32_be')) + self.assertTrue(is_test_codec('utf_32_be_sig')) + self.assertFalse(is_test_codec('utf_32_le')) + self.assertTrue(is_test_codec('utf_32_le_sig')) + + self.assertFalse(is_test_codec('utf_16')) + self.assertFalse(is_test_codec('utf_16_be')) + self.assertTrue(is_test_codec('utf_16_be_sig')) + self.assertFalse(is_test_codec('utf_16_le')) + self.assertTrue(is_test_codec('utf_16_le_sig')) + + self.assertFalse(is_test_codec('utf_8')) + self.assertFalse(is_test_codec('utf_8_be')) + + def test_test_codec_built_in_equivalent(self): + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_be_sig') + ) + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_le_sig') + ) + + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_be_sig') + ) + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_le_sig') + ) + + def test_uses_bom(self): + self.assertTrue(uses_bom('utf_32')) + self.assertFalse(uses_bom('utf_32_be')) + self.assertTrue(uses_bom('utf_32_be_sig')) + self.assertFalse(uses_bom('utf_32_le')) + self.assertTrue(uses_bom('utf_32_le_sig')) + + self.assertTrue(uses_bom('utf_16')) + self.assertFalse(uses_bom('utf_16_be')) + self.assertTrue(uses_bom('utf_16_be_sig')) + self.assertFalse(uses_bom('utf_16_le')) + self.assertTrue(uses_bom('utf_16_le_sig')) + + self.assertFalse(uses_bom('utf_8')) + self.assertTrue(uses_bom('utf_8_sig')) + + def test_encoding_detectable(self): + # No BOM + nothing + self.assertFalse(encoding_detectable('', 'utf_32_be')) + self.assertFalse(encoding_detectable('', 'utf_32_le')) + + self.assertFalse(encoding_detectable('', 'utf_16_be')) + self.assertFalse(encoding_detectable('', 'utf_16_le')) + + self.assertFalse(encoding_detectable('', 'utf_8')) + # BOM + nothing + self.assertTrue(encoding_detectable('', 'utf_32')) + self.assertTrue(encoding_detectable('', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_16')) + self.assertTrue(encoding_detectable('', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_8_sig')) + # No BOM + non-ASCII + self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be')) + self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le')) + + self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be')) + self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le')) + + self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8')) + # No BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32_be')) + self.assertTrue(encoding_detectable('gi', 'utf_32_le')) + + self.assertTrue(encoding_detectable('ve', 'utf_16_be')) + self.assertTrue(encoding_detectable(' y', 'utf_16_le')) + + self.assertTrue(encoding_detectable('ou', 'utf_8')) + # BOM + non-ASCII + self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32')) + self.assertTrue(encoding_detectable('ⓟ␤', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16')) + self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig')) + # BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32')) + self.assertTrue(encoding_detectable('le', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('yo', 'utf_16')) + self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('do', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('wn', 'utf_8_sig')) + + +class DecoderTestCase(unittest.TestCase): + def detect_encoding_test_helper( + self, + original_string, + input_bytes, + expected_output + ): + ERROR1 = "{} was encoded with {}, but detect_encoding() returned {}." + ERROR2 = "detect_encoding({}) returned a codec that isn’t built-in." + actual_output = decoder.detect_encoding(input_bytes) + if expected_output is not None: + self.assertEqual( + expected_output, + actual_output, + msg=ERROR1.format( + input_bytes, + repr(expected_output), + repr(actual_output) + ) + ) + + codec_info = codecs.lookup(actual_output) + self.assertFalse( + is_test_codec(codec_info), + msg=ERROR2.format(input_bytes) + ) + + def test_detect_encoding_with_pre_encoded_strings(self): + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + self.detect_encoding_test_helper( + pre_encoded_test_string_info.expected_output_str, + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes + ) + + def test_detect_encoding_with_strings_encoded_at_runtime(self): + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + if not uses_bom(codec) and len(string) == 0: + expected_output = 'utf_8' + elif not encoding_detectable(string, codec): + expected_output = None + elif is_test_codec(codec): + expected_output = test_codec_built_in_equivalent(codec) + else: + expected_output = codec + self.detect_encoding_test_helper( + string, + string.encode(codec), + expected_output + ) + + def auto_decode_test_helper( + self, + input_bytes, + codec_for_input_bytes, + expected_output + ): + ERROR = "auto_decode({}) returned the wrong value." + does_auto_detect_encodings_return_value_matter = ( + codec_for_input_bytes is not None and ( + encoding_detectable(expected_output, codec_for_input_bytes) + or len(input_bytes) == 0 + ) + ) + if does_auto_detect_encodings_return_value_matter: + actual_output = decoder.auto_decode(input_bytes) + self.assertEqual( + expected_output, + actual_output, + msg=ERROR.format(repr(input_bytes)) + ) + self.assertIsInstance(actual_output, str) + else: + try: + decoder.auto_decode(input_bytes) + except UnicodeDecodeError as exception: + return exception + return None + + def test_auto_decode_with_pre_encoded_strings(self): + ERROR = "auto_decode({}) should not have raised an exception" + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + exception = self.auto_decode_test_helper( + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes, + pre_encoded_test_string_info.expected_output_str + ) + if exception is not None: + new_exception = self.failureException( + msg=ERROR.format( + repr(pre_encoded_test_string_info.input_bytes) + ) + ) + raise new_exception from exception + + def test_auto_decode_with_strings_encoded_at_runtime(self): + at_least_one_decode_error = False + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + exception = self.auto_decode_test_helper( + string.encode(codec), + codec, + string + ) + if exception is not None: + at_least_one_decode_error = True + self.assertTrue( + at_least_one_decode_error, + msg=( + "None of the TEST_STRINGS_TO_ENCODE_AT_RUNTIME triggered a " + + "decoding error." + ) + ) diff --git a/yamllint/cli.py b/yamllint/cli.py index 9a39bd8c..7059b852 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -219,7 +219,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with open(file, newline='') as f: + with open(file, mode='rb') as f: problems = linter.run(f, conf, filepath) except OSError as e: print(e, file=sys.stderr) diff --git a/yamllint/config.py b/yamllint/config.py index 9ce62549..c40d8205 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -20,6 +20,7 @@ import yaml import yamllint.rules +from yamllint import decoder class YamlLintConfigError(Exception): @@ -38,8 +39,8 @@ def __init__(self, content=None, file=None): self.locale = None if file is not None: - with open(file) as f: - content = f.read() + with open(file, mode='rb') as f: + content = decoder.auto_decode(f.read()) self.parse(content) self.validate() diff --git a/yamllint/decoder.py b/yamllint/decoder.py new file mode 100644 index 00000000..1e3c2f32 --- /dev/null +++ b/yamllint/decoder.py @@ -0,0 +1,65 @@ +# Copyright (C) 2023 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs + + +def detect_encoding(stream_data): + """ + Return stream_data’s character encoding + + Specifically, this function will take a bytes object and return a string + that contains the name of one of Python’s built-in codecs [1]. + + The YAML spec says that streams must begin with a BOM or an ASCII + character. If stream_data doesn’t begin with either of those, then this + function might return the wrong encoding. See chapter 5.2 of the YAML spec + for details [2]. + + [1]: + [2]: + """ + if stream_data.startswith(codecs.BOM_UTF32_BE): + return 'utf_32' + elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4: + return 'utf_32_be' + elif stream_data.startswith(codecs.BOM_UTF32_LE): + return 'utf_32' + elif stream_data[1:4] == b'\x00\x00\x00': + return 'utf_32_le' + elif stream_data.startswith(codecs.BOM_UTF16_BE): + return 'utf_16' + elif stream_data.startswith(b'\x00') and len(stream_data) >= 2: + return 'utf_16_be' + elif stream_data.startswith(codecs.BOM_UTF16_LE): + return 'utf_16' + elif stream_data[1:2] == b'\x00': + return 'utf_16_le' + elif stream_data.startswith(codecs.BOM_UTF8): + return 'utf_8_sig' + else: + return 'utf_8' + + +def auto_decode(stream_data): + return stream_data.decode(encoding=detect_encoding(stream_data)) + + +def lines_in_files(paths): + """Autodecodes files and yields their lines.""" + for path in paths: + with open(path, 'rb') as file: + text = auto_decode(file.read()) + yield from text.splitlines() diff --git a/yamllint/linter.py b/yamllint/linter.py index a2faa061..2230a600 100644 --- a/yamllint/linter.py +++ b/yamllint/linter.py @@ -18,7 +18,7 @@ import yaml -from yamllint import parser +from yamllint import decoder, parser PROBLEM_LEVELS = { 0: None, @@ -187,6 +187,8 @@ def get_syntax_error(buffer): def _run(buffer, conf, filepath): assert hasattr(buffer, '__getitem__'), \ '_run() argument must be a buffer, not a stream' + if isinstance(buffer, bytes): + buffer = decoder.auto_decode(buffer) first_line = next(parser.line_generator(buffer)).content if re.match(r'^#\s*yamllint disable-file\s*$', first_line): From 4f97d1f5447038bb5c08fcf9d8f3ea5d851f8c98 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Sun, 31 Dec 2023 18:10:38 -0500 Subject: [PATCH 4/6] decoder: Autodetect encoding for ignore-from-file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this change, yamllint would decode files on the ignore-from-file list using open()’s default encoding [1][2]. This can cause decoding to fail in some situations (see the previous commit message for details). This change makes yamllint automatically detect the encoding for files on the ignore-from-file list. It uses the same algorithm that it uses for detecting the encoding of YAML files, so the same limitations apply: files must use UTF-8, UTF-16 or UTF-32 and they must begin with either a byte order mark or an ASCII character. [1]: [2]: --- docs/configuration.rst | 4 ++++ tests/test_config.py | 49 +++++++++++++++++++++++++++++++++++++++++- tests/test_decoder.py | 30 ++++++++++++++++++++++++++ yamllint/config.py | 14 ++++++------ 4 files changed, 90 insertions(+), 7 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 9624b496..8e7a10ab 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -228,6 +228,10 @@ or: .. note:: However, this is mutually exclusive with the ``ignore`` key. +.. note:: Files on the ``ignore-from-file`` list must use either UTF-8, UTF-16 + or UTF-32. Additionally, they must start with either an ASCII character or a + byte order mark. + If you need to know the exact list of files that yamllint would process, without really linting them, you can use ``--list-files``: diff --git a/tests/test_config.py b/tests/test_config.py index fb570c66..a98b847f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -13,6 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import itertools import os import shutil import sys @@ -20,7 +21,12 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + unregister_test_codecs, +) from yamllint import cli, config from yamllint.config import YamlLintConfigError @@ -820,3 +826,44 @@ def test_run_with_ignore_on_ignored_file(self): sys.stdout.getvalue().strip(), 'file-at-root.yaml:4:17: [error] trailing spaces (trailing-spaces)' ) + + def create_ignore_file(self, text, codec): + path = os.path.join(self.wd, f'{codec}.ignore') + with open(path, 'wb') as f: + f.write(text.encode(codec)) + self.addCleanup(lambda: os.remove(path)) + return path + + def test_ignored_from_file_with_multiple_encodings(self): + register_test_codecs() + self.addCleanup(unregister_test_codecs) + + ignore_files = itertools.starmap( + self.create_ignore_file, ( + ('bin/file.lint-me-anyway.yaml\n', 'utf_32_be'), + ('bin/file.yaml\n', 'utf_32_be_sig'), + ('file-at-root.yaml\n', 'utf_32_le'), + ('file.dont-lint-me.yaml\n', 'utf_32_le_sig'), + + ('ign-dup/file.yaml\n', 'utf_16_be'), + ('ign-dup/sub/dir/file.yaml\n', 'utf_16_be_sig'), + ('ign-trail/file.yaml\n', 'utf_16_le'), + ('include/ign-dup/sub/dir/file.yaml\n', 'utf_16_le_sig'), + + ('s/s/ign-trail/file.yaml\n', 'utf_8'), + ( + 's/s/ign-trail/s/s/file.yaml\n' + 's/s/ign-trail/s/s/file2.lint-me-anyway.yaml\n' + '.yamllint\n', + + 'utf_8_sig' + ), + ) + ) + conf = ('---\n' + 'extends: default\n' + f'ignore-from-file: [{", ".join(ignore_files)}]\n') + + with self.assertRaises(SystemExit) as cm: + cli.run(('-d', conf, '.')) + self.assertEqual(cm.exception.code, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py index f7ef1650..7f0198bc 100644 --- a/tests/test_decoder.py +++ b/tests/test_decoder.py @@ -14,6 +14,7 @@ # along with this program. If not, see . import codecs +import itertools import unittest from tests.common import ( @@ -21,6 +22,8 @@ encoding_detectable, is_test_codec, register_test_codecs, + temp_workspace, + temp_workspace_with_files_in_many_codecs, test_codec_built_in_equivalent, unregister_test_codecs, uses_bom, @@ -450,3 +453,30 @@ def test_auto_decode_with_strings_encoded_at_runtime(self): + "decoding error." ) ) + + def perform_lines_in_file_test(self, strings): + workspace = temp_workspace_with_files_in_many_codecs( + '{}', + '\n'.join(strings) + ) + with temp_workspace(workspace): + iterable = zip( + itertools.cycle(strings), + decoder.lines_in_files(workspace.keys()) + ) + for item in iterable: + self.assertEqual(item[0], item[1]) + + def test_lines_in_file(self): + self.perform_lines_in_file_test(( + "YAML", + "ⓎⒶⓂⓁ", + "🅨🅐🅜🅛", + "YAML" + )) + self.perform_lines_in_file_test(( + "𝐘𝐀𝐌𝐋", + "𝖄𝕬𝕸𝕷", + "𝒀𝑨𝑴𝑳", + "𝓨𝓐𝓜𝓛" + )) diff --git a/yamllint/config.py b/yamllint/config.py index c40d8205..b7d389fc 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -13,7 +13,6 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import fileinput import os.path import pathspec @@ -110,8 +109,10 @@ def parse(self, raw_content): raise YamlLintConfigError( 'invalid config: ignore-from-file should contain ' 'filename(s), either as a list or string') - with fileinput.input(conf['ignore-from-file']) as f: - self.ignore = pathspec.PathSpec.from_lines('gitwildmatch', f) + self.ignore = pathspec.PathSpec.from_lines( + 'gitwildmatch', + decoder.lines_in_files(conf['ignore-from-file']) + ) elif 'ignore' in conf: if isinstance(conf['ignore'], str): self.ignore = pathspec.PathSpec.from_lines( @@ -164,9 +165,10 @@ def validate_rule_conf(rule, conf): raise YamlLintConfigError( 'invalid config: ignore-from-file should contain ' 'valid filename(s), either as a list or string') - with fileinput.input(conf['ignore-from-file']) as f: - conf['ignore'] = pathspec.PathSpec.from_lines( - 'gitwildmatch', f) + conf['ignore'] = pathspec.PathSpec.from_lines( + 'gitwildmatch', + decoder.lines_in_files(conf['ignore-from-file']) + ) elif ('ignore' in conf and not isinstance( conf['ignore'], pathspec.pathspec.PathSpec)): if isinstance(conf['ignore'], str): From a09f5f0b9486f3a9202da2dc7f7da02f5923eab3 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Sun, 31 Dec 2023 17:36:48 -0500 Subject: [PATCH 5/6] =?UTF-8?q?tests:=20Stop=20using=20open()=E2=80=99s=20?= =?UTF-8?q?default=20encoding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In general, using open()’s default encoding is a mistake [1]. This change makes sure that every time open() is called, the encoding parameter is specified. Specifically, it makes it so that all tests succeed when run like this: python -X warn_default_encoding -W error::EncodingWarning -m unittest discover [1]: --- tests/common.py | 5 +++-- tests/test_cli.py | 23 +++++++++++++-------- tests/test_config.py | 49 ++++++++++++++++++++++++++------------------ tests/test_module.py | 6 ++++-- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/tests/common.py b/tests/common.py index 7b73e3ca..d2535310 100644 --- a/tests/common.py +++ b/tests/common.py @@ -145,8 +145,9 @@ def build_temp_workspace(files): else: if isinstance(content, Blob): content = content.text.encode(content.encoding) - mode = 'wb' if isinstance(content, bytes) else 'w' - with open(path, mode) as f: + elif isinstance(content, str): + content = content.encode('utf_8') + with open(path, 'wb') as f: f.write(content) return tempdir diff --git a/tests/test_cli.py b/tests/test_cli.py index 51825efc..76f2fa10 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -303,14 +303,14 @@ def test_run_with_implicit_extends_config(self): (ctx.returncode, ctx.stdout, ctx.stderr), (0, expected_out, '')) def test_run_with_config_file(self): - with open(os.path.join(self.wd, 'config'), 'w') as f: + with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: disable}') with RunContext(self) as ctx: cli.run(('-c', f.name, os.path.join(self.wd, 'a.yaml'))) self.assertEqual(ctx.returncode, 0) - with open(os.path.join(self.wd, 'config'), 'w') as f: + with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: enable}') with RunContext(self) as ctx: @@ -326,14 +326,14 @@ def test_run_with_user_global_config_file(self): self.addCleanup(os.environ.__delitem__, 'HOME') os.environ['HOME'] = home - with open(config, 'w') as f: + with open(config, 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: disable}') with RunContext(self) as ctx: cli.run((os.path.join(self.wd, 'a.yaml'), )) self.assertEqual(ctx.returncode, 0) - with open(config, 'w') as f: + with open(config, 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: enable}') with RunContext(self) as ctx: @@ -346,7 +346,8 @@ def test_run_with_user_xdg_config_home_in_env(self): with tempfile.TemporaryDirectory('w') as d: os.environ['XDG_CONFIG_HOME'] = d os.makedirs(os.path.join(d, 'yamllint')) - with open(os.path.join(d, 'yamllint', 'config'), 'w') as f: + path = os.path.join(d, 'yamllint', 'config') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: relaxed') with RunContext(self) as ctx: cli.run(('-f', 'parsable', os.path.join(self.wd, 'warn.yaml'))) @@ -356,7 +357,7 @@ def test_run_with_user_xdg_config_home_in_env(self): def test_run_with_user_yamllint_config_file_in_env(self): self.addCleanup(os.environ.__delitem__, 'YAMLLINT_CONFIG_FILE') - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: os.environ['YAMLLINT_CONFIG_FILE'] = f.name f.write('rules: {trailing-spaces: disable}') f.flush() @@ -364,7 +365,7 @@ def test_run_with_user_yamllint_config_file_in_env(self): cli.run((os.path.join(self.wd, 'a.yaml'), )) self.assertEqual(ctx.returncode, 0) - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: os.environ['YAMLLINT_CONFIG_FILE'] = f.name f.write('rules: {trailing-spaces: enable}') f.flush() @@ -508,7 +509,11 @@ def test_run_default_format_output_in_tty(self): # Create a pseudo-TTY and redirect stdout to it old_stdout, old_stderr = sys.stdout, sys.stderr master, slave = pty.openpty() - sys.stdout = sys.stderr = os.fdopen(slave, 'w') + sys.stdout = sys.stderr = os.fdopen( + slave, + 'w', + encoding=os.device_encoding(slave) + ) with self.assertRaises(SystemExit) as ctx: cli.run((path, )) @@ -517,7 +522,7 @@ def test_run_default_format_output_in_tty(self): self.assertEqual(ctx.exception.code, 1) # Read output from TTY - output = os.fdopen(master, 'r') + output = os.fdopen(master, 'r', encoding=os.device_encoding(master)) flag = fcntl.fcntl(master, fcntl.F_GETFD) fcntl.fcntl(master, fcntl.F_SETFL, flag | os.O_NONBLOCK) diff --git a/tests/test_config.py b/tests/test_config.py index a98b847f..8071211d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -258,7 +258,7 @@ def test_extend_on_object(self): self.assertEqual(len(new.enabled_rules(None)), 2) def test_extend_on_file(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -277,7 +277,7 @@ def test_extend_on_file(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_remove_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -296,7 +296,7 @@ def test_extend_remove_rule(self): self.assertEqual(len(c.enabled_rules(None)), 1) def test_extend_edit_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -318,7 +318,7 @@ def test_extend_edit_rule(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_reenable_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -338,7 +338,7 @@ def test_extend_reenable_rule(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_recursive_default_values(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' braces:\n' ' max-spaces-inside: 1248\n') @@ -353,7 +353,7 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['braces']['min-spaces-inside-empty'], 2357) self.assertEqual(c.rules['braces']['max-spaces-inside-empty'], -1) - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 1337\n') @@ -365,8 +365,8 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['colons']['max-spaces-before'], 1337) self.assertEqual(c.rules['colons']['max-spaces-after'], 1) - with tempfile.NamedTemporaryFile('w') as f1, \ - tempfile.NamedTemporaryFile('w') as f2: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f1, \ + tempfile.NamedTemporaryFile('w', encoding='utf_8') as f2: f1.write('rules:\n' ' colons:\n' ' max-spaces-before: 1337\n') @@ -383,7 +383,7 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['colons']['max-spaces-after'], 1) def test_extended_ignore_str(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('ignore: |\n' ' *.template.yaml\n') f.flush() @@ -393,7 +393,7 @@ def test_extended_ignore_str(self): self.assertEqual(c.ignore.match_file('test.yaml'), False) def test_extended_ignore_list(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('ignore:\n' ' - "*.template.yaml"\n') f.flush() @@ -563,7 +563,8 @@ def test_no_ignore(self): ))) def test_run_with_ignore_str(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore: |\n' ' *.dont-lint-me.yaml\n' @@ -617,7 +618,8 @@ def test_run_with_ignore_str(self): ))) def test_run_with_ignore_list(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore:\n' ' - "*.dont-lint-me.yaml"\n' @@ -671,19 +673,22 @@ def test_run_with_ignore_list(self): ))) def test_run_with_ignore_from_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore-from-file: .gitignore\n' 'rules:\n' ' key-duplicates:\n' ' ignore-from-file: .ignore-key-duplicates\n') - with open(os.path.join(self.wd, '.gitignore'), 'w') as f: + path = os.path.join(self.wd, '.gitignore') + with open(path, 'w', encoding='utf_8') as f: f.write('*.dont-lint-me.yaml\n' '/bin/\n' '!/bin/*.lint-me-anyway.yaml\n') - with open(os.path.join(self.wd, '.ignore-key-duplicates'), 'w') as f: + path = os.path.join(self.wd, '.ignore-key-duplicates') + with open(path, 'w', encoding='utf_8') as f: f.write('/ign-dup\n') sys.stdout = StringIO() @@ -728,13 +733,16 @@ def test_run_with_ignore_from_file(self): ))) def test_run_with_ignored_from_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('ignore-from-file: [.gitignore, .yamlignore]\n' 'extends: default\n') - with open(os.path.join(self.wd, '.gitignore'), 'w') as f: + path = os.path.join(self.wd, '.gitignore') + with open(path, 'w', encoding='utf_8') as f: f.write('*.dont-lint-me.yaml\n' '/bin/\n') - with open(os.path.join(self.wd, '.yamlignore'), 'w') as f: + path = os.path.join(self.wd, '.yamlignore') + with open(path, 'w', encoding='utf_8') as f: f.write('!/bin/*.lint-me-anyway.yaml\n') sys.stdout = StringIO() @@ -793,7 +801,7 @@ def test_run_with_ignore_with_broken_symlink(self): cli.run(('-f', 'parsable', '.')) self.assertNotEqual(ctx.returncode, 0) - with open(os.path.join(wd, '.yamllint'), 'w') as f: + with open(os.path.join(wd, '.yamllint'), 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore: |\n' ' *404.yaml\n') @@ -811,7 +819,8 @@ def test_run_with_ignore_with_broken_symlink(self): shutil.rmtree(wd) def test_run_with_ignore_on_ignored_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('ignore: file.dont-lint-me.yaml\n' 'rules:\n' ' trailing-spaces: enable\n' diff --git a/tests/test_module.py b/tests/test_module.py index 7f4f62ba..b4e24e38 100644 --- a/tests/test_module.py +++ b/tests/test_module.py @@ -28,12 +28,14 @@ def setUp(self): self.wd = tempfile.mkdtemp(prefix='yamllint-tests-') # file with only one warning - with open(os.path.join(self.wd, 'warn.yaml'), 'w') as f: + path = os.path.join(self.wd, 'warn.yaml') + with open(path, 'w', encoding='utf_8') as f: f.write('key: value\n') # file in dir os.mkdir(os.path.join(self.wd, 'sub')) - with open(os.path.join(self.wd, 'sub', 'nok.yaml'), 'w') as f: + path = os.path.join(self.wd, 'sub', 'nok.yaml') + with open(path, 'w', encoding='utf_8') as f: f.write('---\n' 'list: [ 1, 1, 2, 3, 5, 8] \n') From a6031a401183187301341bb283a85c250f3030d3 Mon Sep 17 00:00:00 2001 From: Jason Yundt Date: Wed, 3 Jan 2024 12:03:13 -0500 Subject: [PATCH 6/6] =?UTF-8?q?CI:=20Fail=20when=20open()=E2=80=99s=20defa?= =?UTF-8?q?ult=20encoding=20is=20used?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous few commits have removed all calls to open() that use its default encoding. That being said, it’s still possible that code added in the future will contain that same mistake. This commit makes it so that the CI test job will fail if that mistake is made again. Unfortunately, it doesn’t look like coverage.py allows you to specify -X options [1] or warning filters [2] when running your tests [3]. To work around this problem, I’m running all of the Python code, including coverage.py itself, with -X warn_default_encoding and -W error::EncodingWarning. As a result, the CI test job will also fail if coverage.py uses open()’s default encoding. Hopefully, coverage.py won’t do that. If it does, then we can always temporarily revert this commit. [1]: [2]: [3]: --- .github/workflows/ci.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7e55e6ad..34a9f7b8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -57,6 +57,13 @@ jobs: - run: pip install . # https://github.com/AndreMiras/coveralls-python-action/issues/18 - run: echo -e "[run]\nrelative_files = True" > .coveragerc - - run: coverage run -m unittest discover + - run: >- + python + -X warn_default_encoding + -W error::EncodingWarning + -m coverage + run + -m unittest + discover - name: Coveralls uses: AndreMiras/coveralls-python-action@develop