diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7e55e6ad..34a9f7b8 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -57,6 +57,13 @@ jobs: - run: pip install . # https://github.com/AndreMiras/coveralls-python-action/issues/18 - run: echo -e "[run]\nrelative_files = True" > .coveragerc - - run: coverage run -m unittest discover + - run: >- + python + -X warn_default_encoding + -W error::EncodingWarning + -m coverage + run + -m unittest + discover - name: Coveralls uses: AndreMiras/coveralls-python-action@develop diff --git a/docs/configuration.rst b/docs/configuration.rst index 9624b496..8e7a10ab 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -228,6 +228,10 @@ or: .. note:: However, this is mutually exclusive with the ``ignore`` key. +.. note:: Files on the ``ignore-from-file`` list must use either UTF-8, UTF-16 + or UTF-32. Additionally, they must start with either an ASCII character or a + byte order mark. + If you need to know the exact list of files that yamllint would process, without really linting them, you can use ``--list-files``: diff --git a/tests/common.py b/tests/common.py index 29dcfb9c..d2535310 100644 --- a/tests/common.py +++ b/tests/common.py @@ -13,6 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import codecs import contextlib from io import StringIO import os @@ -20,6 +21,8 @@ import sys import tempfile import unittest +import warnings +from codecs import CodecInfo import yaml @@ -27,6 +30,152 @@ from yamllint.config import YamlLintConfig +# Encoding related stuff: +UTF_CODECS = ( + 'utf_32_be', + 'utf_32_be_sig', + 'utf_32_le', + 'utf_32_le_sig', + 'utf_16_be', + 'utf_16_be_sig', + 'utf_16_le', + 'utf_16_le_sig', + 'utf_8', + 'utf_8_sig' +) + + +def encode_utf_32_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors), + len(obj) + ) + + +def encode_utf_32_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors), + len(obj) + ) + + +def encode_utf_16_be_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors), + len(obj) + ) + + +def encode_utf_16_le_sig(obj, errors='strict'): + return ( + codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors), + len(obj) + ) + + +test_codec_infos = { + 'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), # noqa: E501 + 'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), # noqa: E501 + 'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), # noqa: E501 +} + + +def register_test_codecs(): + codecs.register(test_codec_infos.get) + + +def unregister_test_codecs(): + if sys.version_info >= (3, 10, 0): + codecs.unregister(test_codec_infos.get) + else: + warnings.warn( + "This version of Python doesn’t allow us to unregister codecs.", + stacklevel=1 + ) + + +def is_test_codec(codec): + return codec in test_codec_infos.keys() + + +def test_codec_built_in_equivalent(test_codec): + return_value = test_codec + for suffix in ('_sig', '_be', '_le'): + return_value = return_value.replace(suffix, '') + return return_value + + +def uses_bom(codec): + for suffix in ('_32', '_16', '_sig'): + if codec.endswith(suffix): + return True + return False + + +def encoding_detectable(string, codec): + """ + Returns True if encoding can be detected after string is encoded + + Encoding detection only works if you’re using a BOM or the first character + is ASCII. See yamllint.decoder.auto_decode()’s docstring. + """ + return uses_bom(codec) or (len(string) > 0 and string[0].isascii()) + + +# Workspace related stuff: +class Blob: + def __init__(self, text, encoding): + self.text = text + self.encoding = encoding + + +def build_temp_workspace(files): + tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') + + for path, content in files.items(): + path = os.fsencode(os.path.join(tempdir, path)) + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + + if isinstance(content, list): + os.mkdir(path) + elif isinstance(content, str) and content.startswith('symlink://'): + os.symlink(content[10:], path) + else: + if isinstance(content, Blob): + content = content.text.encode(content.encoding) + elif isinstance(content, str): + content = content.encode('utf_8') + with open(path, 'wb') as f: + f.write(content) + + return tempdir + + +@contextlib.contextmanager +def temp_workspace(files): + """Provide a temporary workspace that is automatically cleaned up.""" + backup_wd = os.getcwd() + wd = build_temp_workspace(files) + + try: + os.chdir(wd) + yield + finally: + os.chdir(backup_wd) + shutil.rmtree(wd) + + +def temp_workspace_with_files_in_many_codecs(path_template, text): + workspace = {} + for codec in UTF_CODECS: + if encoding_detectable(text, codec): + workspace[path_template.format(codec)] = Blob(text, codec) + return workspace + + +# Miscellaneous stuff: class RuleTestCase(unittest.TestCase): def build_fake_config(self, conf): if conf is None: @@ -81,37 +230,3 @@ def __exit__(self, *exc_info): @property def returncode(self): return self._raises_ctx.exception.code - - -def build_temp_workspace(files): - tempdir = tempfile.mkdtemp(prefix='yamllint-tests-') - - for path, content in files.items(): - path = os.path.join(tempdir, path).encode('utf-8') - if not os.path.exists(os.path.dirname(path)): - os.makedirs(os.path.dirname(path)) - - if isinstance(content, list): - os.mkdir(path) - elif isinstance(content, str) and content.startswith('symlink://'): - os.symlink(content[10:], path) - else: - mode = 'wb' if isinstance(content, bytes) else 'w' - with open(path, mode) as f: - f.write(content) - - return tempdir - - -@contextlib.contextmanager -def temp_workspace(files): - """Provide a temporary workspace that is automatically cleaned up.""" - backup_wd = os.getcwd() - wd = build_temp_workspace(files) - - try: - os.chdir(wd) - yield - finally: - os.chdir(backup_wd) - shutil.rmtree(wd) diff --git a/tests/test_cli.py b/tests/test_cli.py index e0ae0fee..76f2fa10 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -23,7 +23,14 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext, temp_workspace +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + temp_workspace, + unregister_test_codecs, + temp_workspace_with_files_in_many_codecs, +) from yamllint import cli, config @@ -296,14 +303,14 @@ def test_run_with_implicit_extends_config(self): (ctx.returncode, ctx.stdout, ctx.stderr), (0, expected_out, '')) def test_run_with_config_file(self): - with open(os.path.join(self.wd, 'config'), 'w') as f: + with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: disable}') with RunContext(self) as ctx: cli.run(('-c', f.name, os.path.join(self.wd, 'a.yaml'))) self.assertEqual(ctx.returncode, 0) - with open(os.path.join(self.wd, 'config'), 'w') as f: + with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: enable}') with RunContext(self) as ctx: @@ -319,14 +326,14 @@ def test_run_with_user_global_config_file(self): self.addCleanup(os.environ.__delitem__, 'HOME') os.environ['HOME'] = home - with open(config, 'w') as f: + with open(config, 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: disable}') with RunContext(self) as ctx: cli.run((os.path.join(self.wd, 'a.yaml'), )) self.assertEqual(ctx.returncode, 0) - with open(config, 'w') as f: + with open(config, 'w', encoding='utf_8') as f: f.write('rules: {trailing-spaces: enable}') with RunContext(self) as ctx: @@ -339,7 +346,8 @@ def test_run_with_user_xdg_config_home_in_env(self): with tempfile.TemporaryDirectory('w') as d: os.environ['XDG_CONFIG_HOME'] = d os.makedirs(os.path.join(d, 'yamllint')) - with open(os.path.join(d, 'yamllint', 'config'), 'w') as f: + path = os.path.join(d, 'yamllint', 'config') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: relaxed') with RunContext(self) as ctx: cli.run(('-f', 'parsable', os.path.join(self.wd, 'warn.yaml'))) @@ -349,7 +357,7 @@ def test_run_with_user_xdg_config_home_in_env(self): def test_run_with_user_yamllint_config_file_in_env(self): self.addCleanup(os.environ.__delitem__, 'YAMLLINT_CONFIG_FILE') - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: os.environ['YAMLLINT_CONFIG_FILE'] = f.name f.write('rules: {trailing-spaces: disable}') f.flush() @@ -357,7 +365,7 @@ def test_run_with_user_yamllint_config_file_in_env(self): cli.run((os.path.join(self.wd, 'a.yaml'), )) self.assertEqual(ctx.returncode, 0) - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: os.environ['YAMLLINT_CONFIG_FILE'] = f.name f.write('rules: {trailing-spaces: enable}') f.flush() @@ -499,8 +507,13 @@ def test_run_default_format_output_in_tty(self): path = os.path.join(self.wd, 'a.yaml') # Create a pseudo-TTY and redirect stdout to it + old_stdout, old_stderr = sys.stdout, sys.stderr master, slave = pty.openpty() - sys.stdout = sys.stderr = os.fdopen(slave, 'w') + sys.stdout = sys.stderr = os.fdopen( + slave, + 'w', + encoding=os.device_encoding(slave) + ) with self.assertRaises(SystemExit) as ctx: cli.run((path, )) @@ -509,7 +522,7 @@ def test_run_default_format_output_in_tty(self): self.assertEqual(ctx.exception.code, 1) # Read output from TTY - output = os.fdopen(master, 'r') + output = os.fdopen(master, 'r', encoding=os.device_encoding(master)) flag = fcntl.fcntl(master, fcntl.F_GETFD) fcntl.fcntl(master, fcntl.F_SETFL, flag | os.O_NONBLOCK) @@ -518,6 +531,7 @@ def test_run_default_format_output_in_tty(self): sys.stdout.close() sys.stderr.close() output.close() + sys.stdout, sys.stderr = old_stdout, old_stderr self.assertEqual(out, ( f'\033[4m{path}\033[0m\n' @@ -817,3 +831,52 @@ def test_multiple_parent_config_file(self): self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr), (0, './4spaces.yml:2:5: [warning] wrong indentation: ' 'expected 3 but found 4 (indentation)\n', '')) + + +class CommandLineEncodingTestCase(unittest.TestCase): + @classmethod + def setUpClass(cls): + super().setUpClass() + register_test_codecs() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + unregister_test_codecs() + + def test_valid_encodings(self): + conf = ('---\n' + 'rules:\n' + ' key-ordering: enable\n') + config_files = temp_workspace_with_files_in_many_codecs( + 'config_{}.yaml', + conf + ) + sorted_correctly = ('---\n' + 'A: YAML\n' + 'Z: YAML\n') + sorted_correctly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_correctly/{}.yaml', + sorted_correctly + ) + sorted_incorrectly = ('---\n' + 'Z: YAML\n' + 'A: YAML\n') + sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs( + 'sorted_incorrectly/{}.yaml', + sorted_incorrectly + ) + workspace = { + **config_files, + **sorted_correctly_files, + **sorted_incorrectly_files + } + + with temp_workspace(workspace): + for config_path in config_files.keys(): + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_correctly/')) + self.assertEqual(ctx.returncode, 0) + with RunContext(self) as ctx: + cli.run(('-c', config_path, 'sorted_incorrectly/')) + self.assertNotEqual(ctx.returncode, 0) diff --git a/tests/test_config.py b/tests/test_config.py index fb570c66..8071211d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -13,6 +13,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import itertools import os import shutil import sys @@ -20,7 +21,12 @@ import unittest from io import StringIO -from tests.common import build_temp_workspace, RunContext +from tests.common import ( + build_temp_workspace, + register_test_codecs, + RunContext, + unregister_test_codecs, +) from yamllint import cli, config from yamllint.config import YamlLintConfigError @@ -252,7 +258,7 @@ def test_extend_on_object(self): self.assertEqual(len(new.enabled_rules(None)), 2) def test_extend_on_file(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -271,7 +277,7 @@ def test_extend_on_file(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_remove_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -290,7 +296,7 @@ def test_extend_remove_rule(self): self.assertEqual(len(c.enabled_rules(None)), 1) def test_extend_edit_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -312,7 +318,7 @@ def test_extend_edit_rule(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_reenable_rule(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 0\n' @@ -332,7 +338,7 @@ def test_extend_reenable_rule(self): self.assertEqual(len(c.enabled_rules(None)), 2) def test_extend_recursive_default_values(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' braces:\n' ' max-spaces-inside: 1248\n') @@ -347,7 +353,7 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['braces']['min-spaces-inside-empty'], 2357) self.assertEqual(c.rules['braces']['max-spaces-inside-empty'], -1) - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('rules:\n' ' colons:\n' ' max-spaces-before: 1337\n') @@ -359,8 +365,8 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['colons']['max-spaces-before'], 1337) self.assertEqual(c.rules['colons']['max-spaces-after'], 1) - with tempfile.NamedTemporaryFile('w') as f1, \ - tempfile.NamedTemporaryFile('w') as f2: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f1, \ + tempfile.NamedTemporaryFile('w', encoding='utf_8') as f2: f1.write('rules:\n' ' colons:\n' ' max-spaces-before: 1337\n') @@ -377,7 +383,7 @@ def test_extend_recursive_default_values(self): self.assertEqual(c.rules['colons']['max-spaces-after'], 1) def test_extended_ignore_str(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('ignore: |\n' ' *.template.yaml\n') f.flush() @@ -387,7 +393,7 @@ def test_extended_ignore_str(self): self.assertEqual(c.ignore.match_file('test.yaml'), False) def test_extended_ignore_list(self): - with tempfile.NamedTemporaryFile('w') as f: + with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f: f.write('ignore:\n' ' - "*.template.yaml"\n') f.flush() @@ -557,7 +563,8 @@ def test_no_ignore(self): ))) def test_run_with_ignore_str(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore: |\n' ' *.dont-lint-me.yaml\n' @@ -611,7 +618,8 @@ def test_run_with_ignore_str(self): ))) def test_run_with_ignore_list(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore:\n' ' - "*.dont-lint-me.yaml"\n' @@ -665,19 +673,22 @@ def test_run_with_ignore_list(self): ))) def test_run_with_ignore_from_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore-from-file: .gitignore\n' 'rules:\n' ' key-duplicates:\n' ' ignore-from-file: .ignore-key-duplicates\n') - with open(os.path.join(self.wd, '.gitignore'), 'w') as f: + path = os.path.join(self.wd, '.gitignore') + with open(path, 'w', encoding='utf_8') as f: f.write('*.dont-lint-me.yaml\n' '/bin/\n' '!/bin/*.lint-me-anyway.yaml\n') - with open(os.path.join(self.wd, '.ignore-key-duplicates'), 'w') as f: + path = os.path.join(self.wd, '.ignore-key-duplicates') + with open(path, 'w', encoding='utf_8') as f: f.write('/ign-dup\n') sys.stdout = StringIO() @@ -722,13 +733,16 @@ def test_run_with_ignore_from_file(self): ))) def test_run_with_ignored_from_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('ignore-from-file: [.gitignore, .yamlignore]\n' 'extends: default\n') - with open(os.path.join(self.wd, '.gitignore'), 'w') as f: + path = os.path.join(self.wd, '.gitignore') + with open(path, 'w', encoding='utf_8') as f: f.write('*.dont-lint-me.yaml\n' '/bin/\n') - with open(os.path.join(self.wd, '.yamlignore'), 'w') as f: + path = os.path.join(self.wd, '.yamlignore') + with open(path, 'w', encoding='utf_8') as f: f.write('!/bin/*.lint-me-anyway.yaml\n') sys.stdout = StringIO() @@ -787,7 +801,7 @@ def test_run_with_ignore_with_broken_symlink(self): cli.run(('-f', 'parsable', '.')) self.assertNotEqual(ctx.returncode, 0) - with open(os.path.join(wd, '.yamllint'), 'w') as f: + with open(os.path.join(wd, '.yamllint'), 'w', encoding='utf_8') as f: f.write('extends: default\n' 'ignore: |\n' ' *404.yaml\n') @@ -805,7 +819,8 @@ def test_run_with_ignore_with_broken_symlink(self): shutil.rmtree(wd) def test_run_with_ignore_on_ignored_file(self): - with open(os.path.join(self.wd, '.yamllint'), 'w') as f: + path = os.path.join(self.wd, '.yamllint') + with open(path, 'w', encoding='utf_8') as f: f.write('ignore: file.dont-lint-me.yaml\n' 'rules:\n' ' trailing-spaces: enable\n' @@ -820,3 +835,44 @@ def test_run_with_ignore_on_ignored_file(self): sys.stdout.getvalue().strip(), 'file-at-root.yaml:4:17: [error] trailing spaces (trailing-spaces)' ) + + def create_ignore_file(self, text, codec): + path = os.path.join(self.wd, f'{codec}.ignore') + with open(path, 'wb') as f: + f.write(text.encode(codec)) + self.addCleanup(lambda: os.remove(path)) + return path + + def test_ignored_from_file_with_multiple_encodings(self): + register_test_codecs() + self.addCleanup(unregister_test_codecs) + + ignore_files = itertools.starmap( + self.create_ignore_file, ( + ('bin/file.lint-me-anyway.yaml\n', 'utf_32_be'), + ('bin/file.yaml\n', 'utf_32_be_sig'), + ('file-at-root.yaml\n', 'utf_32_le'), + ('file.dont-lint-me.yaml\n', 'utf_32_le_sig'), + + ('ign-dup/file.yaml\n', 'utf_16_be'), + ('ign-dup/sub/dir/file.yaml\n', 'utf_16_be_sig'), + ('ign-trail/file.yaml\n', 'utf_16_le'), + ('include/ign-dup/sub/dir/file.yaml\n', 'utf_16_le_sig'), + + ('s/s/ign-trail/file.yaml\n', 'utf_8'), + ( + 's/s/ign-trail/s/s/file.yaml\n' + 's/s/ign-trail/s/s/file2.lint-me-anyway.yaml\n' + '.yamllint\n', + + 'utf_8_sig' + ), + ) + ) + conf = ('---\n' + 'extends: default\n' + f'ignore-from-file: [{", ".join(ignore_files)}]\n') + + with self.assertRaises(SystemExit) as cm: + cli.run(('-d', conf, '.')) + self.assertEqual(cm.exception.code, 0) diff --git a/tests/test_decoder.py b/tests/test_decoder.py new file mode 100644 index 00000000..7f0198bc --- /dev/null +++ b/tests/test_decoder.py @@ -0,0 +1,482 @@ +# Copyright (C) 2023–2024 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs +import itertools +import unittest + +from tests.common import ( + UTF_CODECS, + encoding_detectable, + is_test_codec, + register_test_codecs, + temp_workspace, + temp_workspace_with_files_in_many_codecs, + test_codec_built_in_equivalent, + unregister_test_codecs, + uses_bom, +) + +from yamllint import decoder + + +class PreEncodedTestStringInfo(): + def __init__( + self, + input_bytes, + codec_for_input_bytes, + expected_output_str + ): + self.input_bytes = input_bytes + self.codec_for_input_bytes = codec_for_input_bytes + self.expected_output_str = expected_output_str + + +PRE_ENCODED_TEST_STRING_INFOS = ( + # An empty string + PreEncodedTestStringInfo( + b'', + None, + '' + ), + + # A single ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00|', + 'utf_32_be', + '|' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00|', + 'utf_32', + '|' + ), + PreEncodedTestStringInfo( + b'|\x00\x00\x00', + 'utf_32_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00|\x00\x00\x00', + 'utf_32', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'\x00|', + 'utf_16_be', + '|' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00|', + 'utf_16', # BE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|\x00', + 'utf_16_le', + '|' + ), + PreEncodedTestStringInfo( + b'\xff\xfe|\x00', + 'utf_16', # LE with BOM + '|' + ), + PreEncodedTestStringInfo( + b'|', + 'utf_8', + '|' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf|', + 'utf_8_sig', + '|' + ), + + # A string that starts with an ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16_be', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?', + 'utf_16', # BE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'W\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16_le', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xff\xfeW\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00', + 'utf_16', # LE with BOM + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'What\xe2\x80\x99s up?', + 'utf_8', + 'What’s up?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbfWhat\xe2\x80\x99s up?', + 'utf_8_sig', + 'What’s up?' + ), + + # A single non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x01\xf4;', + 'utf_32', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00;\xf4\x01\x00', + 'utf_32', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\xd8=\xdc;', + 'utf_16', # BE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xff\xfe=\xd8;\xdc', + 'utf_16', # LE with BOM + '🐻' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xf0\x9f\x90\xbb', + 'utf_8_sig', + '🐻' + ), + + # A string that starts with a non-ASCII character + PreEncodedTestStringInfo( + b'\x00\x00\xfe\xff\x00\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?', # noqa: E501 + 'utf_32', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?\x00\x00\x00', # noqa: E501 + 'utf_32', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xfe\xff\x00\xc7\x00a\x00 \x00v\x00a\x00?', + 'utf_16', # BE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xff\xfe\xc7\x00a\x00 \x00v\x00a\x00?\x00', + 'utf_16', # LE with BOM + 'Ça va?' + ), + PreEncodedTestStringInfo( + b'\xef\xbb\xbf\xc3\x87a va?', + 'utf_8_sig', + 'Ça va?' + ) +) +TEST_STRINGS_TO_ENCODE_AT_RUNTIME = ( + "", + "y", + "yaml", + "🇾⁠🇦⁠🇲⁠🇱⁠❗" +) +setUpModule = register_test_codecs +tearDownModule = unregister_test_codecs + + +class EncodingStuffFromCommonTestCase(unittest.TestCase): + def test_test_codecs_and_utf_codecs(self): + error = "{} failed to correctly encode then decode {}." + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + self.assertEqual( + string, + string.encode(codec).decode(codec), + msg=error.format(repr(codec), repr(string)) + ) + + def test_is_test_codec(self): + self.assertFalse(is_test_codec('utf_32')) + self.assertFalse(is_test_codec('utf_32_be')) + self.assertTrue(is_test_codec('utf_32_be_sig')) + self.assertFalse(is_test_codec('utf_32_le')) + self.assertTrue(is_test_codec('utf_32_le_sig')) + + self.assertFalse(is_test_codec('utf_16')) + self.assertFalse(is_test_codec('utf_16_be')) + self.assertTrue(is_test_codec('utf_16_be_sig')) + self.assertFalse(is_test_codec('utf_16_le')) + self.assertTrue(is_test_codec('utf_16_le_sig')) + + self.assertFalse(is_test_codec('utf_8')) + self.assertFalse(is_test_codec('utf_8_be')) + + def test_test_codec_built_in_equivalent(self): + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_be_sig') + ) + self.assertEqual( + 'utf_32', + test_codec_built_in_equivalent('utf_32_le_sig') + ) + + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_be_sig') + ) + self.assertEqual( + 'utf_16', + test_codec_built_in_equivalent('utf_16_le_sig') + ) + + def test_uses_bom(self): + self.assertTrue(uses_bom('utf_32')) + self.assertFalse(uses_bom('utf_32_be')) + self.assertTrue(uses_bom('utf_32_be_sig')) + self.assertFalse(uses_bom('utf_32_le')) + self.assertTrue(uses_bom('utf_32_le_sig')) + + self.assertTrue(uses_bom('utf_16')) + self.assertFalse(uses_bom('utf_16_be')) + self.assertTrue(uses_bom('utf_16_be_sig')) + self.assertFalse(uses_bom('utf_16_le')) + self.assertTrue(uses_bom('utf_16_le_sig')) + + self.assertFalse(uses_bom('utf_8')) + self.assertTrue(uses_bom('utf_8_sig')) + + def test_encoding_detectable(self): + # No BOM + nothing + self.assertFalse(encoding_detectable('', 'utf_32_be')) + self.assertFalse(encoding_detectable('', 'utf_32_le')) + + self.assertFalse(encoding_detectable('', 'utf_16_be')) + self.assertFalse(encoding_detectable('', 'utf_16_le')) + + self.assertFalse(encoding_detectable('', 'utf_8')) + # BOM + nothing + self.assertTrue(encoding_detectable('', 'utf_32')) + self.assertTrue(encoding_detectable('', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_16')) + self.assertTrue(encoding_detectable('', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('', 'utf_8_sig')) + # No BOM + non-ASCII + self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be')) + self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le')) + + self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be')) + self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le')) + + self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8')) + # No BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32_be')) + self.assertTrue(encoding_detectable('gi', 'utf_32_le')) + + self.assertTrue(encoding_detectable('ve', 'utf_16_be')) + self.assertTrue(encoding_detectable(' y', 'utf_16_le')) + + self.assertTrue(encoding_detectable('ou', 'utf_8')) + # BOM + non-ASCII + self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32')) + self.assertTrue(encoding_detectable('ⓟ␤', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16')) + self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig')) + # BOM + ASCII + self.assertTrue(encoding_detectable('a ', 'utf_32')) + self.assertTrue(encoding_detectable('le', 'utf_32_be_sig')) + self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig')) + + self.assertTrue(encoding_detectable('yo', 'utf_16')) + self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig')) + self.assertTrue(encoding_detectable('do', 'utf_16_le_sig')) + + self.assertTrue(encoding_detectable('wn', 'utf_8_sig')) + + +class DecoderTestCase(unittest.TestCase): + def detect_encoding_test_helper( + self, + original_string, + input_bytes, + expected_output + ): + ERROR1 = "{} was encoded with {}, but detect_encoding() returned {}." + ERROR2 = "detect_encoding({}) returned a codec that isn’t built-in." + actual_output = decoder.detect_encoding(input_bytes) + if expected_output is not None: + self.assertEqual( + expected_output, + actual_output, + msg=ERROR1.format( + input_bytes, + repr(expected_output), + repr(actual_output) + ) + ) + + codec_info = codecs.lookup(actual_output) + self.assertFalse( + is_test_codec(codec_info), + msg=ERROR2.format(input_bytes) + ) + + def test_detect_encoding_with_pre_encoded_strings(self): + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + self.detect_encoding_test_helper( + pre_encoded_test_string_info.expected_output_str, + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes + ) + + def test_detect_encoding_with_strings_encoded_at_runtime(self): + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + if not uses_bom(codec) and len(string) == 0: + expected_output = 'utf_8' + elif not encoding_detectable(string, codec): + expected_output = None + elif is_test_codec(codec): + expected_output = test_codec_built_in_equivalent(codec) + else: + expected_output = codec + self.detect_encoding_test_helper( + string, + string.encode(codec), + expected_output + ) + + def auto_decode_test_helper( + self, + input_bytes, + codec_for_input_bytes, + expected_output + ): + ERROR = "auto_decode({}) returned the wrong value." + does_auto_detect_encodings_return_value_matter = ( + codec_for_input_bytes is not None and ( + encoding_detectable(expected_output, codec_for_input_bytes) + or len(input_bytes) == 0 + ) + ) + if does_auto_detect_encodings_return_value_matter: + actual_output = decoder.auto_decode(input_bytes) + self.assertEqual( + expected_output, + actual_output, + msg=ERROR.format(repr(input_bytes)) + ) + self.assertIsInstance(actual_output, str) + else: + try: + decoder.auto_decode(input_bytes) + except UnicodeDecodeError as exception: + return exception + return None + + def test_auto_decode_with_pre_encoded_strings(self): + ERROR = "auto_decode({}) should not have raised an exception" + for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS: + exception = self.auto_decode_test_helper( + pre_encoded_test_string_info.input_bytes, + pre_encoded_test_string_info.codec_for_input_bytes, + pre_encoded_test_string_info.expected_output_str + ) + if exception is not None: + new_exception = self.failureException( + msg=ERROR.format( + repr(pre_encoded_test_string_info.input_bytes) + ) + ) + raise new_exception from exception + + def test_auto_decode_with_strings_encoded_at_runtime(self): + at_least_one_decode_error = False + for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME: + for codec in UTF_CODECS: + exception = self.auto_decode_test_helper( + string.encode(codec), + codec, + string + ) + if exception is not None: + at_least_one_decode_error = True + self.assertTrue( + at_least_one_decode_error, + msg=( + "None of the TEST_STRINGS_TO_ENCODE_AT_RUNTIME triggered a " + + "decoding error." + ) + ) + + def perform_lines_in_file_test(self, strings): + workspace = temp_workspace_with_files_in_many_codecs( + '{}', + '\n'.join(strings) + ) + with temp_workspace(workspace): + iterable = zip( + itertools.cycle(strings), + decoder.lines_in_files(workspace.keys()) + ) + for item in iterable: + self.assertEqual(item[0], item[1]) + + def test_lines_in_file(self): + self.perform_lines_in_file_test(( + "YAML", + "ⓎⒶⓂⓁ", + "🅨🅐🅜🅛", + "YAML" + )) + self.perform_lines_in_file_test(( + "𝐘𝐀𝐌𝐋", + "𝖄𝕬𝕸𝕷", + "𝒀𝑨𝑴𝑳", + "𝓨𝓐𝓜𝓛" + )) diff --git a/tests/test_module.py b/tests/test_module.py index 7f4f62ba..b4e24e38 100644 --- a/tests/test_module.py +++ b/tests/test_module.py @@ -28,12 +28,14 @@ def setUp(self): self.wd = tempfile.mkdtemp(prefix='yamllint-tests-') # file with only one warning - with open(os.path.join(self.wd, 'warn.yaml'), 'w') as f: + path = os.path.join(self.wd, 'warn.yaml') + with open(path, 'w', encoding='utf_8') as f: f.write('key: value\n') # file in dir os.mkdir(os.path.join(self.wd, 'sub')) - with open(os.path.join(self.wd, 'sub', 'nok.yaml'), 'w') as f: + path = os.path.join(self.wd, 'sub', 'nok.yaml') + with open(path, 'w', encoding='utf_8') as f: f.write('---\n' 'list: [ 1, 1, 2, 3, 5, 8] \n') diff --git a/yamllint/cli.py b/yamllint/cli.py index 9a39bd8c..7059b852 100644 --- a/yamllint/cli.py +++ b/yamllint/cli.py @@ -219,7 +219,7 @@ def run(argv=None): for file in find_files_recursively(args.files, conf): filepath = file[2:] if file.startswith('./') else file try: - with open(file, newline='') as f: + with open(file, mode='rb') as f: problems = linter.run(f, conf, filepath) except OSError as e: print(e, file=sys.stderr) diff --git a/yamllint/config.py b/yamllint/config.py index 9ce62549..b7d389fc 100644 --- a/yamllint/config.py +++ b/yamllint/config.py @@ -13,13 +13,13 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import fileinput import os.path import pathspec import yaml import yamllint.rules +from yamllint import decoder class YamlLintConfigError(Exception): @@ -38,8 +38,8 @@ def __init__(self, content=None, file=None): self.locale = None if file is not None: - with open(file) as f: - content = f.read() + with open(file, mode='rb') as f: + content = decoder.auto_decode(f.read()) self.parse(content) self.validate() @@ -109,8 +109,10 @@ def parse(self, raw_content): raise YamlLintConfigError( 'invalid config: ignore-from-file should contain ' 'filename(s), either as a list or string') - with fileinput.input(conf['ignore-from-file']) as f: - self.ignore = pathspec.PathSpec.from_lines('gitwildmatch', f) + self.ignore = pathspec.PathSpec.from_lines( + 'gitwildmatch', + decoder.lines_in_files(conf['ignore-from-file']) + ) elif 'ignore' in conf: if isinstance(conf['ignore'], str): self.ignore = pathspec.PathSpec.from_lines( @@ -163,9 +165,10 @@ def validate_rule_conf(rule, conf): raise YamlLintConfigError( 'invalid config: ignore-from-file should contain ' 'valid filename(s), either as a list or string') - with fileinput.input(conf['ignore-from-file']) as f: - conf['ignore'] = pathspec.PathSpec.from_lines( - 'gitwildmatch', f) + conf['ignore'] = pathspec.PathSpec.from_lines( + 'gitwildmatch', + decoder.lines_in_files(conf['ignore-from-file']) + ) elif ('ignore' in conf and not isinstance( conf['ignore'], pathspec.pathspec.PathSpec)): if isinstance(conf['ignore'], str): diff --git a/yamllint/decoder.py b/yamllint/decoder.py new file mode 100644 index 00000000..1e3c2f32 --- /dev/null +++ b/yamllint/decoder.py @@ -0,0 +1,65 @@ +# Copyright (C) 2023 Jason Yundt +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import codecs + + +def detect_encoding(stream_data): + """ + Return stream_data’s character encoding + + Specifically, this function will take a bytes object and return a string + that contains the name of one of Python’s built-in codecs [1]. + + The YAML spec says that streams must begin with a BOM or an ASCII + character. If stream_data doesn’t begin with either of those, then this + function might return the wrong encoding. See chapter 5.2 of the YAML spec + for details [2]. + + [1]: + [2]: + """ + if stream_data.startswith(codecs.BOM_UTF32_BE): + return 'utf_32' + elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4: + return 'utf_32_be' + elif stream_data.startswith(codecs.BOM_UTF32_LE): + return 'utf_32' + elif stream_data[1:4] == b'\x00\x00\x00': + return 'utf_32_le' + elif stream_data.startswith(codecs.BOM_UTF16_BE): + return 'utf_16' + elif stream_data.startswith(b'\x00') and len(stream_data) >= 2: + return 'utf_16_be' + elif stream_data.startswith(codecs.BOM_UTF16_LE): + return 'utf_16' + elif stream_data[1:2] == b'\x00': + return 'utf_16_le' + elif stream_data.startswith(codecs.BOM_UTF8): + return 'utf_8_sig' + else: + return 'utf_8' + + +def auto_decode(stream_data): + return stream_data.decode(encoding=detect_encoding(stream_data)) + + +def lines_in_files(paths): + """Autodecodes files and yields their lines.""" + for path in paths: + with open(path, 'rb') as file: + text = auto_decode(file.read()) + yield from text.splitlines() diff --git a/yamllint/linter.py b/yamllint/linter.py index a2faa061..2230a600 100644 --- a/yamllint/linter.py +++ b/yamllint/linter.py @@ -18,7 +18,7 @@ import yaml -from yamllint import parser +from yamllint import decoder, parser PROBLEM_LEVELS = { 0: None, @@ -187,6 +187,8 @@ def get_syntax_error(buffer): def _run(buffer, conf, filepath): assert hasattr(buffer, '__getitem__'), \ '_run() argument must be a buffer, not a stream' + if isinstance(buffer, bytes): + buffer = decoder.auto_decode(buffer) first_line = next(parser.line_generator(buffer)).content if re.match(r'^#\s*yamllint disable-file\s*$', first_line):