diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 7e55e6ad..34a9f7b8 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -57,6 +57,13 @@ jobs:
- run: pip install .
# https://github.com/AndreMiras/coveralls-python-action/issues/18
- run: echo -e "[run]\nrelative_files = True" > .coveragerc
- - run: coverage run -m unittest discover
+ - run: >-
+ python
+ -X warn_default_encoding
+ -W error::EncodingWarning
+ -m coverage
+ run
+ -m unittest
+ discover
- name: Coveralls
uses: AndreMiras/coveralls-python-action@develop
diff --git a/docs/configuration.rst b/docs/configuration.rst
index 9624b496..8e7a10ab 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -228,6 +228,10 @@ or:
.. note:: However, this is mutually exclusive with the ``ignore`` key.
+.. note:: Files on the ``ignore-from-file`` list must use either UTF-8, UTF-16
+ or UTF-32. Additionally, they must start with either an ASCII character or a
+ byte order mark.
+
If you need to know the exact list of files that yamllint would process,
without really linting them, you can use ``--list-files``:
diff --git a/tests/common.py b/tests/common.py
index 29dcfb9c..d2535310 100644
--- a/tests/common.py
+++ b/tests/common.py
@@ -13,6 +13,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+import codecs
import contextlib
from io import StringIO
import os
@@ -20,6 +21,8 @@
import sys
import tempfile
import unittest
+import warnings
+from codecs import CodecInfo
import yaml
@@ -27,6 +30,152 @@
from yamllint.config import YamlLintConfig
+# Encoding related stuff:
+UTF_CODECS = (
+ 'utf_32_be',
+ 'utf_32_be_sig',
+ 'utf_32_le',
+ 'utf_32_le_sig',
+ 'utf_16_be',
+ 'utf_16_be_sig',
+ 'utf_16_le',
+ 'utf_16_le_sig',
+ 'utf_8',
+ 'utf_8_sig'
+)
+
+
+def encode_utf_32_be_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF32_BE + codecs.encode(obj, 'utf_32_be', errors),
+ len(obj)
+ )
+
+
+def encode_utf_32_le_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF32_LE + codecs.encode(obj, 'utf_32_le', errors),
+ len(obj)
+ )
+
+
+def encode_utf_16_be_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF16_BE + codecs.encode(obj, 'utf_16_be', errors),
+ len(obj)
+ )
+
+
+def encode_utf_16_le_sig(obj, errors='strict'):
+ return (
+ codecs.BOM_UTF16_LE + codecs.encode(obj, 'utf_16_le', errors),
+ len(obj)
+ )
+
+
+test_codec_infos = {
+ 'utf_32_be_sig': CodecInfo(encode_utf_32_be_sig, codecs.getdecoder('utf_32')), # noqa: E501
+ 'utf_32_le_sig': CodecInfo(encode_utf_32_le_sig, codecs.getdecoder('utf_32')), # noqa: E501
+ 'utf_16_be_sig': CodecInfo(encode_utf_16_be_sig, codecs.getdecoder('utf_16')), # noqa: E501
+ 'utf_16_le_sig': CodecInfo(encode_utf_16_le_sig, codecs.getdecoder('utf_16')), # noqa: E501
+}
+
+
+def register_test_codecs():
+ codecs.register(test_codec_infos.get)
+
+
+def unregister_test_codecs():
+ if sys.version_info >= (3, 10, 0):
+ codecs.unregister(test_codec_infos.get)
+ else:
+ warnings.warn(
+ "This version of Python doesn’t allow us to unregister codecs.",
+ stacklevel=1
+ )
+
+
+def is_test_codec(codec):
+ return codec in test_codec_infos.keys()
+
+
+def test_codec_built_in_equivalent(test_codec):
+ return_value = test_codec
+ for suffix in ('_sig', '_be', '_le'):
+ return_value = return_value.replace(suffix, '')
+ return return_value
+
+
+def uses_bom(codec):
+ for suffix in ('_32', '_16', '_sig'):
+ if codec.endswith(suffix):
+ return True
+ return False
+
+
+def encoding_detectable(string, codec):
+ """
+ Returns True if encoding can be detected after string is encoded
+
+ Encoding detection only works if you’re using a BOM or the first character
+ is ASCII. See yamllint.decoder.auto_decode()’s docstring.
+ """
+ return uses_bom(codec) or (len(string) > 0 and string[0].isascii())
+
+
+# Workspace related stuff:
+class Blob:
+ def __init__(self, text, encoding):
+ self.text = text
+ self.encoding = encoding
+
+
+def build_temp_workspace(files):
+ tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')
+
+ for path, content in files.items():
+ path = os.fsencode(os.path.join(tempdir, path))
+ if not os.path.exists(os.path.dirname(path)):
+ os.makedirs(os.path.dirname(path))
+
+ if isinstance(content, list):
+ os.mkdir(path)
+ elif isinstance(content, str) and content.startswith('symlink://'):
+ os.symlink(content[10:], path)
+ else:
+ if isinstance(content, Blob):
+ content = content.text.encode(content.encoding)
+ elif isinstance(content, str):
+ content = content.encode('utf_8')
+ with open(path, 'wb') as f:
+ f.write(content)
+
+ return tempdir
+
+
+@contextlib.contextmanager
+def temp_workspace(files):
+ """Provide a temporary workspace that is automatically cleaned up."""
+ backup_wd = os.getcwd()
+ wd = build_temp_workspace(files)
+
+ try:
+ os.chdir(wd)
+ yield
+ finally:
+ os.chdir(backup_wd)
+ shutil.rmtree(wd)
+
+
+def temp_workspace_with_files_in_many_codecs(path_template, text):
+ workspace = {}
+ for codec in UTF_CODECS:
+ if encoding_detectable(text, codec):
+ workspace[path_template.format(codec)] = Blob(text, codec)
+ return workspace
+
+
+# Miscellaneous stuff:
class RuleTestCase(unittest.TestCase):
def build_fake_config(self, conf):
if conf is None:
@@ -81,37 +230,3 @@ def __exit__(self, *exc_info):
@property
def returncode(self):
return self._raises_ctx.exception.code
-
-
-def build_temp_workspace(files):
- tempdir = tempfile.mkdtemp(prefix='yamllint-tests-')
-
- for path, content in files.items():
- path = os.path.join(tempdir, path).encode('utf-8')
- if not os.path.exists(os.path.dirname(path)):
- os.makedirs(os.path.dirname(path))
-
- if isinstance(content, list):
- os.mkdir(path)
- elif isinstance(content, str) and content.startswith('symlink://'):
- os.symlink(content[10:], path)
- else:
- mode = 'wb' if isinstance(content, bytes) else 'w'
- with open(path, mode) as f:
- f.write(content)
-
- return tempdir
-
-
-@contextlib.contextmanager
-def temp_workspace(files):
- """Provide a temporary workspace that is automatically cleaned up."""
- backup_wd = os.getcwd()
- wd = build_temp_workspace(files)
-
- try:
- os.chdir(wd)
- yield
- finally:
- os.chdir(backup_wd)
- shutil.rmtree(wd)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e0ae0fee..76f2fa10 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -23,7 +23,14 @@
import unittest
from io import StringIO
-from tests.common import build_temp_workspace, RunContext, temp_workspace
+from tests.common import (
+ build_temp_workspace,
+ register_test_codecs,
+ RunContext,
+ temp_workspace,
+ unregister_test_codecs,
+ temp_workspace_with_files_in_many_codecs,
+)
from yamllint import cli, config
@@ -296,14 +303,14 @@ def test_run_with_implicit_extends_config(self):
(ctx.returncode, ctx.stdout, ctx.stderr), (0, expected_out, ''))
def test_run_with_config_file(self):
- with open(os.path.join(self.wd, 'config'), 'w') as f:
+ with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: disable}')
with RunContext(self) as ctx:
cli.run(('-c', f.name, os.path.join(self.wd, 'a.yaml')))
self.assertEqual(ctx.returncode, 0)
- with open(os.path.join(self.wd, 'config'), 'w') as f:
+ with open(os.path.join(self.wd, 'config'), 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: enable}')
with RunContext(self) as ctx:
@@ -319,14 +326,14 @@ def test_run_with_user_global_config_file(self):
self.addCleanup(os.environ.__delitem__, 'HOME')
os.environ['HOME'] = home
- with open(config, 'w') as f:
+ with open(config, 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: disable}')
with RunContext(self) as ctx:
cli.run((os.path.join(self.wd, 'a.yaml'), ))
self.assertEqual(ctx.returncode, 0)
- with open(config, 'w') as f:
+ with open(config, 'w', encoding='utf_8') as f:
f.write('rules: {trailing-spaces: enable}')
with RunContext(self) as ctx:
@@ -339,7 +346,8 @@ def test_run_with_user_xdg_config_home_in_env(self):
with tempfile.TemporaryDirectory('w') as d:
os.environ['XDG_CONFIG_HOME'] = d
os.makedirs(os.path.join(d, 'yamllint'))
- with open(os.path.join(d, 'yamllint', 'config'), 'w') as f:
+ path = os.path.join(d, 'yamllint', 'config')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('extends: relaxed')
with RunContext(self) as ctx:
cli.run(('-f', 'parsable', os.path.join(self.wd, 'warn.yaml')))
@@ -349,7 +357,7 @@ def test_run_with_user_xdg_config_home_in_env(self):
def test_run_with_user_yamllint_config_file_in_env(self):
self.addCleanup(os.environ.__delitem__, 'YAMLLINT_CONFIG_FILE')
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
os.environ['YAMLLINT_CONFIG_FILE'] = f.name
f.write('rules: {trailing-spaces: disable}')
f.flush()
@@ -357,7 +365,7 @@ def test_run_with_user_yamllint_config_file_in_env(self):
cli.run((os.path.join(self.wd, 'a.yaml'), ))
self.assertEqual(ctx.returncode, 0)
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
os.environ['YAMLLINT_CONFIG_FILE'] = f.name
f.write('rules: {trailing-spaces: enable}')
f.flush()
@@ -499,8 +507,13 @@ def test_run_default_format_output_in_tty(self):
path = os.path.join(self.wd, 'a.yaml')
# Create a pseudo-TTY and redirect stdout to it
+ old_stdout, old_stderr = sys.stdout, sys.stderr
master, slave = pty.openpty()
- sys.stdout = sys.stderr = os.fdopen(slave, 'w')
+ sys.stdout = sys.stderr = os.fdopen(
+ slave,
+ 'w',
+ encoding=os.device_encoding(slave)
+ )
with self.assertRaises(SystemExit) as ctx:
cli.run((path, ))
@@ -509,7 +522,7 @@ def test_run_default_format_output_in_tty(self):
self.assertEqual(ctx.exception.code, 1)
# Read output from TTY
- output = os.fdopen(master, 'r')
+ output = os.fdopen(master, 'r', encoding=os.device_encoding(master))
flag = fcntl.fcntl(master, fcntl.F_GETFD)
fcntl.fcntl(master, fcntl.F_SETFL, flag | os.O_NONBLOCK)
@@ -518,6 +531,7 @@ def test_run_default_format_output_in_tty(self):
sys.stdout.close()
sys.stderr.close()
output.close()
+ sys.stdout, sys.stderr = old_stdout, old_stderr
self.assertEqual(out, (
f'\033[4m{path}\033[0m\n'
@@ -817,3 +831,52 @@ def test_multiple_parent_config_file(self):
self.assertEqual((ctx.returncode, ctx.stdout, ctx.stderr),
(0, './4spaces.yml:2:5: [warning] wrong indentation: '
'expected 3 but found 4 (indentation)\n', ''))
+
+
+class CommandLineEncodingTestCase(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ super().setUpClass()
+ register_test_codecs()
+
+ @classmethod
+ def tearDownClass(cls):
+ super().tearDownClass()
+ unregister_test_codecs()
+
+ def test_valid_encodings(self):
+ conf = ('---\n'
+ 'rules:\n'
+ ' key-ordering: enable\n')
+ config_files = temp_workspace_with_files_in_many_codecs(
+ 'config_{}.yaml',
+ conf
+ )
+ sorted_correctly = ('---\n'
+ 'A: YAML\n'
+ 'Z: YAML\n')
+ sorted_correctly_files = temp_workspace_with_files_in_many_codecs(
+ 'sorted_correctly/{}.yaml',
+ sorted_correctly
+ )
+ sorted_incorrectly = ('---\n'
+ 'Z: YAML\n'
+ 'A: YAML\n')
+ sorted_incorrectly_files = temp_workspace_with_files_in_many_codecs(
+ 'sorted_incorrectly/{}.yaml',
+ sorted_incorrectly
+ )
+ workspace = {
+ **config_files,
+ **sorted_correctly_files,
+ **sorted_incorrectly_files
+ }
+
+ with temp_workspace(workspace):
+ for config_path in config_files.keys():
+ with RunContext(self) as ctx:
+ cli.run(('-c', config_path, 'sorted_correctly/'))
+ self.assertEqual(ctx.returncode, 0)
+ with RunContext(self) as ctx:
+ cli.run(('-c', config_path, 'sorted_incorrectly/'))
+ self.assertNotEqual(ctx.returncode, 0)
diff --git a/tests/test_config.py b/tests/test_config.py
index fb570c66..8071211d 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -13,6 +13,7 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
+import itertools
import os
import shutil
import sys
@@ -20,7 +21,12 @@
import unittest
from io import StringIO
-from tests.common import build_temp_workspace, RunContext
+from tests.common import (
+ build_temp_workspace,
+ register_test_codecs,
+ RunContext,
+ unregister_test_codecs,
+)
from yamllint import cli, config
from yamllint.config import YamlLintConfigError
@@ -252,7 +258,7 @@ def test_extend_on_object(self):
self.assertEqual(len(new.enabled_rules(None)), 2)
def test_extend_on_file(self):
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('rules:\n'
' colons:\n'
' max-spaces-before: 0\n'
@@ -271,7 +277,7 @@ def test_extend_on_file(self):
self.assertEqual(len(c.enabled_rules(None)), 2)
def test_extend_remove_rule(self):
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('rules:\n'
' colons:\n'
' max-spaces-before: 0\n'
@@ -290,7 +296,7 @@ def test_extend_remove_rule(self):
self.assertEqual(len(c.enabled_rules(None)), 1)
def test_extend_edit_rule(self):
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('rules:\n'
' colons:\n'
' max-spaces-before: 0\n'
@@ -312,7 +318,7 @@ def test_extend_edit_rule(self):
self.assertEqual(len(c.enabled_rules(None)), 2)
def test_extend_reenable_rule(self):
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('rules:\n'
' colons:\n'
' max-spaces-before: 0\n'
@@ -332,7 +338,7 @@ def test_extend_reenable_rule(self):
self.assertEqual(len(c.enabled_rules(None)), 2)
def test_extend_recursive_default_values(self):
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('rules:\n'
' braces:\n'
' max-spaces-inside: 1248\n')
@@ -347,7 +353,7 @@ def test_extend_recursive_default_values(self):
self.assertEqual(c.rules['braces']['min-spaces-inside-empty'], 2357)
self.assertEqual(c.rules['braces']['max-spaces-inside-empty'], -1)
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('rules:\n'
' colons:\n'
' max-spaces-before: 1337\n')
@@ -359,8 +365,8 @@ def test_extend_recursive_default_values(self):
self.assertEqual(c.rules['colons']['max-spaces-before'], 1337)
self.assertEqual(c.rules['colons']['max-spaces-after'], 1)
- with tempfile.NamedTemporaryFile('w') as f1, \
- tempfile.NamedTemporaryFile('w') as f2:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f1, \
+ tempfile.NamedTemporaryFile('w', encoding='utf_8') as f2:
f1.write('rules:\n'
' colons:\n'
' max-spaces-before: 1337\n')
@@ -377,7 +383,7 @@ def test_extend_recursive_default_values(self):
self.assertEqual(c.rules['colons']['max-spaces-after'], 1)
def test_extended_ignore_str(self):
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('ignore: |\n'
' *.template.yaml\n')
f.flush()
@@ -387,7 +393,7 @@ def test_extended_ignore_str(self):
self.assertEqual(c.ignore.match_file('test.yaml'), False)
def test_extended_ignore_list(self):
- with tempfile.NamedTemporaryFile('w') as f:
+ with tempfile.NamedTemporaryFile('w', encoding='utf_8') as f:
f.write('ignore:\n'
' - "*.template.yaml"\n')
f.flush()
@@ -557,7 +563,8 @@ def test_no_ignore(self):
)))
def test_run_with_ignore_str(self):
- with open(os.path.join(self.wd, '.yamllint'), 'w') as f:
+ path = os.path.join(self.wd, '.yamllint')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('extends: default\n'
'ignore: |\n'
' *.dont-lint-me.yaml\n'
@@ -611,7 +618,8 @@ def test_run_with_ignore_str(self):
)))
def test_run_with_ignore_list(self):
- with open(os.path.join(self.wd, '.yamllint'), 'w') as f:
+ path = os.path.join(self.wd, '.yamllint')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('extends: default\n'
'ignore:\n'
' - "*.dont-lint-me.yaml"\n'
@@ -665,19 +673,22 @@ def test_run_with_ignore_list(self):
)))
def test_run_with_ignore_from_file(self):
- with open(os.path.join(self.wd, '.yamllint'), 'w') as f:
+ path = os.path.join(self.wd, '.yamllint')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('extends: default\n'
'ignore-from-file: .gitignore\n'
'rules:\n'
' key-duplicates:\n'
' ignore-from-file: .ignore-key-duplicates\n')
- with open(os.path.join(self.wd, '.gitignore'), 'w') as f:
+ path = os.path.join(self.wd, '.gitignore')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('*.dont-lint-me.yaml\n'
'/bin/\n'
'!/bin/*.lint-me-anyway.yaml\n')
- with open(os.path.join(self.wd, '.ignore-key-duplicates'), 'w') as f:
+ path = os.path.join(self.wd, '.ignore-key-duplicates')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('/ign-dup\n')
sys.stdout = StringIO()
@@ -722,13 +733,16 @@ def test_run_with_ignore_from_file(self):
)))
def test_run_with_ignored_from_file(self):
- with open(os.path.join(self.wd, '.yamllint'), 'w') as f:
+ path = os.path.join(self.wd, '.yamllint')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('ignore-from-file: [.gitignore, .yamlignore]\n'
'extends: default\n')
- with open(os.path.join(self.wd, '.gitignore'), 'w') as f:
+ path = os.path.join(self.wd, '.gitignore')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('*.dont-lint-me.yaml\n'
'/bin/\n')
- with open(os.path.join(self.wd, '.yamlignore'), 'w') as f:
+ path = os.path.join(self.wd, '.yamlignore')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('!/bin/*.lint-me-anyway.yaml\n')
sys.stdout = StringIO()
@@ -787,7 +801,7 @@ def test_run_with_ignore_with_broken_symlink(self):
cli.run(('-f', 'parsable', '.'))
self.assertNotEqual(ctx.returncode, 0)
- with open(os.path.join(wd, '.yamllint'), 'w') as f:
+ with open(os.path.join(wd, '.yamllint'), 'w', encoding='utf_8') as f:
f.write('extends: default\n'
'ignore: |\n'
' *404.yaml\n')
@@ -805,7 +819,8 @@ def test_run_with_ignore_with_broken_symlink(self):
shutil.rmtree(wd)
def test_run_with_ignore_on_ignored_file(self):
- with open(os.path.join(self.wd, '.yamllint'), 'w') as f:
+ path = os.path.join(self.wd, '.yamllint')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('ignore: file.dont-lint-me.yaml\n'
'rules:\n'
' trailing-spaces: enable\n'
@@ -820,3 +835,44 @@ def test_run_with_ignore_on_ignored_file(self):
sys.stdout.getvalue().strip(),
'file-at-root.yaml:4:17: [error] trailing spaces (trailing-spaces)'
)
+
+ def create_ignore_file(self, text, codec):
+ path = os.path.join(self.wd, f'{codec}.ignore')
+ with open(path, 'wb') as f:
+ f.write(text.encode(codec))
+ self.addCleanup(lambda: os.remove(path))
+ return path
+
+ def test_ignored_from_file_with_multiple_encodings(self):
+ register_test_codecs()
+ self.addCleanup(unregister_test_codecs)
+
+ ignore_files = itertools.starmap(
+ self.create_ignore_file, (
+ ('bin/file.lint-me-anyway.yaml\n', 'utf_32_be'),
+ ('bin/file.yaml\n', 'utf_32_be_sig'),
+ ('file-at-root.yaml\n', 'utf_32_le'),
+ ('file.dont-lint-me.yaml\n', 'utf_32_le_sig'),
+
+ ('ign-dup/file.yaml\n', 'utf_16_be'),
+ ('ign-dup/sub/dir/file.yaml\n', 'utf_16_be_sig'),
+ ('ign-trail/file.yaml\n', 'utf_16_le'),
+ ('include/ign-dup/sub/dir/file.yaml\n', 'utf_16_le_sig'),
+
+ ('s/s/ign-trail/file.yaml\n', 'utf_8'),
+ (
+ 's/s/ign-trail/s/s/file.yaml\n'
+ 's/s/ign-trail/s/s/file2.lint-me-anyway.yaml\n'
+ '.yamllint\n',
+
+ 'utf_8_sig'
+ ),
+ )
+ )
+ conf = ('---\n'
+ 'extends: default\n'
+ f'ignore-from-file: [{", ".join(ignore_files)}]\n')
+
+ with self.assertRaises(SystemExit) as cm:
+ cli.run(('-d', conf, '.'))
+ self.assertEqual(cm.exception.code, 0)
diff --git a/tests/test_decoder.py b/tests/test_decoder.py
new file mode 100644
index 00000000..7f0198bc
--- /dev/null
+++ b/tests/test_decoder.py
@@ -0,0 +1,482 @@
+# Copyright (C) 2023–2024 Jason Yundt
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import codecs
+import itertools
+import unittest
+
+from tests.common import (
+ UTF_CODECS,
+ encoding_detectable,
+ is_test_codec,
+ register_test_codecs,
+ temp_workspace,
+ temp_workspace_with_files_in_many_codecs,
+ test_codec_built_in_equivalent,
+ unregister_test_codecs,
+ uses_bom,
+)
+
+from yamllint import decoder
+
+
+class PreEncodedTestStringInfo():
+ def __init__(
+ self,
+ input_bytes,
+ codec_for_input_bytes,
+ expected_output_str
+ ):
+ self.input_bytes = input_bytes
+ self.codec_for_input_bytes = codec_for_input_bytes
+ self.expected_output_str = expected_output_str
+
+
+PRE_ENCODED_TEST_STRING_INFOS = (
+ # An empty string
+ PreEncodedTestStringInfo(
+ b'',
+ None,
+ ''
+ ),
+
+ # A single ASCII character
+ PreEncodedTestStringInfo(
+ b'\x00\x00\x00|',
+ 'utf_32_be',
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'\x00\x00\xfe\xff\x00\x00\x00|',
+ 'utf_32',
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'|\x00\x00\x00',
+ 'utf_32_le',
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfe\x00\x00|\x00\x00\x00',
+ 'utf_32', # LE with BOM
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'\x00|',
+ 'utf_16_be',
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xfe\xff\x00|',
+ 'utf_16', # BE with BOM
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'|\x00',
+ 'utf_16_le',
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfe|\x00',
+ 'utf_16', # LE with BOM
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'|',
+ 'utf_8',
+ '|'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xef\xbb\xbf|',
+ 'utf_8_sig',
+ '|'
+ ),
+
+ # A string that starts with an ASCII character
+ PreEncodedTestStringInfo(
+ b'\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501
+ 'utf_32_be',
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\x00\x00\xfe\xff\x00\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00 \x19\x00\x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?', # noqa: E501
+ 'utf_32', # BE with BOM
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501
+ 'utf_32_le',
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfe\x00\x00W\x00\x00\x00h\x00\x00\x00a\x00\x00\x00t\x00\x00\x00\x19 \x00\x00s\x00\x00\x00 \x00\x00\x00u\x00\x00\x00p\x00\x00\x00?\x00\x00\x00', # noqa: E501
+ 'utf_32', # LE with BOM
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?',
+ 'utf_16_be',
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xfe\xff\x00W\x00h\x00a\x00t \x19\x00s\x00 \x00u\x00p\x00?',
+ 'utf_16', # BE with BOM
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'W\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00',
+ 'utf_16_le',
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfeW\x00h\x00a\x00t\x00\x19 s\x00 \x00u\x00p\x00?\x00',
+ 'utf_16', # LE with BOM
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'What\xe2\x80\x99s up?',
+ 'utf_8',
+ 'What’s up?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xef\xbb\xbfWhat\xe2\x80\x99s up?',
+ 'utf_8_sig',
+ 'What’s up?'
+ ),
+
+ # A single non-ASCII character
+ PreEncodedTestStringInfo(
+ b'\x00\x00\xfe\xff\x00\x01\xf4;',
+ 'utf_32', # BE with BOM
+ '🐻'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfe\x00\x00;\xf4\x01\x00',
+ 'utf_32', # LE with BOM
+ '🐻'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xfe\xff\xd8=\xdc;',
+ 'utf_16', # BE with BOM
+ '🐻'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfe=\xd8;\xdc',
+ 'utf_16', # LE with BOM
+ '🐻'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xef\xbb\xbf\xf0\x9f\x90\xbb',
+ 'utf_8_sig',
+ '🐻'
+ ),
+
+ # A string that starts with a non-ASCII character
+ PreEncodedTestStringInfo(
+ b'\x00\x00\xfe\xff\x00\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?', # noqa: E501
+ 'utf_32', # BE with BOM
+ 'Ça va?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfe\x00\x00\xc7\x00\x00\x00a\x00\x00\x00 \x00\x00\x00v\x00\x00\x00a\x00\x00\x00?\x00\x00\x00', # noqa: E501
+ 'utf_32', # LE with BOM
+ 'Ça va?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xfe\xff\x00\xc7\x00a\x00 \x00v\x00a\x00?',
+ 'utf_16', # BE with BOM
+ 'Ça va?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xff\xfe\xc7\x00a\x00 \x00v\x00a\x00?\x00',
+ 'utf_16', # LE with BOM
+ 'Ça va?'
+ ),
+ PreEncodedTestStringInfo(
+ b'\xef\xbb\xbf\xc3\x87a va?',
+ 'utf_8_sig',
+ 'Ça va?'
+ )
+)
+TEST_STRINGS_TO_ENCODE_AT_RUNTIME = (
+ "",
+ "y",
+ "yaml",
+ "🇾🇦🇲🇱❗"
+)
+setUpModule = register_test_codecs
+tearDownModule = unregister_test_codecs
+
+
+class EncodingStuffFromCommonTestCase(unittest.TestCase):
+ def test_test_codecs_and_utf_codecs(self):
+ error = "{} failed to correctly encode then decode {}."
+ for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME:
+ for codec in UTF_CODECS:
+ self.assertEqual(
+ string,
+ string.encode(codec).decode(codec),
+ msg=error.format(repr(codec), repr(string))
+ )
+
+ def test_is_test_codec(self):
+ self.assertFalse(is_test_codec('utf_32'))
+ self.assertFalse(is_test_codec('utf_32_be'))
+ self.assertTrue(is_test_codec('utf_32_be_sig'))
+ self.assertFalse(is_test_codec('utf_32_le'))
+ self.assertTrue(is_test_codec('utf_32_le_sig'))
+
+ self.assertFalse(is_test_codec('utf_16'))
+ self.assertFalse(is_test_codec('utf_16_be'))
+ self.assertTrue(is_test_codec('utf_16_be_sig'))
+ self.assertFalse(is_test_codec('utf_16_le'))
+ self.assertTrue(is_test_codec('utf_16_le_sig'))
+
+ self.assertFalse(is_test_codec('utf_8'))
+ self.assertFalse(is_test_codec('utf_8_be'))
+
+ def test_test_codec_built_in_equivalent(self):
+ self.assertEqual(
+ 'utf_32',
+ test_codec_built_in_equivalent('utf_32_be_sig')
+ )
+ self.assertEqual(
+ 'utf_32',
+ test_codec_built_in_equivalent('utf_32_le_sig')
+ )
+
+ self.assertEqual(
+ 'utf_16',
+ test_codec_built_in_equivalent('utf_16_be_sig')
+ )
+ self.assertEqual(
+ 'utf_16',
+ test_codec_built_in_equivalent('utf_16_le_sig')
+ )
+
+ def test_uses_bom(self):
+ self.assertTrue(uses_bom('utf_32'))
+ self.assertFalse(uses_bom('utf_32_be'))
+ self.assertTrue(uses_bom('utf_32_be_sig'))
+ self.assertFalse(uses_bom('utf_32_le'))
+ self.assertTrue(uses_bom('utf_32_le_sig'))
+
+ self.assertTrue(uses_bom('utf_16'))
+ self.assertFalse(uses_bom('utf_16_be'))
+ self.assertTrue(uses_bom('utf_16_be_sig'))
+ self.assertFalse(uses_bom('utf_16_le'))
+ self.assertTrue(uses_bom('utf_16_le_sig'))
+
+ self.assertFalse(uses_bom('utf_8'))
+ self.assertTrue(uses_bom('utf_8_sig'))
+
+ def test_encoding_detectable(self):
+ # No BOM + nothing
+ self.assertFalse(encoding_detectable('', 'utf_32_be'))
+ self.assertFalse(encoding_detectable('', 'utf_32_le'))
+
+ self.assertFalse(encoding_detectable('', 'utf_16_be'))
+ self.assertFalse(encoding_detectable('', 'utf_16_le'))
+
+ self.assertFalse(encoding_detectable('', 'utf_8'))
+ # BOM + nothing
+ self.assertTrue(encoding_detectable('', 'utf_32'))
+ self.assertTrue(encoding_detectable('', 'utf_32_be_sig'))
+ self.assertTrue(encoding_detectable('', 'utf_32_le_sig'))
+
+ self.assertTrue(encoding_detectable('', 'utf_16'))
+ self.assertTrue(encoding_detectable('', 'utf_16_be_sig'))
+ self.assertTrue(encoding_detectable('', 'utf_16_le_sig'))
+
+ self.assertTrue(encoding_detectable('', 'utf_8_sig'))
+ # No BOM + non-ASCII
+ self.assertFalse(encoding_detectable('Ⓝⓔ', 'utf_32_be'))
+ self.assertFalse(encoding_detectable('ⓥⓔ', 'utf_32_le'))
+
+ self.assertFalse(encoding_detectable('ⓡ ', 'utf_16_be'))
+ self.assertFalse(encoding_detectable('ⓖⓞ', 'utf_16_le'))
+
+ self.assertFalse(encoding_detectable('ⓝⓝ', 'utf_8'))
+ # No BOM + ASCII
+ self.assertTrue(encoding_detectable('a ', 'utf_32_be'))
+ self.assertTrue(encoding_detectable('gi', 'utf_32_le'))
+
+ self.assertTrue(encoding_detectable('ve', 'utf_16_be'))
+ self.assertTrue(encoding_detectable(' y', 'utf_16_le'))
+
+ self.assertTrue(encoding_detectable('ou', 'utf_8'))
+ # BOM + non-ASCII
+ self.assertTrue(encoding_detectable('␣ⓤ', 'utf_32'))
+ self.assertTrue(encoding_detectable('ⓟ', 'utf_32_be_sig'))
+ self.assertTrue(encoding_detectable('Ⓝⓔ', 'utf_32_le_sig'))
+
+ self.assertTrue(encoding_detectable('ⓥⓔ', 'utf_16'))
+ self.assertTrue(encoding_detectable('ⓡ␣', 'utf_16_be_sig'))
+ self.assertTrue(encoding_detectable('ⓖⓞ', 'utf_16_le_sig'))
+
+ self.assertTrue(encoding_detectable('ⓝⓝ', 'utf_8_sig'))
+ # BOM + ASCII
+ self.assertTrue(encoding_detectable('a ', 'utf_32'))
+ self.assertTrue(encoding_detectable('le', 'utf_32_be_sig'))
+ self.assertTrue(encoding_detectable('t ', 'utf_32_le_sig'))
+
+ self.assertTrue(encoding_detectable('yo', 'utf_16'))
+ self.assertTrue(encoding_detectable('u ', 'utf_16_be_sig'))
+ self.assertTrue(encoding_detectable('do', 'utf_16_le_sig'))
+
+ self.assertTrue(encoding_detectable('wn', 'utf_8_sig'))
+
+
+class DecoderTestCase(unittest.TestCase):
+ def detect_encoding_test_helper(
+ self,
+ original_string,
+ input_bytes,
+ expected_output
+ ):
+ ERROR1 = "{} was encoded with {}, but detect_encoding() returned {}."
+ ERROR2 = "detect_encoding({}) returned a codec that isn’t built-in."
+ actual_output = decoder.detect_encoding(input_bytes)
+ if expected_output is not None:
+ self.assertEqual(
+ expected_output,
+ actual_output,
+ msg=ERROR1.format(
+ input_bytes,
+ repr(expected_output),
+ repr(actual_output)
+ )
+ )
+
+ codec_info = codecs.lookup(actual_output)
+ self.assertFalse(
+ is_test_codec(codec_info),
+ msg=ERROR2.format(input_bytes)
+ )
+
+ def test_detect_encoding_with_pre_encoded_strings(self):
+ for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS:
+ self.detect_encoding_test_helper(
+ pre_encoded_test_string_info.expected_output_str,
+ pre_encoded_test_string_info.input_bytes,
+ pre_encoded_test_string_info.codec_for_input_bytes
+ )
+
+ def test_detect_encoding_with_strings_encoded_at_runtime(self):
+ for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME:
+ for codec in UTF_CODECS:
+ if not uses_bom(codec) and len(string) == 0:
+ expected_output = 'utf_8'
+ elif not encoding_detectable(string, codec):
+ expected_output = None
+ elif is_test_codec(codec):
+ expected_output = test_codec_built_in_equivalent(codec)
+ else:
+ expected_output = codec
+ self.detect_encoding_test_helper(
+ string,
+ string.encode(codec),
+ expected_output
+ )
+
+ def auto_decode_test_helper(
+ self,
+ input_bytes,
+ codec_for_input_bytes,
+ expected_output
+ ):
+ ERROR = "auto_decode({}) returned the wrong value."
+ does_auto_detect_encodings_return_value_matter = (
+ codec_for_input_bytes is not None and (
+ encoding_detectable(expected_output, codec_for_input_bytes)
+ or len(input_bytes) == 0
+ )
+ )
+ if does_auto_detect_encodings_return_value_matter:
+ actual_output = decoder.auto_decode(input_bytes)
+ self.assertEqual(
+ expected_output,
+ actual_output,
+ msg=ERROR.format(repr(input_bytes))
+ )
+ self.assertIsInstance(actual_output, str)
+ else:
+ try:
+ decoder.auto_decode(input_bytes)
+ except UnicodeDecodeError as exception:
+ return exception
+ return None
+
+ def test_auto_decode_with_pre_encoded_strings(self):
+ ERROR = "auto_decode({}) should not have raised an exception"
+ for pre_encoded_test_string_info in PRE_ENCODED_TEST_STRING_INFOS:
+ exception = self.auto_decode_test_helper(
+ pre_encoded_test_string_info.input_bytes,
+ pre_encoded_test_string_info.codec_for_input_bytes,
+ pre_encoded_test_string_info.expected_output_str
+ )
+ if exception is not None:
+ new_exception = self.failureException(
+ msg=ERROR.format(
+ repr(pre_encoded_test_string_info.input_bytes)
+ )
+ )
+ raise new_exception from exception
+
+ def test_auto_decode_with_strings_encoded_at_runtime(self):
+ at_least_one_decode_error = False
+ for string in TEST_STRINGS_TO_ENCODE_AT_RUNTIME:
+ for codec in UTF_CODECS:
+ exception = self.auto_decode_test_helper(
+ string.encode(codec),
+ codec,
+ string
+ )
+ if exception is not None:
+ at_least_one_decode_error = True
+ self.assertTrue(
+ at_least_one_decode_error,
+ msg=(
+ "None of the TEST_STRINGS_TO_ENCODE_AT_RUNTIME triggered a "
+ + "decoding error."
+ )
+ )
+
+ def perform_lines_in_file_test(self, strings):
+ workspace = temp_workspace_with_files_in_many_codecs(
+ '{}',
+ '\n'.join(strings)
+ )
+ with temp_workspace(workspace):
+ iterable = zip(
+ itertools.cycle(strings),
+ decoder.lines_in_files(workspace.keys())
+ )
+ for item in iterable:
+ self.assertEqual(item[0], item[1])
+
+ def test_lines_in_file(self):
+ self.perform_lines_in_file_test((
+ "YAML",
+ "ⓎⒶⓂⓁ",
+ "🅨🅐🅜🅛",
+ "YAML"
+ ))
+ self.perform_lines_in_file_test((
+ "𝐘𝐀𝐌𝐋",
+ "𝖄𝕬𝕸𝕷",
+ "𝒀𝑨𝑴𝑳",
+ "𝓨𝓐𝓜𝓛"
+ ))
diff --git a/tests/test_module.py b/tests/test_module.py
index 7f4f62ba..b4e24e38 100644
--- a/tests/test_module.py
+++ b/tests/test_module.py
@@ -28,12 +28,14 @@ def setUp(self):
self.wd = tempfile.mkdtemp(prefix='yamllint-tests-')
# file with only one warning
- with open(os.path.join(self.wd, 'warn.yaml'), 'w') as f:
+ path = os.path.join(self.wd, 'warn.yaml')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('key: value\n')
# file in dir
os.mkdir(os.path.join(self.wd, 'sub'))
- with open(os.path.join(self.wd, 'sub', 'nok.yaml'), 'w') as f:
+ path = os.path.join(self.wd, 'sub', 'nok.yaml')
+ with open(path, 'w', encoding='utf_8') as f:
f.write('---\n'
'list: [ 1, 1, 2, 3, 5, 8] \n')
diff --git a/yamllint/cli.py b/yamllint/cli.py
index 9a39bd8c..7059b852 100644
--- a/yamllint/cli.py
+++ b/yamllint/cli.py
@@ -219,7 +219,7 @@ def run(argv=None):
for file in find_files_recursively(args.files, conf):
filepath = file[2:] if file.startswith('./') else file
try:
- with open(file, newline='') as f:
+ with open(file, mode='rb') as f:
problems = linter.run(f, conf, filepath)
except OSError as e:
print(e, file=sys.stderr)
diff --git a/yamllint/config.py b/yamllint/config.py
index 9ce62549..b7d389fc 100644
--- a/yamllint/config.py
+++ b/yamllint/config.py
@@ -13,13 +13,13 @@
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
-import fileinput
import os.path
import pathspec
import yaml
import yamllint.rules
+from yamllint import decoder
class YamlLintConfigError(Exception):
@@ -38,8 +38,8 @@ def __init__(self, content=None, file=None):
self.locale = None
if file is not None:
- with open(file) as f:
- content = f.read()
+ with open(file, mode='rb') as f:
+ content = decoder.auto_decode(f.read())
self.parse(content)
self.validate()
@@ -109,8 +109,10 @@ def parse(self, raw_content):
raise YamlLintConfigError(
'invalid config: ignore-from-file should contain '
'filename(s), either as a list or string')
- with fileinput.input(conf['ignore-from-file']) as f:
- self.ignore = pathspec.PathSpec.from_lines('gitwildmatch', f)
+ self.ignore = pathspec.PathSpec.from_lines(
+ 'gitwildmatch',
+ decoder.lines_in_files(conf['ignore-from-file'])
+ )
elif 'ignore' in conf:
if isinstance(conf['ignore'], str):
self.ignore = pathspec.PathSpec.from_lines(
@@ -163,9 +165,10 @@ def validate_rule_conf(rule, conf):
raise YamlLintConfigError(
'invalid config: ignore-from-file should contain '
'valid filename(s), either as a list or string')
- with fileinput.input(conf['ignore-from-file']) as f:
- conf['ignore'] = pathspec.PathSpec.from_lines(
- 'gitwildmatch', f)
+ conf['ignore'] = pathspec.PathSpec.from_lines(
+ 'gitwildmatch',
+ decoder.lines_in_files(conf['ignore-from-file'])
+ )
elif ('ignore' in conf and not isinstance(
conf['ignore'], pathspec.pathspec.PathSpec)):
if isinstance(conf['ignore'], str):
diff --git a/yamllint/decoder.py b/yamllint/decoder.py
new file mode 100644
index 00000000..1e3c2f32
--- /dev/null
+++ b/yamllint/decoder.py
@@ -0,0 +1,65 @@
+# Copyright (C) 2023 Jason Yundt
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+import codecs
+
+
+def detect_encoding(stream_data):
+ """
+ Return stream_data’s character encoding
+
+ Specifically, this function will take a bytes object and return a string
+ that contains the name of one of Python’s built-in codecs [1].
+
+ The YAML spec says that streams must begin with a BOM or an ASCII
+ character. If stream_data doesn’t begin with either of those, then this
+ function might return the wrong encoding. See chapter 5.2 of the YAML spec
+ for details [2].
+
+ [1]:
+ [2]:
+ """
+ if stream_data.startswith(codecs.BOM_UTF32_BE):
+ return 'utf_32'
+ elif stream_data.startswith(b'\x00\x00\x00') and len(stream_data) >= 4:
+ return 'utf_32_be'
+ elif stream_data.startswith(codecs.BOM_UTF32_LE):
+ return 'utf_32'
+ elif stream_data[1:4] == b'\x00\x00\x00':
+ return 'utf_32_le'
+ elif stream_data.startswith(codecs.BOM_UTF16_BE):
+ return 'utf_16'
+ elif stream_data.startswith(b'\x00') and len(stream_data) >= 2:
+ return 'utf_16_be'
+ elif stream_data.startswith(codecs.BOM_UTF16_LE):
+ return 'utf_16'
+ elif stream_data[1:2] == b'\x00':
+ return 'utf_16_le'
+ elif stream_data.startswith(codecs.BOM_UTF8):
+ return 'utf_8_sig'
+ else:
+ return 'utf_8'
+
+
+def auto_decode(stream_data):
+ return stream_data.decode(encoding=detect_encoding(stream_data))
+
+
+def lines_in_files(paths):
+ """Autodecodes files and yields their lines."""
+ for path in paths:
+ with open(path, 'rb') as file:
+ text = auto_decode(file.read())
+ yield from text.splitlines()
diff --git a/yamllint/linter.py b/yamllint/linter.py
index a2faa061..2230a600 100644
--- a/yamllint/linter.py
+++ b/yamllint/linter.py
@@ -18,7 +18,7 @@
import yaml
-from yamllint import parser
+from yamllint import decoder, parser
PROBLEM_LEVELS = {
0: None,
@@ -187,6 +187,8 @@ def get_syntax_error(buffer):
def _run(buffer, conf, filepath):
assert hasattr(buffer, '__getitem__'), \
'_run() argument must be a buffer, not a stream'
+ if isinstance(buffer, bytes):
+ buffer = decoder.auto_decode(buffer)
first_line = next(parser.line_generator(buffer)).content
if re.match(r'^#\s*yamllint disable-file\s*$', first_line):