diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cfccb39..cfe18dc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,7 @@ on: branches: ["main"] pull_request: -jobs: +jobs: check-code-style: name: Check code style runs-on: ubuntu-latest @@ -13,11 +13,17 @@ jobs: - uses: actions/checkout@v4 - name: Install Rust uses: dtolnay/rust-toolchain@stable - - uses: actions/setup-python@v2 + + - name: Setup Poetry + uses: matrix-org/setup-python-poetry@v1 with: - python-version: "3.11" - - run: python -m pip install tox "poetry==1.8.3" - - run: tox -e check_codestyle + install-project: "false" + + - name: Run ruff check + run: poetry run ruff check --output-format=github . + + - name: Run ruff format + run: poetry run ruff format --check . check-types: name: Check types with Mypy @@ -26,12 +32,31 @@ jobs: - run: sudo apt-get install -y libmagic1 - uses: actions/checkout@v4 - name: Install Rust - uses: dtolnay/rust-toolchain@stable - - uses: actions/setup-python@v2 + uses: dtolnay/rust-toolchain@1.82.0 + - uses: Swatinem/rust-cache@v2 + + - name: Setup Poetry + uses: matrix-org/setup-python-poetry@v1 with: - python-version: "3.11" - - run: python -m pip install tox "poetry==1.8.3" - - run: tox -e check_types + # We have seen odd mypy failures that were resolved when we started + # installing the project again: + # https://github.com/matrix-org/synapse/pull/15376#issuecomment-1498983775 + # To make CI green, err towards caution and install the project. + install-project: "true" + + # Cribbed from + # https://github.com/AustinScola/mypy-cache-github-action/blob/85ea4f2972abed39b33bd02c36e341b28ca59213/src/restore.ts#L10-L17 + - name: Restore/persist mypy's cache + uses: actions/cache@v4 + with: + path: | + .mypy_cache + key: mypy-cache-${{ github.context.sha }} + restore-keys: mypy-cache- + + - name: Run mypy + run: poetry run mypy + unit-tests: name: Unit tests diff --git a/config.sample.yaml b/config.sample.yaml index f8d4f4e..954d343 100644 --- a/config.sample.yaml +++ b/config.sample.yaml @@ -36,9 +36,20 @@ scan: # List of allowed MIME types. If a file has a MIME type that's not in this list, its # scan is considered failed. + # Unrecognised binary files are considered to be `application/octet-stream`. + # Unrecognised text files are considered to be `text/plain`. # Optional, defaults to allowing all MIME types. allowed_mimetypes: ["image/jpeg"] + # List of blocked MIME types. + # If specified, `allowed_mimetypes` must not be specified as well. + # If specified, a file whose MIME type is on this list will produce a scan that is + # considered failed. + # Unrecognised binary files are considered to be `application/octet-stream`. + # Unrecognised text files are considered to be `text/plain`. + # Optional. + # blocked_mimetypes: ["image/jpeg"] + # Configuration of scan result caching. # # Results are stored in a cache to avoid having to download and scan a file twice. There diff --git a/mypy.ini b/mypy.ini index 1d781df..6f82dd9 100644 --- a/mypy.ini +++ b/mypy.ini @@ -1,2 +1,5 @@ [mypy] -strict = true \ No newline at end of file +strict = true +files = + tests/, + src/ diff --git a/perf/scanner_perf.py b/perf/scanner_perf.py old mode 100644 new mode 100755 diff --git a/poetry.lock b/poetry.lock index 57a8eea..cab0bb6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -167,52 +167,6 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] -[[package]] -name = "black" -version = "24.8.0" -description = "The uncompromising code formatter." -optional = false -python-versions = ">=3.8" -files = [ - {file = "black-24.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:09cdeb74d494ec023ded657f7092ba518e8cf78fa8386155e4a03fdcc44679e6"}, - {file = "black-24.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:81c6742da39f33b08e791da38410f32e27d632260e599df7245cccee2064afeb"}, - {file = "black-24.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:707a1ca89221bc8a1a64fb5e15ef39cd755633daa672a9db7498d1c19de66a42"}, - {file = "black-24.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:d6417535d99c37cee4091a2f24eb2b6d5ec42b144d50f1f2e436d9fe1916fe1a"}, - {file = "black-24.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:fb6e2c0b86bbd43dee042e48059c9ad7830abd5c94b0bc518c0eeec57c3eddc1"}, - {file = "black-24.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:837fd281f1908d0076844bc2b801ad2d369c78c45cf800cad7b61686051041af"}, - {file = "black-24.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62e8730977f0b77998029da7971fa896ceefa2c4c4933fcd593fa599ecbf97a4"}, - {file = "black-24.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:72901b4913cbac8972ad911dc4098d5753704d1f3c56e44ae8dce99eecb0e3af"}, - {file = "black-24.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7c046c1d1eeb7aea9335da62472481d3bbf3fd986e093cffd35f4385c94ae368"}, - {file = "black-24.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:649f6d84ccbae73ab767e206772cc2d7a393a001070a4c814a546afd0d423aed"}, - {file = "black-24.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b59b250fdba5f9a9cd9d0ece6e6d993d91ce877d121d161e4698af3eb9c1018"}, - {file = "black-24.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:6e55d30d44bed36593c3163b9bc63bf58b3b30e4611e4d88a0c3c239930ed5b2"}, - {file = "black-24.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:505289f17ceda596658ae81b61ebbe2d9b25aa78067035184ed0a9d855d18afd"}, - {file = "black-24.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b19c9ad992c7883ad84c9b22aaa73562a16b819c1d8db7a1a1a49fb7ec13c7d2"}, - {file = "black-24.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f13f7f386f86f8121d76599114bb8c17b69d962137fc70efe56137727c7047e"}, - {file = "black-24.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:f490dbd59680d809ca31efdae20e634f3fae27fba3ce0ba3208333b713bc3920"}, - {file = "black-24.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eab4dd44ce80dea27dc69db40dab62d4ca96112f87996bca68cd75639aeb2e4c"}, - {file = "black-24.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3c4285573d4897a7610054af5a890bde7c65cb466040c5f0c8b732812d7f0e5e"}, - {file = "black-24.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e84e33b37be070ba135176c123ae52a51f82306def9f7d063ee302ecab2cf47"}, - {file = "black-24.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:73bbf84ed136e45d451a260c6b73ed674652f90a2b3211d6a35e78054563a9bb"}, - {file = "black-24.8.0-py3-none-any.whl", hash = "sha256:972085c618ee94f402da1af548a4f218c754ea7e5dc70acb168bfaca4c2542ed"}, - {file = "black-24.8.0.tar.gz", hash = "sha256:2500945420b6784c38b9ee885af039f5e7471ef284ab03fa35ecdde4688cd83f"}, -] - -[package.dependencies] -click = ">=8.0.0" -mypy-extensions = ">=0.4.3" -packaging = ">=22.0" -pathspec = ">=0.9.0" -platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=4.0.1", markers = "python_version < \"3.11\""} - -[package.extras] -colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.7.4)", "aiohttp (>=3.7.4,!=3.9.0)"] -jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] -uvloop = ["uvloop (>=0.15.2)"] - [[package]] name = "cachetools" version = "5.5.0" @@ -235,47 +189,6 @@ files = [ {file = "canonicaljson-2.0.0.tar.gz", hash = "sha256:e2fdaef1d7fadc5d9cb59bd3d0d41b064ddda697809ac4325dced721d12f113f"}, ] -[[package]] -name = "click" -version = "8.1.7" -description = "Composable command line interface toolkit" -optional = false -python-versions = ">=3.7" -files = [ - {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, - {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - -[[package]] -name = "colorama" -version = "0.4.6" -description = "Cross-platform colored terminal text." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -files = [ - {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, - {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, -] - -[[package]] -name = "flake8" -version = "7.1.1" -description = "the modular source code checker: pep8 pyflakes and co" -optional = false -python-versions = ">=3.8.1" -files = [ - {file = "flake8-7.1.1-py2.py3-none-any.whl", hash = "sha256:597477df7860daa5aa0fdd84bf5208a043ab96b8e96ab708770ae0364dd03213"}, - {file = "flake8-7.1.1.tar.gz", hash = "sha256:049d058491e228e03e67b390f311bbf88fce2dbaa8fa673e7aea87b7198b8d38"}, -] - -[package.dependencies] -mccabe = ">=0.7.0,<0.8.0" -pycodestyle = ">=2.12.0,<2.13.0" -pyflakes = ">=3.2.0,<3.3.0" - [[package]] name = "frozenlist" version = "1.4.1" @@ -387,20 +300,6 @@ files = [ {file = "idna-3.8.tar.gz", hash = "sha256:d838c2c0ed6fced7693d5e8ab8e734d5f8fda53a039c0164afb0b82e771e3603"}, ] -[[package]] -name = "isort" -version = "5.13.2" -description = "A Python utility / library to sort Python imports." -optional = false -python-versions = ">=3.8.0" -files = [ - {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, - {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, -] - -[package.extras] -colors = ["colorama (>=0.4.6)"] - [[package]] name = "jsonschema" version = "4.23.0" @@ -436,17 +335,6 @@ files = [ [package.dependencies] referencing = ">=0.31.0" -[[package]] -name = "mccabe" -version = "0.7.0" -description = "McCabe checker, plugin for flake8" -optional = false -python-versions = ">=3.6" -files = [ - {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, - {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, -] - [[package]] name = "multidict" version = "6.0.5" @@ -604,66 +492,6 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] -[[package]] -name = "packaging" -version = "24.1" -description = "Core utilities for Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, -] - -[[package]] -name = "pathspec" -version = "0.12.1" -description = "Utility library for gitignore style pattern matching of file paths." -optional = false -python-versions = ">=3.8" -files = [ - {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, - {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, -] - -[[package]] -name = "platformdirs" -version = "4.2.2" -description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." -optional = false -python-versions = ">=3.8" -files = [ - {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, - {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, -] - -[package.extras] -docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] -type = ["mypy (>=1.8)"] - -[[package]] -name = "pycodestyle" -version = "2.12.1" -description = "Python style guide checker" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pycodestyle-2.12.1-py2.py3-none-any.whl", hash = "sha256:46f0fb92069a7c28ab7bb558f05bfc0110dac69a0cd23c61ea0040283a9d78b3"}, - {file = "pycodestyle-2.12.1.tar.gz", hash = "sha256:6838eae08bbce4f6accd5d5572075c63626a15ee3e6f842df996bf62f6d73521"}, -] - -[[package]] -name = "pyflakes" -version = "3.2.0" -description = "passive checker of Python programs" -optional = false -python-versions = ">=3.8" -files = [ - {file = "pyflakes-3.2.0-py2.py3-none-any.whl", hash = "sha256:84b5be138a2dfbb40689ca07e2152deb896a65c3a3e24c251c5c62489568074a"}, - {file = "pyflakes-3.2.0.tar.gz", hash = "sha256:1c61603ff154621fb2a9172037d84dca3500def8c8b630657d1701f026f8af3f"}, -] - [[package]] name = "pyreadline3" version = "3.4.1" @@ -875,6 +703,33 @@ files = [ {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"}, ] +[[package]] +name = "ruff" +version = "0.7.2" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.7.2-py3-none-linux_armv6l.whl", hash = "sha256:b73f873b5f52092e63ed540adefc3c36f1f803790ecf2590e1df8bf0a9f72cb8"}, + {file = "ruff-0.7.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:5b813ef26db1015953daf476202585512afd6a6862a02cde63f3bafb53d0b2d4"}, + {file = "ruff-0.7.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:853277dbd9675810c6826dad7a428d52a11760744508340e66bf46f8be9701d9"}, + {file = "ruff-0.7.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21aae53ab1490a52bf4e3bf520c10ce120987b047c494cacf4edad0ba0888da2"}, + {file = "ruff-0.7.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ccc7e0fc6e0cb3168443eeadb6445285abaae75142ee22b2b72c27d790ab60ba"}, + {file = "ruff-0.7.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd77877a4e43b3a98e5ef4715ba3862105e299af0c48942cc6d51ba3d97dc859"}, + {file = "ruff-0.7.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e00163fb897d35523c70d71a46fbaa43bf7bf9af0f4534c53ea5b96b2e03397b"}, + {file = "ruff-0.7.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f3c54b538633482dc342e9b634d91168fe8cc56b30a4b4f99287f4e339103e88"}, + {file = "ruff-0.7.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b792468e9804a204be221b14257566669d1db5c00d6bb335996e5cd7004ba80"}, + {file = "ruff-0.7.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dba53ed84ac19ae4bfb4ea4bf0172550a2285fa27fbb13e3746f04c80f7fa088"}, + {file = "ruff-0.7.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b19fafe261bf741bca2764c14cbb4ee1819b67adb63ebc2db6401dcd652e3748"}, + {file = "ruff-0.7.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:28bd8220f4d8f79d590db9e2f6a0674f75ddbc3847277dd44ac1f8d30684b828"}, + {file = "ruff-0.7.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9fd67094e77efbea932e62b5d2483006154794040abb3a5072e659096415ae1e"}, + {file = "ruff-0.7.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:576305393998b7bd6c46018f8104ea3a9cb3fa7908c21d8580e3274a3b04b691"}, + {file = "ruff-0.7.2-py3-none-win32.whl", hash = "sha256:fa993cfc9f0ff11187e82de874dfc3611df80852540331bc85c75809c93253a8"}, + {file = "ruff-0.7.2-py3-none-win_amd64.whl", hash = "sha256:dd8800cbe0254e06b8fec585e97554047fb82c894973f7ff18558eee33d1cb88"}, + {file = "ruff-0.7.2-py3-none-win_arm64.whl", hash = "sha256:bb8368cd45bba3f57bb29cbb8d64b4a33f8415d0149d2655c5c8539452ce7760"}, + {file = "ruff-0.7.2.tar.gz", hash = "sha256:2b14e77293380e475b4e3a7a368e14549288ed2931fce259a6f99978669e844f"}, +] + [[package]] name = "semantic-version" version = "2.10.0" @@ -1100,4 +955,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10.0" -content-hash = "c323c458dab23e7c50da3b0549d282fe79cf287e44a1501862f0cd030ad4a82e" +content-hash = "9c601c1dba23ffa589203c39162eb727ca4a65851837d7d3e12318eae0d324f8" diff --git a/pyproject.toml b/pyproject.toml index 50e29b6..10e9a82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,13 +2,63 @@ requires = ["poetry-core", "wheel", "setuptools-rust"] build-backend = "poetry.core.masonry.api" -[tool.isort] -profile = "black" -known_first_party = [ - "src", - "tests" +[tool.ruff] +line-length = 88 +target-version = "py38" + +[tool.ruff.lint] +# See https://beta.ruff.rs/docs/rules/#error-e +# for error codes. The ones we ignore are: +# E501: line too long (we don't normally run this check in other projects such as Synapse) +# E731: do not assign a lambda expression, use a def +# +# flake8-bugbear compatible checks. Its error codes are described at +# https://beta.ruff.rs/docs/rules/#flake8-bugbear-b +# B023: Functions defined inside a loop must not use variables redefined in the loop +ignore = [ + "B023", + "E501", + "E731", +] +select = [ + # pycodestyle + "E", + "W", + # pyflakes + "F", + # isort + "I001", + # flake8-bugbear + "B0", + # flake8-comprehensions + "C4", + # flake8-2020 + "YTT", + # flake8-slots + "SLOT", + # flake8-debugger + "T10", + # flake8-pie + "PIE", + # flake8-executable + "EXE", ] +[tool.ruff.lint.isort] +combine-as-imports = true +section-order = ["future", "standard-library", "third-party", "twisted", "first-party", "testing", "local-folder"] +known-first-party = ["matrix_content_scanner"] + +[tool.ruff.lint.isort.sections] +twisted = ["twisted", "OpenSSL"] +testing = ["tests"] + +[tool.ruff.format] +quote-style = "double" +indent-style = "space" +skip-magic-trailing-comma = false +line-ending = "auto" + [tool.maturin] manifest-path = "rust/Cargo.toml" module-name = "matrix_content_scanner.mcs_rust" @@ -54,10 +104,8 @@ canonicaljson = ">=1.6.3" setuptools_rust = ">=1.3" [tool.poetry.dev-dependencies] -# for linting -isort = ">=5.10.1" -black = ">=22.7.0" -flake8 = ">=4.0.1" +# for linting and formatting +ruff = "^0.7.2" # for type checking mypy = "*" types-jsonschema = ">=3.2.0" diff --git a/scripts-dev/lint.sh b/scripts-dev/lint.sh index 332f130..8931e64 100755 --- a/scripts-dev/lint.sh +++ b/scripts-dev/lint.sh @@ -1,8 +1,6 @@ #!/usr/bin/env bash # Runs linting scripts and type checking -# isort - sorts import statements -# black - opinionated code formatter -# flake8 - lints and finds mistakes +# ruff - sorts import statements, lints and finds mistakes, formats the code # mypy - checks type annotations set -e @@ -16,7 +14,12 @@ files=( # Print out the commands being run set -x -isort "${files[@]}" -python3 -m black "${files[@]}" -flake8 "${files[@]}" +# Catch any common programming mistakes in Python code. +# --quiet suppresses the update check. +ruff check --quiet --fix "${files[@]}" + +# Reformat Python code. +ruff format --quiet "${files[@]}" + +# Type-check the code. mypy "${files[@]}" diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 8d6fec0..0000000 --- a/setup.cfg +++ /dev/null @@ -1,9 +0,0 @@ -[flake8] -# see https://pycodestyle.readthedocs.io/en/latest/intro.html#error-codes -# for error codes. The ones we ignore are: -# W503: line break before binary operator -# W504: line break after binary operator -# E203: whitespace before ':' (which is contrary to pep8?) -# E501: Line too long (black enforces this for us) -# (this is a subset of those ignored in Synapse) -ignore=W503,W504,E203,E501 diff --git a/src/matrix_content_scanner/config.py b/src/matrix_content_scanner/config.py index 777ef44..d396a2e 100644 --- a/src/matrix_content_scanner/config.py +++ b/src/matrix_content_scanner/config.py @@ -75,6 +75,7 @@ def _parse_size(size: Optional[Union[str, float]]) -> Optional[float]: "temp_directory": {"type": "string"}, "removal_command": {"type": "string"}, "allowed_mimetypes": {"type": "array", "items": {"type": "string"}}, + "blocked_mimetypes": {"type": "array", "items": {"type": "string"}}, }, }, "download": { @@ -131,6 +132,7 @@ class ScanConfig: temp_directory: str removal_command: str = "rm" allowed_mimetypes: Optional[List[str]] = None + blocked_mimetypes: Optional[List[str]] = None @attr.s(auto_attribs=True, frozen=True, slots=True) @@ -175,3 +177,15 @@ def __init__(self, config_dict: Dict[str, Any]): self.crypto = CryptoConfig(**(config_dict.get("crypto") or {})) self.download = DownloadConfig(**(config_dict.get("download") or {})) self.result_cache = ResultCacheConfig(**(config_dict.get("result_cache") or {})) + + # Don't allow both allowlist and blocklist for MIME types, since we do not document + # the semantics for that and it is in any case pointless. + # This could have been expressed in JSONSchema but I suspect the error message would be poor + # in that case. + if ( + self.scan.allowed_mimetypes is not None + and self.scan.blocked_mimetypes is not None + ): + raise ConfigError( + "Both `scan.allowed_mimetypes` and `scan.blocked_mimetypes` are specified, which is not allowed!" + ) diff --git a/src/matrix_content_scanner/scanner/file_downloader.py b/src/matrix_content_scanner/scanner/file_downloader.py index ee2271e..389030d 100644 --- a/src/matrix_content_scanner/scanner/file_downloader.py +++ b/src/matrix_content_scanner/scanner/file_downloader.py @@ -29,8 +29,6 @@ class _PathNotFoundException(Exception): homeserver. """ - pass - class FileDownloader: MEDIA_DOWNLOAD_PREFIX = "_matrix/media/%s/download" @@ -62,7 +60,9 @@ async def download_file( ContentScannerRestError: The file was not found or could not be downloaded due to an error on the remote homeserver's side. """ - url = await self._build_https_url(media_path) + url = await self._build_https_url( + media_path, for_thumbnail=thumbnail_params is not None + ) # Attempt to retrieve the file at the generated URL. try: @@ -73,7 +73,11 @@ async def download_file( # again with an r0 endpoint. logger.info("File not found, trying legacy r0 path") - url = await self._build_https_url(media_path, endpoint_version="r0") + url = await self._build_https_url( + media_path, + endpoint_version="r0", + for_thumbnail=thumbnail_params is not None, + ) try: file = await self._get_file_content(url, thumbnail_params) @@ -91,6 +95,8 @@ async def _build_https_url( self, media_path: str, endpoint_version: str = "v3", + *, + for_thumbnail: bool, ) -> str: """Turn a `server_name/media_id` path into an https:// one we can use to fetch the media. @@ -102,6 +108,9 @@ async def _build_https_url( media_path: The media path to translate. endpoint_version: The version of the download endpoint to use. As of Matrix v1.1, this is either "v3" or "r0". + for_thumbnail: True if a server-side thumbnail is desired instead of the full + media. In that case, the URL for the `/thumbnail` endpoint is returned + instead of the `/download` endpoint. Returns: An https URL to use. If `base_homeserver_url` is set in the config, this @@ -131,7 +140,9 @@ async def _build_https_url( # didn't find a .well-known file. base_url = "https://" + server_name - prefix = self.MEDIA_DOWNLOAD_PREFIX + prefix = ( + self.MEDIA_THUMBNAIL_PREFIX if for_thumbnail else self.MEDIA_DOWNLOAD_PREFIX + ) # Build the full URL. path_prefix = prefix % endpoint_version diff --git a/src/matrix_content_scanner/scanner/scanner.py b/src/matrix_content_scanner/scanner/scanner.py index 774326f..53e9806 100644 --- a/src/matrix_content_scanner/scanner/scanner.py +++ b/src/matrix_content_scanner/scanner/scanner.py @@ -76,11 +76,18 @@ def __init__(self, mcs: "MatrixContentScanner"): self._max_size_to_cache = mcs.config.result_cache.max_file_size - # List of MIME types we should allow. If None, we don't fail files based on their + # List of MIME types we should allow. + # If None, we fall back to `_blocked_mimetypes`. + # If that's also None, we don't fail files based on their # MIME types (besides comparing it with the Content-Type header from the server # for unencrypted files). self._allowed_mimetypes = mcs.config.scan.allowed_mimetypes + # List of MIME types we should block. + # Must not be specified at the same time as `_allowed_mimetypes`. + # See the comment for `_allowed_mimetypes` for the semantics. + self._blocked_mimetypes = mcs.config.scan.blocked_mimetypes + # Cache of futures for files that are currently scanning and downloading, so that # concurrent requests don't cause a file to be downloaded and scanned twice. self._current_scans: Dict[str, Future[MediaDescription]] = {} @@ -272,7 +279,6 @@ async def _scan_file( self._max_size_to_cache is not None and len(media.content) > self._max_size_to_cache ): - # Don't cache the file's content if it exceeds the maximum allowed file # size, to minimise memory usage. logger.info( @@ -506,3 +512,17 @@ def _check_mimetype(self, media_content: bytes) -> None: raise FileMimeTypeForbiddenError( f"File type: {detected_mimetype} not allowed" ) + + # If there's a block list for MIME types, check that the MIME type detected for + # this file is NOT in it. + if ( + self._blocked_mimetypes is not None + and detected_mimetype in self._blocked_mimetypes + ): + logger.error( + "MIME type for file is forbidden: %s", + detected_mimetype, + ) + raise FileMimeTypeForbiddenError( + f"File type: {detected_mimetype} not allowed" + ) diff --git a/src/matrix_content_scanner/utils/errors.py b/src/matrix_content_scanner/utils/errors.py index 1469799..2274852 100644 --- a/src/matrix_content_scanner/utils/errors.py +++ b/src/matrix_content_scanner/utils/errors.py @@ -54,10 +54,6 @@ def __init__(self, info: Optional[str]) -> None: class ConfigError(Exception): """An error indicating an issue with the configuration file.""" - pass - class WellKnownDiscoveryError(Exception): """An error indicating a failure when attempting a .well-known discovery.""" - - pass diff --git a/tests/scanner/test_file_downloader.py b/tests/scanner/test_file_downloader.py index 921cd8b..ce709d4 100644 --- a/tests/scanner/test_file_downloader.py +++ b/tests/scanner/test_file_downloader.py @@ -14,6 +14,7 @@ WellKnownDiscoveryError, ) from matrix_content_scanner.utils.types import JsonDict + from tests.testutils import ( MEDIA_PATH, SMALL_PNG, @@ -42,9 +43,13 @@ async def _get( .well-known client file. """ if ( - url.endswith("/_matrix/media/v3/download/" + MEDIA_PATH) + url.endswith( + ( + "/_matrix/media/v3/download/" + MEDIA_PATH, + "/_matrix/media/r0/download/" + MEDIA_PATH, + ) + ) or "/_matrix/media/v3/thumbnail/" + MEDIA_PATH in url - or url.endswith("/_matrix/media/r0/download/" + MEDIA_PATH) or "/_matrix/media/r0/thumbnail/" + MEDIA_PATH in url ): return self.media_status, self.media_body, self.media_headers @@ -64,8 +69,8 @@ async def test_download(self) -> None: self.assertEqual(media.content_type, "image/png") # Check that we tried downloading from the set base URL. - args = self.get_mock.call_args - self.assertTrue(args[0][0].startswith("http://my-site.com/")) + args = self.get_mock.call_args.args + self.assertTrue(args[0].startswith("http://my-site.com/")) async def test_no_base_url(self) -> None: """Tests that configuring a base homeserver URL means files are downloaded from @@ -141,7 +146,9 @@ async def test_thumbnail(self) -> None: MEDIA_PATH, to_thumbnail_params({"height": "50"}) ) - query: CIMultiDictProxy[str] = self.get_mock.call_args[1]["query"] + url: str = self.get_mock.call_args.args[0] + query: CIMultiDictProxy[str] = self.get_mock.call_args.kwargs["query"] + self.assertIn("/thumbnail/", url) self.assertIn("height", query) self.assertEqual(query.get("height"), "50", query.getall("height")) diff --git a/tests/scanner/test_scanner.py b/tests/scanner/test_scanner.py index 0367a58..6ea61fd 100644 --- a/tests/scanner/test_scanner.py +++ b/tests/scanner/test_scanner.py @@ -18,11 +18,14 @@ FileMimeTypeForbiddenError, ) from matrix_content_scanner.utils.types import MediaDescription + from tests.testutils import ( ENCRYPTED_FILE_METADATA, MEDIA_PATH, + SMALL_BINARY_FILE, SMALL_PNG, SMALL_PNG_ENCRYPTED, + SMALL_TEXT_FILE, get_content_scanner, to_thumbnail_params, ) @@ -218,7 +221,7 @@ async def test_different_encryption_key(self) -> None: # But it also causes it to be downloaded again because its metadata have changed. self.assertEqual(self.downloader_mock.call_count, 2) - async def test_mimetype(self) -> None: + async def test_allowlist_mimetype(self) -> None: """Tests that, if there's an allow list for MIME types and the file's MIME type isn't in it, the file's scan fails. """ @@ -229,7 +232,7 @@ async def test_mimetype(self) -> None: with self.assertRaises(FileMimeTypeForbiddenError): await self.scanner.scan_file(MEDIA_PATH) - async def test_mimetype_encrypted(self) -> None: + async def test_allowlist_mimetype_encrypted(self) -> None: """Tests that the file's MIME type is correctly detected and compared with the allow list (if set), even if it's encrypted. """ @@ -242,6 +245,66 @@ async def test_mimetype_encrypted(self) -> None: with self.assertRaises(FileMimeTypeForbiddenError): await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) + async def test_blocklist_mimetype(self) -> None: + """Tests that, if there's an allow list for MIME types and the file's MIME type + isn't in it, the file's scan fails. + """ + # Set a block list that blocks PNG images. + self.scanner._blocked_mimetypes = ["image/png"] + + # Check that the scan fails since the file is a PNG. + with self.assertRaises(FileMimeTypeForbiddenError): + await self.scanner.scan_file(MEDIA_PATH) + + async def test_blocklist_mimetype_encrypted(self) -> None: + """Tests that the file's MIME type is correctly detected and compared with the + allow list (if set), even if it's encrypted. + """ + self._setup_encrypted() + + # Set a block list that blocks PNG images. + self.scanner._blocked_mimetypes = ["image/png"] + + # Check that the scan fails since the file is a PNG. + with self.assertRaises(FileMimeTypeForbiddenError): + await self.scanner.scan_file(MEDIA_PATH, ENCRYPTED_FILE_METADATA) + + async def test_blocklist_mimetype_fallback_binary_file(self) -> None: + """Tests that unrecognised binary files' MIME type is assumed to be + `application/octet-stream` and that they can be blocked in this way. + """ + + self.downloader_res = MediaDescription( + # This is the *claimed* content-type by the uploader + content_type="application/vnd.io.element.generic_binary_file", + content=SMALL_BINARY_FILE, + response_headers=CIMultiDictProxy(CIMultiDict()), + ) + + # Set a block list that blocks uncategorised binary files. + self.scanner._blocked_mimetypes = ["application/octet-stream"] + + with self.assertRaises(FileMimeTypeForbiddenError): + await self.scanner.scan_file(MEDIA_PATH) + + async def test_blocklist_mimetype_fallback_text_file(self) -> None: + """Tests that unrecognised text files' MIME type is assumed to be + `text/plain` and that they can be blocked in this way. + """ + + self.downloader_res = MediaDescription( + # This is the *claimed* content-type by the uploader + content_type="application/vnd.io.element.generic_file", + content=SMALL_TEXT_FILE, + response_headers=CIMultiDictProxy(CIMultiDict()), + ) + + # Set a block list that blocks uncategorised text files. + self.scanner._blocked_mimetypes = ["text/plain"] + + with self.assertRaises(FileMimeTypeForbiddenError): + await self.scanner.scan_file(MEDIA_PATH) + async def test_dont_cache_exit_codes(self) -> None: """Tests that if the configuration specifies exit codes to ignore when running the scanning script, we don't cache them. diff --git a/tests/servlets/test_scan.py b/tests/servlets/test_scan.py index f666a05..5736919 100644 --- a/tests/servlets/test_scan.py +++ b/tests/servlets/test_scan.py @@ -12,6 +12,7 @@ from matrix_content_scanner.httpserver import HTTPServer from matrix_content_scanner.utils.constants import ErrCode from matrix_content_scanner.utils.errors import ContentScannerRestError + from tests.testutils import get_content_scanner SERVER_NAME = "test" diff --git a/tests/servlets/test_servlets.py b/tests/servlets/test_servlets.py index a6e169c..c50c834 100644 --- a/tests/servlets/test_servlets.py +++ b/tests/servlets/test_servlets.py @@ -9,6 +9,7 @@ from matrix_content_scanner.utils.constants import ErrCode from matrix_content_scanner.utils.errors import ContentScannerRestError from matrix_content_scanner.utils.types import JsonDict + from tests.testutils import ENCRYPTED_FILE_METADATA, get_content_scanner diff --git a/tests/testutils.py b/tests/testutils.py index 6bd9649..f57e816 100644 --- a/tests/testutils.py +++ b/tests/testutils.py @@ -22,6 +22,12 @@ b"0a2db40000000049454e44ae426082" ) +# A small binary file without any specific format. +SMALL_BINARY_FILE = unhexlify(b"010203") + +# A small text file without any specific format. +SMALL_TEXT_FILE = b"Hello world\nThis is a tiny text file" + # A small, encrypted PNG. SMALL_PNG_ENCRYPTED = unhexlify( b"9fd28dd7a1d845a04948f13af104e39402c888f7b601bce313ad" diff --git a/tests/utils/test_encrypted_file_metadata.py b/tests/utils/test_encrypted_file_metadata.py index e1c4053..102215e 100644 --- a/tests/utils/test_encrypted_file_metadata.py +++ b/tests/utils/test_encrypted_file_metadata.py @@ -10,6 +10,7 @@ validate_encrypted_file_metadata, ) from matrix_content_scanner.utils.errors import ContentScannerRestError + from tests.testutils import ENCRYPTED_FILE_METADATA diff --git a/tox.ini b/tox.ini index 75604ae..7208693 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py, check_codestyle, check_types +envlist = py # required for PEP 517 (pyproject.toml-style) builds isolated_build = true @@ -20,24 +20,3 @@ commands = poetry install usedevelop=true commands = poetry run python -m unittest discover tests - -[testenv:check_codestyle] -deps = - flake8 - black - isort - -commands = - poetry run flake8 perf src tests - poetry run black --check --diff perf src tests - poetry run isort --check-only --diff perf src tests - -[testenv:check_types] -deps = - mypy - types-jsonschema - types-PyYAML - types-cachetools - types-humanfriendly - -commands = poetry run mypy perf src tests