From d1be2bca4afb41173a3d6f9ba49664823ab1cff9 Mon Sep 17 00:00:00 2001 From: Greg Tatum Date: Tue, 12 Dec 2023 15:08:59 -0600 Subject: [PATCH] Update the find corpus tool to provide more information (#280) * Add pytest-clarity for better text diffs in tests * Add requests_mock for tests * Add the test_data artifact to the .gitignore * Use an underscore with find_corpus.py * Update the find corpus tool to provide more information * Add humanize to the dependency list --- .gitignore | 2 + Makefile | 4 +- poetry.lock | 138 ++++++++++++++--- pyproject.toml | 3 + tests/test_find_corpus.py | 154 +++++++++++++++++++ utils/find-corpus.py | 70 --------- utils/find_corpus.py | 308 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 589 insertions(+), 90 deletions(-) create mode 100644 tests/test_find_corpus.py delete mode 100755 utils/find-corpus.py create mode 100755 utils/find_corpus.py diff --git a/.gitignore b/.gitignore index 7ba905883..d197eea0c 100644 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,5 @@ dmypy.json .models .bin .snakemake + +tests_data diff --git a/Makefile b/Makefile index f924da325..7247b2ebd 100644 --- a/Makefile +++ b/Makefile @@ -164,8 +164,8 @@ fix-all: # Run unit tests run-tests: - poetry install --only tests - PYTHONPATH=$$(pwd) poetry run pytest tests + poetry install --only tests --only utils + PYTHONPATH=$$(pwd) poetry run pytest tests -vv # Validates Taskcluster task graph locally validate-taskgraph: diff --git a/poetry.lock b/poetry.lock index ab9180479..906b37f09 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "absl-py" @@ -1021,6 +1021,20 @@ files = [ [package.dependencies] pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} +[[package]] +name = "humanize" +version = "4.9.0" +description = "Python humanize utilities" +optional = false +python-versions = ">=3.8" +files = [ + {file = "humanize-4.9.0-py3-none-any.whl", hash = "sha256:ce284a76d5b1377fd8836733b983bfb0b76f1aa1c090de2566fcf008d7f6ab16"}, + {file = "humanize-4.9.0.tar.gz", hash = "sha256:582a265c931c683a7e9b8ed9559089dea7edcf6cc95be39a3cbc2c5d5ac2bcfa"}, +] + +[package.extras] +tests = ["freezegun", "pytest", "pytest-cov"] + [[package]] name = "idna" version = "3.4" @@ -1189,6 +1203,30 @@ files = [ docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] testing = ["coverage", "pyyaml"] +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "markupsafe" version = "2.1.3" @@ -1258,6 +1296,17 @@ files = [ {file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"}, ] +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "mohawk" version = "1.1.0" @@ -1720,6 +1769,17 @@ docs = ["sphinx (>=1.7.1)"] redis = ["redis"] tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-flake8 (>=1.0.5)", "pytest-mypy (>=0.8.0)", "redis", "sphinx (>=3.0.3)"] +[[package]] +name = "pprintpp" +version = "0.4.0" +description = "A drop-in replacement for pprint that's actually pretty" +optional = false +python-versions = "*" +files = [ + {file = "pprintpp-0.4.0-py2.py3-none-any.whl", hash = "sha256:b6b4dcdd0c0c0d75e4d7b2f21a9e933e5b2ce62b26e1a54537f9651ae5a5c01d"}, + {file = "pprintpp-0.4.0.tar.gz", hash = "sha256:ea826108e2c7f49dc6d66c752973c3fc9749142a798d6b254e1e301cfdbc6403"}, +] + [[package]] name = "prefixed" version = "0.7.0" @@ -2008,6 +2068,21 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-clarity" +version = "1.0.1" +description = "A plugin providing an alternative, colourful diff output for failing assertions." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pytest-clarity-1.0.1.tar.gz", hash = "sha256:505fe345fad4fe11c6a4187fe683f2c7c52c077caa1e135f3e483fe112db7772"}, +] + +[package.dependencies] +pprintpp = ">=0.4.0" +pytest = ">=3.5.0" +rich = ">=8.0.0" + [[package]] name = "python-dateutil" version = "2.8.2" @@ -2068,7 +2143,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -2076,15 +2150,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -2101,7 +2168,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -2109,7 +2175,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -2234,6 +2299,25 @@ urllib3 = ">=1.21.1,<1.27" socks = ["PySocks (>=1.5.6,!=1.5.7)", "win-inet-pton"] use-chardet-on-py3 = ["chardet (>=3.0.2,<5)"] +[[package]] +name = "requests-mock" +version = "1.11.0" +description = "Mock out responses from the requests package" +optional = false +python-versions = "*" +files = [ + {file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"}, + {file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"}, +] + +[package.dependencies] +requests = ">=2.3,<3" +six = "*" + +[package.extras] +fixture = ["fixtures"] +test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"] + [[package]] name = "requests-oauthlib" version = "1.3.1" @@ -2252,6 +2336,24 @@ requests = ">=2.0.0" [package.extras] rsa = ["oauthlib[signedtoken] (>=3.0.0)"] +[[package]] +name = "rich" +version = "13.7.0" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.7.0-py3-none-any.whl", hash = "sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235"}, + {file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "rsa" version = "4.9" @@ -2293,24 +2395,24 @@ python-versions = ">=3.6" files = [ {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b42169467c42b692c19cf539c38d4602069d8c1505e97b86387fcf7afb766e1d"}, {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:07238db9cbdf8fc1e9de2489a4f68474e70dffcb32232db7c08fa61ca0c7c462"}, - {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d92f81886165cb14d7b067ef37e142256f1c6a90a65cd156b063a43da1708cfd"}, {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fff3573c2db359f091e1589c3d7c5fc2f86f5bdb6f24252c2d8e539d4e45f412"}, + {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:aa2267c6a303eb483de8d02db2871afb5c5fc15618d894300b88958f729ad74f"}, {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:840f0c7f194986a63d2c2465ca63af8ccbbc90ab1c6001b1978f05119b5e7334"}, {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:024cfe1fc7c7f4e1aff4a81e718109e13409767e4f871443cbff3dba3578203d"}, {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win32.whl", hash = "sha256:c69212f63169ec1cfc9bb44723bf2917cbbd8f6191a00ef3410f5a7fe300722d"}, {file = "ruamel.yaml.clib-0.2.8-cp310-cp310-win_amd64.whl", hash = "sha256:cabddb8d8ead485e255fe80429f833172b4cadf99274db39abc080e068cbcc31"}, {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:bef08cd86169d9eafb3ccb0a39edb11d8e25f3dae2b28f5c52fd997521133069"}, {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:b16420e621d26fdfa949a8b4b47ade8810c56002f5389970db4ddda51dbff248"}, - {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b5edda50e5e9e15e54a6a8a0070302b00c518a9d32accc2346ad6c984aacd279"}, {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:25c515e350e5b739842fc3228d662413ef28f295791af5e5110b543cf0b57d9b"}, + {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-manylinux_2_24_aarch64.whl", hash = "sha256:1707814f0d9791df063f8c19bb51b0d1278b8e9a2353abbb676c2f685dee6afe"}, {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:46d378daaac94f454b3a0e3d8d78cafd78a026b1d71443f4966c696b48a6d899"}, {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:09b055c05697b38ecacb7ac50bdab2240bfca1a0c4872b0fd309bb07dc9aa3a9"}, {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win32.whl", hash = "sha256:53a300ed9cea38cf5a2a9b069058137c2ca1ce658a874b79baceb8f892f915a7"}, {file = "ruamel.yaml.clib-0.2.8-cp311-cp311-win_amd64.whl", hash = "sha256:c2a72e9109ea74e511e29032f3b670835f8a59bbdc9ce692c5b4ed91ccf1eedb"}, {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ebc06178e8821efc9692ea7544aa5644217358490145629914d8020042c24aa1"}, {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:edaef1c1200c4b4cb914583150dcaa3bc30e592e907c01117c08b13a07255ec2"}, - {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:7048c338b6c86627afb27faecf418768acb6331fc24cfa56c93e8c9780f815fa"}, {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d176b57452ab5b7028ac47e7b3cf644bcfdc8cacfecf7e71759f7f51a59e5c92"}, + {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-manylinux_2_24_aarch64.whl", hash = "sha256:1dc67314e7e1086c9fdf2680b7b6c2be1c0d8e3a8279f2e993ca2a7545fecf62"}, {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:3213ece08ea033eb159ac52ae052a4899b56ecc124bb80020d9bbceeb50258e9"}, {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aab7fd643f71d7946f2ee58cc88c9b7bfc97debd71dcc93e03e2d174628e7e2d"}, {file = "ruamel.yaml.clib-0.2.8-cp312-cp312-win32.whl", hash = "sha256:5c365d91c88390c8d0a8545df0b5857172824b1c604e867161e6b3d59a827eaa"}, @@ -2318,7 +2420,7 @@ files = [ {file = "ruamel.yaml.clib-0.2.8-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a5aa27bad2bb83670b71683aae140a1f52b0857a2deff56ad3f6c13a017a26ed"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c58ecd827313af6864893e7af0a3bb85fd529f862b6adbefe14643947cfe2942"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-macosx_12_0_arm64.whl", hash = "sha256:f481f16baec5290e45aebdc2a5168ebc6d35189ae6fea7a58787613a25f6e875"}, - {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3fcc54cb0c8b811ff66082de1680b4b14cf8a81dce0d4fbf665c2265a81e07a1"}, + {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_24_aarch64.whl", hash = "sha256:77159f5d5b5c14f7c34073862a6b7d34944075d9f93e681638f6d753606c6ce6"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7f67a1ee819dc4562d444bbafb135832b0b909f81cc90f7aa00260968c9ca1b3"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4ecbf9c3e19f9562c7fdd462e8d18dd902a47ca046a2e64dba80699f0b6c09b7"}, {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:87ea5ff66d8064301a154b3933ae406b0863402a799b16e4a1d24d9fbbcbe0d3"}, @@ -2326,7 +2428,7 @@ files = [ {file = "ruamel.yaml.clib-0.2.8-cp37-cp37m-win_amd64.whl", hash = "sha256:3f215c5daf6a9d7bbed4a0a4f760f3113b10e82ff4c5c44bec20a68c8014f675"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1b617618914cb00bf5c34d4357c37aa15183fa229b24767259657746c9077615"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:a6a9ffd280b71ad062eae53ac1659ad86a17f59a0fdc7699fd9be40525153337"}, - {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:665f58bfd29b167039f714c6998178d27ccd83984084c286110ef26b230f259f"}, + {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:305889baa4043a09e5b76f8e2a51d4ffba44259f6b4c72dec8ca56207d9c6fe1"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:700e4ebb569e59e16a976857c8798aee258dceac7c7d6b50cab63e080058df91"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:e2b4c44b60eadec492926a7270abb100ef9f72798e18743939bdbf037aab8c28"}, {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:e79e5db08739731b0ce4850bed599235d601701d5694c36570a99a0c5ca41a9d"}, @@ -2334,7 +2436,7 @@ files = [ {file = "ruamel.yaml.clib-0.2.8-cp38-cp38-win_amd64.whl", hash = "sha256:56f4252222c067b4ce51ae12cbac231bce32aee1d33fbfc9d17e5b8d6966c312"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:03d1162b6d1df1caa3a4bd27aa51ce17c9afc2046c31b0ad60a0a96ec22f8001"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:bba64af9fa9cebe325a62fa398760f5c7206b215201b0ec825005f1b18b9bccf"}, - {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:9eb5dee2772b0f704ca2e45b1713e4e5198c18f515b52743576d196348f374d3"}, + {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:a1a45e0bb052edf6a1d3a93baef85319733a888363938e1fc9924cb00c8df24c"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:da09ad1c359a728e112d60116f626cc9f29730ff3e0e7db72b9a2dbc2e4beed5"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:184565012b60405d93838167f425713180b949e9d8dd0bbc7b49f074407c5a8b"}, {file = "ruamel.yaml.clib-0.2.8-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a75879bacf2c987c003368cf14bed0ffe99e8e85acfa6c0bfffc21a090f16880"}, @@ -2995,4 +3097,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "3e4e51ed7309819903d851969d04f5c12c43f63aafa46b8a5983512a40d20bf7" +content-hash = "75996c59b50c2ad361524908ad9e6cd88a21b782e6d99adaa39fa4bf57ed70b4" diff --git a/pyproject.toml b/pyproject.toml index 6d4057a46..a26b98c83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ marian-tensorboard = "^0.2.1" sacrebleu="2.0.0" mtdata="0.3.2" requests="2.26.0" +humanize = "^4.9.0" [tool.poetry.group.tests.dependencies] sacrebleu="2.0.0" @@ -37,6 +38,8 @@ requests="2.26.0" pytest="7.4.3" # use the latest main, switch to PyPi when released opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="9133e1525c7ee37f53ea14ee6a180152bf7ea192"} +pytest-clarity = "^1.0.1" +requests-mock = "^1.11.0" [tool.black] extend-exclude= "/3rd_party" diff --git a/tests/test_find_corpus.py b/tests/test_find_corpus.py new file mode 100644 index 000000000..b6b650a79 --- /dev/null +++ b/tests/test_find_corpus.py @@ -0,0 +1,154 @@ +from textwrap import dedent + +import pytest + +from utils.find_corpus import main as find_corpus + +""" +Tests the `utils/find_corpus.py` script. +""" + + +@pytest.fixture +def mock_opus_data(requests_mock): + """ + Provide a simplistic response from opus, with only 2 entries. + """ + requests_mock.get( + "https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest", + text="""{ + "corpora": [ + { + "alignment_pairs": 4605, + "corpus": "Books", + "documents": "", + "id": 31736, + "latest": "True", + "preprocessing": "moses", + "size": 328, + "source": "ca", + "source_tokens": 73463, + "target": "en", + "target_tokens": 68625, + "url": "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip", + "version": "v1" + }, + { + "alignment_pairs": 5802549, + "corpus": "CCAligned", + "documents": "", + "id": 32571, + "latest": "True", + "preprocessing": "moses", + "size": 522860, + "source": "ca", + "source_tokens": 89704109, + "target": "en", + "target_tokens": 84373417, + "url": "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip", + "version": "v1" + } + ] + }""", + ) + + +def assert_stdout(capsys, message: str, expected_output: str): + """ + Asserts the output from stdout matches a certain string. + """ + captured = capsys.readouterr() + + def clean_text(text): + text = dedent(text).strip() + result = "" + for line in text.split("\n"): + result += line.strip() + "\n" + return result + + assert clean_text(captured.out) == clean_text(expected_output), message + + +def test_opus(mock_opus_data, capsys): + find_corpus(["en", "ca", "--importer", "opus"]) + assert_stdout( + capsys, + "The opus dataset outputs nicely.", + """ + Fetching datasets from: + https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest + + + ┌──────────────────────────────┐ + │ OPUS - https://opus.nlpl.eu/ │ + └──────────────────────────────┘ + + Dataset Code Sentences Size URL + ───────── ───────────────── ───────── ──────── ───────────────────────────────────── + CCAligned opus_CCAligned/v1 5802549 535.4 MB https://opus.nlpl.eu/CCAligned-v1.php + Books opus_Books/v1 4605 335.9 kB https://opus.nlpl.eu/Books-v1.php + + YAML: + - opus_Books/v1 + - opus_CCAligned/v1 + """, + ) + + +def test_opus_download_url(mock_opus_data, capsys): + """ + This checks that the download URLs are shown instead of the information URLs. + """ + find_corpus(["en", "ca", "--importer", "opus", "--download_url"]) + output = capsys.readouterr() + assert "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip" in output.out + assert "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip" in output.out + + +# mtdata has some deprecated dependencies +@pytest.mark.filterwarnings("ignore::DeprecationWarning") +def test_mtdata(requests_mock, capsys): + find_corpus(["en", "ca", "--importer", "mtdata"]) + assert_stdout( + capsys, + "mtdata outputs nicely", + """ + ┌────────────────────────────────────────────────┐ + │ mtdata - https://github.com/thammegowda/mtdata │ + └────────────────────────────────────────────────┘ + + Dataset URL + ────────────────────────────────────── ─────────────────────────────────────────────────────────────────────────────────────────────────────────── + mtdata_ELRC-wikipedia_health-1-cat-eng https://elrc-share.eu/repository/download/ac6d557e8de811ea913100155d026706b0c5fee96b88489781ddd7675f8ea2ae/ + mtdata_Facebook-wikimatrix-1-cat-eng https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ca-en.tsv.gz + mtdata_Statmt-ccaligned-1-cat_ES-eng http://www.statmt.org/cc-aligned/sentence-aligned/ca_ES-en_XX.tsv.xz + + YAML: + - mtdata_ELRC-wikipedia_health-1-cat-eng + - mtdata_Facebook-wikimatrix-1-cat-eng + - mtdata_Statmt-ccaligned-1-cat_ES-eng + """, + ) + + +def test_sacrebleu(requests_mock, capsys): + # "iu" is the Inuktitut language, which has a small dataset available. + find_corpus(["en", "iu", "--importer", "sacrebleu"]) + assert_stdout( + capsys, + "sacrebleu outputs nicely", + """ + ┌─────────────────────────────────────────────────┐ + │ sacrebleu - https://github.com/mjpost/sacrebleu │ + └─────────────────────────────────────────────────┘ + + Dataset Description URLs + ───────── ─────────────────────────────────────── ────────────────────────────────────────────────────── + wmt20 Official evaluation data for WMT20 http://data.statmt.org/wmt20/translation-task/test.tgz + wmt20/dev Development data for tasks new to 2020. http://data.statmt.org/wmt20/translation-task/dev.tgz + + YAML: + - sacrebleu_wmt20 + - sacrebleu_wmt20/dev + """, + ) diff --git a/utils/find-corpus.py b/utils/find-corpus.py deleted file mode 100755 index d74f0d82f..000000000 --- a/utils/find-corpus.py +++ /dev/null @@ -1,70 +0,0 @@ -#!/usr/bin/env python3 -""" -Finds all opus datasets for a language pair and prints them to set config settings. - -Usage: - python find-corpus.py - -Params: - src - source language code - trg - target language code - importer - importer type (mtdata, opus, sacrebleu) - -""" - -import sys - -import requests - -source = sys.argv[1] -target = sys.argv[2] -type = sys.argv[3] - -# exclude = ['bible', 'Ubuntu', 'Gnome', 'KDE', 'Multi', 'OPUS100v'] -exclude = [] -names = [] - -if type == "opus": - exclude += ["OPUS100v", "WMT-News"] - datasets = requests.get( - f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest" - ).json() - names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets["corpora"]] -elif type == "sacrebleu": - import sacrebleu - - names = [ - f"sacrebleu_{name}" - for name, meta in sacrebleu.DATASETS.items() - if f"{source}-{target}" in meta or f"{target}-{source}" in meta - ] -elif type == "mtdata": - from mtdata.entry import lang_pair - from mtdata.index import get_entries - from mtdata.iso import iso3_code - - source_tricode = iso3_code(source, fail_error=True) - target_tricode = iso3_code(target, fail_error=True) - exclude += ["opus", "newstest", "UNv1"] - entries = sorted( - get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True), - key=lambda entry: entry.did.group, - ) - names = [ - f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}" - for entry in entries - ] -else: - print(f"Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu") - -cleaned = set() -for name in names: - filter = False - for ex in exclude: - if ex.lower() in name.lower(): - filter = True - break - if not filter: - cleaned.add(name) - -print("\n".join(sorted([f" - {name}" for name in cleaned]))) diff --git a/utils/find_corpus.py b/utils/find_corpus.py new file mode 100755 index 000000000..6e4e9fc0d --- /dev/null +++ b/utils/find_corpus.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +""" +Finds all opus datasets for a language pair and prints them to set config settings. + +Usage: + poetry install --only utils + poetry run ./utils/find_corpus.py "en" "ca" + poetry run ./utils/find_corpus.py "en" "fr" --importer opus +""" + +import argparse +import logging +import sys +from typing import NamedTuple, Optional, TypeVar, Union + +import humanize +import requests + + +class OpusDataset(NamedTuple): + # The name of this dataset, e.g. "CCAligned" + corpus: str + # This is a blank string at the time of this writing. + documents: str + + # 'moses' + preprocessing: str + # The language tag. + source: str + # The language tag. + target: str + # The URL to the download + url: str + # For example "v1" + version: str + + alignment_pairs: int + id: int + # Size in KiB + size: int + source_tokens: int + target_tokens: int + + latest: Union["True", "False"] + + def name(self) -> str: + return f"opus_{self.corpus}/{self.version}" + + def website_url(self) -> str: + return f"https://opus.nlpl.eu/{self.corpus}-{self.version}.php" + + def humanize_size(self) -> str: + return humanize.naturalsize(self.size * 1024) + + +def get_opus(source: str, target: str, download_url: bool): + # This API is documented: https://opus.nlpl.eu/opusapi/ + url = f"https://opus.nlpl.eu/opusapi/?source={source}&target={target}&preprocessing=moses&version=latest" + + print(f"Fetching datasets from:\n{url}\n") + + datasets = requests.get(url).json() + + # Convert the response into a typed object that is sorted. + datasets_typed = [OpusDataset(**corpus_data) for corpus_data in datasets.get("corpora", [])] + datasets_typed = sorted(datasets_typed, key=lambda x: x.alignment_pairs or 0, reverse=True) + + print("") + print("┌──────────────────────────────┐") + print("│ OPUS - https://opus.nlpl.eu/ │") + print("└──────────────────────────────┘") + + print_table( + [ + [ + "Dataset", + "Code", + "Sentences", + "Size", + "URL", + ], + *[ + [ + dataset.corpus, + dataset.name(), + dataset.alignment_pairs, + dataset.humanize_size(), + dataset.url if download_url else dataset.website_url(), + ] + for dataset in datasets_typed + ], + ] + ) + + names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets["corpora"]] + print_yaml(names, exclude=["OPUS100v", "WMT-News"]) + + +def get_sacrebleu(source: str, target: str): + import sacrebleu + + entries = [ + (name, entry) + for name, entry in sacrebleu.DATASETS.items() + if f"{source}-{target}" in entry or f"{target}-{source}" in entry + ] + + names = [f"sacrebleu_{name}" for name, entry in entries] + + print("") + print("┌─────────────────────────────────────────────────┐") + print("│ sacrebleu - https://github.com/mjpost/sacrebleu │") + print("└─────────────────────────────────────────────────┘") + print_table( + [ + ["Dataset", "Description", "URLs"], + *[ + [ + # + name, + entry["description"], + ", ".join(entry["data"]), + ] + for name, entry in entries + ], + ] + ) + print_yaml(names) + + +def get_remote_file_size(url: str) -> Optional[int]: + try: + response = requests.head(url, timeout=1) + + if response.status_code == 200: + return humanize.naturalsize(int(response.headers.get("Content-Length", 0))) + else: + print(f"Failed to retrieve file information. Status code: {response.status_code}") + return None + except requests.exceptions.RequestException as e: + print(f"An error occurred: {e}") + return None + + +T = TypeVar("T") + + +def exclude_by_name(excludes: list[str], names: list[str], entries: list[T]) -> list[T]: + """Exclude entries by an excludes list, and a name list.""" + filtered_entries = [] + for name, entry in zip(names, entries): + filter = False + for exclude in excludes: + if exclude.lower() in name.lower(): + filter = True + break + + if not filter: + filtered_entries.append(entry) + + return filtered_entries + + +def get_mtdata(source: str, target: str): + # mtdata outputs debug logs + logging.disable(logging.CRITICAL) + + from mtdata.entry import lang_pair + from mtdata.index import get_entries + from mtdata.iso import iso3_code + + source_tricode = iso3_code(source, fail_error=True) + target_tricode = iso3_code(target, fail_error=True) + entries = sorted( + get_entries(lang_pair(source_tricode + "-" + target_tricode), None, None, True), + key=lambda entry: entry.did.group, + ) + excludes = ["opus", "newstest", "UNv1"] + + def get_name(entry): + return ( + f"mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}" + ) + + names = [get_name(entry) for entry in entries] + + print("") + print("┌────────────────────────────────────────────────┐") + print("│ mtdata - https://github.com/thammegowda/mtdata │") + print("└────────────────────────────────────────────────┘") + print_table( + [ + [ + "Dataset", + "URL", + # "Size", + ], + *[ + [ + # + get_name(entry), + entry.url, + # get_remote_file_size(entry.url), + ] + for entry in + # Filter out the excludes + exclude_by_name(excludes, names, entries) + ], + ] + ) + + print_yaml(names, exclude=excludes) + + +def print_yaml(names: list[str], exclude: list[str] = []): + cleaned = set() + for name in names: + filter = False + for ex in exclude: + if ex.lower() in name.lower(): + filter = True + break + if not filter: + cleaned.add(name) + + print("\nYAML:") + if len(cleaned) == 0: + print("(no datasets)\n") + else: + print("\n".join(sorted([f" - {name}" for name in cleaned]))) + + +def run(source: str, target: str, importer: Optional[str]): + if importer == "opus" or not type: + get_opus(source, target) + + if importer == "sacrebleu" or not type: + get_sacrebleu(source, target) + + if importer == "mtdata" or not type: + get_mtdata(source, target) + + +def print_table(table: list[list[any]]): + """ + Nicely print a table, the first row is the header + """ + + # Compute the column lengths. + transposed_table = list(map(list, zip(*table))) + column_lengths = [max(len(str(x)) for x in column) for column in transposed_table] + + print("") + for index, row in enumerate(table): + # Print the row. + for datum, max_len in zip(row, column_lengths): + print(str(datum).ljust(max_len), end=" ") + print("") + + # Print a separator between the header and the rest of the table. + if index == 0: + for length in column_lengths: + print("".ljust(length, "─"), end=" ") + print("") + + if len(table) == 1: + print("(no datasets)") + + +def main(args: Optional[list[str]] = None) -> None: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter, # Preserves whitespace in the help text. + ) + parser.add_argument("source", type=str, nargs="?", help="Source language code") + parser.add_argument("target", type=str, nargs="?", help="Target language code") + parser.add_argument( + "--importer", type=str, help="The importer to use: mtdata, opus, sacrebleu" + ) + parser.add_argument( + "--download_url", + action="store_true", + default=False, + help="Show the download url if available.", + ) + + args = parser.parse_args(args) + + if not args.source or not args.target: + parser.print_help() + sys.exit(1) + + if args.importer and args.importer not in ["opus", "sacrebleu", "mtdata"]: + print(f'"{args.importer}" is not a valid importer.') + sys.exit(1) + + if args.importer == "opus" or not args.importer: + get_opus(args.source, args.target, args.download_url) + + if args.importer == "sacrebleu" or not args.importer: + get_sacrebleu(args.source, args.target) + + if args.importer == "mtdata" or not args.importer: + get_mtdata(args.source, args.target) + + +if __name__ == "__main__": + main()