Skip to content

Commit

Permalink
Update the find corpus tool to provide more information (#280)
Browse files Browse the repository at this point in the history
* Add pytest-clarity for better text diffs in tests

* Add requests_mock for tests

* Add the test_data artifact to the .gitignore

* Use an underscore with find_corpus.py

* Update the find corpus tool to provide more information

* Add humanize to the dependency list
  • Loading branch information
gregtatum authored Dec 12, 2023
1 parent ae78143 commit d1be2bc
Show file tree
Hide file tree
Showing 7 changed files with 589 additions and 90 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -136,3 +136,5 @@ dmypy.json
.models
.bin
.snakemake

tests_data
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ fix-all:

# Run unit tests
run-tests:
poetry install --only tests
PYTHONPATH=$$(pwd) poetry run pytest tests
poetry install --only tests --only utils
PYTHONPATH=$$(pwd) poetry run pytest tests -vv

# Validates Taskcluster task graph locally
validate-taskgraph:
Expand Down
138 changes: 120 additions & 18 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ marian-tensorboard = "^0.2.1"
sacrebleu="2.0.0"
mtdata="0.3.2"
requests="2.26.0"
humanize = "^4.9.0"

[tool.poetry.group.tests.dependencies]
sacrebleu="2.0.0"
Expand All @@ -37,6 +38,8 @@ requests="2.26.0"
pytest="7.4.3"
# use the latest main, switch to PyPi when released
opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="9133e1525c7ee37f53ea14ee6a180152bf7ea192"}
pytest-clarity = "^1.0.1"
requests-mock = "^1.11.0"

[tool.black]
extend-exclude= "/3rd_party"
Expand Down
154 changes: 154 additions & 0 deletions tests/test_find_corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from textwrap import dedent

import pytest

from utils.find_corpus import main as find_corpus

"""
Tests the `utils/find_corpus.py` script.
"""


@pytest.fixture
def mock_opus_data(requests_mock):
"""
Provide a simplistic response from opus, with only 2 entries.
"""
requests_mock.get(
"https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest",
text="""{
"corpora": [
{
"alignment_pairs": 4605,
"corpus": "Books",
"documents": "",
"id": 31736,
"latest": "True",
"preprocessing": "moses",
"size": 328,
"source": "ca",
"source_tokens": 73463,
"target": "en",
"target_tokens": 68625,
"url": "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip",
"version": "v1"
},
{
"alignment_pairs": 5802549,
"corpus": "CCAligned",
"documents": "",
"id": 32571,
"latest": "True",
"preprocessing": "moses",
"size": 522860,
"source": "ca",
"source_tokens": 89704109,
"target": "en",
"target_tokens": 84373417,
"url": "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip",
"version": "v1"
}
]
}""",
)


def assert_stdout(capsys, message: str, expected_output: str):
"""
Asserts the output from stdout matches a certain string.
"""
captured = capsys.readouterr()

def clean_text(text):
text = dedent(text).strip()
result = ""
for line in text.split("\n"):
result += line.strip() + "\n"
return result

assert clean_text(captured.out) == clean_text(expected_output), message


def test_opus(mock_opus_data, capsys):
find_corpus(["en", "ca", "--importer", "opus"])
assert_stdout(
capsys,
"The opus dataset outputs nicely.",
"""
Fetching datasets from:
https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest
┌──────────────────────────────┐
│ OPUS - https://opus.nlpl.eu/ │
└──────────────────────────────┘
Dataset Code Sentences Size URL
───────── ───────────────── ───────── ──────── ─────────────────────────────────────
CCAligned opus_CCAligned/v1 5802549 535.4 MB https://opus.nlpl.eu/CCAligned-v1.php
Books opus_Books/v1 4605 335.9 kB https://opus.nlpl.eu/Books-v1.php
YAML:
- opus_Books/v1
- opus_CCAligned/v1
""",
)


def test_opus_download_url(mock_opus_data, capsys):
"""
This checks that the download URLs are shown instead of the information URLs.
"""
find_corpus(["en", "ca", "--importer", "opus", "--download_url"])
output = capsys.readouterr()
assert "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip" in output.out
assert "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip" in output.out


# mtdata has some deprecated dependencies
@pytest.mark.filterwarnings("ignore::DeprecationWarning")
def test_mtdata(requests_mock, capsys):
find_corpus(["en", "ca", "--importer", "mtdata"])
assert_stdout(
capsys,
"mtdata outputs nicely",
"""
┌────────────────────────────────────────────────┐
│ mtdata - https://github.com/thammegowda/mtdata │
└────────────────────────────────────────────────┘
Dataset URL
────────────────────────────────────── ───────────────────────────────────────────────────────────────────────────────────────────────────────────
mtdata_ELRC-wikipedia_health-1-cat-eng https://elrc-share.eu/repository/download/ac6d557e8de811ea913100155d026706b0c5fee96b88489781ddd7675f8ea2ae/
mtdata_Facebook-wikimatrix-1-cat-eng https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ca-en.tsv.gz
mtdata_Statmt-ccaligned-1-cat_ES-eng http://www.statmt.org/cc-aligned/sentence-aligned/ca_ES-en_XX.tsv.xz
YAML:
- mtdata_ELRC-wikipedia_health-1-cat-eng
- mtdata_Facebook-wikimatrix-1-cat-eng
- mtdata_Statmt-ccaligned-1-cat_ES-eng
""",
)


def test_sacrebleu(requests_mock, capsys):
# "iu" is the Inuktitut language, which has a small dataset available.
find_corpus(["en", "iu", "--importer", "sacrebleu"])
assert_stdout(
capsys,
"sacrebleu outputs nicely",
"""
┌─────────────────────────────────────────────────┐
│ sacrebleu - https://github.com/mjpost/sacrebleu │
└─────────────────────────────────────────────────┘
Dataset Description URLs
───────── ─────────────────────────────────────── ──────────────────────────────────────────────────────
wmt20 Official evaluation data for WMT20 http://data.statmt.org/wmt20/translation-task/test.tgz
wmt20/dev Development data for tasks new to 2020. http://data.statmt.org/wmt20/translation-task/dev.tgz
YAML:
- sacrebleu_wmt20
- sacrebleu_wmt20/dev
""",
)
70 changes: 0 additions & 70 deletions utils/find-corpus.py

This file was deleted.

Loading

0 comments on commit d1be2bc

Please sign in to comment.