Update the find corpus tool to provide more information (#280)

* Add pytest-clarity for better text diffs in tests * Add requests_mock for tests * Add the test_data artifact to the .gitignore * Use an underscore with find_corpus.py * Update the find corpus tool to provide more information * Add humanize to the dependency list
mozilla · Dec 12, 2023 · d1be2bc · d1be2bc
1 parent ae78143
commit d1be2bc
Show file tree

Hide file tree

Showing 7 changed files with 589 additions and 90 deletions.
diff --git a/.gitignore b/.gitignore
@@ -136,3 +136,5 @@ dmypy.json
 .models
 .bin
 .snakemake
+
+tests_data
diff --git a/Makefile b/Makefile
@@ -164,8 +164,8 @@ fix-all:
 
 # Run unit tests
 run-tests:
-	poetry install --only tests
-	PYTHONPATH=$$(pwd) poetry run pytest tests
+	poetry install --only tests --only utils
+	PYTHONPATH=$$(pwd) poetry run pytest tests -vv
 
 # Validates Taskcluster task graph locally
 validate-taskgraph:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ marian-tensorboard = "^0.2.1"
 sacrebleu="2.0.0"
 mtdata="0.3.2"
 requests="2.26.0"
+humanize = "^4.9.0"
 
 [tool.poetry.group.tests.dependencies]
 sacrebleu="2.0.0"
@@ -37,6 +38,8 @@ requests="2.26.0"
 pytest="7.4.3"
 # use the latest main, switch to PyPi when released
 opustrainer = {git = "https://github.com/hplt-project/OpusTrainer.git", rev="9133e1525c7ee37f53ea14ee6a180152bf7ea192"}
+pytest-clarity = "^1.0.1"
+requests-mock = "^1.11.0"
 
 [tool.black]
 extend-exclude= "/3rd_party"

diff --git a/tests/test_find_corpus.py b/tests/test_find_corpus.py
@@ -0,0 +1,154 @@
+from textwrap import dedent
+
+import pytest
+
+from utils.find_corpus import main as find_corpus
+
+"""
+Tests the `utils/find_corpus.py` script.
+"""
+
+
+@pytest.fixture
+def mock_opus_data(requests_mock):
+    """
+    Provide a simplistic response from opus, with only 2 entries.
+    """
+    requests_mock.get(
+        "https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest",
+        text="""{
+            "corpora": [
+                {
+                    "alignment_pairs": 4605,
+                    "corpus": "Books",
+                    "documents": "",
+                    "id": 31736,
+                    "latest": "True",
+                    "preprocessing": "moses",
+                    "size": 328,
+                    "source": "ca",
+                    "source_tokens": 73463,
+                    "target": "en",
+                    "target_tokens": 68625,
+                    "url": "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip",
+                    "version": "v1"
+                },
+                {
+                    "alignment_pairs": 5802549,
+                    "corpus": "CCAligned",
+                    "documents": "",
+                    "id": 32571,
+                    "latest": "True",
+                    "preprocessing": "moses",
+                    "size": 522860,
+                    "source": "ca",
+                    "source_tokens": 89704109,
+                    "target": "en",
+                    "target_tokens": 84373417,
+                    "url": "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip",
+                    "version": "v1"
+                }
+            ]
+        }""",
+    )
+
+
+def assert_stdout(capsys, message: str, expected_output: str):
+    """
+    Asserts the output from stdout matches a certain string.
+    """
+    captured = capsys.readouterr()
+
+    def clean_text(text):
+        text = dedent(text).strip()
+        result = ""
+        for line in text.split("\n"):
+            result += line.strip() + "\n"
+        return result
+
+    assert clean_text(captured.out) == clean_text(expected_output), message
+
+
+def test_opus(mock_opus_data, capsys):
+    find_corpus(["en", "ca", "--importer", "opus"])
+    assert_stdout(
+        capsys,
+        "The opus dataset outputs nicely.",
+        """
+        Fetching datasets from:
+        https://opus.nlpl.eu/opusapi/?source=en&target=ca&preprocessing=moses&version=latest
+
+
+        ┌──────────────────────────────┐
+        │ OPUS - https://opus.nlpl.eu/ │
+        └──────────────────────────────┘
+
+        Dataset   Code              Sentences Size     URL
+        ───────── ───────────────── ───────── ──────── ─────────────────────────────────────
+        CCAligned opus_CCAligned/v1 5802549   535.4 MB https://opus.nlpl.eu/CCAligned-v1.php
+        Books     opus_Books/v1     4605      335.9 kB https://opus.nlpl.eu/Books-v1.php
+
+        YAML:
+            - opus_Books/v1
+            - opus_CCAligned/v1
+        """,
+    )
+
+
+def test_opus_download_url(mock_opus_data, capsys):
+    """
+    This checks that the download URLs are shown instead of the information URLs.
+    """
+    find_corpus(["en", "ca", "--importer", "opus", "--download_url"])
+    output = capsys.readouterr()
+    assert "https://object.pouta.csc.fi/OPUS-CCAligned/v1/moses/ca-en.txt.zip" in output.out
+    assert "https://object.pouta.csc.fi/OPUS-Books/v1/moses/ca-en.txt.zip" in output.out
+
+
+# mtdata has some deprecated dependencies
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_mtdata(requests_mock, capsys):
+    find_corpus(["en", "ca", "--importer", "mtdata"])
+    assert_stdout(
+        capsys,
+        "mtdata outputs nicely",
+        """
+        ┌────────────────────────────────────────────────┐
+        │ mtdata - https://github.com/thammegowda/mtdata │
+        └────────────────────────────────────────────────┘
+
+        Dataset                                URL
+        ────────────────────────────────────── ───────────────────────────────────────────────────────────────────────────────────────────────────────────
+        mtdata_ELRC-wikipedia_health-1-cat-eng https://elrc-share.eu/repository/download/ac6d557e8de811ea913100155d026706b0c5fee96b88489781ddd7675f8ea2ae/
+        mtdata_Facebook-wikimatrix-1-cat-eng   https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.ca-en.tsv.gz
+        mtdata_Statmt-ccaligned-1-cat_ES-eng   http://www.statmt.org/cc-aligned/sentence-aligned/ca_ES-en_XX.tsv.xz
+
+        YAML:
+            - mtdata_ELRC-wikipedia_health-1-cat-eng
+            - mtdata_Facebook-wikimatrix-1-cat-eng
+            - mtdata_Statmt-ccaligned-1-cat_ES-eng
+        """,
+    )
+
+
+def test_sacrebleu(requests_mock, capsys):
+    # "iu" is the Inuktitut language, which has a small dataset available.
+    find_corpus(["en", "iu", "--importer", "sacrebleu"])
+    assert_stdout(
+        capsys,
+        "sacrebleu outputs nicely",
+        """
+        ┌─────────────────────────────────────────────────┐
+        │ sacrebleu - https://github.com/mjpost/sacrebleu │
+        └─────────────────────────────────────────────────┘
+
+        Dataset   Description                             URLs
+        ───────── ─────────────────────────────────────── ──────────────────────────────────────────────────────
+        wmt20     Official evaluation data for WMT20      http://data.statmt.org/wmt20/translation-task/test.tgz
+        wmt20/dev Development data for tasks new to 2020. http://data.statmt.org/wmt20/translation-task/dev.tgz
+
+        YAML:
+            - sacrebleu_wmt20
+            - sacrebleu_wmt20/dev
+        """,
+    )
diff --git a/utils/find-corpus.py b/utils/find-corpus.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -136,3 +136,5 @@ dmypy.json @@
     .models
     .bin
     .snakemake
+    tests_data