From 9c1a6f5d7d5f1efb3f34add0a6c31227ef801943 Mon Sep 17 00:00:00 2001 From: Akshay Karle <1443108+akshaykarle@users.noreply.github.com> Date: Thu, 30 May 2024 10:20:43 +0100 Subject: [PATCH 1/2] migrate to pytest --- .github/workflows/test.yaml | 2 +- poetry.lock | 75 ++++++++++++++++++- pyproject.toml | 7 ++ .../analyzer_engine}/__init__.py | 0 .../analyzer_engine}/csv_analyzer_engine.py | 0 {config => src/config}/__init__.py | 0 {config => src/config}/nlp_engine_config.py | 0 {recognizer => src/recognizer}/__init__.py | 0 .../recognizer}/flair_recognizer.py | 0 tests/__init__.py | 0 tests/analyzer_engine/__init__.py | 0 .../csv_analyzer_engine_test.py | 25 +++---- tests/recognizer/__init__.py | 0 tests/recognizer/flair_recognizer_test.py | 27 +++---- 14 files changed, 103 insertions(+), 33 deletions(-) rename {analyzer_engine => src/analyzer_engine}/__init__.py (100%) rename {analyzer_engine => src/analyzer_engine}/csv_analyzer_engine.py (100%) rename {config => src/config}/__init__.py (100%) rename {config => src/config}/nlp_engine_config.py (100%) rename {recognizer => src/recognizer}/__init__.py (100%) rename {recognizer => src/recognizer}/flair_recognizer.py (100%) delete mode 100644 tests/__init__.py delete mode 100644 tests/analyzer_engine/__init__.py delete mode 100644 tests/recognizer/__init__.py diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ef4559a..d74f636 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -28,5 +28,5 @@ jobs: shell: bash - name: "Test" - run: 'shopt -s globstar && poetry run python -m unittest tests/**/*.py' + run: 'poetry run pytest' shell: bash diff --git a/poetry.lock b/poetry.lock index 0238185..fdb4104 100644 --- a/poetry.lock +++ b/poetry.lock @@ -507,6 +507,20 @@ files = [ [package.extras] dev = ["coverage", "coveralls", "pytest"] +[[package]] +name = "exceptiongroup" +version = "1.2.1" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, + {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, +] + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "filelock" version = "3.13.4" @@ -785,6 +799,17 @@ files = [ {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"}, ] +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "janome" version = "0.5.0" @@ -1661,6 +1686,21 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa typing = ["typing-extensions"] xmp = ["defusedxml"] +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "pptree" version = "3.1" @@ -1980,6 +2020,28 @@ files = [ {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, ] +[[package]] +name = "pytest" +version = "8.2.1" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.2.1-py3-none-any.whl", hash = "sha256:faccc5d332b8c3719f40283d0d44aa5cf101cec36f88cde9ed8f2bc0538612b1"}, + {file = "pytest-8.2.1.tar.gz", hash = "sha256:5046e5b46d8e4cac199c373041f26be56fdb81eb4e67dc11d4e10811fc3408fd"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2.0" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -3070,6 +3132,17 @@ dev = ["tokenizers[testing]"] docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"] testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + [[package]] name = "torch" version = "2.2.2" @@ -3450,4 +3523,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "cf3e5185e4c9cadbc2a3f4474254bd79904b50247a447ab29d4049363143b45a" +content-hash = "3a9d4de2fa2f4472d0e59b8e541562a39b95611cd70172c3bda32413f5fdbbe1" diff --git a/pyproject.toml b/pyproject.toml index 54e78cc..ff611de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,8 +13,15 @@ spacy = "^3.7.4" scipy = "<1.13.0" presidio-anonymizer = "^2.2.354" presidio-analyzer = {version = "^2.2.354", extras = ["transformers", "stanza"]} +pytest = "^8.2.1" [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = "src" +addopts = [ + "--import-mode=importlib", +] \ No newline at end of file diff --git a/analyzer_engine/__init__.py b/src/analyzer_engine/__init__.py similarity index 100% rename from analyzer_engine/__init__.py rename to src/analyzer_engine/__init__.py diff --git a/analyzer_engine/csv_analyzer_engine.py b/src/analyzer_engine/csv_analyzer_engine.py similarity index 100% rename from analyzer_engine/csv_analyzer_engine.py rename to src/analyzer_engine/csv_analyzer_engine.py diff --git a/config/__init__.py b/src/config/__init__.py similarity index 100% rename from config/__init__.py rename to src/config/__init__.py diff --git a/config/nlp_engine_config.py b/src/config/nlp_engine_config.py similarity index 100% rename from config/nlp_engine_config.py rename to src/config/nlp_engine_config.py diff --git a/recognizer/__init__.py b/src/recognizer/__init__.py similarity index 100% rename from recognizer/__init__.py rename to src/recognizer/__init__.py diff --git a/recognizer/flair_recognizer.py b/src/recognizer/flair_recognizer.py similarity index 100% rename from recognizer/flair_recognizer.py rename to src/recognizer/flair_recognizer.py diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/analyzer_engine/__init__.py b/tests/analyzer_engine/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/analyzer_engine/csv_analyzer_engine_test.py b/tests/analyzer_engine/csv_analyzer_engine_test.py index 84db309..e45ef83 100644 --- a/tests/analyzer_engine/csv_analyzer_engine_test.py +++ b/tests/analyzer_engine/csv_analyzer_engine_test.py @@ -1,22 +1,15 @@ -import unittest +import pytest from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine from config.nlp_engine_config import FlairNLPEngine -class CSVAnalayzerEngineTest(unittest.TestCase): - def setUp(self) -> None: - nlp_engine = FlairNLPEngine("flair/ner-english-large") - self.csv_analyser = CSVAnalyzerEngine(nlp_engine) +def test_csv_analyzer_engine_anonymizer(): + nlp_engine = FlairNLPEngine("flair/ner-english-large") + csv_analyzer = CSVAnalyzerEngine(nlp_engine) + from presidio_anonymizer import BatchAnonymizerEngine + analyzer_results = csv_analyzer.analyze_csv('./sample_data.csv', language="en") - def test_csv_analyzer_engine_anonymizer(self): - import pprint - from presidio_anonymizer import BatchAnonymizerEngine - analyzer_results = self.csv_analyser.analyze_csv('./sample_data.csv', language="en") - - pprint.pprint(analyzer_results) - - anonymizer = BatchAnonymizerEngine() - anonymized_results = anonymizer.anonymize_dict(analyzer_results) - pprint.pprint(anonymized_results) - self.assertIsNotNone(anonymized_results) \ No newline at end of file + anonymizer = BatchAnonymizerEngine() + anonymized_results = anonymizer.anonymize_dict(analyzer_results) + assert anonymized_results diff --git a/tests/recognizer/__init__.py b/tests/recognizer/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/recognizer/flair_recognizer_test.py b/tests/recognizer/flair_recognizer_test.py index 032b668..56cff4b 100644 --- a/tests/recognizer/flair_recognizer_test.py +++ b/tests/recognizer/flair_recognizer_test.py @@ -1,20 +1,17 @@ -import unittest +import pytest from recognizer.flair_recognizer import FlairRecognizer -class TestFlairRecognizer(unittest.TestCase): - def setUp(self) -> None: - self.recognizer = FlairRecognizer(model_path="flair/ner-english-large") - - def test_flair_recognizer_analyse(self): - test_data = "Sowmya is working in Berkley bank as an accountant since 2021" - result = self.recognizer.analyze(test_data) - self.assertGreater(len(result), 0) - - def test_flair_recognizes_persons_correctly(self): - test_data = "Sowmya is a person name" - self.assertGreater(len(self.recognizer.analyze(test_data)), 0) - test_data = "XXXXXX is a valid name?" - self.assertEquals(len(self.recognizer.analyze(test_data)), 0) +def test_flair_recognizer_analyze(): + recognizer = FlairRecognizer(model_path="flair/ner-english-large") + test_data = "Sowmya is working in Berkley bank as an accountant since 2021" + result = recognizer.analyze(test_data) + assert len(result) > 0 +def test_flair_recognizes_persons_correctly(): + recognizer = FlairRecognizer(model_path="flair/ner-english-large") + test_data = "Sowmya is a person name" + assert len(recognizer.analyze(test_data)) > 0 + test_data = "XXXXXX is a valid name?" + assert len(recognizer.analyze(test_data)) == 0 From f42a4308de981713bf310576f086a3c38b98bd40 Mon Sep 17 00:00:00 2001 From: l-r-sowmya <137146627+l-r-sowmya@users.noreply.github.com> Date: Mon, 3 Jun 2024 10:31:56 +0530 Subject: [PATCH 2/2] adding tesseract in flake.nix --- flake.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flake.nix b/flake.nix index dccecb1..b925431 100644 --- a/flake.nix +++ b/flake.nix @@ -11,7 +11,7 @@ outputs = { self, nixpkgs, flake-utils, poetry2nix }: flake-utils.lib.eachDefaultSystem (system: let - nativeBuildInputs = with pkgs; [ stdenv python3 poetry ]; + nativeBuildInputs = with pkgs; [ stdenv python3 poetry tesseract ]; buildInputs = with pkgs; [ ]; # see https://github.com/nix-community/poetry2nix/tree/master#api for more functions and examples.