diff --git a/poetry.lock b/poetry.lock index 62154b4..aa8c0e8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -42,6 +42,25 @@ files = [ {file = "annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89"}, ] +[[package]] +name = "azure-core" +version = "1.31.0" +description = "Microsoft Azure Core Library for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "azure_core-1.31.0-py3-none-any.whl", hash = "sha256:22954de3777e0250029360ef31d80448ef1be13b80a459bff80ba7073379e2cd"}, + {file = "azure_core-1.31.0.tar.gz", hash = "sha256:656a0dd61e1869b1506b7c6a3b31d62f15984b1a573d6326f6aa2f3e4123284b"}, +] + +[package.dependencies] +requests = ">=2.21.0" +six = ">=1.11.0" +typing-extensions = ">=4.6.0" + +[package.extras] +aio = ["aiohttp (>=3.0)"] + [[package]] name = "beautifulsoup4" version = "4.12.3" @@ -2086,17 +2105,36 @@ transformers = ["spacy-huggingface-pipelines"] [[package]] name = "presidio-anonymizer" -version = "2.2.354" -description = "Persidio Anonymizer package - replaces analyzed text with desired values." +version = "2.2.355" +description = "Presidio Anonymizer package - replaces analyzed text with desired values." optional = false -python-versions = ">=3.5" +python-versions = "<4.0,>=3.8" files = [ - {file = "presidio_anonymizer-2.2.354-py3-none-any.whl", hash = "sha256:2b44bfedf376aa0c21f463581bede543a632c23ac6bc427a2e026c8e81ecfa67"}, + {file = "presidio_anonymizer-2.2.355-py3-none-any.whl", hash = "sha256:c85f5f155fcb66aff8e962fcf3984552a5512ab34bb1a433b1a52193e635c23f"}, ] [package.dependencies] +azure-core = "*" pycryptodome = ">=3.10.1" +[package.extras] +server = ["flask (>=1.1)"] + +[[package]] +name = "presidio-vault" +version = "0.1.0" +description = "A HashiCorp Vault operator that allows anonymization and de-anonymization using Microsoft Presidio." +optional = false +python-versions = "<4.0,>=3.8" +files = [ + {file = "presidio_vault-0.1.0-py3-none-any.whl", hash = "sha256:1ebcd8797ad93dd9dde1f00ccf4df8c8e10d7ba670c573c3c9421755b783c9a2"}, + {file = "presidio_vault-0.1.0.tar.gz", hash = "sha256:6eb2a974be4ee87fe04265937a1633a7d42574bdda56f85c5ce338a543531303"}, +] + +[package.dependencies] +hvac = ">=2.3.0,<3.0.0" +presidio-anonymizer = ">=2.2.355,<3.0.0" + [[package]] name = "protobuf" version = "5.27.0" @@ -2324,13 +2362,13 @@ files = [ [[package]] name = "pytest" -version = "8.2.1" +version = "8.3.3" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.2.1-py3-none-any.whl", hash = "sha256:faccc5d332b8c3719f40283d0d44aa5cf101cec36f88cde9ed8f2bc0538612b1"}, - {file = "pytest-8.2.1.tar.gz", hash = "sha256:5046e5b46d8e4cac199c373041f26be56fdb81eb4e67dc11d4e10811fc3408fd"}, + {file = "pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2"}, + {file = "pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181"}, ] [package.dependencies] @@ -2338,7 +2376,7 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" -pluggy = ">=1.5,<2.0" +pluggy = ">=1.5,<2" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] @@ -3827,4 +3865,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "54b51113397da4c81a5c3878e89ca5ed3abfe6d2be277f9edcdb139f7934ec28" +content-hash = "74b33179297552cef1304d8587cd11c40b7f59a43ea6971ca7ddf44b6e1f22af" diff --git a/pyproject.toml b/pyproject.toml index f6a6c62..f7f66a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,8 +15,8 @@ presidio-anonymizer = "^2.2.354" presidio-analyzer = {version = "^2.2.354", extras = ["transformers", "stanza"]} pytest = "^8.2.1" flask = "^3.0.3" -hvac = "^2.3.0" black = "^24.8.0" +presidio-vault = "^0.1.0" [build-system] diff --git a/src/app.py b/src/app.py index 3f54b73..aed6dcc 100644 --- a/src/app.py +++ b/src/app.py @@ -11,7 +11,7 @@ from presidio_analyzer import AnalyzerEngine, DictAnalyzerResult, RecognizerResult from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine from config.nlp_engine_config import FlairNLPEngine -from operators.vault import Vault +from presidio_vault.vault import Vault DEFAULT_PORT = "3000" NLP_ENGINE = "flair/ner-english-large" diff --git a/src/cli.py b/src/cli.py index 0d0e71e..d70e7ed 100644 --- a/src/cli.py +++ b/src/cli.py @@ -7,7 +7,7 @@ from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine from config.nlp_engine_config import FlairNLPEngine -from operators.vault import Vault +from presidio_vault.vault import Vault import sys import logging diff --git a/src/operators/__init__.py b/src/operators/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/operators/vault.py b/src/operators/vault.py deleted file mode 100644 index 38ee474..0000000 --- a/src/operators/vault.py +++ /dev/null @@ -1,140 +0,0 @@ -import base64 -from typing import Dict, List -from urllib.parse import urlparse - -import hvac -from presidio_anonymizer import ConflictResolutionStrategy, OperatorResult -from presidio_anonymizer.anonymizer_engine import AnonymizerEngine -from presidio_anonymizer.deanonymize_engine import DeanonymizeEngine -from presidio_anonymizer.entities import ( - OperatorConfig, - InvalidParamException, - RecognizerResult, -) -from presidio_anonymizer.operators import Operator, OperatorType - - -class VaultEncrypt(Operator): - def _base64ify(self, bytes_or_str): - if isinstance(bytes_or_str, str): - input_bytes = bytes_or_str.encode("utf8") - else: - input_bytes = bytes_or_str - - output_bytes = base64.urlsafe_b64encode(input_bytes) - return output_bytes.decode("ascii") - - def operate(self, text: str, params: Dict = None) -> str: - vault_url = params.get("vault_url") - key = params.get("key") - - client = hvac.Client(url=vault_url) - if params.get("vault_token"): - client.token = params.get("vault_token") - encrypt_data_response = client.secrets.transit.encrypt_data( - name=key, - plaintext=self._base64ify(text), - ) - ciphertext = encrypt_data_response["data"]["ciphertext"] - - return ciphertext - - def validate(self, params: Dict = None) -> None: - vault_url = params.get("vault_url") - if isinstance(vault_url, str): - result = urlparse(vault_url) - if result.scheme and result.netloc: - pass - else: - raise InvalidParamException( - f"Invalid input, vault_url must be a valid URL." - ) - else: - raise InvalidParamException(f"Invalid input, vault_url must be a string.") - - key = params.get("key") - if isinstance(key, str) and key: - pass - else: - raise InvalidParamException( - f"Invalid input, key must be a valid encryption key name." - ) - - def operator_name(self) -> str: - return "vault_encrypt" - - def operator_type(self) -> OperatorType: - return OperatorType.Anonymize - - -class VaultDecrypt(Operator): - def operate(self, text: str, params: Dict = None) -> str: - vault_url = params.get("vault_url") - key = params.get("key") - - client = hvac.Client(url=vault_url) - if params.get("vault_token"): - client.token = params.get("vault_token") - decrypt_data_response = client.secrets.transit.decrypt_data( - name=key, - ciphertext=text, - ) - encodedtext = decrypt_data_response["data"]["plaintext"] - plaintext = base64.b64decode(encodedtext).decode("utf8") - - return plaintext - - def validate(self, params: Dict = None) -> None: - vault_url = params.get("vault_url") - if isinstance(vault_url, str): - result = urlparse(vault_url) - if result.scheme and result.netloc: - pass - else: - raise InvalidParamException( - f"Invalid input, vault_url must be a valid URL." - ) - else: - raise InvalidParamException(f"Invalid input, vault_url must be a string.") - - key = params.get("key") - if isinstance(key, str) and key: - pass - else: - raise InvalidParamException( - f"Invalid input, key must be a valid encryption key name." - ) - - def operator_name(self) -> str: - return "vault_decrypt" - - def operator_type(self) -> OperatorType: - return OperatorType.Deanonymize - - -class Vault: - def __init__(self, vault_url: str, vault_key: str, vault_token: str = None) -> None: - self.vault_config = { - "vault_url": vault_url, - "key": vault_key, - "vault_token": vault_token, - } - - def anonymize( - self, - text: str, - analyzer_results: List[RecognizerResult], - conflict_resolution: ConflictResolutionStrategy = None, - ): - anonymizer = AnonymizerEngine() - anonymizer.add_anonymizer(VaultEncrypt) - operators = {"DEFAULT": OperatorConfig("vault_encrypt", self.vault_config)} - return anonymizer.anonymize( - text, analyzer_results, operators, conflict_resolution - ) - - def deanonymize(self, text: str, anonymizer_result_items: List[OperatorResult]): - deanonymizer = DeanonymizeEngine() - deanonymizer.add_deanonymizer(VaultDecrypt) - operators = {"DEFAULT": OperatorConfig("vault_decrypt", self.vault_config)} - return deanonymizer.deanonymize(text, anonymizer_result_items, operators) diff --git a/tests/operators/vault_test.py b/tests/operators/vault_test.py deleted file mode 100644 index bc1a236..0000000 --- a/tests/operators/vault_test.py +++ /dev/null @@ -1,146 +0,0 @@ -import pytest -from unittest import mock - -from presidio_anonymizer.entities import InvalidParamException -import hvac -from operators.vault import VaultEncrypt, VaultDecrypt - - -class TestVaultEncrypt: - def test_given_valid_key_raises_no_exceptions(self): - VaultEncrypt().validate( - params={"vault_url": "http://127.0.0.1:8200", "key": "foobar"} - ) - - def test_given_invalid_key_raises_exceptions(self): - with pytest.raises( - InvalidParamException, - match="Invalid input, key must be a valid encryption key name.", - ): - VaultEncrypt().validate( - params={"vault_url": "http://127.0.0.1:8200", "key": 1} - ) - - def test_given_valid_url_raises_no_exceptions(self): - VaultEncrypt().validate( - params={"vault_url": "http://127.0.0.1:8200", "key": "foobar"} - ) - - def test_given_invalid_url_raises_exceptions(self): - with pytest.raises( - InvalidParamException, - match="Invalid input, vault_url must be a valid URL.", - ): - VaultEncrypt().validate( - params={"vault_url": "http:/127.0.0.1:8200", "key": "foobar"} - ) - - def test_vault_encrypt_and_result_is_returned(self): - expected_vault_url = "http://127.0.0.1:8200" - expected_vault_key = "key" - expected_text = "text" - expected_anonymized_text = "encrypted_text" - with mock.patch.object(hvac, "Client"): - expected_anonymized_text = "encrypted_text" - fake_client = mock.MagicMock() - fake_client.secrets.transit.encrypt_data.return_value = { - "data": {"ciphertext": expected_anonymized_text} - } - hvac.Client.return_value = fake_client - - anonymized_text = VaultEncrypt().operate( - text=expected_text, - params={"vault_url": expected_vault_url, "key": expected_vault_key}, - ) - - assert anonymized_text == expected_anonymized_text - hvac.Client.assert_called_once_with(url=expected_vault_url) - - def test_vault_token_when_supplied_is_used(self): - expected_vault_token = "secret-123" - with mock.patch.object(hvac, "Client"): - fake_client = mock.MagicMock() - hvac.Client.return_value = fake_client - - VaultEncrypt().operate( - text="", params={"vault_token": expected_vault_token} - ) - - assert fake_client.token == expected_vault_token - - -class TestVaultDecrypt: - def test_given_valid_key_raises_no_exceptions(self): - VaultDecrypt().validate( - params={"vault_url": "http://127.0.0.1:8200", "key": "foobar"} - ) - - def test_given_invalid_key_raises_exceptions(self): - with pytest.raises( - InvalidParamException, - match="Invalid input, key must be a valid encryption key name.", - ): - VaultDecrypt().validate( - params={"vault_url": "http://127.0.0.1:8200", "key": 1} - ) - - def test_given_valid_url_raises_no_exceptions(self): - VaultDecrypt().validate( - params={"vault_url": "http://127.0.0.1:8200", "key": "foobar"} - ) - - def test_given_invalid_url_raises_exceptions(self): - with pytest.raises( - InvalidParamException, - match="Invalid input, vault_url must be a valid URL.", - ): - VaultDecrypt().validate( - params={"vault_url": "http:/127.0.0.1:8200", "key": "foobar"} - ) - - def test_vault_decrypt_and_result_is_returned(self): - expected_vault_url = "http://127.0.0.1:8200" - expected_vault_key = "key" - expected_deanonymized_text = "text" - with mock.patch.object(hvac, "Client"): - import base64 - - fake_client = mock.MagicMock() - fake_client.secrets.transit.decrypt_data.return_value = { - "data": { - "plaintext": base64.urlsafe_b64encode( - expected_deanonymized_text.encode("utf8") - ) - } - } - hvac.Client.return_value = fake_client - - deanonymized_text = VaultDecrypt().operate( - text="encrypted_text", - params={"vault_url": expected_vault_url, "key": expected_vault_key}, - ) - - assert deanonymized_text == expected_deanonymized_text - hvac.Client.assert_called_once_with(url=expected_vault_url) - - def test_vault_token_when_supplied_is_used(self): - expected_vault_token = "secret-123" - expected_deanonymized_text = "text" - with mock.patch.object(hvac, "Client"): - import base64 - - fake_client = mock.MagicMock() - fake_client.secrets.transit.decrypt_data.return_value = { - "data": { - "plaintext": base64.urlsafe_b64encode( - expected_deanonymized_text.encode("utf8") - ) - } - } - hvac.Client.return_value = fake_client - - VaultDecrypt().operate( - text="encrypted_text", params={"vault_token": expected_vault_token} - ) - - assert fake_client.token == expected_vault_token