Merge pull request #9 from sahajsoft/api_anonymize

add support for an API to analyze and anonymize files
sahajsoft · Jun 5, 2024 · 17f7939 · 17f7939
2 parents f1ece7d + 6b9d88a
commit 17f7939
Show file tree

Hide file tree

Showing 8 changed files with 1,110 additions and 700 deletions.
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
 
-.direnv/
+.direnv/
+file_uploads/
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ scipy = "<1.13.0"
 presidio-anonymizer = "^2.2.354"
 presidio-analyzer = {version = "^2.2.354", extras = ["transformers", "stanza"]}
 pytest = "^8.2.1"
+flask = "^3.0.3"
 
 
 [build-system]

diff --git a/src/app.py b/src/app.py
@@ -0,0 +1,127 @@
+import json
+import logging
+import os
+import uuid
+from typing import Tuple
+
+from flask import Flask, request, jsonify, Response, send_file
+
+import csv
+from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine
+from presidio_analyzer import DictAnalyzerResult, RecognizerResult
+from presidio_anonymizer import BatchAnonymizerEngine
+from config.nlp_engine_config import FlairNLPEngine
+
+DEFAULT_PORT = "3000"
+NLP_ENGINE = "flair/ner-english-large"
+UPLOAD_DIR = "file_uploads"
+
+class Server:
+    """HTTP Server for calling Presidio Analyzer."""
+
+    def __init__(self):
+        self.logger = logging.getLogger("pii-detection-anonymizer")
+        self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
+        self.app = Flask(__name__)
+        self.logger.info("Starting analyzer engine")
+        nlp_engine = FlairNLPEngine(NLP_ENGINE)
+        self.engine = CSVAnalyzerEngine(nlp_engine)
+        self.logger.info("Started analyzer engine")
+        if not os.path.exists(UPLOAD_DIR):
+            os.makedirs(UPLOAD_DIR)
+
+        @self.app.route("/health")
+        def health() -> str:
+            """Return basic health probe result."""
+            return "PII detection and anonymizer service is up"
+
+        @self.app.route("/analyze", methods=["POST"])
+        def analyze() -> Tuple[str, int]:
+            """Execute the analyzer function."""
+            try:
+                file = request.files['file']
+                language = request.form['language']
+                if file.filename == '':
+                    return jsonify({'error': 'No selected file'}), 400
+
+                filepath = f'{UPLOAD_DIR}/{uuid.uuid4()}'
+                file.save(filepath)
+                self.logger.info(f"Successfully saved file: {filepath}")
+
+                analyzer_results = self.engine.analyze_csv(
+                    csv_full_path=filepath,
+                    language=language
+                )
+                self.logger.debug(f"Analyzed file with results: {analyzer_results}")
+                os.remove(filepath)
+                self.logger.info(f"Successfully removed file: {filepath}")
+
+                analyzer_results_list = {}
+                for a in analyzer_results:
+                    recognizer_results = []
+                    for r in a.recognizer_results:
+                        recognizer_results.append([o.to_dict() for o in r])
+                    analyzer_results_list[a.key] = {
+                        "value": a.value,
+                        "recognizer_results": recognizer_results
+                    }
+
+                return jsonify(analyzer_results_list), 200
+            except Exception as e:
+                self.logger.error(
+                    f"A fatal error occurred during execution of "
+                    f"AnalyzerEngine.analyze(). {e}"
+                )
+                return jsonify(error=e.args[0]), 500
+
+        @self.app.route("/anonymize", methods=["POST"])
+        def anonymize() -> Response:
+            """Execute the anonymizer function."""
+            try:
+                analyzer_results = json.loads(request.form['analyzer_results'])
+                dict_analyzer_results = []
+                for key, value in analyzer_results.items():
+                    recognizer_results = []
+                    for results_for_each_entry in value["recognizer_results"]:
+                        each_entry_recognizer_results = []
+                        for r in results_for_each_entry:
+                            each_entry_recognizer_results.append(
+                                RecognizerResult(r["entity_type"],
+                                                r["start"],
+                                                r["end"],
+                                                r["score"]))
+                        recognizer_results.append(each_entry_recognizer_results)
+                    dict_analyzer_results.append(DictAnalyzerResult(key=key, value=value["value"], recognizer_results=recognizer_results))
+
+                anonymizer = BatchAnonymizerEngine()
+                anonymized_results = anonymizer.anonymize_dict(dict_analyzer_results)
+
+                data = []
+                keys = anonymized_results.keys()
+                for i in range(len(anonymized_results[list(keys)[0]])):
+                    row = {key: anonymized_results[key][i] for key in keys}
+                    data.append(row)
+
+                filename = f'{UPLOAD_DIR}/{uuid.uuid4()}.csv'
+                with open(filename, 'w', newline='') as output:
+                    writer = csv.DictWriter(output, fieldnames=keys)
+                    writer.writeheader()
+                    writer.writerows(data)
+
+                return send_file(
+                    os.path.abspath(filename),
+                    mimetype='text/csv',
+                    as_attachment=True,
+                    download_name='anonymized_data.csv'
+                )
+            except Exception as e:
+                self.logger.error(
+                    f"A fatal error occurred during execution of "
+                    f"AnonymizerEngine.anonymize(). {e}"
+                )
+                return jsonify(error=e.args[0]), 500
+
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", DEFAULT_PORT))
+    server = Server()
+    server.app.run(host="0.0.0.0", port=port)
diff --git a/tests/analyzer_engine/csv_analyzer_engine_test.py b/tests/analyzer_engine/csv_analyzer_engine_test.py
@@ -8,7 +8,7 @@ def test_csv_analyzer_engine_anonymizer():
     nlp_engine = FlairNLPEngine("flair/ner-english-large")
     csv_analyzer = CSVAnalyzerEngine(nlp_engine)
     from presidio_anonymizer import BatchAnonymizerEngine
-    analyzer_results = csv_analyzer.analyze_csv('./sample_data.csv', language="en")
+    analyzer_results = csv_analyzer.analyze_csv('./tests/sample_data/sample_data.csv', language="en")
 
     anonymizer = BatchAnonymizerEngine()
     anonymized_results = anonymizer.anonymize_dict(analyzer_results)

diff --git a/tests/app_test.py b/tests/app_test.py
@@ -0,0 +1,76 @@
+import pytest
+import json
+
+from app import Server
+
+@pytest.fixture()
+def app():
+    app = Server().app
+    app.config.update({
+        "TESTING": True,
+    })
+
+    yield app
+
+
+@pytest.fixture()
+def client(app):
+    return app.test_client()
+
+
+def test_health(client):
+    response = client.get("/health")
+    assert response.status_code == 200
+
+def test_analyze_non_existent(client):
+    response = client.post("/analyze", data={
+        "language": "en",
+    })
+
+    assert response.status_code == 500
+
+
+def test_analyze_invalid_csv(client):
+    response = client.post("/analyze", data={
+        "file": open('./tests/sample_data/invalid.csv', 'rb'),
+    })
+
+    assert response.status_code == 500
+
+
+def test_analyze_pii_csv(client):
+    expected_response_id = {'value': ['1', '2', '3'], 'recognizer_results': [[], [], []]}
+
+    response = client.post("/analyze", data={
+        "file": open('./tests/sample_data/sample_data.csv', 'rb'),
+        "language": "en",
+    })
+
+    assert response.status_code == 200
+    data = json.loads(response.get_data(as_text=True))
+    # No PII in id
+    assert data['id'] == expected_response_id
+    # first row has no PII
+    assert data['comments']['recognizer_results'][0] == []
+    # second row has PII
+    assert data['comments']['recognizer_results'][1][0]['entity_type'] == 'US_DRIVER_LICENSE'
+    assert data['comments']['recognizer_results'][1][0]['start'] == 34
+    assert data['comments']['recognizer_results'][1][0]['end'] == 42
+
+def test_anonymize_csv_pii(client):
+    analyze_response = client.post("/analyze", data={
+        "file": open('./tests/sample_data/sample_data.csv', 'rb'),
+        "language": "en",
+    })
+
+    assert analyze_response.status_code == 200
+    analyzer_results = analyze_response.get_data(as_text=True)
+
+    anonymizer_response = client.post("/anonymize", data={
+        "file": open('./tests/sample_data/sample_data.csv', 'rb'),
+        "analyzer_results": analyzer_results
+    })
+
+    assert anonymizer_response.status_code == 200
+    anonymizer_data = anonymizer_response.get_data(as_text=True)
+    assert anonymizer_data
diff --git a/tests/sample_data/invalid.csv b/tests/sample_data/invalid.csv
@@ -0,0 +1,3 @@
+{
+  "hello": "json"
+}
diff --git a/tests/analyzer_engine/sample_data.csv → tests/sample_data/sample_data.csv b/tests/analyzer_engine/sample_data.csv → tests/sample_data/sample_data.csv