Skip to content

Commit

Permalink
Merge pull request #9 from sahajsoft/api_anonymize
Browse files Browse the repository at this point in the history
add support for an API to analyze and anonymize files
  • Loading branch information
l-r-sowmya authored Jun 5, 2024
2 parents f1ece7d + 6b9d88a commit 17f7939
Show file tree
Hide file tree
Showing 8 changed files with 1,110 additions and 700 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,5 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

.direnv/
.direnv/
file_uploads/
1,598 changes: 900 additions & 698 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ scipy = "<1.13.0"
presidio-anonymizer = "^2.2.354"
presidio-analyzer = {version = "^2.2.354", extras = ["transformers", "stanza"]}
pytest = "^8.2.1"
flask = "^3.0.3"


[build-system]
Expand Down
127 changes: 127 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import json
import logging
import os
import uuid
from typing import Tuple

from flask import Flask, request, jsonify, Response, send_file

import csv
from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine
from presidio_analyzer import DictAnalyzerResult, RecognizerResult
from presidio_anonymizer import BatchAnonymizerEngine
from config.nlp_engine_config import FlairNLPEngine

DEFAULT_PORT = "3000"
NLP_ENGINE = "flair/ner-english-large"
UPLOAD_DIR = "file_uploads"

class Server:
"""HTTP Server for calling Presidio Analyzer."""

def __init__(self):
self.logger = logging.getLogger("pii-detection-anonymizer")
self.logger.setLevel(os.environ.get("LOG_LEVEL", self.logger.level))
self.app = Flask(__name__)
self.logger.info("Starting analyzer engine")
nlp_engine = FlairNLPEngine(NLP_ENGINE)
self.engine = CSVAnalyzerEngine(nlp_engine)
self.logger.info("Started analyzer engine")
if not os.path.exists(UPLOAD_DIR):
os.makedirs(UPLOAD_DIR)

@self.app.route("/health")
def health() -> str:
"""Return basic health probe result."""
return "PII detection and anonymizer service is up"

@self.app.route("/analyze", methods=["POST"])
def analyze() -> Tuple[str, int]:
"""Execute the analyzer function."""
try:
file = request.files['file']
language = request.form['language']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400

filepath = f'{UPLOAD_DIR}/{uuid.uuid4()}'
file.save(filepath)
self.logger.info(f"Successfully saved file: {filepath}")

analyzer_results = self.engine.analyze_csv(
csv_full_path=filepath,
language=language
)
self.logger.debug(f"Analyzed file with results: {analyzer_results}")
os.remove(filepath)
self.logger.info(f"Successfully removed file: {filepath}")

analyzer_results_list = {}
for a in analyzer_results:
recognizer_results = []
for r in a.recognizer_results:
recognizer_results.append([o.to_dict() for o in r])
analyzer_results_list[a.key] = {
"value": a.value,
"recognizer_results": recognizer_results
}

return jsonify(analyzer_results_list), 200
except Exception as e:
self.logger.error(
f"A fatal error occurred during execution of "
f"AnalyzerEngine.analyze(). {e}"
)
return jsonify(error=e.args[0]), 500

@self.app.route("/anonymize", methods=["POST"])
def anonymize() -> Response:
"""Execute the anonymizer function."""
try:
analyzer_results = json.loads(request.form['analyzer_results'])
dict_analyzer_results = []
for key, value in analyzer_results.items():
recognizer_results = []
for results_for_each_entry in value["recognizer_results"]:
each_entry_recognizer_results = []
for r in results_for_each_entry:
each_entry_recognizer_results.append(
RecognizerResult(r["entity_type"],
r["start"],
r["end"],
r["score"]))
recognizer_results.append(each_entry_recognizer_results)
dict_analyzer_results.append(DictAnalyzerResult(key=key, value=value["value"], recognizer_results=recognizer_results))

anonymizer = BatchAnonymizerEngine()
anonymized_results = anonymizer.anonymize_dict(dict_analyzer_results)

data = []
keys = anonymized_results.keys()
for i in range(len(anonymized_results[list(keys)[0]])):
row = {key: anonymized_results[key][i] for key in keys}
data.append(row)

filename = f'{UPLOAD_DIR}/{uuid.uuid4()}.csv'
with open(filename, 'w', newline='') as output:
writer = csv.DictWriter(output, fieldnames=keys)
writer.writeheader()
writer.writerows(data)

return send_file(
os.path.abspath(filename),
mimetype='text/csv',
as_attachment=True,
download_name='anonymized_data.csv'
)
except Exception as e:
self.logger.error(
f"A fatal error occurred during execution of "
f"AnonymizerEngine.anonymize(). {e}"
)
return jsonify(error=e.args[0]), 500

if __name__ == "__main__":
port = int(os.environ.get("PORT", DEFAULT_PORT))
server = Server()
server.app.run(host="0.0.0.0", port=port)
2 changes: 1 addition & 1 deletion tests/analyzer_engine/csv_analyzer_engine_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def test_csv_analyzer_engine_anonymizer():
nlp_engine = FlairNLPEngine("flair/ner-english-large")
csv_analyzer = CSVAnalyzerEngine(nlp_engine)
from presidio_anonymizer import BatchAnonymizerEngine
analyzer_results = csv_analyzer.analyze_csv('./sample_data.csv', language="en")
analyzer_results = csv_analyzer.analyze_csv('./tests/sample_data/sample_data.csv', language="en")

anonymizer = BatchAnonymizerEngine()
anonymized_results = anonymizer.anonymize_dict(analyzer_results)
Expand Down
76 changes: 76 additions & 0 deletions tests/app_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import pytest
import json

from app import Server

@pytest.fixture()
def app():
app = Server().app
app.config.update({
"TESTING": True,
})

yield app


@pytest.fixture()
def client(app):
return app.test_client()


def test_health(client):
response = client.get("/health")
assert response.status_code == 200

def test_analyze_non_existent(client):
response = client.post("/analyze", data={
"language": "en",
})

assert response.status_code == 500


def test_analyze_invalid_csv(client):
response = client.post("/analyze", data={
"file": open('./tests/sample_data/invalid.csv', 'rb'),
})

assert response.status_code == 500


def test_analyze_pii_csv(client):
expected_response_id = {'value': ['1', '2', '3'], 'recognizer_results': [[], [], []]}

response = client.post("/analyze", data={
"file": open('./tests/sample_data/sample_data.csv', 'rb'),
"language": "en",
})

assert response.status_code == 200
data = json.loads(response.get_data(as_text=True))
# No PII in id
assert data['id'] == expected_response_id
# first row has no PII
assert data['comments']['recognizer_results'][0] == []
# second row has PII
assert data['comments']['recognizer_results'][1][0]['entity_type'] == 'US_DRIVER_LICENSE'
assert data['comments']['recognizer_results'][1][0]['start'] == 34
assert data['comments']['recognizer_results'][1][0]['end'] == 42

def test_anonymize_csv_pii(client):
analyze_response = client.post("/analyze", data={
"file": open('./tests/sample_data/sample_data.csv', 'rb'),
"language": "en",
})

assert analyze_response.status_code == 200
analyzer_results = analyze_response.get_data(as_text=True)

anonymizer_response = client.post("/anonymize", data={
"file": open('./tests/sample_data/sample_data.csv', 'rb'),
"analyzer_results": analyzer_results
})

assert anonymizer_response.status_code == 200
anonymizer_data = anonymizer_response.get_data(as_text=True)
assert anonymizer_data
3 changes: 3 additions & 0 deletions tests/sample_data/invalid.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"hello": "json"
}
File renamed without changes.

0 comments on commit 17f7939

Please sign in to comment.