From 344d92f486ce948350989f89b5bfc86a4d856b29 Mon Sep 17 00:00:00 2001 From: mallikarjun-br <93917885+mallikarjun-br@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:16:41 +0100 Subject: [PATCH] feat: add support to analyze images --- README.md | 5 +++++ src/cli.py | 58 ++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 88bd227..13e19b3 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,11 @@ cat sample.csv | pii analyze --csv | pii anonymize cat sample.csv | pii analyze --csv | pii anonymize | jq -r '.text' cat sample.csv | pii analyze --csv | pii anonymize | jq -r '.text' > anonymized.csv + +# img files +cat sample.png | pii analyze --img + + # vault integration ./vault.sh # start and configure vault server and transit secret engine keys echo "My name is Don Stark and my phone number is 212-555-5555" | pii anonymize --vaulturl "http://127.0.0.1:8200" --vaultkey "orders" diff --git a/src/cli.py b/src/cli.py index 0d0e71e..b07e387 100644 --- a/src/cli.py +++ b/src/cli.py @@ -1,13 +1,19 @@ import argparse +import io import json from presidio_analyzer import RecognizerResult from presidio_analyzer.analyzer_engine import AnalyzerEngine from presidio_anonymizer.entities.engine.result.operator_result import OperatorResult +from presidio_image_redactor import ImageAnalyzerEngine from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine from config.nlp_engine_config import FlairNLPEngine from operators.vault import Vault +from PIL import Image +from presidio_image_redactor import ImageRedactorEngine + + import sys import logging @@ -19,15 +25,46 @@ def analyze(args): analyzer_results = None - nlp_engine = FlairNLPEngine(NLP_ENGINE) - nlp_engine, registry = nlp_engine.create_nlp_engine() - engine = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) - text = sys.stdin.read() - if args.csv: - engine = CSVAnalyzerEngine(engine) - analyzer_results = engine.analyze(text=text, language=args.language) + text = None + image = None + if args.img: + image = Image.open(io.BytesIO(sys.stdin.buffer.read())) + analyzer_results = ImageAnalyzerEngine().analyze(image=image, language=args.language) + else: + nlp_engine = FlairNLPEngine(NLP_ENGINE) + nlp_engine, registry = nlp_engine.create_nlp_engine() + engine = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) + text = sys.stdin.read() + if args.csv: + engine = CSVAnalyzerEngine(engine) + analyzer_results = engine.analyze(text=text, language=args.language) + + output = format_output(analyzer_results, text, image) + print(json.dumps(output, indent=2)) + return analyzer_results - output = { +def format_output(analyzer_results, text, image): + if image: + output = io.BytesIO() + image.convert('RGB').save(output, format='JPEG') + return { + "image": list(output.getvalue()), + "analyzer_results": [ + { + "entity_type": result.entity_type, + "start": result.start, + "end": result.end, + "score": result.score, + "left" : result.left, + "top" : result.top, + "width" : result.width, + "height" : result.height + } + for result in analyzer_results + ] + } + + return { "text": text, "analyzer_results": [ { @@ -41,8 +78,8 @@ def analyze(args): for result in analyzer_results ], } - print(json.dumps(output, indent=2)) - return analyzer_results + + def anonymize(args): @@ -106,6 +143,7 @@ def main(): "analyze", description="Analyze inputs and return PII detection results" ) analyzer_parser.add_argument("--csv", action="store_true") + analyzer_parser.add_argument("--img", action="store_true") analyzer_parser.add_argument("--language", required=False, type=str, default="en") analyzer_parser.set_defaults(func=analyze)