From 344d92f486ce948350989f89b5bfc86a4d856b29 Mon Sep 17 00:00:00 2001
From: mallikarjun-br <93917885+mallikarjun-br@users.noreply.github.com>
Date: Wed, 9 Oct 2024 16:16:41 +0100
Subject: [PATCH] feat: add support to analyze images

---
 README.md  |  5 +++++
 src/cli.py | 58 ++++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 88bd227..13e19b3 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,11 @@ cat sample.csv | pii analyze --csv | pii anonymize
 cat sample.csv | pii analyze --csv | pii anonymize | jq -r '.text'
 cat sample.csv | pii analyze --csv | pii anonymize | jq -r '.text' > anonymized.csv
 
+
+# img files
+cat sample.png | pii analyze --img
+
+
 # vault integration
 ./vault.sh # start and configure vault server and transit secret engine keys
 echo "My name is Don Stark and my phone number is 212-555-5555" | pii anonymize --vaulturl "http://127.0.0.1:8200" --vaultkey "orders"
diff --git a/src/cli.py b/src/cli.py
index 0d0e71e..b07e387 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -1,13 +1,19 @@
 import argparse
+import io
 import json
 
 from presidio_analyzer import RecognizerResult
 from presidio_analyzer.analyzer_engine import AnalyzerEngine
 from presidio_anonymizer.entities.engine.result.operator_result import OperatorResult
+from presidio_image_redactor import ImageAnalyzerEngine
 from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from config.nlp_engine_config import FlairNLPEngine
 from operators.vault import Vault
+from PIL import Image
+from presidio_image_redactor import ImageRedactorEngine
+
+
 import sys
 import logging
 
@@ -19,15 +25,46 @@
 
 def analyze(args):
     analyzer_results = None
-    nlp_engine = FlairNLPEngine(NLP_ENGINE)
-    nlp_engine, registry = nlp_engine.create_nlp_engine()
-    engine = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)
-    text = sys.stdin.read()
-    if args.csv:
-        engine = CSVAnalyzerEngine(engine)
-    analyzer_results = engine.analyze(text=text, language=args.language)
+    text = None
+    image = None
+    if args.img:
+        image = Image.open(io.BytesIO(sys.stdin.buffer.read()))
+        analyzer_results = ImageAnalyzerEngine().analyze(image=image, language=args.language)
+    else:
+        nlp_engine = FlairNLPEngine(NLP_ENGINE)
+        nlp_engine, registry = nlp_engine.create_nlp_engine()
+        engine = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine)
+        text = sys.stdin.read()
+        if args.csv:
+            engine = CSVAnalyzerEngine(engine)
+        analyzer_results = engine.analyze(text=text, language=args.language)
+
+    output = format_output(analyzer_results, text, image)
+    print(json.dumps(output, indent=2))
+    return analyzer_results
 
-    output = {
+def format_output(analyzer_results, text, image):
+    if image:
+        output = io.BytesIO()
+        image.convert('RGB').save(output, format='JPEG')
+        return {
+            "image": list(output.getvalue()),
+            "analyzer_results": [
+                {
+                    "entity_type": result.entity_type,
+                    "start": result.start,
+                    "end":  result.end,
+                    "score": result.score,
+                    "left" : result.left,
+                    "top" : result.top,
+                    "width" : result.width,
+                    "height" : result.height
+                }
+                for result in analyzer_results
+            ]
+        }
+
+    return {
         "text": text,
         "analyzer_results": [
             {
@@ -41,8 +78,8 @@ def analyze(args):
             for result in analyzer_results
         ],
     }
-    print(json.dumps(output, indent=2))
-    return analyzer_results
+
+
 
 
 def anonymize(args):
@@ -106,6 +143,7 @@ def main():
         "analyze", description="Analyze inputs and return PII detection results"
     )
     analyzer_parser.add_argument("--csv", action="store_true")
+    analyzer_parser.add_argument("--img", action="store_true")
     analyzer_parser.add_argument("--language", required=False, type=str, default="en")
     analyzer_parser.set_defaults(func=analyze)