diff --git a/Dockerfile b/Dockerfile index cfa035f..bbb6897 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,6 +2,7 @@ FROM python:3.11 WORKDIR /usr/src/app RUN pip install poetry==1.8.3 +RUN apt-get update && apt-get install -y tesseract-ocr ENV POETRY_NO_INTERACTION=1 ENV POETRY_VIRTUALENVS_IN_PROJECT=1 ENV POETRY_VIRTUALENVS_CREATE=1 diff --git a/src/cli.py b/src/cli.py index 30f1c98..5b6b523 100644 --- a/src/cli.py +++ b/src/cli.py @@ -9,6 +9,7 @@ from analyzer_engine.csv_analyzer_engine import CSVAnalyzerEngine from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine from config.nlp_engine_config import FlairNLPEngine +from utils.formatter import Formatter from operators.vault import Vault from PIL import Image from presidio_image_redactor import ImageRedactorEngine @@ -25,21 +26,22 @@ def analyze(args): analyzer_results = None + input_buffer = sys.stdin.buffer.read() text = None image = None if args.img: - image = Image.open(io.BytesIO(sys.stdin.buffer.read())) + image = Image.open(io.BytesIO(input_buffer)) analyzer_results = ImageAnalyzerEngine().analyze(image=image, language=args.language) else: nlp_engine = FlairNLPEngine(NLP_ENGINE) nlp_engine, registry = nlp_engine.create_nlp_engine() engine = AnalyzerEngine(registry=registry, nlp_engine=nlp_engine) - text = sys.stdin.read() + text = input_buffer.decode("utf-8") if args.csv: engine = CSVAnalyzerEngine(engine) analyzer_results = engine.analyze(text=text, language=args.language) - output = format_output(analyzer_results, text, image) + output = Formatter().format_output(analyzer_results, text, image) print(json.dumps(output, indent=2)) return analyzer_results @@ -132,43 +134,5 @@ def main(): args = parser.parse_args() args.func(args) - -def format_output(analyzer_results, text, image): - if image: - output = io.BytesIO() - image.convert('RGB').save(output, format='JPEG') - return { - "image": list(output.getvalue()), - "analyzer_results": [ - { - "entity_type": result.entity_type, - "start": result.start, - "end": result.end, - "score": result.score, - "left" : result.left, - "top" : result.top, - "width" : result.width, - "height" : result.height - } - for result in analyzer_results - ] - } - - return { - "text": text, - "analyzer_results": [ - { - "entity_type": result.entity_type, - "start": result.start, - "end": result.end, - "score": result.score, - "analysis_explanation": result.analysis_explanation, - "recognition_metadata": result.recognition_metadata, - } - for result in analyzer_results - ], - } - - if __name__ == "__main__": main() diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/formatter.py b/src/utils/formatter.py new file mode 100644 index 0000000..5237105 --- /dev/null +++ b/src/utils/formatter.py @@ -0,0 +1,43 @@ +import io + + +class Formatter: + + def __init__(self): + pass + + def format_output(self ,analyzer_results, text, image): + if image: + output = io.BytesIO() + image.convert('RGB').save(output, format='JPEG') + return { + "image": list(output.getvalue()), + "analyzer_results": [ + { + "entity_type": result.entity_type, + "start": result.start, + "end": result.end, + "score": result.score, + "left" : result.left, + "top" : result.top, + "width" : result.width, + "height" : result.height + } + for result in analyzer_results + ] + } + + return { + "text": text, + "analyzer_results": [ + { + "entity_type": result.entity_type, + "start": result.start, + "end": result.end, + "score": result.score, + "analysis_explanation": result.analysis_explanation, + "recognition_metadata": result.recognition_metadata, + } + for result in analyzer_results + ], + } \ No newline at end of file