Skip to content

Commit

Permalink
Issue #45 : Add document analysis function (#74)
Browse files Browse the repository at this point in the history
* Add document analysis function to process fertilizer labels with OCR and LLM

* Update version to 0.0.9 and add test_label.py to .gitignore

* Bump version to 0.0.7 and add unit test for analyze_document function

* Update .gitignore to exclude analyze_label.py and modify test_pipeline to use organization name for manufacturer assertion
  • Loading branch information
snakedye authored Nov 26, 2024
1 parent c3493f7 commit 176c26f
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ __pycache__/
logs/
test_logs/
reports/
analyze_label.py

# VS Code
.vscode
Expand Down
28 changes: 28 additions & 0 deletions pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,31 @@ def analyze(label_storage: LabelStorage, ocr: OCR, gpt: GPT, log_dir_path: str =
os.remove(f"{log_dir_path}/{now}.json")

return inspection

def analyze_document(document: bytes, ocr: OCR, gpt: GPT, log_dir_path: str = './logs') -> FertilizerInspection:
"""
Analyze the raw document of the fertiliser label using an OCR and an LLM.
It returns the data extracted from the label in a FertiliserForm.
"""
result = ocr.extract_text(document=document)

# Logs the results from document intelligence
now = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
save_text_to_file(result.content, f"{log_dir_path}/{now}.md")

# Generate inspection from extracted text
prediction = gpt.create_inspection(result.content)

# Check the coninspectionity of the JSON
inspection = prediction.inspection

# Logs the results from GPT
save_text_to_file(prediction.reasoning, f"{log_dir_path}/{now}.txt")
save_text_to_file(inspection.model_dump_json(indent=2), f"{log_dir_path}/{now}.json")

# Delete the logs if there's no error
os.remove(f"{log_dir_path}/{now}.md")
os.remove(f"{log_dir_path}/{now}.txt")
os.remove(f"{log_dir_path}/{now}.json")

return inspection
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "fertiscan_pipeline"
version = "0.0.8"
version = "0.0.9"
description = "A pipeline for the FertiScan project"
authors = [
{ name = "Albert Bryan Ndjeutcha", email = "[email protected]" }
Expand Down
35 changes: 34 additions & 1 deletion tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from dotenv import load_dotenv

from pipeline import GPT, OCR, LabelStorage, analyze
from pipeline import GPT, OCR, LabelStorage, analyze, analyze_document
from pipeline.inspection import FertilizerInspection, Value
from tests import curl_file, levenshtein_similarity

Expand Down Expand Up @@ -89,6 +89,39 @@ def test_analyze(self):
self.assertFalse(os.path.exists(json_log_path))
self.assertFalse(os.path.exists(txt_log_path))

def test_analyze_document(self):
# Run the analyze function
self.setUpClass()
inspection = analyze_document(
self.label_storage.get_document(), self.ocr, self.gpt, log_dir_path=self.log_dir_path
)

# Perform assertions
self.assertIsInstance(inspection, FertilizerInspection, inspection)
self.assertIn(Value(value="25", unit="kg"), inspection.weight, inspection)
manufacturer_or_company = inspection.organizations[0].name
self.assertIsNotNone(manufacturer_or_company, inspection)
self.assertGreater(
levenshtein_similarity(
manufacturer_or_company, "TerraLink Horticulture Inc."
),
0.95,
inspection,
)
self.assertGreater(
levenshtein_similarity(inspection.npk, "10-52-0"), 0.90, inspection
)

# Ensure logs are created and then deleted
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
md_log_path = f"{self.log_dir_path}/{now}.md"
json_log_path = f"{self.log_dir_path}/{now}.json"
txt_log_path = f"{self.log_dir_path}/{now}.txt"

self.assertFalse(os.path.exists(md_log_path))
self.assertFalse(os.path.exists(json_log_path))
self.assertFalse(os.path.exists(txt_log_path))


class TestInspectionAnnotatedFields(unittest.TestCase):
@classmethod
Expand Down

0 comments on commit 176c26f

Please sign in to comment.