From 8aa52110562228a4d72e7c2121501c312ec88c6b Mon Sep 17 00:00:00 2001 From: saratvk Date: Thu, 10 Oct 2024 14:13:05 -0400 Subject: [PATCH 01/12] issue #41: feat: add phone_number format check in a field_validator --- pipeline/inspection.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pipeline/inspection.py b/pipeline/inspection.py index 237784f..7dbf753 100644 --- a/pipeline/inspection.py +++ b/pipeline/inspection.py @@ -138,6 +138,19 @@ def check_registration_number_format(cls, v): if re.match(pattern, v): return v return None + + @field_validator("company_phone_number", "manufacturer_phone_number", mode="before") + def check_phone_number_format(cls, v): + if v is not None: + phone_number_pattern = r"(\+?1[\s-]?)?(\(?\d{3}\)?)[\s-]?(\d{3})[\s-]?(\d{4})" + first_matched_phone_number = re.search(phone_number_pattern, v) + if phone_number_pattern: + area_code = first_matched_phone_number.group(2).replace('(', '').replace(')', '') + middle_digits = first_matched_phone_number.group(3) + last_digits = first_matched_phone_number.group(4) + + return f"{area_code}-{middle_digits}-{last_digits}" + return None class Config: populate_by_name = True From a7893884b365ee01224452c5bec67370f19e6163 Mon Sep 17 00:00:00 2001 From: saratvk Date: Thu, 10 Oct 2024 14:13:43 -0400 Subject: [PATCH 02/12] issue #41: add unit tests for phone_number format checking --- tests/test_inspection.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_inspection.py b/tests/test_inspection.py index d0d2842..e6d748f 100644 --- a/tests/test_inspection.py +++ b/tests/test_inspection.py @@ -298,6 +298,22 @@ def test_registration_number_mixed_format(self): instance = FertilizerInspection(registration_number="12A34567B") self.assertIsNone(instance.registration_number) +class TestFertilizerInspectionPhoneNumberFormat(unittest.TestCase): + def test_phone_number_with_country_code(self): + instance = FertilizerInspection(company_phone_number="1-800-640-9605") + self.assertEqual(instance.company_phone_number, "800-640-9605") + + def test_phone_number_with_parentheses(self): + instance = FertilizerInspection(manufacturer_phone_number="(757) 123-4567, (800) 456-7890") + self.assertEqual(instance.manufacturer_phone_number, "757-123-4567") + + def test_phone_number_with_parentheses_and_country_code(self): + instance = FertilizerInspection(manufacturer_phone_number="+1 (757) 123-4567, (800) 456-7890") + self.assertEqual(instance.manufacturer_phone_number, "757-123-4567") + + def test_phone_number_with_chars(self): + instance = FertilizerInspection(manufacturer_phone_number="+1 800 123-4567 FAX") + self.assertEqual(instance.manufacturer_phone_number, "800-123-4567") if __name__ == "__main__": unittest.main() From 701992299859035bfa221e1b18dc71c8c8b187c2 Mon Sep 17 00:00:00 2001 From: saratvk Date: Tue, 15 Oct 2024 23:33:42 -0400 Subject: [PATCH 03/12] feat: revise: using phonenumbers lib to convert phone number to international format --- pipeline/inspection.py | 15 +++++++++------ requirements.txt | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pipeline/inspection.py b/pipeline/inspection.py index 7dbf753..a9d87a3 100644 --- a/pipeline/inspection.py +++ b/pipeline/inspection.py @@ -1,6 +1,7 @@ import re from typing import List, Optional from pydantic import BaseModel, Field, field_validator, model_validator +import phonenumbers class npkError(ValueError): pass @@ -142,14 +143,16 @@ def check_registration_number_format(cls, v): @field_validator("company_phone_number", "manufacturer_phone_number", mode="before") def check_phone_number_format(cls, v): if v is not None: - phone_number_pattern = r"(\+?1[\s-]?)?(\(?\d{3}\)?)[\s-]?(\d{3})[\s-]?(\d{4})" + phone_number_pattern = re.compile(r'\+?(\d{1,4})?[\s-]?\(?\d{1,4}\)?[\s-]?\d{1,4}[\s-]?\d{1,4}[\s-]?\d{1,9}') first_matched_phone_number = re.search(phone_number_pattern, v) - if phone_number_pattern: - area_code = first_matched_phone_number.group(2).replace('(', '').replace(')', '') - middle_digits = first_matched_phone_number.group(3) - last_digits = first_matched_phone_number.group(4) + if first_matched_phone_number: + try: + phone_number = phonenumbers.parse(first_matched_phone_number.group(), "US") + phone_number = phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL) + return phone_number.replace(" ", "-") - return f"{area_code}-{middle_digits}-{last_digits}" + except phonenumbers.phonenumberutil.NumberParseException: + return None return None class Config: diff --git a/requirements.txt b/requirements.txt index a77eb01..8c33d65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,5 +5,6 @@ pydantic>=2.7.1 python-dotenv reportlab setuptools +phonenumbers # Test dependencies Levenshtein From 3cb5d96e7569e80cda5f789de512350a6febe7ef Mon Sep 17 00:00:00 2001 From: saratvk Date: Tue, 15 Oct 2024 23:34:37 -0400 Subject: [PATCH 04/12] feat: add unit tests for the new phone number format --- tests/test_inspection.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_inspection.py b/tests/test_inspection.py index e6d748f..2f28afd 100644 --- a/tests/test_inspection.py +++ b/tests/test_inspection.py @@ -300,20 +300,24 @@ def test_registration_number_mixed_format(self): class TestFertilizerInspectionPhoneNumberFormat(unittest.TestCase): def test_phone_number_with_country_code(self): - instance = FertilizerInspection(company_phone_number="1-800-640-9605") - self.assertEqual(instance.company_phone_number, "800-640-9605") + instance = FertilizerInspection(company_phone_number="1 800 640 9605") + self.assertEqual(instance.company_phone_number, "+1-800-640-9605") def test_phone_number_with_parentheses(self): instance = FertilizerInspection(manufacturer_phone_number="(757) 123-4567, (800) 456-7890") - self.assertEqual(instance.manufacturer_phone_number, "757-123-4567") + self.assertEqual(instance.manufacturer_phone_number, "+1-757-123-4567") def test_phone_number_with_parentheses_and_country_code(self): instance = FertilizerInspection(manufacturer_phone_number="+1 (757) 123-4567, (800) 456-7890") - self.assertEqual(instance.manufacturer_phone_number, "757-123-4567") + self.assertEqual(instance.manufacturer_phone_number, "+1-757-123-4567") def test_phone_number_with_chars(self): instance = FertilizerInspection(manufacturer_phone_number="+1 800 123-4567 FAX") - self.assertEqual(instance.manufacturer_phone_number, "800-123-4567") + self.assertEqual(instance.manufacturer_phone_number, "+1-800-123-4567") + + def test_phone_number_from_other_countries(self): + instance = FertilizerInspection(manufacturer_phone_number="+98 919-678-8900") + self.assertEqual(instance.manufacturer_phone_number, "+98-919-678-8900") if __name__ == "__main__": unittest.main() From 62fdad9455d92ec0a689dfe1338c9b8d3e416cf2 Mon Sep 17 00:00:00 2001 From: k-allagbe Date: Thu, 17 Oct 2024 16:33:04 -0400 Subject: [PATCH 05/12] issue #41: reject all invalid values --- .gitignore | 3 ++ pipeline/inspection.py | 46 +++++++++++++------------ tests/test_inspection.py | 73 ++++++++++++++++++++++++++-------------- 3 files changed, 74 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index c793664..7a96a2d 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,6 @@ cython_debug/ # Testing artifacts end_to_end_pipeline_artifacts + +logs/ +reports/ diff --git a/pipeline/inspection.py b/pipeline/inspection.py index a9d87a3..683653b 100644 --- a/pipeline/inspection.py +++ b/pipeline/inspection.py @@ -1,7 +1,9 @@ import re -from typing import List, Optional -from pydantic import BaseModel, Field, field_validator, model_validator +from typing import List, Optional + import phonenumbers +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + class npkError(ValueError): pass @@ -59,10 +61,10 @@ def replace_none_with_empty_list(cls, v): if v is None: v = [] return v - + @model_validator(mode="after") def set_is_minimal(self): - pattern = r'\bminim\w*\b' + pattern = r"\bminim\w*\b" if self.title: self.is_minimal = re.search(pattern, self.title, re.IGNORECASE) is not None return self @@ -108,6 +110,7 @@ class FertilizerInspection(BaseModel): instructions_fr: List[str] = [] ingredients_en: List[NutrientValue] = [] ingredients_fr: List[NutrientValue] = [] + model_config = ConfigDict(populate_by_name=True) @field_validator("npk", mode="before") def validate_npk(cls, v): @@ -131,29 +134,28 @@ def replace_none_with_empty_list(cls, v): if v is None: v = [] return v - + @field_validator("registration_number", mode="before") def check_registration_number_format(cls, v): if v is not None: - pattern = r'^\d{7}[A-Z]$' + pattern = r"^\d{7}[A-Z]$" if re.match(pattern, v): return v return None - + @field_validator("company_phone_number", "manufacturer_phone_number", mode="before") def check_phone_number_format(cls, v): - if v is not None: - phone_number_pattern = re.compile(r'\+?(\d{1,4})?[\s-]?\(?\d{1,4}\)?[\s-]?\d{1,4}[\s-]?\d{1,4}[\s-]?\d{1,9}') - first_matched_phone_number = re.search(phone_number_pattern, v) - if first_matched_phone_number: - try: - phone_number = phonenumbers.parse(first_matched_phone_number.group(), "US") - phone_number = phonenumbers.format_number(phone_number, phonenumbers.PhoneNumberFormat.INTERNATIONAL) - return phone_number.replace(" ", "-") - - except phonenumbers.phonenumberutil.NumberParseException: - return None - return None - - class Config: - populate_by_name = True + if v is None: + return + + try: + phone_number = phonenumbers.parse(v, "CA") + if not phonenumbers.is_valid_number(phone_number): + return + phone_number = phonenumbers.format_number( + phone_number, phonenumbers.PhoneNumberFormat.E164 + ) + return phone_number + + except phonenumbers.phonenumberutil.NumberParseException: + return diff --git a/tests/test_inspection.py b/tests/test_inspection.py index 2f28afd..11dce0a 100644 --- a/tests/test_inspection.py +++ b/tests/test_inspection.py @@ -2,10 +2,10 @@ from pipeline.inspection import ( FertilizerInspection, + GuaranteedAnalysis, NutrientValue, Specification, Value, - GuaranteedAnalysis, ) @@ -206,26 +206,27 @@ def test_invalid_npk(self): inspection.npk, f"Expected None for npk with input {npk}" ) + class TestGuaranteedAnalysis(unittest.TestCase): - def setUp(self): self.nutrient_1 = NutrientValue(nutrient="Nitrogen", value="2", unit="mg/L") - self.nutrient_2 = NutrientValue(nutrient="Organic matter", value="15", unit="mg/L") + self.nutrient_2 = NutrientValue( + nutrient="Organic matter", value="15", unit="mg/L" + ) def test_set_is_minimal(self): guaranteed_analysis = GuaranteedAnalysis( title="Guaranteed minimum analysis", - nutrients=[self.nutrient_1, self.nutrient_2] + nutrients=[self.nutrient_1, self.nutrient_2], ) self.assertTrue(guaranteed_analysis.is_minimal) def test_set_is_not_minimal(self): guaranteed_analysis = GuaranteedAnalysis( - title="Guaranteed analysis", - nutrients=[self.nutrient_1, self.nutrient_2] + title="Guaranteed analysis", nutrients=[self.nutrient_1, self.nutrient_2] ) self.assertFalse(guaranteed_analysis.is_minimal) - + def test_is_minimal_in_none(self): guaranteed_analysis = GuaranteedAnalysis( nutrients=[self.nutrient_1, self.nutrient_2] @@ -268,7 +269,8 @@ def test_replace_none_with_empty_list(self): self.assertEqual(inspection.ingredients_en, []) self.assertEqual(inspection.ingredients_fr, []) self.assertEqual(inspection.weight, []) - + + class TestFertilizerInspectionRegistrationNumber(unittest.TestCase): def test_registration_number_with_less_digits(self): instance = FertilizerInspection(registration_number="1234") @@ -298,26 +300,45 @@ def test_registration_number_mixed_format(self): instance = FertilizerInspection(registration_number="12A34567B") self.assertIsNone(instance.registration_number) + class TestFertilizerInspectionPhoneNumberFormat(unittest.TestCase): - def test_phone_number_with_country_code(self): - instance = FertilizerInspection(company_phone_number="1 800 640 9605") - self.assertEqual(instance.company_phone_number, "+1-800-640-9605") - + def test_valid_phone_number_with_country_code(self): + instance = FertilizerInspection(company_phone_number="+1 800 640 9605") + self.assertEqual(instance.company_phone_number, "+18006409605") + + def test_valid_phone_number_without_country_code(self): + instance = FertilizerInspection(company_phone_number="800 640 9605") + self.assertEqual(instance.company_phone_number, "+18006409605") + def test_phone_number_with_parentheses(self): - instance = FertilizerInspection(manufacturer_phone_number="(757) 123-4567, (800) 456-7890") - self.assertEqual(instance.manufacturer_phone_number, "+1-757-123-4567") - - def test_phone_number_with_parentheses_and_country_code(self): - instance = FertilizerInspection(manufacturer_phone_number="+1 (757) 123-4567, (800) 456-7890") - self.assertEqual(instance.manufacturer_phone_number, "+1-757-123-4567") - - def test_phone_number_with_chars(self): - instance = FertilizerInspection(manufacturer_phone_number="+1 800 123-4567 FAX") - self.assertEqual(instance.manufacturer_phone_number, "+1-800-123-4567") - - def test_phone_number_from_other_countries(self): - instance = FertilizerInspection(manufacturer_phone_number="+98 919-678-8900") - self.assertEqual(instance.manufacturer_phone_number, "+98-919-678-8900") + instance = FertilizerInspection(company_phone_number="(757) 321-4567") + self.assertEqual(instance.company_phone_number, "+17573214567") + + def test_phone_number_with_extra_characters(self): + instance = FertilizerInspection(company_phone_number="+1 800 321-9605 FAX") + self.assertIsNone(instance.company_phone_number) + + def test_phone_number_with_multiple_numbers(self): + instance = FertilizerInspection( + company_phone_number="(757) 123-4567 (800) 456-7890, 1234567890" + ) + self.assertIsNone(instance.company_phone_number) + + def test_phone_number_from_other_country(self): + instance = FertilizerInspection(manufacturer_phone_number="+44 20 7946 0958") + self.assertEqual(instance.manufacturer_phone_number, "+442079460958") + + def test_invalid_phone_number(self): + instance = FertilizerInspection(company_phone_number="invalid phone") + self.assertIsNone(instance.company_phone_number) + + def test_phone_number_with_invalid_format(self): + instance = FertilizerInspection(company_phone_number="12345") + self.assertIsNone(instance.company_phone_number) + + +if __name__ == "__main__": + unittest.main() if __name__ == "__main__": unittest.main() From 98217483c05aac5a7cf6ead36113117f748a5b90 Mon Sep 17 00:00:00 2001 From: k-allagbe Date: Thu, 17 Oct 2024 16:37:53 -0400 Subject: [PATCH 06/12] issue #41: pkg version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a0c6a14..572a99a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "fertiscan_pipeline" -version = "0.0.2" +version = "0.0.3" description = "A pipeline for the FertiScan project" authors = [ { name = "Albert Bryan Ndjeutcha", email = "albert.ndjeutcha@inspection.gc.ca" } From eb71ee00ad8314c23f9f48d918b92fd4540a84d7 Mon Sep 17 00:00:00 2001 From: "K. Allagbe" Date: Mon, 21 Oct 2024 14:41:52 -0400 Subject: [PATCH 07/12] issue #41: annotate the phone number fields --- pipeline/inspection.py | 10 +++-- script_for_testing_phone_numbers.py | 67 +++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 3 deletions(-) create mode 100644 script_for_testing_phone_numbers.py diff --git a/pipeline/inspection.py b/pipeline/inspection.py index 683653b..6f2d226 100644 --- a/pipeline/inspection.py +++ b/pipeline/inspection.py @@ -1,5 +1,5 @@ import re -from typing import List, Optional +from typing import List, Optional import phonenumbers from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator @@ -90,11 +90,15 @@ class FertilizerInspection(BaseModel): company_name: Optional[str] = None company_address: Optional[str] = None company_website: Optional[str] = None - company_phone_number: Optional[str] = None + company_phone_number: Optional[str] = Field( + description="The distributor's primary phone number. Return only one." + ) manufacturer_name: Optional[str] = None manufacturer_address: Optional[str] = None manufacturer_website: Optional[str] = None - manufacturer_phone_number: Optional[str] = None + manufacturer_phone_number: Optional[str] = Field( + description="The manufacturer's primary phone number. Return only one." + ) fertiliser_name: Optional[str] = None registration_number: Optional[str] = None lot_number: Optional[str] = None diff --git a/script_for_testing_phone_numbers.py b/script_for_testing_phone_numbers.py new file mode 100644 index 0000000..532672e --- /dev/null +++ b/script_for_testing_phone_numbers.py @@ -0,0 +1,67 @@ +import os +import pickle + +from dotenv import load_dotenv + +from pipeline import GPT, OCR, LabelStorage, analyze +from pipeline.inspection import FertilizerInspection + +# Load environment variables +load_dotenv() + +# Define the label folder numbers +label_folders = [8, 11, 19, 22, 24, 25, 27, 28, 30, 34] +# label_folders = [24, 25] + +# Define possible image filenames and extensions +image_filenames = ["img_001", "img_002"] # Basenames without extension +image_extensions = [".jpg", ".png"] # Possible extensions + +# Mock environment setup for OCR and GPT +api_endpoint_ocr = os.getenv("AZURE_API_ENDPOINT") +api_key_ocr = os.getenv("AZURE_API_KEY") +api_endpoint_gpt = os.getenv("AZURE_OPENAI_ENDPOINT") +api_key_gpt = os.getenv("AZURE_OPENAI_KEY") +api_deployment_gpt = os.getenv("AZURE_OPENAI_DEPLOYMENT") + +# Initialize OCR and GPT objects (reusable) +ocr = OCR(api_endpoint=api_endpoint_ocr, api_key=api_key_ocr) +gpt = GPT(api_endpoint=api_endpoint_gpt, api_key=api_key_gpt, deployment_id=api_deployment_gpt) + +# Dictionary to store inspection results for all labels +all_inspections = {} + +# Loop through each label folder +for label_num in label_folders: + label_folder = f"test_data/labels/label_{label_num:03d}" # Format as label_008, label_011, etc. + label_storage = LabelStorage() # Initialize a new LabelStorage for each label folder + + # Add relevant images to the label storage + for image_filename in image_filenames: + for ext in image_extensions: + image_path = os.path.join(label_folder, f"{image_filename}{ext}") + if os.path.exists(image_path): + print("Adding image:", image_path) + label_storage.add_image(image_path) + + # Run the analyze function + inspection = analyze(label_storage, ocr, gpt) + + # Store the result in the dictionary with the label number as the key + all_inspections[f"label_{label_num:03d}"] = inspection + +# Pickle all the results in a single file +pickle.dump(all_inspections, open("all_inspections.pkl", "wb")) + +print("All inspections have been processed and saved to all_inspections.pkl") + + +# Load the pickled data +with open("all_inspections.pkl", "rb") as f: + all_inspections: dict[str, FertilizerInspection] = pickle.load(f) + +for label, inspection in all_inspections.items(): + print(f"Label: {label}") + print(f" Company Phone Number: {inspection.company_phone_number}") + print(f" Manufacturer Phone Number: {inspection.manufacturer_phone_number}") + print() From 9560884c90edc1267a343f6721b2ecd1b79b5951 Mon Sep 17 00:00:00 2001 From: "K. Allagbe" Date: Mon, 21 Oct 2024 14:47:57 -0400 Subject: [PATCH 08/12] issue #41: fix expected manufacturer name "TerraLink Horticulture Inc." --- tests/test_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 50d6591..b64fa7b 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -53,7 +53,7 @@ def test_analyze(self): # Perform assertions self.assertIsInstance(inspection, FertilizerInspection, inspection) self.assertIn(Value(value='25', unit='kg'), inspection.weight, inspection) - self.assertGreater(levenshtein_similarity(inspection.manufacturer_name, "TerraLink"), 0.95, inspection) + self.assertGreater(levenshtein_similarity(inspection.manufacturer_name, "TerraLink Horticulture Inc."), 0.95, inspection) self.assertGreater(levenshtein_similarity(inspection.npk, "10-52-0"), 0.90, inspection) # Ensure logs are created and then deleted From 49b2e6cd436b13712346fe149904bd521eeb11ca Mon Sep 17 00:00:00 2001 From: "K. Allagbe" Date: Mon, 21 Oct 2024 15:04:22 -0400 Subject: [PATCH 09/12] issue #41: add `None` default value to phone number fields --- pipeline/inspection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline/inspection.py b/pipeline/inspection.py index 6f2d226..7763877 100644 --- a/pipeline/inspection.py +++ b/pipeline/inspection.py @@ -91,13 +91,13 @@ class FertilizerInspection(BaseModel): company_address: Optional[str] = None company_website: Optional[str] = None company_phone_number: Optional[str] = Field( - description="The distributor's primary phone number. Return only one." + None, description="The distributor's primary phone number. Return only one." ) manufacturer_name: Optional[str] = None manufacturer_address: Optional[str] = None manufacturer_website: Optional[str] = None manufacturer_phone_number: Optional[str] = Field( - description="The manufacturer's primary phone number. Return only one." + None, description="The manufacturer's primary phone number. Return only one." ) fertiliser_name: Optional[str] = None registration_number: Optional[str] = None From a7798b75902b515c41bb198ffdb6b97106e103c7 Mon Sep 17 00:00:00 2001 From: "K. Allagbe" Date: Mon, 21 Oct 2024 15:26:34 -0400 Subject: [PATCH 10/12] issue #41: temporary fix for #59 --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8c33d65..9e28a13 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ azure-ai-documentintelligence==1.0.0b3 -dspy-ai +dspy-ai==2.4.13 openai>=1.0 pydantic>=2.7.1 python-dotenv @@ -8,3 +8,4 @@ setuptools phonenumbers # Test dependencies Levenshtein +pytest From 4007a9e06ca8d3e7fe271e455baf464da723f2a6 Mon Sep 17 00:00:00 2001 From: "K. Allagbe" Date: Mon, 21 Oct 2024 15:40:36 -0400 Subject: [PATCH 11/12] issue #41: integration tests for phone numbers --- script_for_testing_phone_numbers.py | 67 -------------- tests/test_pipeline.py | 133 ++++++++++++++++++++++------ 2 files changed, 107 insertions(+), 93 deletions(-) delete mode 100644 script_for_testing_phone_numbers.py diff --git a/script_for_testing_phone_numbers.py b/script_for_testing_phone_numbers.py deleted file mode 100644 index 532672e..0000000 --- a/script_for_testing_phone_numbers.py +++ /dev/null @@ -1,67 +0,0 @@ -import os -import pickle - -from dotenv import load_dotenv - -from pipeline import GPT, OCR, LabelStorage, analyze -from pipeline.inspection import FertilizerInspection - -# Load environment variables -load_dotenv() - -# Define the label folder numbers -label_folders = [8, 11, 19, 22, 24, 25, 27, 28, 30, 34] -# label_folders = [24, 25] - -# Define possible image filenames and extensions -image_filenames = ["img_001", "img_002"] # Basenames without extension -image_extensions = [".jpg", ".png"] # Possible extensions - -# Mock environment setup for OCR and GPT -api_endpoint_ocr = os.getenv("AZURE_API_ENDPOINT") -api_key_ocr = os.getenv("AZURE_API_KEY") -api_endpoint_gpt = os.getenv("AZURE_OPENAI_ENDPOINT") -api_key_gpt = os.getenv("AZURE_OPENAI_KEY") -api_deployment_gpt = os.getenv("AZURE_OPENAI_DEPLOYMENT") - -# Initialize OCR and GPT objects (reusable) -ocr = OCR(api_endpoint=api_endpoint_ocr, api_key=api_key_ocr) -gpt = GPT(api_endpoint=api_endpoint_gpt, api_key=api_key_gpt, deployment_id=api_deployment_gpt) - -# Dictionary to store inspection results for all labels -all_inspections = {} - -# Loop through each label folder -for label_num in label_folders: - label_folder = f"test_data/labels/label_{label_num:03d}" # Format as label_008, label_011, etc. - label_storage = LabelStorage() # Initialize a new LabelStorage for each label folder - - # Add relevant images to the label storage - for image_filename in image_filenames: - for ext in image_extensions: - image_path = os.path.join(label_folder, f"{image_filename}{ext}") - if os.path.exists(image_path): - print("Adding image:", image_path) - label_storage.add_image(image_path) - - # Run the analyze function - inspection = analyze(label_storage, ocr, gpt) - - # Store the result in the dictionary with the label number as the key - all_inspections[f"label_{label_num:03d}"] = inspection - -# Pickle all the results in a single file -pickle.dump(all_inspections, open("all_inspections.pkl", "wb")) - -print("All inspections have been processed and saved to all_inspections.pkl") - - -# Load the pickled data -with open("all_inspections.pkl", "rb") as f: - all_inspections: dict[str, FertilizerInspection] = pickle.load(f) - -for label, inspection in all_inspections.items(): - print(f"Label: {label}") - print(f" Company Phone Number: {inspection.company_phone_number}") - print(f" Manufacturer Phone Number: {inspection.manufacturer_phone_number}") - print() diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index b64fa7b..e6a39a7 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,41 +1,50 @@ import os import unittest +from datetime import datetime -from tests import curl_file from dotenv import load_dotenv -from datetime import datetime -from tests import levenshtein_similarity + +from pipeline import GPT, OCR, LabelStorage, analyze from pipeline.inspection import FertilizerInspection, Value -from pipeline import LabelStorage, OCR, GPT, analyze +from tests import curl_file, levenshtein_similarity -class TestPipeline(unittest.TestCase): +class TestPipeline(unittest.TestCase): @classmethod def setUpClass(self): load_dotenv() # Set up the required objects - self.log_dir_path = './test_logs' - self.image_path = f'{self.log_dir_path}/test_image.jpg' # Path to your test image - + self.log_dir_path = "./test_logs" + self.image_path = ( + f"{self.log_dir_path}/test_image.jpg" # Path to your test image + ) + # Ensure the log directory exists if not os.path.exists(self.log_dir_path): os.mkdir(self.log_dir_path) - + # Download the test image - curl_file(url='https://tlhort.com/cdn/shop/products/10-52-0MAP.jpg', path=self.image_path) - + curl_file( + url="https://tlhort.com/cdn/shop/products/10-52-0MAP.jpg", + path=self.image_path, + ) + # Mock environment setup for OCR and GPT - self.api_endpoint_ocr = os.getenv('AZURE_API_ENDPOINT') - self.api_key_ocr = os.getenv('AZURE_API_KEY') - self.api_endpoint_gpt = os.getenv('AZURE_OPENAI_ENDPOINT') - self.api_key_gpt = os.getenv('AZURE_OPENAI_KEY') - self.api_deployment_gpt = os.getenv('AZURE_OPENAI_DEPLOYMENT') - + self.api_endpoint_ocr = os.getenv("AZURE_API_ENDPOINT") + self.api_key_ocr = os.getenv("AZURE_API_KEY") + self.api_endpoint_gpt = os.getenv("AZURE_OPENAI_ENDPOINT") + self.api_key_gpt = os.getenv("AZURE_OPENAI_KEY") + self.api_deployment_gpt = os.getenv("AZURE_OPENAI_DEPLOYMENT") + # Initialize the objects self.label_storage = LabelStorage() self.label_storage.add_image(self.image_path) self.ocr = OCR(api_endpoint=self.api_endpoint_ocr, api_key=self.api_key_ocr) - self.gpt = GPT(api_endpoint=self.api_endpoint_gpt, api_key=self.api_key_gpt, deployment_id=self.api_deployment_gpt) + self.gpt = GPT( + api_endpoint=self.api_endpoint_gpt, + api_key=self.api_key_gpt, + deployment_id=self.api_deployment_gpt, + ) @classmethod def tearDownClass(cls): @@ -48,23 +57,95 @@ def tearDownClass(cls): def test_analyze(self): # Run the analyze function - inspection = analyze(self.label_storage, self.ocr, self.gpt, log_dir_path=self.log_dir_path) - + inspection = analyze( + self.label_storage, self.ocr, self.gpt, log_dir_path=self.log_dir_path + ) + # Perform assertions self.assertIsInstance(inspection, FertilizerInspection, inspection) - self.assertIn(Value(value='25', unit='kg'), inspection.weight, inspection) - self.assertGreater(levenshtein_similarity(inspection.manufacturer_name, "TerraLink Horticulture Inc."), 0.95, inspection) - self.assertGreater(levenshtein_similarity(inspection.npk, "10-52-0"), 0.90, inspection) + self.assertIn(Value(value="25", unit="kg"), inspection.weight, inspection) + self.assertGreater( + levenshtein_similarity( + inspection.manufacturer_name, "TerraLink Horticulture Inc." + ), + 0.95, + inspection, + ) + self.assertGreater( + levenshtein_similarity(inspection.npk, "10-52-0"), 0.90, inspection + ) # Ensure logs are created and then deleted - now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") md_log_path = f"{self.log_dir_path}/{now}.md" json_log_path = f"{self.log_dir_path}/{now}.json" txt_log_path = f"{self.log_dir_path}/{now}.txt" - + self.assertFalse(os.path.exists(md_log_path)) self.assertFalse(os.path.exists(json_log_path)) self.assertFalse(os.path.exists(txt_log_path)) -if __name__ == '__main__': + +class TestPhoneNumbers(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Load environment variables + load_dotenv() + + # Mock environment setup for OCR and GPT + cls.api_endpoint_ocr = os.getenv("AZURE_API_ENDPOINT") + cls.api_key_ocr = os.getenv("AZURE_API_KEY") + cls.api_endpoint_gpt = os.getenv("AZURE_OPENAI_ENDPOINT") + cls.api_key_gpt = os.getenv("AZURE_OPENAI_KEY") + cls.api_deployment_gpt = os.getenv("AZURE_OPENAI_DEPLOYMENT") + + # Initialize OCR and GPT objects (real instances) + cls.ocr = OCR(api_endpoint=cls.api_endpoint_ocr, api_key=cls.api_key_ocr) + cls.gpt = GPT( + api_endpoint=cls.api_endpoint_gpt, + api_key=cls.api_key_gpt, + deployment_id=cls.api_deployment_gpt, + ) + + # Define possible image filenames and extensions + cls.image_filenames = ["img_001", "img_002"] + cls.image_extensions = [".jpg", ".png"] + + def add_images_to_storage(self, label_folder, label_storage): + for image_filename in self.image_filenames: + for ext in self.image_extensions: + image_path = os.path.join(label_folder, f"{image_filename}{ext}") + if os.path.exists(image_path): + label_storage.add_image(image_path) + + def test_label_008_inspection(self): + label_folder = "test_data/labels/label_008" + label_storage = LabelStorage() + + # Add images using the helper function + self.add_images_to_storage(label_folder, label_storage) + + # Run the analyze function + inspection = analyze(label_storage, self.ocr, self.gpt) + + # assertions + self.assertEqual(inspection.company_phone_number, "+18003279462") + self.assertIsNone(inspection.manufacturer_phone_number) + + def test_label_024_inspection(self): + label_folder = "test_data/labels/label_024" + label_storage = LabelStorage() + + # Add images using the helper function + self.add_images_to_storage(label_folder, label_storage) + + # Run the analyze function + inspection = analyze(label_storage, self.ocr, self.gpt) + + # assertions + self.assertEqual(inspection.company_phone_number, "+14506556147") + self.assertIsNone(inspection.manufacturer_phone_number) + + +if __name__ == "__main__": unittest.main() From 6c19eae7302b974c8664eee20a8121c68f323883 Mon Sep 17 00:00:00 2001 From: "K. Allagbe" Date: Mon, 21 Oct 2024 15:59:56 -0400 Subject: [PATCH 12/12] issue #41: test manufacturer or company name in `test_analyze` --- tests/test_pipeline.py | 53 ++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index e6a39a7..67fb6e0 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,4 +1,6 @@ import os +import shutil +import tempfile import unittest from datetime import datetime @@ -64,9 +66,13 @@ def test_analyze(self): # Perform assertions self.assertIsInstance(inspection, FertilizerInspection, inspection) self.assertIn(Value(value="25", unit="kg"), inspection.weight, inspection) + manufacturer_or_company = ( + inspection.manufacturer_name or inspection.company_name + ) + self.assertIsNotNone(manufacturer_or_company, inspection) self.assertGreater( levenshtein_similarity( - inspection.manufacturer_name, "TerraLink Horticulture Inc." + manufacturer_or_company, "TerraLink Horticulture Inc." ), 0.95, inspection, @@ -107,28 +113,44 @@ def setUpClass(cls): deployment_id=cls.api_deployment_gpt, ) - # Define possible image filenames and extensions - cls.image_filenames = ["img_001", "img_002"] + # Supported image extensions cls.image_extensions = [".jpg", ".png"] - def add_images_to_storage(self, label_folder, label_storage): - for image_filename in self.image_filenames: - for ext in self.image_extensions: - image_path = os.path.join(label_folder, f"{image_filename}{ext}") - if os.path.exists(image_path): - label_storage.add_image(image_path) + # Create a temporary directory for image copies + cls.temp_dir = tempfile.mkdtemp() + + @classmethod + def tearDownClass(cls): + # Clean up the temporary directory after tests are done + shutil.rmtree(cls.temp_dir) + + def copy_images_to_temp_dir(self, label_folder): + copied_files = [] + for file_name in os.listdir(label_folder): + _, ext = os.path.splitext(file_name) + if ext.lower() in self.image_extensions: + image_path = os.path.join(label_folder, file_name) + temp_image_path = os.path.join(self.temp_dir, file_name) + shutil.copy(image_path, temp_image_path) + copied_files.append(temp_image_path) + return copied_files + + def add_images_to_storage(self, image_paths, label_storage): + for image_path in image_paths: + label_storage.add_image(image_path) def test_label_008_inspection(self): label_folder = "test_data/labels/label_008" label_storage = LabelStorage() - # Add images using the helper function - self.add_images_to_storage(label_folder, label_storage) + # Copy images to temporary directory and add to storage + image_paths = self.copy_images_to_temp_dir(label_folder) + self.add_images_to_storage(image_paths, label_storage) # Run the analyze function inspection = analyze(label_storage, self.ocr, self.gpt) - # assertions + # Assertions self.assertEqual(inspection.company_phone_number, "+18003279462") self.assertIsNone(inspection.manufacturer_phone_number) @@ -136,13 +158,14 @@ def test_label_024_inspection(self): label_folder = "test_data/labels/label_024" label_storage = LabelStorage() - # Add images using the helper function - self.add_images_to_storage(label_folder, label_storage) + # Copy images to temporary directory and add to storage + image_paths = self.copy_images_to_temp_dir(label_folder) + self.add_images_to_storage(image_paths, label_storage) # Run the analyze function inspection = analyze(label_storage, self.ocr, self.gpt) - # assertions + # Assertions self.assertEqual(inspection.company_phone_number, "+14506556147") self.assertIsNone(inspection.manufacturer_phone_number)