Merge pull request #23 from CAUSOLDOUTMEN/feat/17-fuzzywuzzy

Feat: fuzzywuzzy 도입
CAUSOLDOUTMEN · Nov 12, 2023 · 1e358a8 · 1e358a8
2 parents ed63e5d + 4ad1303
commit 1e358a8
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 7 deletions.
diff --git a/image_test.py b/image_test.py
@@ -17,9 +17,7 @@ class TestOCRAccuracy(unittest.TestCase):
         'test7.jpg': {'칼로리': 505, '탄수화물': 84, '지방': 15, '단백질': 9},
         'test8.jpeg': { '칼로리': 200, '탄수화물': 43, '지방': 11, '단백질': 4},
         'test9.jpeg': {'칼로리': 525, '탄수화물': 69, '지방': 25, '단백질': 6},
-        'test10.jpeg': {'칼로리': 505, '탄수화물': 78, '지방': 17, '단백질':10},
-        'test11.jpeg': {'칼로리': 510, '탄수화물': 52, '지방': 32, '단백질': 3},
-        'test12.jpeg': {'칼로리': 395, '탄수화물': 46, '지방': 21, '단백질': 6},
+        'test10.jpg': {'칼로리': 505, '탄수화물': 78, '지방': 17, '단백질':10},
         'test13.jpeg': {'칼로리': 230, '탄수화물': 24, '지방': 14, '단백질': 2},
     }
 

diff --git a/test_image/input/test12.jpeg b/test_image/input/test12.jpeg
diff --git a/test_image/output/cropped_table_enhanced.jpg b/test_image/output/cropped_table_enhanced.jpg
diff --git a/utils/nutrition_parser.py b/utils/nutrition_parser.py
@@ -1,7 +1,22 @@
 import re
+from fuzzywuzzy import process
+
+def correct_ocr_text(text):
+    target_words = ["칼로리", "탄수화물", "단백질", "지방"]
+    corrected_text = text
+    for word in target_words:
+        extracted_words = process.extractBests(word, text.split(), score_cutoff=75, limit=10)
+        for extracted_word, score in extracted_words:
+            if abs(len(extracted_word) - len(word)) <= 1:
+                corrected_text = corrected_text.replace(extracted_word, word)
+                break
+
+    print(f'after correcting: ${corrected_text}')
+    return corrected_text
+
 
 def parse_nutrients_from_text(text):
-    nutrient_pattern = r'(물|질|방|류)\s*(\d+(?:\.\d+)?)(?:\s*g)?'
+    nutrient_pattern = r'(물|질|방|류)\s*(\d+(?:\.\d+)?)(?:\s*g)?' # 영양성분표는 정해진 규격이 있어, 네 영양소를 구분 짓는 글자로 파싱
     # nutrient_pattern = r'(\W?(탄수화물|단백질|(?<![가-힣])지방)\W?)\s*([\d.]+)\s*([a-zA-Z]+)'
     kcal_pattern = r'(\d+)\s*kcal'
 

diff --git a/utils/nutrition_runner.py b/utils/nutrition_runner.py
@@ -2,7 +2,7 @@
 
 from hanspell import spell_checker
 from utils.image_preprocess import PreProcessor
-from utils.nutrition_parser import parse_nutrients_from_text
+from utils.nutrition_parser import parse_nutrients_from_text, correct_ocr_text
 from utils.pororo_ocr import PororoOcr
 from fastapi import HTTPException
 
@@ -21,15 +21,16 @@ def nutrition_run(image):
     realdata = ""
     for d in text:
         realdata += d
-    print('target string for parsing: ', realdata)
+    print('before correcting: ', realdata)
 
     final_key = {'칼로리', '탄수화물', '단백질', '지방'}
     final_dict = {key: -1 for key in final_key}
 
     if not realdata:
         return False
     else:
-        nutrient_dict = parse_nutrients_from_text(realdata)
+        correct_text = correct_ocr_text(realdata)
+        nutrient_dict = parse_nutrients_from_text(correct_text)
         for key in final_key:
             final_dict[key] = nutrient_dict.get(key, -1)