Skip to content

Commit

Permalink
Merge pull request #23 from CAUSOLDOUTMEN/feat/17-fuzzywuzzy
Browse files Browse the repository at this point in the history
Feat: fuzzywuzzy 도입
  • Loading branch information
synoti21 authored Nov 12, 2023
2 parents ed63e5d + 4ad1303 commit 1e358a8
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 7 deletions.
4 changes: 1 addition & 3 deletions image_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ class TestOCRAccuracy(unittest.TestCase):
'test7.jpg': {'칼로리': 505, '탄수화물': 84, '지방': 15, '단백질': 9},
'test8.jpeg': { '칼로리': 200, '탄수화물': 43, '지방': 11, '단백질': 4},
'test9.jpeg': {'칼로리': 525, '탄수화물': 69, '지방': 25, '단백질': 6},
'test10.jpeg': {'칼로리': 505, '탄수화물': 78, '지방': 17, '단백질':10},
'test11.jpeg': {'칼로리': 510, '탄수화물': 52, '지방': 32, '단백질': 3},
'test12.jpeg': {'칼로리': 395, '탄수화물': 46, '지방': 21, '단백질': 6},
'test10.jpg': {'칼로리': 505, '탄수화물': 78, '지방': 17, '단백질':10},
'test13.jpeg': {'칼로리': 230, '탄수화물': 24, '지방': 14, '단백질': 2},
}

Expand Down
Binary file removed test_image/input/test12.jpeg
Binary file not shown.
Binary file modified test_image/output/cropped_table_enhanced.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
17 changes: 16 additions & 1 deletion utils/nutrition_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
import re
from fuzzywuzzy import process

def correct_ocr_text(text):
target_words = ["칼로리", "탄수화물", "단백질", "지방"]
corrected_text = text
for word in target_words:
extracted_words = process.extractBests(word, text.split(), score_cutoff=75, limit=10)
for extracted_word, score in extracted_words:
if abs(len(extracted_word) - len(word)) <= 1:
corrected_text = corrected_text.replace(extracted_word, word)
break

print(f'after correcting: ${corrected_text}')
return corrected_text


def parse_nutrients_from_text(text):
nutrient_pattern = r'(물|질|방|류)\s*(\d+(?:\.\d+)?)(?:\s*g)?'
nutrient_pattern = r'(물|질|방|류)\s*(\d+(?:\.\d+)?)(?:\s*g)?' # 영양성분표는 정해진 규격이 있어, 네 영양소를 구분 짓는 글자로 파싱
# nutrient_pattern = r'(\W?(탄수화물|단백질|(?<![가-힣])지방)\W?)\s*([\d.]+)\s*([a-zA-Z]+)'
kcal_pattern = r'(\d+)\s*kcal'

Expand Down
7 changes: 4 additions & 3 deletions utils/nutrition_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from hanspell import spell_checker
from utils.image_preprocess import PreProcessor
from utils.nutrition_parser import parse_nutrients_from_text
from utils.nutrition_parser import parse_nutrients_from_text, correct_ocr_text
from utils.pororo_ocr import PororoOcr
from fastapi import HTTPException

Expand All @@ -21,15 +21,16 @@ def nutrition_run(image):
realdata = ""
for d in text:
realdata += d
print('target string for parsing: ', realdata)
print('before correcting: ', realdata)

final_key = {'칼로리', '탄수화물', '단백질', '지방'}
final_dict = {key: -1 for key in final_key}

if not realdata:
return False
else:
nutrient_dict = parse_nutrients_from_text(realdata)
correct_text = correct_ocr_text(realdata)
nutrient_dict = parse_nutrients_from_text(correct_text)
for key in final_key:
final_dict[key] = nutrient_dict.get(key, -1)

Expand Down

0 comments on commit 1e358a8

Please sign in to comment.