forked from jpsanchezg/My-heroes-gene-challenge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeyword_extraction.py
57 lines (43 loc) · 1.5 KB
/
keyword_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from PIL import Image
from pytesseract import pytesseract
import os
import spacy
pytesseract.tesseract_cmd = r"C:/Program Files/Tesseract-OCR/tesseract.exe"
image_path = r"C:\Users\Paula\UofTHacks\dataset\VarientxUofTHacks Gene Resource\ARSE.png"
def img_to_txt():
img = Image.open(image_path)
print('image is loaded')
text = pytesseract.image_to_string(image_path)
return text
def gene_interest():
file_name = os.path.basename(image_path).split('.')[0]
if '_' in file_name:
underscore_index = file_name.index('_')
file_name = file_name[:underscore_index]
print(file_name)
def extract_keywords(text):
gene_interest()
line_list = []
result_index = 0
paragraphs = text.split('\n\n')
for line in paragraphs:
if ('result' or 'summary') in line.lower():
result_index = paragraphs.index(line)
break
results = paragraphs[result_index:]
results_paragraphs = '\n\n'.join(results)
nlp = spacy.load("en_ner_bionlp13cg_md")
if results_paragraphs != []:
doc = nlp(results_paragraphs)
else:
doc = nlp(text)
words = doc.ents
keywords = list(words)
keyword_strings = []
for item in keywords:
keyword_strings.append(str(item).rstrip())
keyword_strings = list(dict.fromkeys(keyword_strings))
return keyword_strings
if __name__ == "__main__":
preprocessed_txt = img_to_txt()
extract_keywords(preprocessed_txt)