-
Notifications
You must be signed in to change notification settings - Fork 9
/
utils.py
137 lines (114 loc) · 4.22 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import re
import json
import numpy as np
import functools
with open("per_section.json") as f:
json_data = json.load(f)
with open("all_phr.json") as f:
all_phrases = json.load(f)
COARSE_VIEWS=['A2C',
'A3C',
'A4C',
'A5C',
'Apical_Doppler',
'Doppler_Parasternal_Long',
'Doppler_Parasternal_Short',
'Parasternal_Long',
'Parasternal_Short',
'SSN',
'Subcostal']
ALL_SECTIONS=["Left Ventricle",
"Resting Segmental Wall Motion Analysis",
"Right Ventricle",
"Left Atrium",
"Right Atrium",
"Atrial Septum",
"Mitral Valve",
"Aortic Valve",
"Tricuspid Valve",
"Pulmonic Valve",
"Pericardium",
"Aorta",
"IVC",
"Pulmonary Artery",
"Pulmonary Veins",
"Postoperative Findings"]
t_list = {k: [all_phrases[k][j] for j in all_phrases[k]]
for k in all_phrases}
phrases_per_section_list={k:functools.reduce(lambda a,b: a+b, v) for (k,v) in t_list.items()}
numerical_pattern = r'(\\d+(\\.\\d+)?)' # Escaped backslashes for integers or floats
string_pattern = r'\\b\\w+.*?(?=\\.)'
def isin(phrase,text):
return phrase.lower() in (text.lower())
def extract_section(report, section_header):
# Create a regex pattern that matches the section and anything up to the next [SEP]
pattern = rf"{section_header}(.*?)(?=\[SEP\])"
# Search for the pattern in the report
match = re.search(pattern, report)
# If a match is found, return the section including the header and the content up to [SEP]
if match:
# Include the trailing [SEP] if you need it as part of the output
return f"{section_header}{match.group(1)}[SEP]"
else:
return "Section not found."
def extract_features(report: str) -> list:
"""
Returns a list of 21 different features
see json_data for a list of features
"""
features=[]
for key,value in json_data.items():
if value['mode'] == "regression":
match=None
for phrase in value['label_sources']:
pattern = re.compile((phrase.split("<#>")[0] + r"(\d{1,3}(?:\.\d{1,2})?)"), re.IGNORECASE)
match = pattern.search(report)
if match:
features.append(float(match.group(1)))
break
if match is None:
features.append(np.nan)
elif value['mode'] == "binary":
assigned=False
for phrase in value['label_sources']:
if isin(phrase,report):
features.append(1)
assigned=True
break
if not assigned:
features.append(0)
return features
def make_it_regex(sec):
# replace numerical and string with corresponding regex
for idx in range(len(sec)):
sec[idx]=sec[idx].replace('(', '\(').replace(')', '\)').replace("+",'\+')
sec[idx]=re.sub(r'<numerical>', numerical_pattern, sec[idx])
sec[idx]=re.sub(r'<string>', string_pattern, sec[idx])
regex_sec = re.compile('|'.join(sec), flags=re.IGNORECASE)
return regex_sec
regex_per_section={k: make_it_regex(v)
for (k,v) in phrases_per_section_list.items()}
def remove_subsets(strings):
result=[]
for string in strings:
if not any(string in res for res in result):
result.append(string)
return list(result)
def structure_rep(rep):
#remove double spaces
rep = re.sub(r'\s{2,}', ' ', rep)
structured_report = []
for sec in ALL_SECTIONS:
cur_section= extract_section(rep,sec)
new_section=[sec+":"]
# Find all matches using the combined pattern
for match in re.finditer(regex_per_section[sec], cur_section):
new_section.append(cur_section[match.start():match.end()])
if len(new_section)>1:
#remove phrases that are a subset of some other phrase
new_section=remove_subsets(new_section)
new_section.append("[SEP]")
structured_report+=new_section
# Join structured report parts
structured_report = ' '.join(structured_report)
return structured_report