-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain_Fusion.py
275 lines (220 loc) · 12 KB
/
main_Fusion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
from flask import Flask, request, jsonify
from openpyxl import load_workbook, Workbook
import os
from dotenv import load_dotenv
# Importing necessary modules from llama_index
from llama_index.core import VectorStoreIndex, get_response_synthesizer, Document, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.query_engine import CustomQueryEngine
from llama_index.core.retrievers import BaseRetriever, VectorIndexRetriever, QueryFusionRetriever
from llama_index.core.response_synthesizers import BaseSynthesizer
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
# Initialize the Flask application
app = Flask(__name__)
def extractor(file_path):
"""
Extracts data from an Excel file and returns it as a list of dictionaries.
:param file_path: Path to the Excel file.
:return: List of dictionaries with data extracted from the file.
"""
# Load the workbook in read-only mode
wb = load_workbook(filename=file_path, read_only=True)
ws = wb.active # Select the first sheet
data = [] # Initialize an empty list to store extracted data
# Get the header row
header = [cell for cell in next(ws.iter_rows(values_only=True))]
col_index = {name: index for index, name in enumerate(header)} # Map column names to indices
# Iterate over rows, starting from the second row
for row in ws.iter_rows(min_row=2, values_only=True):
if not all(cell is None for cell in row): # Skip empty rows
row_dict = {
'title': row[col_index['Title']],
'abstract': row[col_index['Abstract']],
'description': row[col_index['English description']],
'claims': row[col_index['Claims']]
}
data.append(row_dict) # Add the row data to the list
return data
def newFileSaver(relevancy, file_path):
"""
Saves relevancy data to a new column in the existing Excel file.
:param relevancy: List of tuples with relevancy and comments.
:param file_path: Path to the Excel file.
:return: Updated file path.
"""
workbook = load_workbook(filename=file_path)
sheet = workbook.active # Select the active worksheet
relevancy_header = 'Relevancy predicted'
comments_header = 'Comments made'
# Find an empty column
empty_column = None
for cell in sheet[1]:
if cell.value is None:
empty_column = cell.column
break
if empty_column is None:
empty_column = sheet.max_column + 1
# Add headers to the first row of the new columns
sheet.cell(row=1, column=empty_column, value=relevancy_header)
sheet.cell(row=1, column=empty_column + 1, value=comments_header)
# Add the values from the relevancy list to the new columns
for i, (relevancy, comment) in enumerate(relevancy, start=2):
sheet.cell(row=i, column=empty_column, value=relevancy)
sheet.cell(row=i, column=empty_column + 1, value=comment)
# Save the updated workbook
workbook.save(filename=file_path)
# Create a new workbook for entries with 'R'
new_workbook = Workbook()
new_sheet = new_workbook.active
# Copy headers to the new workbook
for col_num, cell in enumerate(sheet[1], 1):
new_sheet.cell(row=1, column=col_num, value=cell.value)
# Filter rows with 'R' and copy to new workbook
new_row_idx = 2
for row in sheet.iter_rows(min_row=2, values_only=False):
if row[empty_column-1].value == 'R':
for col_num, cell in enumerate(row, 1):
new_sheet.cell(row=new_row_idx, column=col_num, value=cell.value)
new_row_idx += 1
# Save the new workbook
new_file_path = os.path.splitext(file_path)[0] + '_filtered.xlsx'
new_workbook.save(filename=new_file_path)
return file_path, new_file_path
def extract_reason(text):
"""
Extracts the reason from the response text.
:param text: Text to extract the reason from.
:return: Extracted reason.
"""
parts = text.split("Reason: ", 1)
return parts[1] if len(parts) > 1 else ""
def extract_related(text):
"""
Checks if the text indicates a relation by looking for '1R1'.
:param text: Text to check for relation.
:return: True if related, otherwise False.
"""
return '1R1' in text
def backend(dict_item, user_query):
"""
Processes the given dictionary and user query to predict relevancy.
:param dict_item: Dictionary containing document data.
:param user_query: User query string.
:return: Tuple with related status and reason.
"""
load_dotenv() # Load environment variables from .env file
llama_api_key = os.getenv('LLAMA_CLOUD_API_KEY')
if llama_api_key is None:
raise ValueError("LLAMA_CLOUD_API_KEY not found in environment variables")
# Create Document objects from dictionary items
documents = [Document(text=f"{key}: {val}") for key, val in dict_item.items()]
# Create and configure the VectorStoreIndex
splitter = SentenceSplitter(chunk_size=128, chunk_overlap=5)
index = VectorStoreIndex.from_documents(documents, transformations=[splitter])
class RAGStringQueryEngine(CustomQueryEngine):
"""
Custom Query Engine using RAG (Retrieval-Augmented Generation) approach.
"""
retriever: BaseRetriever
response_synthesizer: BaseSynthesizer
llm: OpenAI
qa_prompt: PromptTemplate
def custom_query(self, query_str: str):
"""
Custom query method to retrieve and synthesize response.
:param query_str: Query string.
:return: Response string.
"""
nodes = self.retriever.retrieve(query_str)
context_str = "\n\n".join([n.node.get_content() for n in nodes])
# # Uncomment the following to print context string as well.
# print('--'*50)
# print(context_str)
# print('--'*50)
response = self.llm.complete(
qa_prompt.format(context_str=context_str, query_str=query_str)
)
return str(response)
# Configure retrievers
retriever1 = VectorIndexRetriever(index=index, similarity_top_k=5)
nodes = splitter.get_nodes_from_documents(documents)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)
retriever2 = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=5)
retriever = QueryFusionRetriever(
[retriever1, retriever2],
similarity_top_k=5,
num_queries=4,
mode="reciprocal_rerank",
use_async=True,
verbose=True,
)
response_synthesizer1 = get_response_synthesizer(response_mode="tree_summarize")
qa_prompt = PromptTemplate(
"You are an AI assistant that predicts relevancy of a 'Document' with a certain 'Statement'. If it is even a little relevant then return output as '1R1', otherwise '0R0'. If output is '1R1', then state the 'Reason' which makes it relevant with the help of information present in 'Document'. \n"
"For example 1:\n"
"Document:" + ''' title: Composition, application of the composition, cosmetic preparation hydrogel bio-mask in the form of a compress, method of manufacturing the preparation
Background of the invention.
hydrogel bio-mask composed of natural materials and active ingredients, designed for cosmetic applications to enhance skin health. The hydrogel matrix provides a natural and effective medium for delivering active ingredients to the skin. the composition of the hydrogel bio-mask and its natural active ingredients. The following are the key points regarding the specific ingredients mentioned
Hydrogel Matrix: The document emphasizes the use of a hydrogel matrix obtained from natural sources. Natural Active Ingredients: The hydrogel bio-mask includes various natural active ingredients intended for cosmetic use.''' + "\n"
"Statement:" + "Mannuronic acid or avocado or alginate from algae should be used in a cosmetic formulation for any skin claim" + "\n"
"Output: '0R0' \n"
"Reason: It is not mentioning the use of Mannuronic acid, alginate, or avocado but having skin claim for cosmetics \n"
"For example 2:\n"
"Document:" + ''' the use of mannuronic acid derivatives and alginate from algae in cosmetic formulations aimed at improving skin health by providing anti-photoaging benefits, moisture retention, antioxidant protection, and enzyme inhibition. The derivatives form an invisible film on the skin, protecting against UV damage and maintaining a moist environment. They exhibit strong antioxidant capabilities and inhibit enzymes like tyrosinase and elastase, reducing melanin production and collagen degradation.
The primary focus of the patent is on alginate oligosaccharide derivatives derived from brown algae. These are used for their moisture absorption, antioxidation, and enzyme inhibition properties in skincare products. ''' + "\n"
"Statement:" + "Mannuronic acid or avocado or alginate from algae should be used in a cosmetic formulation for any skin claim" + "\n"
"Output: '1R1' \n"
"Reason : Mannuronic acid and alginate from algae is used for different skin claims in a cosmetic product \n"
"For example 3:\n"
"Document:" + ''' title: Use of brown algae water extract for preparing blue light resistant skin external product
Background of the invention.
using brown algae extract containing fucoidan for preparing topical skin care products that protect against blue light exposure. These products aim to improve skin health by reducing wrinkles and enhancing brightness, particularly for individuals frequently exposed to blue light. The invention emphasizes the benefits of fucoidan in long-term skin care.
The present invention provides a use of a brown algae extract for preparing a skin topical product for anti-blue light, wherein the product is provided to a subject exposed to blue light, and the brown algae extract contains fucoidan.''' + "\n"
"Statement:" + "Mannuronic acid or avocado or alginate from algae should be used in a cosmetic formulation for any skin claim" + "\n"
"Output: '1R1' \n"
"Reason: Alginate from Brown Algae is used for protecting against blue light in skincare products \n"
"Using the below given Document and Statement , provide the Output and Reason"
"Document: {context_str}\n"
"Statement: {query_str}\n"
"Output: "
"Reason: "
)
llm = OpenAI(model="gpt-3.5-turbo")
query_engine = RAGStringQueryEngine(
retriever=retriever,
response_synthesizer=response_synthesizer1,
llm=llm,
qa_prompt=qa_prompt,
)
response = query_engine.query(user_query)
pprint_response(response, show_source=True)
response_str = str(response)
related = extract_related(response_str)
reason = extract_reason(response_str)
return related, reason
@app.route("/", methods=['GET', 'POST'])
def process_file():
"""
Endpoint to process the uploaded file and user query.
:return: JSON response with the path to the updated file.
"""
data = request.json
query = data.get('query')
file_path = data.get('file_path')
datalist = extractor(file_path)
if datalist and datalist[-1]['title'] is None:
datalist.pop() # Remove the last item if it has no title
relevancy = [] # Initialize the relevancy list
# Process each dictionary item
for dict_item in datalist:
result = backend(dict_item, query)
status = "R" if result[0] else "NR"
relevancy.append((status, result[1]))
outputFilePath, newFilePath = newFileSaver(relevancy, file_path)
return jsonify({'Path': outputFilePath, 'FilteredPath': newFilePath})
if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=True)