-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
299 lines (236 loc) · 12.2 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
# Import necessary libraries
from flask import Flask, request, jsonify
from openpyxl import load_workbook, Workbook
import os
import qdrant_client
from dotenv import load_dotenv
# Import llama_index components
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core import Document
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.query_engine import CustomQueryEngine
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.response_synthesizers import BaseSynthesizer
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
# Initialize Flask application
app = Flask(__name__)
def extractor(file_path):
"""
Extracts data from an Excel file.
Args:
file_path (str): The path to the Excel file.
Returns:
list: A list of dictionaries containing extracted data.
"""
# Load the workbook in read-only mode
wb = load_workbook(filename=file_path, read_only=True)
ws = wb.active # Select the first sheet
data = [] # List to store row data as dictionaries
# Get the header row
header = [cell for cell in next(ws.iter_rows(values_only=True))]
col_index = {name: index for index, name in enumerate(header)} # Column name to index mapping
# Iterate over the rows starting from the second row
for row in ws.iter_rows(min_row=2, values_only=True):
if not all(cell is None for cell in row): # Skip rows with all None values
row_dict = {
'title': row[col_index['Title']],
'abstract': row[col_index['Abstract']],
'description': row[col_index['English description']],
'claims': row[col_index['Claims']]
}
data.append(row_dict)
return data
def newFileSaver(relevancy, file_path):
"""
Saves relevancy data to a new column in the Excel file.
Args:
relevancy (list): A list of tuples containing relevancy status and comments.
file_path (str): The path to the Excel file.
Returns:
str: The file path where the updated file is saved.
"""
workbook = load_workbook(filename=file_path)
sheet = workbook.active
relevancy_header = 'Relevancy predicted'
comments_header = 'Comments made'
empty_column = None
for cell in sheet[1]:
if cell.value is None:
empty_column = cell.column
break
if empty_column is None:
empty_column = sheet.max_column + 1
# Add headers to the new columns
sheet.cell(row=1, column=empty_column, value=relevancy_header)
sheet.cell(row=1, column=empty_column + 1, value=comments_header)
# Add relevancy data to the new columns
for i, (status, comment) in enumerate(relevancy, start=2):
sheet.cell(row=i, column=empty_column, value=status)
sheet.cell(row=i, column=empty_column + 1, value=comment)
workbook.save(filename=file_path)
# Create a new workbook for entries with 'R'
new_workbook = Workbook()
new_sheet = new_workbook.active
# Copy headers to the new workbook
for col_num, cell in enumerate(sheet[1], 1):
new_sheet.cell(row=1, column=col_num, value=cell.value)
# Filter rows with 'R' and copy to new workbook
new_row_idx = 2
for row in sheet.iter_rows(min_row=2, values_only=False):
if row[empty_column-1].value == 'R':
for col_num, cell in enumerate(row, 1):
new_sheet.cell(row=new_row_idx, column=col_num, value=cell.value)
new_row_idx += 1
# Save the new workbook
new_file_path = os.path.splitext(file_path)[0] + '_filtered.xlsx'
new_workbook.save(filename=new_file_path)
return file_path, new_file_path
def extract_reason(text):
"""
Extracts reason text from a given string.
Args:
text (str): The input text.
Returns:
str: Extracted reason text.
"""
parts = text.split("Reason: ", 1)
return parts[1] if len(parts) > 1 else ""
def extract_related(text):
"""
Checks if the text contains the string '1R1'.
Args:
text (str): The input text.
Returns:
bool: True if '1R1' is found, False otherwise.
"""
return '1R1' in text
def backend(dict_item, user_query):
"""
Backend function to process a document and a query.
Args:
dict_item (dict): A dictionary representing the document.
user_query (str): The user's query.
Returns:
tuple: A tuple containing related status and reason.
"""
load_dotenv() # Load environment variables
llama_api_key = os.getenv('LLAMA_CLOUD_API_KEY')
if llama_api_key is None:
raise ValueError("LLAMA_CLOUD_API_KEY not found in environment variables")
# Create documents from the dictionary item
documents = [Document(text=f"{key}: {val}") for key, val in dict_item.items()]
client = qdrant_client.QdrantClient(location=":memory:")
vector_store = QdrantVectorStore(client=client, collection_name="test_store")
# Ingest documents into a vector store
pipeline = IngestionPipeline(
transformations=[
SentenceSplitter(chunk_size=128, chunk_overlap=5),
OpenAIEmbedding(),
],
vector_store=vector_store,
)
pipeline.run(documents=documents)
index = VectorStoreIndex.from_vector_store(vector_store)
class RAGStringQueryEngine(CustomQueryEngine):
"""
Custom Query Engine for RAG (Retrieval-Augmented Generation).
"""
retriever: BaseRetriever
response_synthesizer: BaseSynthesizer
llm: OpenAI
qa_prompt: PromptTemplate
def custom_query(self, query_str: str):
"""
Perform a custom query.
Args:
query_str (str): The query string.
Returns:
str: The response from the query.
"""
nodes = self.retriever.retrieve(query_str)
context_str = "\n\n".join([n.node.get_content() for n in nodes])
# # Uncomment the following to print context string as well.
# print('--'*50)
# print(context_str)
# print('--'*50)
response = self.llm.complete(qa_prompt.format(context_str=context_str, query_str=query_str))
return str(response)
qa_prompt = PromptTemplate(
"You are an AI assistant that predicts relevancy of a 'Document' with a certain 'Statement'. If it is Relevant then return output as '1R1', otherwise '0R0'. If output is '1R1', then state the 'Reason' which makes it relevant with the help of information present in 'Document'. \n"
"For example 1:\n"
"Document:" + ''' title: Composition, application of the composition, cosmetic preparation hydrogel bio-mask in the form of a compress, method of manufacturing the preparation
Background of the invention.
hydrogel bio-mask composed of natural materials and active ingredients, designed for cosmetic applications to enhance skin health. The hydrogel matrix provides a natural and effective medium for delivering active ingredients to the skin. the composition of the hydrogel bio-mask and its natural active ingredients. The following are the key points regarding the specific ingredients mentioned
Hydrogel Matrix: The document emphasizes the use of a hydrogel matrix obtained from natural sources. Natural Active Ingredients: The hydrogel bio-mask includes various natural active ingredients intended for cosmetic use.''' + "\n"
"Statement:" + "Mannuronic acid or avocado or alginate from algae should be used in a cosmetic formulation for any skin claim" + "\n"
"Output: '0R0' \n"
"Reason: It is not mentioning the use of Mannuronic acid, alginate, or avocado but having skin claim for cosmetics \n"
"For example 2:\n"
"Document:" + ''' the use of mannuronic acid derivatives and alginate from algae in cosmetic formulations aimed at improving skin health by providing anti-photoaging benefits, moisture retention, antioxidant protection, and enzyme inhibition. The derivatives form an invisible film on the skin, protecting against UV damage and maintaining a moist environment. They exhibit strong antioxidant capabilities and inhibit enzymes like tyrosinase and elastase, reducing melanin production and collagen degradation.
The primary focus of the patent is on alginate oligosaccharide derivatives derived from brown algae. These are used for their moisture absorption, antioxidation, and enzyme inhibition properties in skincare products. ''' + "\n"
"Statement:" + "Mannuronic acid or avocado or alginate from algae should be used in a cosmetic formulation for any skin claim" + "\n"
"Output: '1R1' \n"
"Reason : Mannuronic acid and alginate from algae is used for different skin claims in a cosmetic product \n"
"For example 3:\n"
"Document:" + ''' title: Use of brown algae water extract for preparing blue light resistant skin external product
Background of the invention.
using brown algae extract containing fucoidan for preparing topical skin care products that protect against blue light exposure. These products aim to improve skin health by reducing wrinkles and enhancing brightness, particularly for individuals frequently exposed to blue light. The invention emphasizes the benefits of fucoidan in long-term skin care.
The present invention provides a use of a brown algae extract for preparing a skin topical product for anti-blue light, wherein the product is provided to a subject exposed to blue light, and the brown algae extract contains fucoidan.''' + "\n"
"Statement:" + "Mannuronic acid or avocado or alginate from algae should be used in a cosmetic formulation for any skin claim" + "\n"
"Output: '1R1' \n"
"Reason: Alginate from Brown Algae is used for protecting against blue light in skincare products \n"
"Using the below given Document and Statement , provide the Output and Reason"
"Document: {context_str}\n"
"Statement: {query_str}\n"
"Output: "
"Reason: "
)
retriever = VectorIndexRetriever(index=index, similarity_top_k=5)
response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")
llm = OpenAI(model="gpt-3.5-turbo")
query_engine = RAGStringQueryEngine(
retriever=retriever,
response_synthesizer=response_synthesizer,
llm=llm,
qa_prompt=qa_prompt,
)
response = query_engine.query(user_query)
# Uncomment the following if you need to see the responses for each input on terminal.
pprint_response(response)
response_str = str(response)
related = extract_related(response_str)
reason = extract_reason(response_str)
return related, reason
@app.route("/", methods=['GET', 'POST'])
def process_file():
"""
Flask route to process the uploaded file and query.
Returns:
Response: JSON response containing the path to the updated file.
"""
data = request.json
query = data.get('query')
file_path = data.get('file_path')
datalist = extractor(file_path)
if datalist[-1]['title'] is None:
datalist.pop()
# Initialize the relevancy list
relevancy = []
# Iterate over each dictionary in the datalist
for dict_item in datalist:
# Call the backend function with the current dictionary
result = backend(dict_item, query)
# Check the first element of the tuple and set 'R' or 'NR' accordingly
status = "R" if result[0] else "NR"
# Append the modified result to the relevancy list
relevancy.append((status, result[1]))
outputFilePath, newFilePath = newFileSaver(relevancy, file_path)
return jsonify({'Path': outputFilePath, 'FilteredPath': newFilePath})
if __name__ == "__main__":
app.run(host='0.0.0.0', port=5000, debug=True)