-
Notifications
You must be signed in to change notification settings - Fork 82
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
neo4j GraphRag implementation group 17 #45
Changes from 40 commits
618085d
26b15be
d7e6b7b
d056d0a
2708f2d
2b39d08
0c71380
28a1b03
daacbbb
26f1073
dd21af7
394496c
ce6285d
f165902
52a7e3a
6addb82
1afecb5
24d4179
df3c1c2
bb04dee
c5d683e
dd83948
347d76c
d6101fd
9b7292b
1a3476d
16c81ba
c06deb0
f6206fa
f0460bc
cd03c68
25eb165
d343817
df25d9f
ea02852
bd296f9
3fc6de1
3ef6de4
7152105
ed55e79
fa90a94
6c5012c
ed84166
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
ngrok_token='Your_ngrok_token' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you remove this? |
||
neo4j_location='your_neo4j_location_uri' | ||
neo4j_user='your_neo4j_user_name' | ||
neo4j_password='your_neo4j_password_here' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
neo4j | ||
flask | ||
pyngrok | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you remove this? |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
import csv | ||
from neo4j import GraphDatabase | ||
from flask import Flask, jsonify, request | ||
from pyngrok import ngrok | ||
import os | ||
|
||
# Define the Graph_whisperer class to interact with Neo4j | ||
class Graph_whisperer: | ||
|
||
def __init__(self, uri, user, password): | ||
self.driver = GraphDatabase.driver(uri, auth=(user, password)) | ||
|
||
def close(self): | ||
self.driver.close() | ||
|
||
def create_instance(self, payload): | ||
with self.driver.session() as session: | ||
return session.execute_write(self._create_instance, payload) | ||
|
||
def add_document(self, payload): | ||
with self.driver.session() as session: | ||
return session.execute_write(self._add_document, payload) | ||
|
||
def get_meta_schema(self): | ||
""" | ||
Retrieve detailed schema information, including node labels, properties, and relationship types. | ||
|
||
Returns: | ||
dict: A detailed schema including labels, properties, and relationship types. | ||
""" | ||
with self.driver.session() as session: | ||
# Retrieve node labels and their properties | ||
nodes_query = """ | ||
MATCH (n) | ||
UNWIND labels(n) AS label | ||
RETURN label, collect(DISTINCT keys(n)) AS properties | ||
""" | ||
node_results = session.run(nodes_query) | ||
nodes = {} | ||
for record in node_results: | ||
label = record["label"] | ||
properties = set() | ||
for prop_list in record["properties"]: | ||
properties.update(prop_list) | ||
nodes[label] = list(properties) | ||
|
||
# Retrieve relationship types and their properties | ||
rels_query = """ | ||
MATCH ()-[r]->() | ||
RETURN type(r) AS type, collect(DISTINCT keys(r)) AS properties | ||
""" | ||
rel_results = session.run(rels_query) | ||
relationships = {} | ||
for record in rel_results: | ||
rel_type = record["type"] | ||
properties = set() | ||
for prop_list in record["properties"]: | ||
properties.update(prop_list) | ||
relationships[rel_type] = list(properties) | ||
|
||
return {"nodes": nodes, "relationships": relationships} | ||
|
||
def run_query(self, query): | ||
""" | ||
Executes a Cypher query against the Neo4j database. | ||
|
||
Args: | ||
query (str): The Cypher query to execute. | ||
|
||
Returns: | ||
list: A list of query results, where each result is a dictionary. | ||
""" | ||
with self.driver.session() as session: | ||
result = session.run(query) | ||
return [record.data() for record in result] | ||
|
||
@staticmethod | ||
def _create_instance(tx, payload): | ||
for instance in payload: | ||
tx.run(instance["query"], instance["parameters"]) | ||
return instance | ||
|
||
@staticmethod | ||
def _add_document(self, csv_file_path): | ||
""" | ||
Loads a CSV file into Neo4j by constructing and executing queries for each row. | ||
|
||
Args: | ||
csv_file_path (str): The path to the CSV file to be loaded. | ||
|
||
Returns: | ||
dict: A summary of the import process, including the number of records processed. | ||
""" | ||
payloads = [] | ||
try: | ||
with open(csv_file_path, mode="r", encoding="utf-8") as csvfile: | ||
reader = csv.DictReader(csvfile) | ||
for row in reader: | ||
# Construct the payload for each row | ||
payloads.append( | ||
{ | ||
"query": "MERGE (q:Quote {text: $quoteText}) " | ||
"MERGE (t:Topic {name: $topicName}) " | ||
"MERGE (q)-[:IS_PART_OF]->(t)", | ||
"parameters": { | ||
"quoteText": row.get("quoteText"), | ||
"topicName": row.get("topicName"), | ||
}, | ||
} | ||
) | ||
# Execute all queries in the payload | ||
self._create_instance(self, payloads) | ||
return { | ||
"message": f"Successfully loaded {len(payloads)} records into Neo4j." | ||
} | ||
except Exception as e: | ||
return {"error": str(e)} | ||
|
||
|
||
# Initialize Flask app | ||
app = Flask(__name__) | ||
|
||
|
||
neo4j_location = os.getenv('neo4j_location') | ||
neo4j_user = os.getenv('neo4j_user') | ||
neo4j_password = os.getenv('neo4j_password') | ||
# Initialize Neo4j database connection | ||
neo4j_db = Graph_whisperer(neo4j_location, neo4j_user, neo4j_password) | ||
|
||
|
||
@app.route("/add_instances", methods=["POST"]) | ||
def add_instance(): | ||
json_data = request.get_json() | ||
# print(json_data) | ||
try: | ||
# Use the json data to insert directly into Neo4j | ||
insert_result = neo4j_db.create_instance(json_data) | ||
return jsonify({"last inserted instance": insert_result}) | ||
except Exception as e: | ||
return jsonify({"error": str(e)}), 500 | ||
|
||
|
||
@app.route("/add_csv", methods=["POST"]) | ||
def add_csv(): | ||
json_data = request.get_json() | ||
# print(json_data) | ||
try: | ||
# Use the json data to insert directly into Neo4j | ||
insert_result = neo4j_db.add_document(json_data) | ||
return jsonify({"last inserted instance": insert_result}) | ||
except Exception as e: | ||
return jsonify({"error": str(e)}), 500 | ||
|
||
@app.route("/close_db") | ||
def close_db(): | ||
try: | ||
neo4j_db.close() | ||
return jsonify({"message": "Database connection closed."}) | ||
except Exception as e: | ||
return jsonify({"error": str(e)}), 500 | ||
|
||
@app.route("/schema", methods=["GET"]) | ||
def get_meta_schema(): | ||
try: | ||
schema = neo4j_db.get_meta_schema() | ||
app.logger.info(f"Retrieved schema: {schema}") | ||
return jsonify(schema) | ||
except Exception as e: | ||
app.logger.error(f"Error retrieving schema: {e}") | ||
return jsonify({"error": str(e)}), 500 | ||
|
||
@app.route("/run_query", methods=["POST"]) | ||
def run_query(): | ||
try: | ||
# Extract the Cypher query from the request body | ||
query = request.json.get("query") | ||
if not query: | ||
return jsonify({"error": "No query provided"}), 400 | ||
|
||
# Execute the query | ||
results = neo4j_db.run_query(query) | ||
return jsonify({"results": results}) | ||
except Exception as e: | ||
return jsonify({"error": str(e)}), 500 | ||
|
||
|
||
|
||
if __name__ == "__main__": | ||
# # Set ngrok auth token and expose the app | ||
ngrok_token = os.getenv('ngrok_token') | ||
ngrok.set_auth_token(ngrok_token) # Replace with your actual ngrok auth token | ||
public_url = ngrok.connect(4000) # Expose port 5000 | ||
print(f"ngrok tunnel available at: {public_url}") | ||
|
||
# Start Flask app | ||
app.run(host="0.0.0.0",port=4000) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,18 +23,22 @@ The source document that you need to score is the following: | |
{context}" | ||
|
||
data_directory='data' | ||
file_types="pdf,json,docx,pptx,xslx,csv,xml" | ||
file_types="pdf,json,docx,pptx,xslx,csv,xml,txt" | ||
json_schema="." | ||
json_text_content=False | ||
xml_xpath="//" | ||
|
||
max_document_limit=10 | ||
neo4j_location='URL_to_neo4j_server' | ||
vector_store=milvus | ||
vector_store_uri='data.db' | ||
vector_store_collection=ragmeup_documents | ||
vector_store_sparse_uri=bm25_db.pickle | ||
vector_store_initial_load=True | ||
vector_store_k=10 | ||
document_chunks_pickle=rag_chunks.pickle | ||
file_upload_using_llm=True | ||
dynamic_neo4j_schema=False | ||
rerank=True | ||
rerank_k=3 | ||
rerank_model=flashrank | ||
|
@@ -76,8 +80,9 @@ number_of_chunks=None | |
|
||
use_openai=False | ||
openai_model_name='gpt-4o-mini' | ||
use_gemini=False | ||
use_gemini=True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you undo this? |
||
gemini_model_name='gemini-pro' | ||
GOOGLE_API_KEY='Your_API_key' | ||
use_azure=False | ||
use_ollama=False | ||
ollama_model='llama3.1' | ||
|
@@ -95,4 +100,50 @@ ragas_answer_instruction="You are a digital librarian and need to answer questio | |
{context}" | ||
ragas_answer_query="Answer the following question, never give any explanation or other output than the generated article itself: | ||
|
||
{question}" | ||
{question}" | ||
|
||
rag_retrieval_instruction="Instruction: You are a graph database query assistant. Based on the graph schema below, generate a Cypher query to search for the answer to the user's question. If the schema does not support the query, respond with 'None'. | ||
Schema: | ||
{schema}" | ||
retrieval_few_shot="Few-shot examples: | ||
Example 1: | ||
User query: \'What topics are available?\' | ||
Output: MATCH (t:Topic) RETURN t.name | ||
|
||
Example 2: | ||
User query: \'What is the size of an elephant?\' | ||
Output: None" | ||
rag_retrieval_question="The user question is: | ||
|
||
{question} | ||
|
||
Please generate a Cypher question to answer, or return None if it does not fit the Schema" | ||
|
||
neo4j_insert_instruction= "You are a Neo4j database assistant. Your task is to generate Cypher queries for inserting data into the Neo4j graph database. Use only the nodes, properties, and relationships specified in the provided schema. Ensure that all generated queries are valid Cypher and JSON format, and conform to the schema. Make a maximum 9 additions. If the input data cannot be mapped to the schema, return 'None' and do not generate any invalid query. " | ||
neo4j_insert_schema= "Instruction: You are tasked with generating Cypher queries to insert data into the Neo4j graph database. Use only the nodes, properties, and relationships defined in the following schema. Ensure the queries are valid and align with the schema. If the input data cannot be mapped to the schema, return 'None'. | ||
|
||
Schema: | ||
{schema} | ||
|
||
Input data: | ||
{data} | ||
|
||
Output: " | ||
|
||
|
||
neo4j_insert_data_only= "Instruction: You are tasked with generating Cypher queries to insert data into the Neo4j graph database. Use only the nodes, properties, and relationships defined in the following schema. Ensure the queries are valid and align with the schema. If the input data cannot be mapped to the schema, return 'None'. | ||
|
||
Schema: | ||
Nodes: | ||
- Topic: name | ||
- Fact: name | ||
Relationships: | ||
- IS_PART_OF: No properties | ||
|
||
Input data: | ||
{data} | ||
|
||
Output: " | ||
|
||
neo4j_insert_few_shot="Few-shot examples:Example 1: Schema: Nodes: - Quote: text - Topic: name Relationships: - IS_PART_OF: None Input data:Course block 4 Pitching Tools you need (all available on Canvas > Files): • A series of short videos on pitching by Nathalie Mangelaars (links available on Canvas) • Pitch Toolkit by Pitch Academy • Example Pitch Deck by Horseplay Ventures Expected deliverables: • Pitch script • Slide deck • A Minimum Viable Product (MVP) (also see here and here) Notes: • You are strongly encouraged to already draft a pitch script and create a preliminary slide deck before the pitch training takes place (i.e., on Wednesday November 22). If you come prepared, then Cyrille van Hoof and Nathalie Mangelaars can focus on important opportunities for improvement instead of starting from scratch, which saves us valuable time. In case you do so, include both your draft and final versions to your portfolio. Output:[{\"query\": \"MERGE (q:Quote {text: $quoteText}) MERGE (t:Topic {name: $topicName}) MERGE (q)-[:IS_PART_OF]->(t)\",\"parameters\": { \"quoteText\": \"Pitch Toolkit by Pitch Academy\",\"topicName\": \"Needed tools\"}},{\"query\": \"MERGE (q:Quote {text: $quoteText}) MERGE (t:Topic {name: $topicName}) MERGE (q)-[:IS_PART_OF]->(t)\",\"parameters\": {\"quoteText\": \"A Minimum Viable Product (MVP) (also see here and here) \",\"topicName\": \"Deliverables\"}}]" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you remove this file from the PR? And/or make a section on just how GraphRAG works.