Skip to content

Commit

Permalink
Merge pull request #46 from AI4WA/feature/i-37-neo4j-connector
Browse files Browse the repository at this point in the history
update and the connector created
  • Loading branch information
PascalSun authored May 31, 2024
2 parents fd30b8e + a2665f5 commit 9ba65ca
Show file tree
Hide file tree
Showing 10 changed files with 239 additions and 13 deletions.
62 changes: 53 additions & 9 deletions Docs2KG/kg/json2triplets.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,12 @@ def transform_node(self, node: dict, parent_uuid: str = None):
Returns:
"""
labels = [node["node_type"]]
labels = [node["node_type"].upper()]
uuid = node["uuid"]
properties = node["node_properties"]
entity = {"uuid": uuid, "labels": labels, "properties": properties}
# deep copy the properties
copied_properties = self.clean_nested_properties(properties)
entity = {"uuid": uuid, "labels": labels, "properties": copied_properties}
self.triplets_json["nodes"].append(entity)
rel = {
"start_node": parent_uuid,
Expand All @@ -89,8 +91,26 @@ def transform_node(self, node: dict, parent_uuid: str = None):
}
self.triplets_json["relationships"].append(rel)
for child in node["children"]:
# if the children is text_block, then stop here
if child["node_type"] == "text_block":
continue
self.transform_node(child, parent_uuid=uuid)

@staticmethod
def clean_nested_properties(properties: dict):
"""
Clean the nested properties
Args:
properties:
Returns:
"""
copied_properties = properties.copy()
if "text2kg" in copied_properties:
copied_properties.pop("text2kg")
return copied_properties

def transform_images(self):
"""
Connect the image to nearby text
Expand All @@ -115,11 +135,14 @@ def transform_images(self):
for child in node["children"]:
if child["node_type"] == "text_block":
text_block_uuid = child["uuid"]
copied_properties = self.clean_nested_properties(
child["node_properties"]
)
self.triplets_json["nodes"].append(
{
"uuid": text_block_uuid,
"labels": ["text_block"],
"properties": child["node_properties"],
"labels": ["TEXT_BLOCK"],
"properties": copied_properties,
}
)

Expand Down Expand Up @@ -166,11 +189,14 @@ def transform_tables(self):
for child in node["children"]:
if child["node_type"] == "text_block":
text_block_uuid = child["uuid"]
copied_properties = self.clean_nested_properties(
child["node_properties"]
)
self.triplets_json["nodes"].append(
{
"uuid": text_block_uuid,
"labels": ["text_block"],
"properties": child["node_properties"],
"labels": ["TEXT_BLOCK"],
"properties": copied_properties,
}
)

Expand Down Expand Up @@ -208,7 +234,7 @@ def transform_text2kg(self, node: dict):
if len(text2kg_list) == 0:
continue
for text2kg in text2kg_list:
logger.info(f"Text2KG: {text2kg}")

subject = text2kg.get("subject", None)
subject_ner_type = text2kg.get("subject_ner_type", None)
predicate = text2kg.get("predicate", None)
Expand All @@ -227,6 +253,24 @@ def transform_text2kg(self, node: dict):
]
):
continue
# strip the text and then clean again
subject = subject.strip()
object_ent = object_ent.strip()
predicate = predicate.strip()
subject_ner_type = subject_ner_type.strip()
object_ner_type = object_ner_type.strip()

if any(
[
subject == "",
object_ent == "",
predicate == "",
subject_ner_type == "",
object_ner_type == "",
]
):
continue
logger.info(f"Text2KG: {text2kg}")
# check if the subject is in the entities_mapping
if subject not in self.entities_mapping:
self.entities_mapping[subject] = str(uuid4())
Expand All @@ -238,15 +282,15 @@ def transform_text2kg(self, node: dict):
self.triplets_json["nodes"].append(
{
"uuid": subject_uuid,
"labels": ["entity", subject_ner_type, "TEXT2KG"],
"labels": ["ENTITY", subject_ner_type.upper(), "TEXT2KG"],
"properties": {"text": subject},
}
)
# add the object
self.triplets_json["nodes"].append(
{
"uuid": object_uuid,
"labels": ["entity", object_ner_type, "TEXT2KG"],
"labels": ["ENTITY", object_ner_type.upper(), "TEXT2KG"],
"properties": {"text": object_ent},
}
)
Expand Down
Empty file added Docs2KG/kg/utils/__init__.py
Empty file.
166 changes: 166 additions & 0 deletions Docs2KG/kg/utils/neo4j_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
import json
from pathlib import Path

from neo4j import GraphDatabase
from tqdm import tqdm

from Docs2KG.utils.get_logger import get_logger

logger = get_logger(__name__)


class Neo4jLoader:
def __init__(
self,
uri: str,
username: str,
password: str,
json_file_path: Path,
clean: bool = False,
):
"""
Args:
uri (str): URI of the Neo4j database
username (str): Username of the Neo4j database
password (str): Password of the Neo4j database
json_file_path (Path): Path to the JSON file containing the data
clean (bool): Whether to clean the database before loading the data
"""
self.uri = uri
self.username = username
self.password = password
self.json_file_path = json_file_path
self.driver = GraphDatabase.driver(
self.uri, auth=(self.username, self.password)
)
self.clean = clean

if self.clean:
self.clean_database()

def clean_database(self):
with self.driver.session() as session:
session.run("MATCH (n) DETACH DELETE n")
logger.info("Database cleaned successfully")

def close(self):
self.driver.close()

def load_json_data(self):
with open(self.json_file_path, "r") as file:
return json.load(file)

def load_nodes(self, nodes):
"""
It can be any type of node, not just Person
One example node is like this:
```
{
"uuid": "6cedef4a-52d1-4ff2-8fc8-644ad5de8c49",
"labels": [
"text_block"
],
"properties": {
"text_block_bbox": "(373.5598449707031, 667.95703125, 490.0483093261719, 679.9588623046875)",
"content": "B.Sc., Geol, Grad Dip (GIS) ",
"position": "right",
"text_block_number": 9,
"text2kg": [
{
"subject": "B.Sc.",
"subject_ner_type": "Degree",
"predicate": "has",
"object": "Geol",
"object_ner_type": "Subject"
},
{
"subject": "B.Sc.",
"subject_ner_type": "Degree",
"predicate": "has",
"object": "Grad Dip (GIS)",
"object_ner_type": "Certificate"
}
]
}
}
```
Args:
nodes:
Returns:
"""
for node in tqdm(nodes, desc="Loading Nodes"):
labels = ":".join(node["labels"])
properties = node["properties"]

properties["uuid"] = node["uuid"]
properties["labels"] = labels
properties_query = ", ".join(
[
f"{key.replace('.', '_')}: ${key}"
for key in node["properties"].keys()
]
)
node_query = f"""
CREATE (n:{labels} {{ {properties_query} }})
"""
logger.debug(node_query)
logger.debug(properties)

with self.driver.session() as session:
session.run(node_query, **properties)

def load_relationships(self, relationships):
"""
Example like this:
```
{
"start_node": "49547ed0-0f86-418e-8dea-a269f7b002f6",
"end_node": "d9efb3d3-7b5c-49af-83b6-1d39b3f63912",
"type": "was issued in",
"properties": {
"source": "TEXT2KG"
}
}
```
Args:
relationships:
Returns:
"""
for relationship in tqdm(relationships, desc="Loading Relationships"):
start_node = relationship["start_node"]
end_node = relationship["end_node"]
relationship_type = relationship["type"]
properties = relationship.get("properties", {})
properties_query = ", ".join(
[f"{key}: ${key}" for key in properties.keys()]
)
relationship_query = f"""
MATCH (start_node {{uuid: $start_node}}), (end_node {{uuid: $end_node}})
MERGE (start_node)-[r:{relationship_type}]->(end_node)
ON CREATE SET r += {{{properties_query}}}
"""

with self.driver.session() as session:
session.run(
relationship_query,
start_node=start_node,
end_node=end_node,
**properties,
)

def load_data(self):
data = self.load_json_data()
nodes = data["nodes"]
relationships = data["relationships"]
self.load_nodes(nodes)
self.load_relationships(relationships)
logger.info("Data loaded successfully to Neo4j")
8 changes: 4 additions & 4 deletions Docs2KG/parser/pdf/pdf2metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
PDF_TYPE_EXPORTED,
PDF_TYPE_SCANNED,
)
from Docs2KG.utils.count_tokens import count_tokens
from Docs2KG.utils.estimate_price import estimate_price
from Docs2KG.utils.get_logger import get_logger
from Docs2KG.utils.llm.count_tokens import count_tokens
from Docs2KG.utils.llm.estimate_price import estimate_price

logger = get_logger(__name__)

Expand Down Expand Up @@ -69,8 +69,8 @@ def get_meda_for_file(pdf_file: Path) -> dict:
texts.append(page.get_text())
metadata["text_token"] = count_tokens(" ".join(texts))
# estimate the price
metadata["estimated_price_3.5"] = estimate_price(metadata["text_token"])
metadata["estimated_price_4o"] = estimate_price(
metadata["estimated_price_gpt35"] = estimate_price(metadata["text_token"])
metadata["estimated_price_gpt4o"] = estimate_price(
metadata["text_token"], model_name="gpt-4o"
)
metadata["estimated_price_4_turbo"] = estimate_price(
Expand Down
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions examples/compose/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ services:
container_name: neo4j
environment:
- NEO4J_AUTH=neo4j/testpassword
- NEO4JLABS_PLUGINS=["apoc"]
- NEO4J_apoc_import_file_enabled=true
- NEO4J_dbms_security_procedures_unrestricted=apoc.*
ports:
- 7474:7474
- 7687:7687
Expand Down
Empty file added examples/kg/utils/__init__.py
Empty file.
12 changes: 12 additions & 0 deletions examples/kg/utils/neo4j_connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from Docs2KG.kg.utils.neo4j_connector import Neo4jLoader
from Docs2KG.utils.constants import DATA_OUTPUT_DIR

if __name__ == "__main__":
uri = "bolt://localhost:7687"
username = "neo4j"
password = "testpassword"
json_file_path = DATA_OUTPUT_DIR / "4.pdf" / "kg" / "triplets_kg.json"

neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
neo4j_loader.load_data()
neo4j_loader.close()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ transformers
torch
sentence-transformers
openpyxl
neo4j

0 comments on commit 9ba65ca

Please sign in to comment.