Merge pull request #46 from AI4WA/feature/i-37-neo4j-connector

update and the connector created
AI4WA · May 31, 2024 · 9ba65ca · 9ba65ca
2 parents fd30b8e + a2665f5
commit 9ba65ca
Show file tree

Hide file tree

Showing 10 changed files with 239 additions and 13 deletions.
diff --git a/Docs2KG/kg/json2triplets.py b/Docs2KG/kg/json2triplets.py
@@ -77,10 +77,12 @@ def transform_node(self, node: dict, parent_uuid: str = None):
         Returns:
 
         """
-        labels = [node["node_type"]]
+        labels = [node["node_type"].upper()]
         uuid = node["uuid"]
         properties = node["node_properties"]
-        entity = {"uuid": uuid, "labels": labels, "properties": properties}
+        # deep copy the properties
+        copied_properties = self.clean_nested_properties(properties)
+        entity = {"uuid": uuid, "labels": labels, "properties": copied_properties}
         self.triplets_json["nodes"].append(entity)
         rel = {
             "start_node": parent_uuid,
@@ -89,8 +91,26 @@ def transform_node(self, node: dict, parent_uuid: str = None):
         }
         self.triplets_json["relationships"].append(rel)
         for child in node["children"]:
+            # if the children is text_block, then stop here
+            if child["node_type"] == "text_block":
+                continue
             self.transform_node(child, parent_uuid=uuid)
 
+    @staticmethod
+    def clean_nested_properties(properties: dict):
+        """
+        Clean the nested properties
+        Args:
+            properties:
+
+        Returns:
+
+        """
+        copied_properties = properties.copy()
+        if "text2kg" in copied_properties:
+            copied_properties.pop("text2kg")
+        return copied_properties
+
     def transform_images(self):
         """
         Connect the image to nearby text
@@ -115,11 +135,14 @@ def transform_images(self):
                     for child in node["children"]:
                         if child["node_type"] == "text_block":
                             text_block_uuid = child["uuid"]
+                            copied_properties = self.clean_nested_properties(
+                                child["node_properties"]
+                            )
                             self.triplets_json["nodes"].append(
                                 {
                                     "uuid": text_block_uuid,
-                                    "labels": ["text_block"],
-                                    "properties": child["node_properties"],
+                                    "labels": ["TEXT_BLOCK"],
+                                    "properties": copied_properties,
                                 }
                             )
 
@@ -166,11 +189,14 @@ def transform_tables(self):
                     for child in node["children"]:
                         if child["node_type"] == "text_block":
                             text_block_uuid = child["uuid"]
+                            copied_properties = self.clean_nested_properties(
+                                child["node_properties"]
+                            )
                             self.triplets_json["nodes"].append(
                                 {
                                     "uuid": text_block_uuid,
-                                    "labels": ["text_block"],
-                                    "properties": child["node_properties"],
+                                    "labels": ["TEXT_BLOCK"],
+                                    "properties": copied_properties,
                                 }
                             )
 
@@ -208,7 +234,7 @@ def transform_text2kg(self, node: dict):
             if len(text2kg_list) == 0:
                 continue
             for text2kg in text2kg_list:
-                logger.info(f"Text2KG: {text2kg}")
+
                 subject = text2kg.get("subject", None)
                 subject_ner_type = text2kg.get("subject_ner_type", None)
                 predicate = text2kg.get("predicate", None)
@@ -227,6 +253,24 @@ def transform_text2kg(self, node: dict):
                     ]
                 ):
                     continue
+                # strip the text and then clean again
+                subject = subject.strip()
+                object_ent = object_ent.strip()
+                predicate = predicate.strip()
+                subject_ner_type = subject_ner_type.strip()
+                object_ner_type = object_ner_type.strip()
+
+                if any(
+                    [
+                        subject == "",
+                        object_ent == "",
+                        predicate == "",
+                        subject_ner_type == "",
+                        object_ner_type == "",
+                    ]
+                ):
+                    continue
+                logger.info(f"Text2KG: {text2kg}")
                 # check if the subject is in the entities_mapping
                 if subject not in self.entities_mapping:
                     self.entities_mapping[subject] = str(uuid4())
@@ -238,15 +282,15 @@ def transform_text2kg(self, node: dict):
                 self.triplets_json["nodes"].append(
                     {
                         "uuid": subject_uuid,
-                        "labels": ["entity", subject_ner_type, "TEXT2KG"],
+                        "labels": ["ENTITY", subject_ner_type.upper(), "TEXT2KG"],
                         "properties": {"text": subject},
                     }
                 )
                 # add the object
                 self.triplets_json["nodes"].append(
                     {
                         "uuid": object_uuid,
-                        "labels": ["entity", object_ner_type, "TEXT2KG"],
+                        "labels": ["ENTITY", object_ner_type.upper(), "TEXT2KG"],
                         "properties": {"text": object_ent},
                     }
                 )

diff --git a/Docs2KG/kg/utils/__init__.py b/Docs2KG/kg/utils/__init__.py
diff --git a/Docs2KG/kg/utils/neo4j_connector.py b/Docs2KG/kg/utils/neo4j_connector.py
@@ -0,0 +1,166 @@
+import json
+from pathlib import Path
+
+from neo4j import GraphDatabase
+from tqdm import tqdm
+
+from Docs2KG.utils.get_logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class Neo4jLoader:
+    def __init__(
+        self,
+        uri: str,
+        username: str,
+        password: str,
+        json_file_path: Path,
+        clean: bool = False,
+    ):
+        """
+
+        Args:
+            uri (str): URI of the Neo4j database
+            username (str): Username of the Neo4j database
+            password (str): Password of the Neo4j database
+            json_file_path (Path): Path to the JSON file containing the data
+            clean (bool): Whether to clean the database before loading the data
+        """
+        self.uri = uri
+        self.username = username
+        self.password = password
+        self.json_file_path = json_file_path
+        self.driver = GraphDatabase.driver(
+            self.uri, auth=(self.username, self.password)
+        )
+        self.clean = clean
+
+        if self.clean:
+            self.clean_database()
+
+    def clean_database(self):
+        with self.driver.session() as session:
+            session.run("MATCH (n) DETACH DELETE n")
+            logger.info("Database cleaned successfully")
+
+    def close(self):
+        self.driver.close()
+
+    def load_json_data(self):
+        with open(self.json_file_path, "r") as file:
+            return json.load(file)
+
+    def load_nodes(self, nodes):
+        """
+        It can be any type of node, not just Person
+
+        One example node is like this:
+
+        ```
+           {
+            "uuid": "6cedef4a-52d1-4ff2-8fc8-644ad5de8c49",
+            "labels": [
+                "text_block"
+            ],
+            "properties": {
+                "text_block_bbox": "(373.5598449707031, 667.95703125, 490.0483093261719, 679.9588623046875)",
+                "content": "B.Sc., Geol, Grad Dip (GIS) ",
+                "position": "right",
+                "text_block_number": 9,
+                "text2kg": [
+                    {
+                        "subject": "B.Sc.",
+                        "subject_ner_type": "Degree",
+                        "predicate": "has",
+                        "object": "Geol",
+                        "object_ner_type": "Subject"
+                    },
+                    {
+                        "subject": "B.Sc.",
+                        "subject_ner_type": "Degree",
+                        "predicate": "has",
+                        "object": "Grad Dip (GIS)",
+                        "object_ner_type": "Certificate"
+                    }
+                ]
+            }
+        }
+        ```
+
+        Args:
+            nodes:
+
+        Returns:
+
+        """
+        for node in tqdm(nodes, desc="Loading Nodes"):
+            labels = ":".join(node["labels"])
+            properties = node["properties"]
+
+            properties["uuid"] = node["uuid"]
+            properties["labels"] = labels
+            properties_query = ", ".join(
+                [
+                    f"{key.replace('.', '_')}: ${key}"
+                    for key in node["properties"].keys()
+                ]
+            )
+            node_query = f"""
+              CREATE (n:{labels} {{ {properties_query} }})
+            """
+            logger.debug(node_query)
+            logger.debug(properties)
+
+            with self.driver.session() as session:
+                session.run(node_query, **properties)
+
+    def load_relationships(self, relationships):
+        """
+        Example like this:
+
+        ```
+        {
+            "start_node": "49547ed0-0f86-418e-8dea-a269f7b002f6",
+            "end_node": "d9efb3d3-7b5c-49af-83b6-1d39b3f63912",
+            "type": "was issued in",
+            "properties": {
+                "source": "TEXT2KG"
+            }
+        }
+        ```
+        Args:
+            relationships:
+
+        Returns:
+
+        """
+        for relationship in tqdm(relationships, desc="Loading Relationships"):
+            start_node = relationship["start_node"]
+            end_node = relationship["end_node"]
+            relationship_type = relationship["type"]
+            properties = relationship.get("properties", {})
+            properties_query = ", ".join(
+                [f"{key}: ${key}" for key in properties.keys()]
+            )
+            relationship_query = f"""
+            MATCH (start_node {{uuid: $start_node}}), (end_node {{uuid: $end_node}})
+            MERGE (start_node)-[r:{relationship_type}]->(end_node)
+            ON CREATE SET r += {{{properties_query}}}
+            """
+
+            with self.driver.session() as session:
+                session.run(
+                    relationship_query,
+                    start_node=start_node,
+                    end_node=end_node,
+                    **properties,
+                )
+
+    def load_data(self):
+        data = self.load_json_data()
+        nodes = data["nodes"]
+        relationships = data["relationships"]
+        self.load_nodes(nodes)
+        self.load_relationships(relationships)
+        logger.info("Data loaded successfully to Neo4j")
diff --git a/Docs2KG/parser/pdf/pdf2metadata.py b/Docs2KG/parser/pdf/pdf2metadata.py
@@ -8,9 +8,9 @@
     PDF_TYPE_EXPORTED,
     PDF_TYPE_SCANNED,
 )
-from Docs2KG.utils.count_tokens import count_tokens
-from Docs2KG.utils.estimate_price import estimate_price
 from Docs2KG.utils.get_logger import get_logger
+from Docs2KG.utils.llm.count_tokens import count_tokens
+from Docs2KG.utils.llm.estimate_price import estimate_price
 
 logger = get_logger(__name__)
 
@@ -69,8 +69,8 @@ def get_meda_for_file(pdf_file: Path) -> dict:
         texts.append(page.get_text())
     metadata["text_token"] = count_tokens(" ".join(texts))
     # estimate the price
-    metadata["estimated_price_3.5"] = estimate_price(metadata["text_token"])
-    metadata["estimated_price_4o"] = estimate_price(
+    metadata["estimated_price_gpt35"] = estimate_price(metadata["text_token"])
+    metadata["estimated_price_gpt4o"] = estimate_price(
         metadata["text_token"], model_name="gpt-4o"
     )
     metadata["estimated_price_4_turbo"] = estimate_price(

diff --git a/Docs2KG/utils/count_tokens.py → Docs2KG/utils/llm/count_tokens.py b/Docs2KG/utils/count_tokens.py → Docs2KG/utils/llm/count_tokens.py
diff --git a/Docs2KG/utils/estimate_price.py → Docs2KG/utils/llm/estimate_price.py b/Docs2KG/utils/estimate_price.py → Docs2KG/utils/llm/estimate_price.py
diff --git a/examples/compose/docker-compose.yml b/examples/compose/docker-compose.yml
@@ -4,6 +4,9 @@ services:
     container_name: neo4j
     environment:
       - NEO4J_AUTH=neo4j/testpassword
+      - NEO4JLABS_PLUGINS=["apoc"]
+      - NEO4J_apoc_import_file_enabled=true
+      - NEO4J_dbms_security_procedures_unrestricted=apoc.*
     ports:
       - 7474:7474
       - 7687:7687

diff --git a/examples/kg/utils/__init__.py b/examples/kg/utils/__init__.py
diff --git a/examples/kg/utils/neo4j_connector.py b/examples/kg/utils/neo4j_connector.py
@@ -0,0 +1,12 @@
+from Docs2KG.kg.utils.neo4j_connector import Neo4jLoader
+from Docs2KG.utils.constants import DATA_OUTPUT_DIR
+
+if __name__ == "__main__":
+    uri = "bolt://localhost:7687"
+    username = "neo4j"
+    password = "testpassword"
+    json_file_path = DATA_OUTPUT_DIR / "4.pdf" / "kg" / "triplets_kg.json"
+
+    neo4j_loader = Neo4jLoader(uri, username, password, json_file_path, clean=True)
+    neo4j_loader.load_data()
+    neo4j_loader.close()
diff --git a/requirements.txt b/requirements.txt
@@ -16,3 +16,4 @@ transformers
 torch
 sentence-transformers
 openpyxl
+neo4j
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,4 @@ transformers @@
     torch
     sentence-transformers
     openpyxl
+    neo4j