Relations are now split up before prompting Llama

Knox-AAU · Dec 7, 2023 · d5c38ab · d5c38ab
1 parent cb829da
commit d5c38ab
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 27 deletions.
diff --git a/relation_extraction/multilingual/llm_messenger.py b/relation_extraction/multilingual/llm_messenger.py
@@ -6,7 +6,7 @@
 class LLMMessenger(APIHandler):
 
     def API_endpoint():
-        return ""
+        return "http://knox-func01.srv.aau.dk:5004/llama"
 
     def send_request(request):
 
@@ -16,24 +16,6 @@ def send_request(request):
         # Create a llama model
         model = Llama(model_path=model_path, n_ctx=4096)
 
-        # Prompt creation
-        # system_message = """### Instruction ###
-        # When given a sentence and the entity mentions in the sentence, you should perform relation extraction.  This includes marking an entity mention as subject, marking another entity mention as object, and identifying the relation between the subject and object. You should only use entity mentions specified in the prompt. You should only use relations from the list of relations given in the context.
-
-        # ### Context ###
-        # List of relations: [location, birthPlace, deathPlace, owns, sibling, child, parent, title, employer, age, residence, headquarter, deathCause, member, foundedBy, religion]
-
-        # ### Input Data ###
-        # You should perform relation extraction when prompted with input on the following format:
-        # "sentence", [comma_separated_list_of_entity_mentions]
-
-        # ### Output Indicator ###
-        # If no relation can be found in the sentence, or the entity mentions have not been specified in the user prompt, you should respond with "undefined". In all other cases, your output should be a list of triples on the following format:
-        # <subject, relation, object>
-
-        # """
-        # user_message = '"Casper and Rytter has the same mother", [Casper, Rytter]'
-
         prompt = f"""<s>[INST] <<SYS>>
         {request["system_message"]}
         <</SYS>>
@@ -51,17 +33,18 @@ def send_request(request):
         #     # Write content to the file
         #     file.write(output["choices"][0]["text"])
 
-        #response = requests.post(url=LLMMessenger.API_endpoint)
+        #response = requests.post(url=LLMMessenger.API_endpoint, json=request)
         return output
 
     def process_message(response):
         print("Recieved response from Llama2...")
         triples = []
+        print(response)
         answer = re.split("/INST]", response["choices"][0]["text"])[1]
         print(response["choices"][0]["text"])
-        llama_triples = re.findall("<[\s\w\d]*,[\s\w\d]*,[\s\w\d]*>|\[[\s\w\d]*,[\s\w\d]*,[\s\w\d]*\]", answer)
+        llama_triples = re.findall('<["\s\w\d,"]*,[\s\w\d]*,["\s\w\d,"]*>|\[["\s\w\d,"]*,[\s\w\d]*,["\s\w\d,"]*\]', answer)
         for llama_triple in llama_triples:
-            triple = re.split(",", llama_triple.replace("<", "").replace(">", "").replace("]", "").replace("[", ""))
+            triple = re.split('"."', llama_triple.replace("<", "").replace(">", "").replace("]", "").replace("[", ""))
             if len(triple) == 3:
                 triple_object = {}
                 for i, entry in enumerate(triple):
@@ -79,13 +62,11 @@ def check_validity_of_response(sentence, response, relations):
 
     def prompt_llm(data, relations):
         triples = []
-        relations_test = ["spouse", "location", "birthPlace", "deathPlace", "owns", "sibling", "child", "parent", "title", "employer", "age", "residence", "headquarter", "deathCause", "member", "foundedBy", "religion"]
-        relations_text = "[" + ", ".join(["location", "birthPlace", "deathPlace", "owns", "sibling", "child", "parent", "title", "employer", "age", "residence", "headquarter", "deathCause", "member", "foundedBy", "religion"]) + "]"
         system_message = f"""### Instruction ###
 When given a sentence in either danish or english and the entity mentions in the sentence, you should find triples by performing relation extraction.  This includes marking an entity mention as subject, marking another entity mention as object, and identifying the relation between the subject and object. You should only use entity mentions specified in the prompt. You should only use relations from the list of relations given in the context. You should provide reasoning for why each of the triples you find is correct. 
 S
 ### Context ###
-List of relations: [spouse, location, birthPlace, deathPlace, owns, sibling, child, parent, title, employer, age, residence, headquarter, deathCause, member, foundedBy, religion]
+List of relations: [{", ".join(relations)}]
 Here is a transcript with you. You are called Llama.
 User: Sentence: "Aalborg is in Denmark" Entity mentions: ["Aalborg", "Denmark"]
 Llama: The relation "is in" is not in the list of relations but "location" is in the list of relations. "Aalborg is in Denmark" implies that Aalborg is located in Denmark. Therefore, the triple <"Aalborg", location, "Denmark"> is correct.
@@ -103,7 +84,7 @@ def prompt_llm(data, relations):
 
         """
 
-        request = {"system_message": system_message, "user_message": ""}
+        request = {"system_message": system_message, "user_message": "", "max_tokens": 4096}
 
         for file in data:
             for sentence in file["sentences"]:

diff --git a/relation_extraction/multilingual/main.py b/relation_extraction/multilingual/main.py
@@ -29,7 +29,11 @@ def begin_relation_extraction(data):
         raise Exception("Incorrectly formatted input. Exception during parsing")
 
     try:
-        triples = LLMMessenger.prompt_llm(parsed_data, relations)
+        triples = []
+        chunk_size = 250
+        split_relations = [relations[i:i + chunk_size] for i in range(0, len(relations), chunk_size)] #Split the relations into lists of size chunk_size
+        for split_relation in split_relations:
+            triples.append(LLMMessenger.prompt_llm(parsed_data, split_relation))
     except Exception as E:
         print(f"Exception during prompt to Llama 2: {str(E)}")
         raise Exception("Exception during prompt to Llama 2")
@@ -39,3 +43,23 @@ def begin_relation_extraction(data):
     except Exception as E:
         print(f"Exception during request to database. {str(E)}")
         raise Exception("Data was not sent to database due to connection error")
+
+begin_relation_extraction([
+    {
+        "filename": "path/to/Artikel.txt",
+        "language": "en",
+        "sentences": [
+            {
+                "sentence": "Barrack Obama is married to Michelle Obama.",
+                "sentenceStartIndex": 20,
+                "sentenceEndIndex": 62,
+                "entityMentions": 
+                [
+                    { "name": "Barrack Obama", "startIndex": 0, "endIndex": 12, "iri": "knox-kb01.srv.aau.dk/Barack_Obama" },
+                    { "name": "Michelle Obama", "startIndex": 27, "endIndex": 40, "iri": "knox-kb01.srv.aau.dk/Michele_Obama" }
+                ]
+            }
+        ]
+    }
+]
+)