Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
Gamm0 committed Dec 11, 2023
2 parents 547ee72 + 8ba66b1 commit 90cdc86
Show file tree
Hide file tree
Showing 12 changed files with 459 additions and 95 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ Note that the ports map to the ports used in the ssh command give in "your port"

Deployment is normally handled by Watchtower on push to main. However, in case of the need of manual deployment, run:

`sudo docker run -p 0.0.0.0:4444:<your_port> --add-host=host.docker.internal:host-gateway -e API_SECRET=*** -e ACCESS_SECRET=*** -d ghcr.io/knox-aau/preprocessinglayer_tripleconstruction:main`
`docker run --name tc_api -p 0.0.0.0:4444:<your_port> --add-host=host.docker.internal:host-gateway -e API_SECRET=*** -e ACCESS_SECRET=*** -d ghcr.io/knox-aau/preprocessinglayer_tripleconstruction:main`

### Access through access API endpoint

Expand Down
12 changes: 9 additions & 3 deletions relation_extraction/evaluation/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
from relation_extraction.ontology_messenger import OntologyMessenger
from relation_extraction.LessNaive.lessNaive import do_relation_extraction
from relation_extraction.NaiveMVP.main import parse_data
from relation_extraction.multilingual.llm_messenger import LLMMessenger
import re
import datetime
import json



def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 3, length = 100, fill = '█', printEnd = "\r"):
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 3, length = 100, fill = '█', printEnd = "\n"):
percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
filledLength = int(length * iteration // total)
bar = fill * filledLength + '-' * (length - filledLength)
Expand Down Expand Up @@ -60,7 +61,8 @@ def main():

solutions_to_test = {
# "less_naive": do_relation_extraction
"naive": parse_data
# "naive": parse_data
"multilingual": LLMMessenger.prompt_llm
}
evaluation_results = dict() #dictionary to hold results of tests
for name, solution in solutions_to_test.items():
Expand Down Expand Up @@ -93,7 +95,11 @@ def main():
]
}]

res = solution(input_obj, ontology_relations)
chunk_size = 650
split_relations = [ontology_relations[i:i + chunk_size] for i in range(0, len(ontology_relations), chunk_size)] #Split the relations into lists of size chunk_size
res = []
for split_relation in split_relations:
res.append(solution(input_obj, split_relation, ontology_relations))
res_hits = 0
for triple in res:
if triple in expected_triples:
Expand Down
130 changes: 66 additions & 64 deletions relation_extraction/multilingual/llm_messenger.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,90 @@
from relation_extraction.API_handler import APIHandler
import requests
from llama_cpp import Llama
import re
import os

class LLMMessenger(APIHandler):

def API_endpoint():
return ""
return "http://knox-proxy01.srv.aau.dk/llama-api/llama"

def send_request(request):
HEADERS = {"Access-Authorization": os.getenv("ACCESS_SECRET")}
response = requests.post(url=LLMMessenger.API_endpoint(), json=request, headers=HEADERS)

# Put the location of to the GGUF model that you've download from HuggingFace here
model_path = "llama-2-7b-chat.Q2_K.gguf"
# # Put the location of to the GGUF model that you've download from HuggingFace here
# model_path = "./relation_extraction/multilingual/llama-2-7b-chat.Q2_K.gguf"

# Create a llama model
#model = Llama(model_path=model_path, n_ctx=4092)
# # Create a llama model
# model = Llama(model_path=model_path, n_ctx=4096)

# Prompt creation
system_message = """### Instruction ###
When given a sentence and the entity mentions in the sentence, you should perform relation extraction. This includes marking an entity mention as subject, marking another entity mention as object, and identifying the relation between the subject and object. You should only use entity mentions specified in the prompt. You should only use relations from the list of relations given in the context.
# prompt = f"""<s>[INST] <<SYS>>
# {request["system_message"]}
# <</SYS>>
# {request["user_message"]} [/INST]"""

### Context ###
List of relations: [location, birthPlace, deathPlace, owns, sibling, child, parent, title, employer, age, residence, headquarter, deathCause, member, foundedBy, religion]
### Input Data ###
You should perform relation extraction when prompted with input on the following format:
"sentence", [comma_separated_list_of_entity_mentions]
### Output Indicator ###
If no relation can be found in the sentence, or the entity mentions have not been specified in the user prompt, you should respond with "undefined". In all other cases, your output should be a list of triples on the following format:
<subject, relation, object>
"""
user_message = '"Casper and Rytter has the same mother", [Casper, Rytter]'

prompt = f"""<s>[INST] <<SYS>>
{system_message}
<</SYS>>
{user_message} [/INST]"""

# Model parameters
max_tokens = 4092

# Run the model
output = model(prompt, max_tokens=max_tokens, echo=True)

# Print the model output
# print(output["choices"][0]["text"])
# with open("LlamaResponse.txt", "w") as file:
# # Write content to the file
# file.write(output["choices"][0]["text"])

#response = requests.post(url=LLMMessenger.API_endpoint)
return output
# # Run the model
# output = model(prompt, max_tokens=request["max_tokens"], echo=True)

return response

def process_message(response):
print("Recieved response from Llama2...")
print(response)


def costruct_prompt_message(data):
system_message = """### Instruction ###
When given a sentence and the entity mentions in the sentence, you should perform relation extraction. This includes marking an entity mention as subject, marking another entity mention as object, and identifying the relation between the subject and object. You should only use entity mentions specified in the prompt. You should only use relations from the list of relations given in the context.
### Context ###
List of relations: [location, birthPlace, deathPlace, owns, sibling, child, parent, title, employer, age, residence, headquarter, deathCause, member, foundedBy, religion]
### Input Data ###
You should perform relation extraction when prompted with input on the following format:
"sentence", [comma_separated_list_of_entity_mentions]
### Output Indicator ###
If no relation can be found in the sentence, or the entity mentions have not been specified in the user prompt, you should respond with "undefined". In all other cases, your output should be a list of triples on the following format:
<subject, relation, object>
triples = []
answer = re.split("/INST]", response["choices"][0]["text"])[1]
llama_triples = re.findall('<["\s\w\d,"]*,[\s\w\d]*,["\s\w\d,"]*>|\[["\s\w\d,"]*,[\s\w\d]*,["\s\w\d,"]*\]', answer)
for llama_triple in llama_triples:
triple = re.split('"', llama_triple.replace("<", "").replace(">", "").replace("]", "").replace("[", ""))[1:-1]
if len(triple) == 3:
triple_object = {}
for i, entry in enumerate(triple):
triple_object[i.__str__()] = entry.strip(' ,')
triples.append(triple_object)
return triples

def check_validity_of_response(sentence, response, relations):
triples = []
valid_entity_mentions = [em["name"] for em in sentence["entityMentions"]]
for triple in response:
if triple["0"] in valid_entity_mentions and triple["1"] in relations and triple["2"] in valid_entity_mentions: # 0 = subject, 1 = predicate, and 2 = object
triples.append([[em["iri"] for em in sentence["entityMentions"] if em["name"] == triple["0"]][0], f'http://dbpedia.org/ontology/{triple["1"]}', [em["iri"] for em in sentence["entityMentions"] if em["name"] == triple["2"]][0]])
return triples

def prompt_llm(data, split_relations, relations):
triples = []
system_message = f"""### Instruction ###
When given a sentence in either danish or english and the entity mentions in the sentence, you should find triples by performing relation extraction. This includes marking an entity mention as subject, marking another entity mention as object, and identifying the relation between the subject and object. You should only use entity mentions specified in the prompt. You should only use relations from the list of relations given in the context. You should provide reasoning for why each of the triples you find is correct.
S
### Context ###
List of relations: [{", ".join(split_relations)}]
Here is a transcript with you. You are called Llama.
User: Sentence: "Aalborg is in Denmark" Entity mentions: ["Aalborg", "Denmark"]
Llama: The relation "is in" is not in the list of relations but "location" is in the list of relations. "Aalborg is in Denmark" implies that Aalborg is located in Denmark. Therefore, the triple <"Aalborg", location, "Denmark"> is correct.
User: Sentence: "Peter has a subscription to Pure Gym" Entity mentions: ["Peter", "Pure Gym"]
Llama: The relation "subscription" is not in the list of relations, but "member" is in the list of relations. "Peter has a subscription to Pure Gym" implies that Peter is a member of Pure Gym. Therefore, the triple <"Peter", member, "Pure Gym"> is correct.
User: Sentence: "Martin Eberhard and Marc Tarpenning are the original founders of Tesla" Entity mentions: ["Martin Eberhard", "Marc Tarpenning", "Tesla"]
Llama: The sentence states that Tesla was founded by both Martin Eberhard and Marc Tarpenning. The relation "foundedBy" is in the list of relations. Therefore, the two triples <"Tesla", foundedBy, "Martin Eberhard"> and <"Tesla", foundedBy, "Marc Tarpenning"> are correct.
User: Sentence: "Sofie was born in Kolding" Entity mentions: ["Sofie", "Kolding"]
Llama: The relation "born in" is not in the list of relations. But "born in" implies a place of birth, and "birthPlace" is in the list of relations. Therefore, the triple <"Sofie", birthPlace, "Kolding"> is correct.
User: Sentence: "Frederik is the father of Christian" Entity mentions: ["Frederik", "Christian"]
Llama: The relation "father" is not in the list of relations. However, a father is a parent and "parent" is in the list of relations. Therefore, the triple <"Frederik", parent, "Christian"> is correct.
### Output Indicator ###
Before answering with a triple, you should explain why it is correct. If no relation can be found in the sentence, or the entity mentions have not been specified in the user prompt, you should respond with “undefined”. In all other cases, your output should be triples on the format <subject, relation, object> and an explanation for each triple.
"""

request = {"system_message": system_message, "user_message": ""}
request = {"system_message": system_message, "user_message": "", "max_tokens": 4096}

for file in data:
for sentence in file["sentences"]:
user_message = f'"{sentence["sentence"]}", ['
user_message = f'Sentence: "{sentence["sentence"]}" Entity mentions: ['
for em in sentence["entityMentions"]:
user_message += f"{em['name']}, "
user_message = user_message[:-2] + ']' #Remove comma and space after last entity mention
user_message += f'"{em["name"]}", '
user_message = user_message[:-2] + ']' #Remove comma and space after last entity mention in message
request["user_message"] = user_message
response = LLMMessenger.send_request(request)
LLMMessenger.process_message(response)
process_response = LLMMessenger.process_message(response)
triples = LLMMessenger.check_validity_of_response(sentence, process_response, relations)
return triples

33 changes: 6 additions & 27 deletions relation_extraction/multilingual/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from relation_extraction.multilingual.llm_messenger import LLMMessenger

def parse_data(data):
"Parses JSON data and converts it into a dictionary with information on sentence, tokens, and entity mentions"
"Removes entity mentions with no iri and sentences with less than two entity mentions"

for file in data:
for i, sentence in enumerate(file["sentences"]):
Expand All @@ -29,7 +29,11 @@ def begin_relation_extraction(data):
raise Exception("Incorrectly formatted input. Exception during parsing")

try:
triples = LLMMessenger.costruct_prompt_message(parsed_data)
triples = []
chunk_size = 650
split_relations = [relations[i:i + chunk_size] for i in range(0, len(relations), chunk_size)] #Split the relations into lists of size chunk_size
for split_relation in split_relations:
triples.extend(LLMMessenger.prompt_llm(parsed_data, split_relation, relations))
except Exception as E:
print(f"Exception during prompt to Llama 2: {str(E)}")
raise Exception("Exception during prompt to Llama 2")
Expand All @@ -39,28 +43,3 @@ def begin_relation_extraction(data):
except Exception as E:
print(f"Exception during request to database. {str(E)}")
raise Exception("Data was not sent to database due to connection error")


def test():
begin_relation_extraction(data=
[
{
"filename": "path/to/Artikel.txt",
"language": "en",
"sentences": [
{
"sentence": "Barrack Obama is married to Michelle Obama.",
"sentenceStartIndex": 20,
"sentenceEndIndex": 62,
"entityMentions":
[
{ "name": "Barrack Obama", "startIndex": 0, "endIndex": 12, "iri": "knox-kb01.srv.aau.dk/Barack_Obama" },
{ "name": "Michelle Obama", "startIndex": 27, "endIndex": 40, "iri": "knox-kb01.srv.aau.dk/Michele_Obama" }
]
}
]
}
]
)

test()
2 changes: 2 additions & 0 deletions relation_extraction/relation_extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from relation_extraction.NaiveMVP.main import handle_relation_post_request
from relation_extraction.multilingual.main import begin_relation_extraction


class RelationExtractor():
@classmethod
def begin_extraction(self, data):
handle_relation_post_request(data)
begin_relation_extraction(data)
Empty file.
Empty file.
File renamed without changes.
Loading

0 comments on commit 90cdc86

Please sign in to comment.