Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/concept-linking' into concept-li…
Browse files Browse the repository at this point in the history
…nking
  • Loading branch information
Vi Thien Le authored and Vi Thien Le committed Dec 15, 2023
2 parents 4cae81a + d276902 commit 3fb3b7b
Show file tree
Hide file tree
Showing 14 changed files with 146 additions and 157 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ jobs:
- name: Run tests
run: |
echo "Testing..."
python -m unittest || exit 1
python -m unittest -b || exit 1
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ FROM python:3.11-slim
WORKDIR /code

COPY . .
RUN pip install --no-cache-dir -r requirements_docker.txt

RUN pip install --no-cache-dir -r requirements_docker.txt

CMD ["python", "-u", "-m", "server.server", "--host", "0.0.0.0", "--port", "4444", "--reload"]
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,9 @@ The triples will be data stored in the form of a subject (entity IRI), predicate
- Rasmus Rytter Sørensen <[email protected]>

**Group D (Concept Linking)**
- Caspar Emil Jensen <[email protected]>
- Gamma Ishimwe Ntakiyimana <[email protected]>
- Lucas Pedersen <[email protected]>
- Mikkel Wissing <[email protected]>
- Rune Eberhardt <[email protected]>
- Vi Thien Le <[email protected]>
126 changes: 0 additions & 126 deletions concept_linking/README.md

This file was deleted.

4 changes: 2 additions & 2 deletions concept_linking/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ def entity_type_classification(input_data):
#stringComparisonSolution(input_data)

# Untrained Spacy
untrainedSpacySolution(input_data)
#untrainedSpacySolution(input_data)

# PromptEngineering
#perform_entity_type_classification(input_data)
perform_entity_type_classification(input_data)

# Machine Learning
#predict(input_data)
Expand Down
10 changes: 9 additions & 1 deletion concept_linking/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
# Other solutions
# Tools
#Requirements for LlamaServer
#-r tools/LlamaServer/requirements.txt

#Requirements for OntologyGraphBuilder
#-r tools/OntologyGraphBuilder/requirements.txt

# Solutions
#Requirements for MachineLearning
-r solutions/MachineLearning/requirements.txt

Expand All @@ -11,3 +18,4 @@

#Requirements for UntrainedSpacy
-r solutions/UntrainedSpacy/requirements.txt

17 changes: 13 additions & 4 deletions concept_linking/solutions/PromptEngineering/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,16 @@
from relation_extraction.knowledge_graph_messenger import KnowledgeGraphMessenger
from concept_linking.tools.triple_helper import *


# Local API url python
api_url = "http://127.0.0.1:5000/llama"
headers = {"Content-Type": "application/json"}

# Local API url docker
# api_url = "http://llama-cpu-server:5000/llama"

# Remote API url
# api_url = "http://knox-proxy01.srv.aau.dk/llama-api/llama"

headers = {"Access-Authorization": os.getenv("ACCESS_SECRET"), "Content-Type": "application/json"}

PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))

Expand Down Expand Up @@ -58,6 +65,7 @@ def classify_entity_mentions(input_data, output_sentence_test_run):

outer_while_retry_count = 0
while outer_while_retry_count < max_outer_retries: # Run until entity is mapped to a provided ontology class
found_classification = False
outer_while_retry_count += 1
print(f'--- RUN Count #{outer_while_retry_count} (Outer While loop) ---')
prompt = {key: value.format(
Expand Down Expand Up @@ -96,14 +104,15 @@ def classify_entity_mentions(input_data, output_sentence_test_run):
classification = match.group(1) if match and match.group(1) in ontology_classes_list else None

if classification:
found_classification = True
# Generate triples if an entity was succesfully classified with the ontology
if output_sentence_test_run:
triples.append({sentence_key: (content_iri, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://dbpedia.org/ontology/" + classification)})
else:
triples.append((content_iri, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://dbpedia.org/ontology/" + classification))

break # Exit the while loop if entity is mapped to a provided ontology class
if outer_while_retry_count > max_outer_retries:
if not found_classification:
if output_sentence_test_run:
triples.append({sentence_key: (content_iri, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
"http://dbpedia.org/ontology/unknown")})
Expand Down Expand Up @@ -175,7 +184,7 @@ def perform_entity_type_classification(post_json, output_file_path=None, output_


if __name__ == '__main__':
input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN.json")
input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN_small.json")
output_file = os.path.join(PROJECT_ROOT, "data/files/PromptEngineering/output.json")

f = open(input_file, encoding="utf-8")
Expand Down
29 changes: 15 additions & 14 deletions concept_linking/tools/LlamaServer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
# Before you create the docker image from this file, you must have either downloaded the llama-2-7b-chat.Q2_K.gguf file
# manually yourself, or run the server at least one time.

# Use python as base image
FROM python

FROM python:3.11-slim
# Set the working directory in the container
WORKDIR /app

# Copy the current directory contents into the container at /app
COPY ./llama_cpu_server.py /app/llama_cpu_server.py
COPY ./llama-2-7b-chat.Q2_K.gguf /app/llama-2-7b-chat.Q2_K.gguf
COPY ./requirements.txt /app/requirements.txt
# Copy only the necessary files
COPY llama_cpu_server.py .
COPY requirements.txt .

#Install necessary build tools and dependencies for running C++(llama_cpp)
# This can be removed when app is in production and remote llama api server is reliable and used instead of local llama
# Install dependencies and curl
RUN apt-get update && apt-get install -y build-essential cmake curl && rm -rf /var/lib/apt/lists/*


# Install dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Download the model file from the URL if it doesn't exist
RUN test -e /app/llama-2-7b-chat.Q2_K.gguf || curl -o llama-2-7b-chat.Q2_K.gguf -LJO 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true'

# Expose port 5000 outside of the container
EXPOSE 5000

# Run llama_cpu_server.py when the container launches
CMD ["python", "llama_cpu_server.py"]


#run to build image
#docker build -t llama-cpu-server .
CMD ["python", "-u", "-m", "llama_cpu_server", "--host", "0.0.0.0", "--port", "5000", "--reload"]
9 changes: 9 additions & 0 deletions concept_linking/tools/LlamaServer/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
services:
llama-cpu-server:
build: .
container_name: llama-server
command: python -u -m llama_cpu_server --host 0.0.0.0 --port 5000 --reload
volumes:
- ./concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf:/app/concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf
ports:
- "5000:5000"
2 changes: 1 addition & 1 deletion relation_extraction/multilingual/llm_messenger.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def send_request(request):
HEADERS = {"Access-Authorization": os.getenv("ACCESS_SECRET")}
response = requests.post(url=LLMMessenger.API_endpoint(), json=request, headers=HEADERS)

# # Put the location of to the GGUF model that you've download from HuggingFace here
# # Put the location of to the GGUF model that you've download from HuggingFace (https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true) here
# model_path = "./relation_extraction/multilingual/llama-2-7b-chat.Q2_K.gguf"

# # Create a llama model
Expand Down
6 changes: 3 additions & 3 deletions server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ def do_triple_construction():
post_data = request.get_data().decode('utf-8')
post_json = json.loads(post_data)

RelationExtractor.begin_extraction(post_json)
entity_type_classification(post_json)
RelationExtractor.begin_extraction(post_json) # Relation Extraction
entity_type_classification(post_json) # Concept Linking

message = "Post request was successfully processed. Relation extraction and concept linking completed."
message = "Post request was successfully processed. Relation Extraction and Concept Linking completed."
return jsonify(message=message), 200

except Exception as e:
Expand Down
13 changes: 10 additions & 3 deletions test/test_concept_linking/test_machineLearning.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@
from concept_linking.solutions.MachineLearning.src.training_dataset import TrainingDataset
from concept_linking.solutions.MachineLearning.src.config import TrainingConfig, ModelConfig
from concept_linking.solutions.MachineLearning.src.data_preprocessing import split_data, load_data, extract_sentences
from concept_linking.solutions.MachineLearning.main import predict
from concept_linking.solutions.MachineLearning.src.prediction_dataset import PredictionDataset
from concept_linking.solutions.MachineLearning.src.data_preprocessing import extract_sentences, load_data


import json
from sklearn.model_selection import train_test_split

Expand All @@ -16,7 +21,7 @@ class TestMachineLearning(unittest.TestCase):
def setUp(self):
self.data = [{"sentences": ["sentence " + str(i)]} for i in range(100)]

def test_correct_split_ratio(self):
def test_split_ratio(self):
train_data, val_data, test_data = split_data(self.data, test_size=0.2, val_size=0.5, random_state=42)

# Check if the split ratios are correct
Expand All @@ -25,7 +30,6 @@ def test_correct_split_ratio(self):
self.assertEqual(len(test_data), 10) # 10% for testing

def test_error_on_insufficient_samples(self):
# Test with insufficient data
small_data = [{"sentences": ["sentence 1", "sentence 2"]}]

test_size = 0.5 # This will take 1 sentence for testing, leaving 1 for training and validation
Expand All @@ -34,7 +38,7 @@ def test_error_on_insufficient_samples(self):
with self.assertRaises(ValueError):
split_data(small_data, test_size=test_size, val_size=val_size, random_state=42)

def test_reproducibility_with_random_state(self):
def test_with_random_state(self):
train_data1, val_data1, test_data1 = split_data(self.data, test_size=0.2, val_size=0.5, random_state=42)
train_data2, val_data2, test_data2 = split_data(self.data, test_size=0.2, val_size=0.5, random_state=42)

Expand All @@ -61,6 +65,9 @@ def test_extract_sentences(self):
result = extract_sentences(mock_data)
self.assertEqual(result, expected_sentences)




#class TestTrainModel(unittest.TestCase):
#
# def setUp(self):
Expand Down
Loading

0 comments on commit 3fb3b7b

Please sign in to comment.