Merge remote-tracking branch 'origin/concept-linking' into concept-li…

…nking
Knox-AAU · Dec 15, 2023 · 3fb3b7b · 3fb3b7b
2 parents 4cae81a + d276902
commit 3fb3b7b
Show file tree

Hide file tree

Showing 14 changed files with 146 additions and 157 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -26,4 +26,4 @@ jobs:
       - name: Run tests
         run: |
           echo "Testing..."
-          python -m unittest || exit 1
+          python -m unittest -b || exit 1
diff --git a/Dockerfile b/Dockerfile
@@ -3,7 +3,7 @@ FROM python:3.11-slim
 WORKDIR /code
 
 COPY . .
-RUN pip install --no-cache-dir -r requirements_docker.txt
 
+RUN pip install --no-cache-dir -r requirements_docker.txt
 
 CMD ["python", "-u", "-m", "server.server", "--host", "0.0.0.0", "--port", "4444", "--reload"]
diff --git a/README.md b/README.md
@@ -22,3 +22,9 @@ The triples will be data stored in the form of a subject (entity IRI), predicate
 - Rasmus Rytter Sørensen <[email protected]>
 
 **Group D (Concept Linking)**
+- Caspar Emil Jensen <[email protected]>
+- Gamma Ishimwe Ntakiyimana <[email protected]>
+- Lucas Pedersen <[email protected]>
+- Mikkel Wissing <[email protected]>
+- Rune Eberhardt <[email protected]>
+- Vi Thien Le <[email protected]>
diff --git a/concept_linking/README.md b/concept_linking/README.md
diff --git a/concept_linking/main.py b/concept_linking/main.py
@@ -14,10 +14,10 @@ def entity_type_classification(input_data):
     #stringComparisonSolution(input_data)
 
     # Untrained Spacy
-    untrainedSpacySolution(input_data)
+    #untrainedSpacySolution(input_data)
 
     # PromptEngineering
-    #perform_entity_type_classification(input_data)
+    perform_entity_type_classification(input_data)
 
     # Machine Learning
     #predict(input_data)

diff --git a/concept_linking/requirements.txt b/concept_linking/requirements.txt
@@ -1,4 +1,11 @@
-# Other solutions
+# Tools
+#Requirements for LlamaServer
+#-r tools/LlamaServer/requirements.txt
+
+#Requirements for OntologyGraphBuilder
+#-r tools/OntologyGraphBuilder/requirements.txt
+
+# Solutions
 #Requirements for MachineLearning
 -r solutions/MachineLearning/requirements.txt
 
@@ -11,3 +18,4 @@
 
 #Requirements for UntrainedSpacy
 -r solutions/UntrainedSpacy/requirements.txt
+
diff --git a/concept_linking/solutions/PromptEngineering/main.py b/concept_linking/solutions/PromptEngineering/main.py
@@ -8,9 +8,16 @@
 from relation_extraction.knowledge_graph_messenger import KnowledgeGraphMessenger
 from concept_linking.tools.triple_helper import *
 
-
+# Local API url python
 api_url = "http://127.0.0.1:5000/llama"
-headers = {"Content-Type": "application/json"}
+
+# Local API url docker
+# api_url = "http://llama-cpu-server:5000/llama"
+
+# Remote API url
+# api_url = "http://knox-proxy01.srv.aau.dk/llama-api/llama"
+
+headers = {"Access-Authorization": os.getenv("ACCESS_SECRET"), "Content-Type": "application/json"}
 
 PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
 
@@ -58,6 +65,7 @@ def classify_entity_mentions(input_data, output_sentence_test_run):
 
             outer_while_retry_count = 0
             while outer_while_retry_count < max_outer_retries:  # Run until entity is mapped to a provided ontology class
+                found_classification = False
                 outer_while_retry_count += 1
                 print(f'--- RUN Count #{outer_while_retry_count} (Outer While loop) ---')
                 prompt = {key: value.format(
@@ -96,14 +104,15 @@ def classify_entity_mentions(input_data, output_sentence_test_run):
                     classification = match.group(1) if match and match.group(1) in ontology_classes_list else None
 
                     if classification:
+                        found_classification = True
                         # Generate triples if an entity was succesfully classified with the ontology
                         if output_sentence_test_run:
                             triples.append({sentence_key: (content_iri, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://dbpedia.org/ontology/" + classification)})
                         else:
                             triples.append((content_iri, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://dbpedia.org/ontology/" + classification))
 
                         break  # Exit the while loop if entity is mapped to a provided ontology class
-            if outer_while_retry_count > max_outer_retries:
+            if not found_classification:
                 if output_sentence_test_run:
                     triples.append({sentence_key: (content_iri, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
                                                    "http://dbpedia.org/ontology/unknown")})
@@ -175,7 +184,7 @@ def perform_entity_type_classification(post_json, output_file_path=None, output_
 
 
 if __name__ == '__main__':
-    input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN.json")
+    input_file = os.path.join(PROJECT_ROOT, "data/files/EvaluationData/evaluationSet_EN_small.json")
     output_file = os.path.join(PROJECT_ROOT, "data/files/PromptEngineering/output.json")
 
     f = open(input_file,  encoding="utf-8")

diff --git a/concept_linking/tools/LlamaServer/Dockerfile b/concept_linking/tools/LlamaServer/Dockerfile
@@ -1,25 +1,26 @@
-# Before you create the docker image from this file, you must have either downloaded the llama-2-7b-chat.Q2_K.gguf file
-# manually yourself, or run the server at least one time.
-
 # Use python as base image
-FROM python
-
+FROM python:3.11-slim
 # Set the working directory in the container
 WORKDIR /app
 
-# Copy the current directory contents into the container at /app
-COPY ./llama_cpu_server.py /app/llama_cpu_server.py
-COPY ./llama-2-7b-chat.Q2_K.gguf /app/llama-2-7b-chat.Q2_K.gguf
-COPY ./requirements.txt /app/requirements.txt
+# Copy only the necessary files
+COPY llama_cpu_server.py .
+COPY requirements.txt .
+
+#Install necessary build tools and dependencies for running C++(llama_cpp)
+# This can be removed when app is in production and remote llama api server is reliable and used instead of local llama
+# Install dependencies and curl
+RUN apt-get update && apt-get install -y build-essential cmake curl && rm -rf /var/lib/apt/lists/*
+
 
+# Install dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 
+# Download the model file from the URL if it doesn't exist
+RUN test -e /app/llama-2-7b-chat.Q2_K.gguf || curl -o llama-2-7b-chat.Q2_K.gguf -LJO 'https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true'
+
 # Expose port 5000 outside of the container
 EXPOSE 5000
 
 # Run llama_cpu_server.py when the container launches
-CMD ["python", "llama_cpu_server.py"]
-
-
-#run to build image
-#docker build -t llama-cpu-server .
+CMD ["python", "-u", "-m", "llama_cpu_server", "--host", "0.0.0.0", "--port", "5000", "--reload"]
diff --git a/concept_linking/tools/LlamaServer/docker-compose.yml b/concept_linking/tools/LlamaServer/docker-compose.yml
@@ -0,0 +1,9 @@
+services:
+  llama-cpu-server:
+    build: .
+    container_name: llama-server
+    command: python -u -m llama_cpu_server --host 0.0.0.0 --port 5000 --reload
+    volumes:
+      - ./concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf:/app/concept_linking/tools/LlamaServer/llama-2-7b-chat.Q2_K.gguf
+    ports:
+      - "5000:5000"
diff --git a/relation_extraction/multilingual/llm_messenger.py b/relation_extraction/multilingual/llm_messenger.py
@@ -12,7 +12,7 @@ def send_request(request):
         HEADERS = {"Access-Authorization": os.getenv("ACCESS_SECRET")}
         response = requests.post(url=LLMMessenger.API_endpoint(), json=request, headers=HEADERS)
 
-        #  # Put the location of to the GGUF model that you've download from HuggingFace here
+        #  # Put the location of to the GGUF model that you've download from HuggingFace (https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf?download=true) here
         # model_path = "./relation_extraction/multilingual/llama-2-7b-chat.Q2_K.gguf"	
 
         # # Create a llama model	

diff --git a/server/server.py b/server/server.py
@@ -21,10 +21,10 @@ def do_triple_construction():
         post_data = request.get_data().decode('utf-8')
         post_json = json.loads(post_data)
 
-        RelationExtractor.begin_extraction(post_json)
-        entity_type_classification(post_json)
+        RelationExtractor.begin_extraction(post_json)   # Relation Extraction
+        entity_type_classification(post_json)           # Concept Linking
 
-        message = "Post request was successfully processed. Relation extraction and concept linking completed."
+        message = "Post request was successfully processed. Relation Extraction and Concept Linking completed."
         return jsonify(message=message), 200
 
     except Exception as e:

diff --git a/test/test_concept_linking/test_machineLearning.py b/test/test_concept_linking/test_machineLearning.py
@@ -7,6 +7,11 @@
 from concept_linking.solutions.MachineLearning.src.training_dataset import TrainingDataset
 from concept_linking.solutions.MachineLearning.src.config import TrainingConfig, ModelConfig
 from concept_linking.solutions.MachineLearning.src.data_preprocessing import split_data, load_data, extract_sentences
+from concept_linking.solutions.MachineLearning.main import predict
+from concept_linking.solutions.MachineLearning.src.prediction_dataset import PredictionDataset
+from concept_linking.solutions.MachineLearning.src.data_preprocessing import extract_sentences, load_data
+
+
 import json
 from sklearn.model_selection import train_test_split
 
@@ -16,7 +21,7 @@ class TestMachineLearning(unittest.TestCase):
     def setUp(self):
         self.data = [{"sentences": ["sentence " + str(i)]} for i in range(100)]
 
-    def test_correct_split_ratio(self):
+    def test_split_ratio(self):
         train_data, val_data, test_data = split_data(self.data, test_size=0.2, val_size=0.5, random_state=42)
 
         # Check if the split ratios are correct
@@ -25,7 +30,6 @@ def test_correct_split_ratio(self):
         self.assertEqual(len(test_data), 10)   # 10% for testing
 
     def test_error_on_insufficient_samples(self):
-        # Test with insufficient data
         small_data = [{"sentences": ["sentence 1", "sentence 2"]}]
 
         test_size = 0.5  # This will take 1 sentence for testing, leaving 1 for training and validation
@@ -34,7 +38,7 @@ def test_error_on_insufficient_samples(self):
         with self.assertRaises(ValueError):
             split_data(small_data, test_size=test_size, val_size=val_size, random_state=42)
 
-    def test_reproducibility_with_random_state(self):
+    def test_with_random_state(self):
         train_data1, val_data1, test_data1 = split_data(self.data, test_size=0.2, val_size=0.5, random_state=42)
         train_data2, val_data2, test_data2 = split_data(self.data, test_size=0.2, val_size=0.5, random_state=42)
 
@@ -61,6 +65,9 @@ def test_extract_sentences(self):
         result = extract_sentences(mock_data)
         self.assertEqual(result, expected_sentences)
 
+
+
+
 #class TestTrainModel(unittest.TestCase):
 #
 #    def setUp(self):