diff --git a/__pycache__/brain_processing.cpython-39.pyc b/__pycache__/brain_processing.cpython-39.pyc
new file mode 100644
index 0000000..e1883c3
Binary files /dev/null and b/__pycache__/brain_processing.cpython-39.pyc differ
diff --git a/__pycache__/named_entity_linking.cpython-39.pyc b/__pycache__/named_entity_linking.cpython-39.pyc
index 1f96bc0..4109922 100644
Binary files a/__pycache__/named_entity_linking.cpython-39.pyc and b/__pycache__/named_entity_linking.cpython-39.pyc differ
diff --git a/__pycache__/simple_ner.cpython-39.pyc b/__pycache__/simple_ner.cpython-39.pyc
new file mode 100644
index 0000000..367f746
Binary files /dev/null and b/__pycache__/simple_ner.cpython-39.pyc differ
diff --git a/brain_processing.py b/brain_processing.py
new file mode 100644
index 0000000..73f3698
--- /dev/null
+++ b/brain_processing.py
@@ -0,0 +1,89 @@
+from emissor.representation.scenario import Modality, ImageSignal, TextSignal, Mention, Annotation, Scenario
+from cltl.combot.backend.api.discrete import UtteranceType
+from cltl.brain.infrastructure.rdf_builder import RdfBuilder
+from rdflib import RDFS
+from datetime import date
+from random import getrandbits
+import requests
+
+
+def seq_to_text (seq):
+ text = ""
+ for c in seq:
+ text+=c
+ return text
+
+
+def scenario_utterance_to_capsule(scenario: Scenario, signal: TextSignal, author:str, perspective:dict, subj: str, pred:str, obj:str):
+ place_id = getrandbits(8)
+ location = requests.get("https://ipinfo.io").json()
+
+ capsule = {"chat":scenario.id,
+ "turn":signal.id,
+ "author": "carl",
+ "utterance": seq_to_text(signal.seq),
+ "utterance_type": UtteranceType.STATEMENT,
+ "position": "0-"+str(len(signal.seq)), #TODO generate the true offset range
+ "subject": {"label": subj, "type": "person"},
+ "predicate": {"type": pred},
+ "object": {"label": obj, "type": "object"},
+ "perspective": perspective ,
+ "context_id": scenario.scenario.context,
+ "date": date.today(),
+ "place": location['city'],
+ "place_id": place_id,
+ "country": location['country'],
+ "region": location['region'],
+ "city": location['city'],
+ "objects": [{'type': 'chair', 'confidence': 0.59, 'id': 1},
+ {'type': 'table', 'confidence': 0.73, 'id': 1},
+ {'type': 'pillbox', 'confidence': 0.32, 'id': 1}],
+ "people": [{'name': 'Carl', 'confidence': 0.98, 'id': 1}]
+ }
+ return capsule
+
+
+# create a capsule for a TextSignal with a triple and perspective string
+def scenario_utterance_and_triple_to_capsule(scenario: Scenario,
+ place_id: str,
+ location: str,
+ signal: TextSignal,
+ author: str,
+ utterance_type: UtteranceType,
+ perspective: dict,
+ triple: dict):
+ value = generate_obl_object_json(author)
+ capsule = {"chat": scenario.id,
+ "turn": signal.id,
+ "author": author,
+ "utterance": seq_to_text(signal.seq),
+ "utterance_type": utterance_type,
+ "position": "0-" + str(len(signal.seq)), # TODO generate the true offset range
+ "subject": {'label': triple['subject']['label'], 'type': triple['subject']['type']},
+ "predicate": {'type': triple['predicate']['label']},
+ "object": {'label': triple['object']['label'], 'type': triple['object']['type']},
+ "perspective": perspective,
+ "context_id": scenario.scenario.context,
+ ##### standard elements
+ "date": date.today(),
+ "place": location['city'],
+ "place_id": place_id,
+ "country": location['country'],
+ "region": location['region'],
+ "city": location['city'],
+ "objects": value['objects'],
+ "people": value['people']
+ }
+
+ return capsule
+
+
+# Function to generate bogus elements for capsules. Without these, the update function fails
+def generate_obl_object_json(human: str):
+ json_string = {
+ "objects": [{'type': 'chair', 'confidence': 0.59, 'id': 1},
+ {'type': 'table', 'confidence': 0.73, 'id': 1},
+ {'type': 'pillbox', 'confidence': 0.32, 'id': 1}],
+ "people": [{'name': human, 'confidence': 0.98, 'id': 1}]
+ }
+ return json_string
\ No newline at end of file
diff --git a/data/test_scenario/test_scenario.json b/data/test_scenario/test_scenario.json
new file mode 100644
index 0000000..2ee9be2
--- /dev/null
+++ b/data/test_scenario/test_scenario.json
@@ -0,0 +1,33 @@
+{
+ "context": "AGENT",
+ "id": "test_scenario",
+ "signals": {
+ "image": "./image.json",
+ "text": "./text.json"
+ },
+ "@context": {
+ "Scenario": "https://emissor.org#Scenario",
+ "id": "@id",
+ "context": "https://emissor.org#context",
+ "signals": "https://emissor.org#signals",
+ "ruler": "https://emissor.org#ruler"
+ },
+ "@type": "Scenario",
+ "ruler": {
+ "container_id": "test_scenario",
+ "start": 662994,
+ "end": 662999,
+ "@context": {
+ "TemporalRuler": "https://emissor.org#TemporalRuler",
+ "id": "@id",
+ "start": "https://emissor.org#start",
+ "end": "https://emissor.org#end",
+ "container_id": {
+ "@id": "https://emissor.org#container_id",
+ "@type": "@id"
+ }
+ },
+ "@type": "TemporalRuler",
+ "_py_type": "emissor.representation.container-TemporalRuler"
+ }
+}
\ No newline at end of file
diff --git a/logs/2021-10-15-10-21/brain_log_2021-10-15-10-21-29.trig b/logs/2021-10-15-10-21/brain_log_2021-10-15-10-21-29.trig
new file mode 100644
index 0000000..443cd3b
--- /dev/null
+++ b/logs/2021-10-15-10-21/brain_log_2021-10-15-10-21-29.trig
@@ -0,0 +1,11 @@
+@prefix leolaniTalk: .
+@prefix leolaniWorld: .
+@prefix n2mu: .
+@prefix rdfs: .
+
+leolaniWorld:Instances {
+ n2mu:jaap_1 rdfs:label "PhD",
+ "hij",
+ "jaap" .
+}
+
diff --git a/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-10.trig b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-10.trig
new file mode 100644
index 0000000..443cd3b
--- /dev/null
+++ b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-10.trig
@@ -0,0 +1,11 @@
+@prefix leolaniTalk: .
+@prefix leolaniWorld: .
+@prefix n2mu: .
+@prefix rdfs: .
+
+leolaniWorld:Instances {
+ n2mu:jaap_1 rdfs:label "PhD",
+ "hij",
+ "jaap" .
+}
+
diff --git a/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-32.trig b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-32.trig
new file mode 100644
index 0000000..25cfe5c
--- /dev/null
+++ b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-32.trig
@@ -0,0 +1,10 @@
+@prefix leolaniTalk: .
+@prefix leolaniWorld: .
+@prefix rdfs: .
+
+leolaniWorld:Instances {
+ leolaniWorld:jaap_1 rdfs:label "PhD",
+ "hij",
+ "jaap" .
+}
+
diff --git a/logs/2021-11-22-17-46/brain_log_2021-11-22-17-46-01.trig b/logs/2021-11-22-17-46/brain_log_2021-11-22-17-46-01.trig
new file mode 100644
index 0000000..25cfe5c
--- /dev/null
+++ b/logs/2021-11-22-17-46/brain_log_2021-11-22-17-46-01.trig
@@ -0,0 +1,10 @@
+@prefix leolaniTalk: .
+@prefix leolaniWorld: .
+@prefix rdfs: .
+
+leolaniWorld:Instances {
+ leolaniWorld:jaap_1 rdfs:label "PhD",
+ "hij",
+ "jaap" .
+}
+
diff --git a/named_entity_linking.py b/named_entity_linking.py
index 05b716c..e995449 100644
--- a/named_entity_linking.py
+++ b/named_entity_linking.py
@@ -16,25 +16,28 @@
from cltl.brain.utils import base_cases
from cltl.brain.basic_brain import BasicBrain
+from cltl.brain.long_term_memory import LongTermMemory
from cltl.brain.utils.helper_functions import read_query
+from cltl.brain.infrastructure.rdf_builder import RdfBuilder
+from rdflib import RDFS, Literal
-class NamedEntityLinker(BasicBrain):
+class NamedEntityLinker(LongTermMemory):
def __init__(self, address, log_dir, clear_all=False):
- super(NamedEntityLinker, self).__init__(address, log_dir, clear_all, is_submodule=True)
+ super(NamedEntityLinker, self).__init__(address, log_dir, clear_all)
- # Problem: How are uri's defined right now in the brain? Is ambiguity taken into account? -->
- # Otherwise uri's are the same
- # E.g. if labels are firstname-lastname then query needs to be RE only looking at part before hyphen
-
- def link_entities(self, ne_text, baseline='popularity'):
- if baseline == 'popularity':
- uri = self._get_most_popular(ne_text)
- elif baseline == 'recency':
- uri = self._get_most_recent(ne_text)
- return uri
+ def link_entities(self, ne_list, baseline='popularity'):
+ uri_list = []
+ for ne_text in ne_list:
+ if baseline == 'popularity':
+ uri = self._get_most_popular(ne_text)
+ uri_list.append((uri, ne_text))
+ elif baseline == 'recency':
+ uri = self._get_most_recent(ne_text)
+ uri_list.append((uri, ne_text))
+ return uri_list
def _get_most_popular(self, ne_text):
query = read_query('/Users/jaapkruijt/Documents/GitHub/NEL-coreference/popularity') % ne_text
@@ -42,19 +45,46 @@ def _get_most_popular(self, ne_text):
# print(response)
pop_ordered = []
for row in response:
+ print(row)
uri = row['ent']['value']
occurrences = row['num_mentions']['value']
pop_ordered.append((uri, occurrences))
if pop_ordered:
uri, popularity = pop_ordered[0]
- # else:
- #
- # # TODO add functionality to add entity to graph
+ else:
+ uri_name = f'{ne_text}_1'
+ uri = self._rdf_builder.create_resource_uri('LW', uri_name)
return uri
def _get_most_recent(self, ne_text):
pass
+ def add_labels(self, capsule, uri=None):
+ ent_uri = self._rdf_builder.create_resource_uri('LW', capsule['subject']['id']) if not uri else uri
+ for label in capsule['labels']:
+ self.instance_graph.add((ent_uri, RDFS.label, Literal(label)))
+
+ def add_labels_2(self, identity, labels, uri=None):
+ ent_uri = self._rdf_builder.create_resource_uri('LW', identity) if not uri else uri
+ for label in labels:
+ self.instance_graph.add((ent_uri, RDFS.label, Literal(label)))
+
+ def update_brain(self):
+
+ data = self._serialize(self._brain_log())
+ code = self._upload_to_brain(data)
+
+
+if __name__ == "__main__":
+ import pathlib
+
+ log_path = pathlib.Path('./logs')
+ print(type(log_path))
+ nel = NamedEntityLinker(address="http://localhost:7200/repositories/sandbox",
+ log_dir=log_path)
+ nel.add_labels_2('jaap_1', ['jaap', 'PhD', 'hij'])
+ nel.update_brain()
+
diff --git a/recency.rq b/recency.rq
new file mode 100644
index 0000000..fc3fd5b
--- /dev/null
+++ b/recency.rq
@@ -0,0 +1,11 @@
+prefix gaf:
+PREFIX rdfs:
+
+select ?ent (COUNT(DISTINCT ?e) as ?num_mentions) where{
+ ?ent rdfs:label "%s".
+
+ ?ent gaf:denotedIn ?e.
+ }
+
+group by ?ent
+ order by DESC(COUNT(DISTINCT ?e))
\ No newline at end of file
diff --git a/simple_ner.py b/simple_ner.py
index cbbd847..f1695e8 100644
--- a/simple_ner.py
+++ b/simple_ner.py
@@ -2,44 +2,83 @@
from tempfile import TemporaryDirectory
from pathlib import Path
from named_entity_linking import NamedEntityLinker
+from brain_processing import scenario_utterance_to_capsule
-utt = "Hi this is my friend #123 and his supervisor Piek"
+# These modules are imported for the added let's-chat stuff
+from emissor.persistence import ScenarioStorage
+from emissor.representation.annotation import AnnotationType, Token, NER
+from emissor.representation.container import Index
+from emissor.representation.scenario import Modality, ImageSignal, TextSignal, Mention, Annotation, Scenario
+import uuid
+import time
+from datetime import datetime
+
+# These modules are not included in NEL-coreference at the moment!! Won't work outside this machine
+from src.chatbots.util import driver_util, capsule_util
+from src.chatbots.dummies import text_to_triple as ttt
+
+from rdflib.namespace import RDFS
+
+utt = "Carl likes Bart"
# Idea: can the system search for NP's in the surroundings of a NE, and remember those
-# Testing linking separate from NER (by e.g. using hashes and a dict) (NamedEntityRecognizer)
-# Make linking independent from ner function (so not nested inside the ner function but use its output)
-# Updating the brain: a lot of it is already done automatically in the LTM update() function
-# If I do it here as well then it is done twice; what is the right way to approach this?
-# Using dummy triples that don't require an utterance?
+# TODO Testing linking separate from NER (by e.g. using hashes and a dict) (NamedEntityRecognizer)
+# Using dummy triples that don't require an utterance?
-def named_entity_recognition(utterance, nel: NamedEntityLinker):
+def add_ner_annotation(signal: TextSignal):
processor_name = "spaCy"
+ utterance = ''.join(signal.seq)
doc = nlp(utterance)
- tokens = [token.text for token in doc]
+ offsets, tokens = zip(*[(Index(signal.id, token.idx, token.idx + len(token)), Token.for_string(token.text))
+ for token in doc])
- entity_label = [ent.label_ for ent in doc.ents]
- entity_text = [ent.text.lower() for ent in doc.ents]
+ ents = [NER.for_string(ent.label_) for ent in doc.ents]
+ entity_list = [ent.text.lower() for ent in doc.ents]
+ segments = [token.ruler for token in tokens if token.value in entity_list]
- entities = []
- for ent_text in entity_text:
- name = nel.link_entities(ent_text)
- entities.append(name)
+ annotations = [Annotation(AnnotationType.TOKEN.name.lower(), token, processor_name, int(time.time()))
+ for token in tokens]
+ ner_annotations = [Annotation(AnnotationType.NER.name.lower(), ent, processor_name, int(time.time()))
+ for ent in ents]
+
+ signal.mentions.extend([Mention(str(uuid.uuid4()), [offset], [annotation])
+ for offset, annotation in zip(offsets, annotations)])
+ signal.mentions.extend([Mention(str(uuid.uuid4()), [segment], [annotation])
+ for segment, annotation in zip(segments, ner_annotations)])
+ return entity_list
- return entities
+def utterance_processor(utterance, scenario, brain, author):
+ text_signal = driver_util.create_text_signal(scenario, utterance)
-def main(log_path):
+ entity_text = add_ner_annotation(text_signal)
+ scenario.append_signal(text_signal)
+
+ return entity_text
+
+
+def main(log_path, utterance):
nel = NamedEntityLinker(address="http://localhost:7200/repositories/sandbox",
log_dir=log_path)
- results = named_entity_recognition(utt, nel)
- return results
+ scenario_path = './data'
+ scenario_id = 'test_scenario'
+ scenario_storage = driver_util.create_scenario(scenario_path, scenario_id)
+ scen = scenario_storage.create_scenario(scenario_id, datetime.now().microsecond, datetime.now().microsecond, 'AGENT')
+ entity_text = utterance_processor(utterance, scen, nel, 'Jaap')
+
+ # link_entities expects a list with all entities in one
+ # but the new ner gives a list with a single entity per utterance(?)
+
+ entities = nel.link_entities(entity_text)
+
+ return entities
if __name__ == "__main__":
nlp = spacy.load('en_core_web_sm')
with TemporaryDirectory(prefix="brain-log") as log_path:
- res = main(Path(log_path))
+ res = main(Path(log_path), utt)
print(res)
\ No newline at end of file