diff --git a/__pycache__/brain_processing.cpython-39.pyc b/__pycache__/brain_processing.cpython-39.pyc new file mode 100644 index 0000000..e1883c3 Binary files /dev/null and b/__pycache__/brain_processing.cpython-39.pyc differ diff --git a/__pycache__/named_entity_linking.cpython-39.pyc b/__pycache__/named_entity_linking.cpython-39.pyc index 1f96bc0..4109922 100644 Binary files a/__pycache__/named_entity_linking.cpython-39.pyc and b/__pycache__/named_entity_linking.cpython-39.pyc differ diff --git a/__pycache__/simple_ner.cpython-39.pyc b/__pycache__/simple_ner.cpython-39.pyc new file mode 100644 index 0000000..367f746 Binary files /dev/null and b/__pycache__/simple_ner.cpython-39.pyc differ diff --git a/brain_processing.py b/brain_processing.py new file mode 100644 index 0000000..73f3698 --- /dev/null +++ b/brain_processing.py @@ -0,0 +1,89 @@ +from emissor.representation.scenario import Modality, ImageSignal, TextSignal, Mention, Annotation, Scenario +from cltl.combot.backend.api.discrete import UtteranceType +from cltl.brain.infrastructure.rdf_builder import RdfBuilder +from rdflib import RDFS +from datetime import date +from random import getrandbits +import requests + + +def seq_to_text (seq): + text = "" + for c in seq: + text+=c + return text + + +def scenario_utterance_to_capsule(scenario: Scenario, signal: TextSignal, author:str, perspective:dict, subj: str, pred:str, obj:str): + place_id = getrandbits(8) + location = requests.get("https://ipinfo.io").json() + + capsule = {"chat":scenario.id, + "turn":signal.id, + "author": "carl", + "utterance": seq_to_text(signal.seq), + "utterance_type": UtteranceType.STATEMENT, + "position": "0-"+str(len(signal.seq)), #TODO generate the true offset range + "subject": {"label": subj, "type": "person"}, + "predicate": {"type": pred}, + "object": {"label": obj, "type": "object"}, + "perspective": perspective , + "context_id": scenario.scenario.context, + "date": date.today(), + "place": location['city'], + "place_id": place_id, + "country": location['country'], + "region": location['region'], + "city": location['city'], + "objects": [{'type': 'chair', 'confidence': 0.59, 'id': 1}, + {'type': 'table', 'confidence': 0.73, 'id': 1}, + {'type': 'pillbox', 'confidence': 0.32, 'id': 1}], + "people": [{'name': 'Carl', 'confidence': 0.98, 'id': 1}] + } + return capsule + + +# create a capsule for a TextSignal with a triple and perspective string +def scenario_utterance_and_triple_to_capsule(scenario: Scenario, + place_id: str, + location: str, + signal: TextSignal, + author: str, + utterance_type: UtteranceType, + perspective: dict, + triple: dict): + value = generate_obl_object_json(author) + capsule = {"chat": scenario.id, + "turn": signal.id, + "author": author, + "utterance": seq_to_text(signal.seq), + "utterance_type": utterance_type, + "position": "0-" + str(len(signal.seq)), # TODO generate the true offset range + "subject": {'label': triple['subject']['label'], 'type': triple['subject']['type']}, + "predicate": {'type': triple['predicate']['label']}, + "object": {'label': triple['object']['label'], 'type': triple['object']['type']}, + "perspective": perspective, + "context_id": scenario.scenario.context, + ##### standard elements + "date": date.today(), + "place": location['city'], + "place_id": place_id, + "country": location['country'], + "region": location['region'], + "city": location['city'], + "objects": value['objects'], + "people": value['people'] + } + + return capsule + + +# Function to generate bogus elements for capsules. Without these, the update function fails +def generate_obl_object_json(human: str): + json_string = { + "objects": [{'type': 'chair', 'confidence': 0.59, 'id': 1}, + {'type': 'table', 'confidence': 0.73, 'id': 1}, + {'type': 'pillbox', 'confidence': 0.32, 'id': 1}], + "people": [{'name': human, 'confidence': 0.98, 'id': 1}] + } + return json_string \ No newline at end of file diff --git a/data/test_scenario/test_scenario.json b/data/test_scenario/test_scenario.json new file mode 100644 index 0000000..2ee9be2 --- /dev/null +++ b/data/test_scenario/test_scenario.json @@ -0,0 +1,33 @@ +{ + "context": "AGENT", + "id": "test_scenario", + "signals": { + "image": "./image.json", + "text": "./text.json" + }, + "@context": { + "Scenario": "https://emissor.org#Scenario", + "id": "@id", + "context": "https://emissor.org#context", + "signals": "https://emissor.org#signals", + "ruler": "https://emissor.org#ruler" + }, + "@type": "Scenario", + "ruler": { + "container_id": "test_scenario", + "start": 662994, + "end": 662999, + "@context": { + "TemporalRuler": "https://emissor.org#TemporalRuler", + "id": "@id", + "start": "https://emissor.org#start", + "end": "https://emissor.org#end", + "container_id": { + "@id": "https://emissor.org#container_id", + "@type": "@id" + } + }, + "@type": "TemporalRuler", + "_py_type": "emissor.representation.container-TemporalRuler" + } +} \ No newline at end of file diff --git a/logs/2021-10-15-10-21/brain_log_2021-10-15-10-21-29.trig b/logs/2021-10-15-10-21/brain_log_2021-10-15-10-21-29.trig new file mode 100644 index 0000000..443cd3b --- /dev/null +++ b/logs/2021-10-15-10-21/brain_log_2021-10-15-10-21-29.trig @@ -0,0 +1,11 @@ +@prefix leolaniTalk: . +@prefix leolaniWorld: . +@prefix n2mu: . +@prefix rdfs: . + +leolaniWorld:Instances { + n2mu:jaap_1 rdfs:label "PhD", + "hij", + "jaap" . +} + diff --git a/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-10.trig b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-10.trig new file mode 100644 index 0000000..443cd3b --- /dev/null +++ b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-10.trig @@ -0,0 +1,11 @@ +@prefix leolaniTalk: . +@prefix leolaniWorld: . +@prefix n2mu: . +@prefix rdfs: . + +leolaniWorld:Instances { + n2mu:jaap_1 rdfs:label "PhD", + "hij", + "jaap" . +} + diff --git a/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-32.trig b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-32.trig new file mode 100644 index 0000000..25cfe5c --- /dev/null +++ b/logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-32.trig @@ -0,0 +1,10 @@ +@prefix leolaniTalk: . +@prefix leolaniWorld: . +@prefix rdfs: . + +leolaniWorld:Instances { + leolaniWorld:jaap_1 rdfs:label "PhD", + "hij", + "jaap" . +} + diff --git a/logs/2021-11-22-17-46/brain_log_2021-11-22-17-46-01.trig b/logs/2021-11-22-17-46/brain_log_2021-11-22-17-46-01.trig new file mode 100644 index 0000000..25cfe5c --- /dev/null +++ b/logs/2021-11-22-17-46/brain_log_2021-11-22-17-46-01.trig @@ -0,0 +1,10 @@ +@prefix leolaniTalk: . +@prefix leolaniWorld: . +@prefix rdfs: . + +leolaniWorld:Instances { + leolaniWorld:jaap_1 rdfs:label "PhD", + "hij", + "jaap" . +} + diff --git a/named_entity_linking.py b/named_entity_linking.py index 05b716c..e995449 100644 --- a/named_entity_linking.py +++ b/named_entity_linking.py @@ -16,25 +16,28 @@ from cltl.brain.utils import base_cases from cltl.brain.basic_brain import BasicBrain +from cltl.brain.long_term_memory import LongTermMemory from cltl.brain.utils.helper_functions import read_query +from cltl.brain.infrastructure.rdf_builder import RdfBuilder +from rdflib import RDFS, Literal -class NamedEntityLinker(BasicBrain): +class NamedEntityLinker(LongTermMemory): def __init__(self, address, log_dir, clear_all=False): - super(NamedEntityLinker, self).__init__(address, log_dir, clear_all, is_submodule=True) + super(NamedEntityLinker, self).__init__(address, log_dir, clear_all) - # Problem: How are uri's defined right now in the brain? Is ambiguity taken into account? --> - # Otherwise uri's are the same - # E.g. if labels are firstname-lastname then query needs to be RE only looking at part before hyphen - - def link_entities(self, ne_text, baseline='popularity'): - if baseline == 'popularity': - uri = self._get_most_popular(ne_text) - elif baseline == 'recency': - uri = self._get_most_recent(ne_text) - return uri + def link_entities(self, ne_list, baseline='popularity'): + uri_list = [] + for ne_text in ne_list: + if baseline == 'popularity': + uri = self._get_most_popular(ne_text) + uri_list.append((uri, ne_text)) + elif baseline == 'recency': + uri = self._get_most_recent(ne_text) + uri_list.append((uri, ne_text)) + return uri_list def _get_most_popular(self, ne_text): query = read_query('/Users/jaapkruijt/Documents/GitHub/NEL-coreference/popularity') % ne_text @@ -42,19 +45,46 @@ def _get_most_popular(self, ne_text): # print(response) pop_ordered = [] for row in response: + print(row) uri = row['ent']['value'] occurrences = row['num_mentions']['value'] pop_ordered.append((uri, occurrences)) if pop_ordered: uri, popularity = pop_ordered[0] - # else: - # - # # TODO add functionality to add entity to graph + else: + uri_name = f'{ne_text}_1' + uri = self._rdf_builder.create_resource_uri('LW', uri_name) return uri def _get_most_recent(self, ne_text): pass + def add_labels(self, capsule, uri=None): + ent_uri = self._rdf_builder.create_resource_uri('LW', capsule['subject']['id']) if not uri else uri + for label in capsule['labels']: + self.instance_graph.add((ent_uri, RDFS.label, Literal(label))) + + def add_labels_2(self, identity, labels, uri=None): + ent_uri = self._rdf_builder.create_resource_uri('LW', identity) if not uri else uri + for label in labels: + self.instance_graph.add((ent_uri, RDFS.label, Literal(label))) + + def update_brain(self): + + data = self._serialize(self._brain_log()) + code = self._upload_to_brain(data) + + +if __name__ == "__main__": + import pathlib + + log_path = pathlib.Path('./logs') + print(type(log_path)) + nel = NamedEntityLinker(address="http://localhost:7200/repositories/sandbox", + log_dir=log_path) + nel.add_labels_2('jaap_1', ['jaap', 'PhD', 'hij']) + nel.update_brain() + diff --git a/recency.rq b/recency.rq new file mode 100644 index 0000000..fc3fd5b --- /dev/null +++ b/recency.rq @@ -0,0 +1,11 @@ +prefix gaf: +PREFIX rdfs: + +select ?ent (COUNT(DISTINCT ?e) as ?num_mentions) where{ + ?ent rdfs:label "%s". + + ?ent gaf:denotedIn ?e. + } + +group by ?ent + order by DESC(COUNT(DISTINCT ?e)) \ No newline at end of file diff --git a/simple_ner.py b/simple_ner.py index cbbd847..f1695e8 100644 --- a/simple_ner.py +++ b/simple_ner.py @@ -2,44 +2,83 @@ from tempfile import TemporaryDirectory from pathlib import Path from named_entity_linking import NamedEntityLinker +from brain_processing import scenario_utterance_to_capsule -utt = "Hi this is my friend #123 and his supervisor Piek" +# These modules are imported for the added let's-chat stuff +from emissor.persistence import ScenarioStorage +from emissor.representation.annotation import AnnotationType, Token, NER +from emissor.representation.container import Index +from emissor.representation.scenario import Modality, ImageSignal, TextSignal, Mention, Annotation, Scenario +import uuid +import time +from datetime import datetime + +# These modules are not included in NEL-coreference at the moment!! Won't work outside this machine +from src.chatbots.util import driver_util, capsule_util +from src.chatbots.dummies import text_to_triple as ttt + +from rdflib.namespace import RDFS + +utt = "Carl likes Bart" # Idea: can the system search for NP's in the surroundings of a NE, and remember those -# Testing linking separate from NER (by e.g. using hashes and a dict) (NamedEntityRecognizer) -# Make linking independent from ner function (so not nested inside the ner function but use its output) -# Updating the brain: a lot of it is already done automatically in the LTM update() function -# If I do it here as well then it is done twice; what is the right way to approach this? -# Using dummy triples that don't require an utterance? +# TODO Testing linking separate from NER (by e.g. using hashes and a dict) (NamedEntityRecognizer) +# Using dummy triples that don't require an utterance? -def named_entity_recognition(utterance, nel: NamedEntityLinker): +def add_ner_annotation(signal: TextSignal): processor_name = "spaCy" + utterance = ''.join(signal.seq) doc = nlp(utterance) - tokens = [token.text for token in doc] + offsets, tokens = zip(*[(Index(signal.id, token.idx, token.idx + len(token)), Token.for_string(token.text)) + for token in doc]) - entity_label = [ent.label_ for ent in doc.ents] - entity_text = [ent.text.lower() for ent in doc.ents] + ents = [NER.for_string(ent.label_) for ent in doc.ents] + entity_list = [ent.text.lower() for ent in doc.ents] + segments = [token.ruler for token in tokens if token.value in entity_list] - entities = [] - for ent_text in entity_text: - name = nel.link_entities(ent_text) - entities.append(name) + annotations = [Annotation(AnnotationType.TOKEN.name.lower(), token, processor_name, int(time.time())) + for token in tokens] + ner_annotations = [Annotation(AnnotationType.NER.name.lower(), ent, processor_name, int(time.time())) + for ent in ents] + + signal.mentions.extend([Mention(str(uuid.uuid4()), [offset], [annotation]) + for offset, annotation in zip(offsets, annotations)]) + signal.mentions.extend([Mention(str(uuid.uuid4()), [segment], [annotation]) + for segment, annotation in zip(segments, ner_annotations)]) + return entity_list - return entities +def utterance_processor(utterance, scenario, brain, author): + text_signal = driver_util.create_text_signal(scenario, utterance) -def main(log_path): + entity_text = add_ner_annotation(text_signal) + scenario.append_signal(text_signal) + + return entity_text + + +def main(log_path, utterance): nel = NamedEntityLinker(address="http://localhost:7200/repositories/sandbox", log_dir=log_path) - results = named_entity_recognition(utt, nel) - return results + scenario_path = './data' + scenario_id = 'test_scenario' + scenario_storage = driver_util.create_scenario(scenario_path, scenario_id) + scen = scenario_storage.create_scenario(scenario_id, datetime.now().microsecond, datetime.now().microsecond, 'AGENT') + entity_text = utterance_processor(utterance, scen, nel, 'Jaap') + + # link_entities expects a list with all entities in one + # but the new ner gives a list with a single entity per utterance(?) + + entities = nel.link_entities(entity_text) + + return entities if __name__ == "__main__": nlp = spacy.load('en_core_web_sm') with TemporaryDirectory(prefix="brain-log") as log_path: - res = main(Path(log_path)) + res = main(Path(log_path), utt) print(res) \ No newline at end of file