Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add capsule functions, integration with emissor #1

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added __pycache__/brain_processing.cpython-39.pyc
Binary file not shown.
Binary file modified __pycache__/named_entity_linking.cpython-39.pyc
Binary file not shown.
Binary file added __pycache__/simple_ner.cpython-39.pyc
Binary file not shown.
89 changes: 89 additions & 0 deletions brain_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from emissor.representation.scenario import Modality, ImageSignal, TextSignal, Mention, Annotation, Scenario
from cltl.combot.backend.api.discrete import UtteranceType
from cltl.brain.infrastructure.rdf_builder import RdfBuilder
from rdflib import RDFS
from datetime import date
from random import getrandbits
import requests


def seq_to_text (seq):
text = ""
for c in seq:
text+=c
return text


def scenario_utterance_to_capsule(scenario: Scenario, signal: TextSignal, author:str, perspective:dict, subj: str, pred:str, obj:str):
place_id = getrandbits(8)
location = requests.get("https://ipinfo.io").json()

capsule = {"chat":scenario.id,
"turn":signal.id,
"author": "carl",
"utterance": seq_to_text(signal.seq),
"utterance_type": UtteranceType.STATEMENT,
"position": "0-"+str(len(signal.seq)), #TODO generate the true offset range
"subject": {"label": subj, "type": "person"},
"predicate": {"type": pred},
"object": {"label": obj, "type": "object"},
"perspective": perspective ,
"context_id": scenario.scenario.context,
"date": date.today(),
"place": location['city'],
"place_id": place_id,
"country": location['country'],
"region": location['region'],
"city": location['city'],
"objects": [{'type': 'chair', 'confidence': 0.59, 'id': 1},
{'type': 'table', 'confidence': 0.73, 'id': 1},
{'type': 'pillbox', 'confidence': 0.32, 'id': 1}],
"people": [{'name': 'Carl', 'confidence': 0.98, 'id': 1}]
}
return capsule


# create a capsule for a TextSignal with a triple and perspective string
def scenario_utterance_and_triple_to_capsule(scenario: Scenario,
place_id: str,
location: str,
signal: TextSignal,
author: str,
utterance_type: UtteranceType,
perspective: dict,
triple: dict):
value = generate_obl_object_json(author)
capsule = {"chat": scenario.id,
"turn": signal.id,
"author": author,
"utterance": seq_to_text(signal.seq),
"utterance_type": utterance_type,
"position": "0-" + str(len(signal.seq)), # TODO generate the true offset range
"subject": {'label': triple['subject']['label'], 'type': triple['subject']['type']},
"predicate": {'type': triple['predicate']['label']},
"object": {'label': triple['object']['label'], 'type': triple['object']['type']},
"perspective": perspective,
"context_id": scenario.scenario.context,
##### standard elements
"date": date.today(),
"place": location['city'],
"place_id": place_id,
"country": location['country'],
"region": location['region'],
"city": location['city'],
"objects": value['objects'],
"people": value['people']
}

return capsule


# Function to generate bogus elements for capsules. Without these, the update function fails
def generate_obl_object_json(human: str):
json_string = {
"objects": [{'type': 'chair', 'confidence': 0.59, 'id': 1},
{'type': 'table', 'confidence': 0.73, 'id': 1},
{'type': 'pillbox', 'confidence': 0.32, 'id': 1}],
"people": [{'name': human, 'confidence': 0.98, 'id': 1}]
}
return json_string
33 changes: 33 additions & 0 deletions data/test_scenario/test_scenario.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"context": "AGENT",
"id": "test_scenario",
"signals": {
"image": "./image.json",
"text": "./text.json"
},
"@context": {
"Scenario": "https://emissor.org#Scenario",
"id": "@id",
"context": "https://emissor.org#context",
"signals": "https://emissor.org#signals",
"ruler": "https://emissor.org#ruler"
},
"@type": "Scenario",
"ruler": {
"container_id": "test_scenario",
"start": 662994,
"end": 662999,
"@context": {
"TemporalRuler": "https://emissor.org#TemporalRuler",
"id": "@id",
"start": "https://emissor.org#start",
"end": "https://emissor.org#end",
"container_id": {
"@id": "https://emissor.org#container_id",
"@type": "@id"
}
},
"@type": "TemporalRuler",
"_py_type": "emissor.representation.container-TemporalRuler"
}
}
11 changes: 11 additions & 0 deletions logs/2021-10-15-10-21/brain_log_2021-10-15-10-21-29.trig
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
@prefix leolaniTalk: <http://cltl.nl/leolani/talk/> .
@prefix leolaniWorld: <http://cltl.nl/leolani/world/> .
@prefix n2mu: <http://cltl.nl/leolani/n2mu/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

leolaniWorld:Instances {
n2mu:jaap_1 rdfs:label "PhD",
"hij",
"jaap" .
}

11 changes: 11 additions & 0 deletions logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-10.trig
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
@prefix leolaniTalk: <http://cltl.nl/leolani/talk/> .
@prefix leolaniWorld: <http://cltl.nl/leolani/world/> .
@prefix n2mu: <http://cltl.nl/leolani/n2mu/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

leolaniWorld:Instances {
n2mu:jaap_1 rdfs:label "PhD",
"hij",
"jaap" .
}

10 changes: 10 additions & 0 deletions logs/2021-10-15-10-22/brain_log_2021-10-15-10-22-32.trig
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@prefix leolaniTalk: <http://cltl.nl/leolani/talk/> .
@prefix leolaniWorld: <http://cltl.nl/leolani/world/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

leolaniWorld:Instances {
leolaniWorld:jaap_1 rdfs:label "PhD",
"hij",
"jaap" .
}

10 changes: 10 additions & 0 deletions logs/2021-11-22-17-46/brain_log_2021-11-22-17-46-01.trig
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
@prefix leolaniTalk: <http://cltl.nl/leolani/talk/> .
@prefix leolaniWorld: <http://cltl.nl/leolani/world/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

leolaniWorld:Instances {
leolaniWorld:jaap_1 rdfs:label "PhD",
"hij",
"jaap" .
}

60 changes: 45 additions & 15 deletions named_entity_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,45 +16,75 @@

from cltl.brain.utils import base_cases
from cltl.brain.basic_brain import BasicBrain
from cltl.brain.long_term_memory import LongTermMemory
from cltl.brain.utils.helper_functions import read_query

from cltl.brain.infrastructure.rdf_builder import RdfBuilder
from rdflib import RDFS, Literal

class NamedEntityLinker(BasicBrain):
class NamedEntityLinker(LongTermMemory):

def __init__(self, address, log_dir, clear_all=False):

super(NamedEntityLinker, self).__init__(address, log_dir, clear_all, is_submodule=True)
super(NamedEntityLinker, self).__init__(address, log_dir, clear_all)

# Problem: How are uri's defined right now in the brain? Is ambiguity taken into account? -->
# Otherwise uri's are the same
# E.g. if labels are firstname-lastname then query needs to be RE only looking at part before hyphen

def link_entities(self, ne_text, baseline='popularity'):
if baseline == 'popularity':
uri = self._get_most_popular(ne_text)
elif baseline == 'recency':
uri = self._get_most_recent(ne_text)
return uri
def link_entities(self, ne_list, baseline='popularity'):
uri_list = []
for ne_text in ne_list:
if baseline == 'popularity':
uri = self._get_most_popular(ne_text)
uri_list.append((uri, ne_text))
elif baseline == 'recency':
uri = self._get_most_recent(ne_text)
uri_list.append((uri, ne_text))
return uri_list

def _get_most_popular(self, ne_text):
query = read_query('/Users/jaapkruijt/Documents/GitHub/NEL-coreference/popularity') % ne_text
response = self._submit_query(query)
# print(response)
pop_ordered = []
for row in response:
print(row)
uri = row['ent']['value']
occurrences = row['num_mentions']['value']
pop_ordered.append((uri, occurrences))
if pop_ordered:
uri, popularity = pop_ordered[0]
# else:
#
# # TODO add functionality to add entity to graph
else:
uri_name = f'{ne_text}_1'
uri = self._rdf_builder.create_resource_uri('LW', uri_name)
return uri

def _get_most_recent(self, ne_text):
pass

def add_labels(self, capsule, uri=None):
ent_uri = self._rdf_builder.create_resource_uri('LW', capsule['subject']['id']) if not uri else uri
for label in capsule['labels']:
self.instance_graph.add((ent_uri, RDFS.label, Literal(label)))

def add_labels_2(self, identity, labels, uri=None):
ent_uri = self._rdf_builder.create_resource_uri('LW', identity) if not uri else uri
for label in labels:
self.instance_graph.add((ent_uri, RDFS.label, Literal(label)))

def update_brain(self):

data = self._serialize(self._brain_log())
code = self._upload_to_brain(data)


if __name__ == "__main__":
import pathlib

log_path = pathlib.Path('./logs')
print(type(log_path))
nel = NamedEntityLinker(address="http://localhost:7200/repositories/sandbox",
log_dir=log_path)
nel.add_labels_2('jaap_1', ['jaap', 'PhD', 'hij'])
nel.update_brain()




Expand Down
11 changes: 11 additions & 0 deletions recency.rq
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
prefix gaf: <http://groundedannotationframework.org/gaf#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

select ?ent (COUNT(DISTINCT ?e) as ?num_mentions) where{
?ent rdfs:label "%s".

?ent gaf:denotedIn ?e.
}

group by ?ent
order by DESC(COUNT(DISTINCT ?e))
77 changes: 58 additions & 19 deletions simple_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,83 @@
from tempfile import TemporaryDirectory
from pathlib import Path
from named_entity_linking import NamedEntityLinker
from brain_processing import scenario_utterance_to_capsule

utt = "Hi this is my friend #123 and his supervisor Piek"
# These modules are imported for the added let's-chat stuff
from emissor.persistence import ScenarioStorage
from emissor.representation.annotation import AnnotationType, Token, NER
from emissor.representation.container import Index
from emissor.representation.scenario import Modality, ImageSignal, TextSignal, Mention, Annotation, Scenario
import uuid
import time
from datetime import datetime

# These modules are not included in NEL-coreference at the moment!! Won't work outside this machine
from src.chatbots.util import driver_util, capsule_util
from src.chatbots.dummies import text_to_triple as ttt

from rdflib.namespace import RDFS

utt = "Carl likes Bart"
# Idea: can the system search for NP's in the surroundings of a NE, and remember those

# Testing linking separate from NER (by e.g. using hashes and a dict) (NamedEntityRecognizer)
# Make linking independent from ner function (so not nested inside the ner function but use its output)
# Updating the brain: a lot of it is already done automatically in the LTM update() function
# If I do it here as well then it is done twice; what is the right way to approach this?
# Using dummy triples that don't require an utterance?
# TODO Testing linking separate from NER (by e.g. using hashes and a dict) (NamedEntityRecognizer)

# Using dummy triples that don't require an utterance?

def named_entity_recognition(utterance, nel: NamedEntityLinker):
def add_ner_annotation(signal: TextSignal):
processor_name = "spaCy"
utterance = ''.join(signal.seq)

doc = nlp(utterance)

tokens = [token.text for token in doc]
offsets, tokens = zip(*[(Index(signal.id, token.idx, token.idx + len(token)), Token.for_string(token.text))
for token in doc])

entity_label = [ent.label_ for ent in doc.ents]
entity_text = [ent.text.lower() for ent in doc.ents]
ents = [NER.for_string(ent.label_) for ent in doc.ents]
entity_list = [ent.text.lower() for ent in doc.ents]
segments = [token.ruler for token in tokens if token.value in entity_list]

entities = []
for ent_text in entity_text:
name = nel.link_entities(ent_text)
entities.append(name)
annotations = [Annotation(AnnotationType.TOKEN.name.lower(), token, processor_name, int(time.time()))
for token in tokens]
ner_annotations = [Annotation(AnnotationType.NER.name.lower(), ent, processor_name, int(time.time()))
for ent in ents]

signal.mentions.extend([Mention(str(uuid.uuid4()), [offset], [annotation])
for offset, annotation in zip(offsets, annotations)])
signal.mentions.extend([Mention(str(uuid.uuid4()), [segment], [annotation])
for segment, annotation in zip(segments, ner_annotations)])
return entity_list

return entities

def utterance_processor(utterance, scenario, brain, author):
text_signal = driver_util.create_text_signal(scenario, utterance)

def main(log_path):
entity_text = add_ner_annotation(text_signal)
scenario.append_signal(text_signal)

return entity_text


def main(log_path, utterance):
nel = NamedEntityLinker(address="http://localhost:7200/repositories/sandbox",
log_dir=log_path)
results = named_entity_recognition(utt, nel)
return results
scenario_path = './data'
scenario_id = 'test_scenario'
scenario_storage = driver_util.create_scenario(scenario_path, scenario_id)
scen = scenario_storage.create_scenario(scenario_id, datetime.now().microsecond, datetime.now().microsecond, 'AGENT')
entity_text = utterance_processor(utterance, scen, nel, 'Jaap')

# link_entities expects a list with all entities in one
# but the new ner gives a list with a single entity per utterance(?)

entities = nel.link_entities(entity_text)

return entities


if __name__ == "__main__":
nlp = spacy.load('en_core_web_sm')
with TemporaryDirectory(prefix="brain-log") as log_path:
res = main(Path(log_path))
res = main(Path(log_path), utt)
print(res)