-
Notifications
You must be signed in to change notification settings - Fork 0
/
detector.py
184 lines (160 loc) · 7.46 KB
/
detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# Imports
import spacy
import pandas as pd
from src.language_models.albert import AlbertNER
from src.extractors import *
from src.checker import Checker
from src.filter import Filter
from .endpoint_config import ASSETS_PATH, MODELS_PATH
from .logging_handler import init_logger, logger
from typing import List, Tuple
import logging
import string
import re
import os
ACTOR = "actor"
DIRECTOR = "director"
CHARACETR = "character"
class Detector:
""" Detector class to detect entities and movies in text """
def __init__(self):
""" Init the Detector. Will set extractors, NER models, filter, checker and dataframe"""
# NER Models
self.model = spacy.load("en_core_web_lg")
self.ner = AlbertNER(os.path.join(MODELS_PATH, "conll03"))
# Check data with movie database
df_movies = pd.read_csv(os.path.join(ASSETS_PATH, "movies.csv"))
df_movies = df_movies.loc[df_movies.actors.notna()]
self.df_movies = df_movies
# Extractors
self.award_extractor = AwardsExtractor(df_movies)
self.genre_extractor = GenreExtractor(df_movies)
self.person_extractor = PersonExtractor(df_movies)
self.rate_extractor = RateExtractor(df_movies)
self.song_extractor = SongExtractor(df_movies)
self.title_extractor = TitleExtractor(df_movies)
self.trailer_extractor = TrailerExtractor(df_movies)
self.year_extractor = YearExtractor(df_movies)
self.extractors = [
self.award_extractor,
self.genre_extractor,
self.person_extractor,
self.rate_extractor,
self.song_extractor,
self.title_extractor,
self.trailer_extractor,
self.year_extractor
]
# Filter
self.filter = Filter()
# Checker
self.checker = Checker(self.filter, df_movies)
def get_entities(self, **kwargs: dict) -> dict:
""" Get Named Entities from text. Will take text from kwargs and will update them
with entities_spacy and entities_albert, extracted from the NER models """
doc = self.model(kwargs['text'])
kwargs['entities_spacy'] = [(ent.text, ent.label_) for ent in doc.ents]
kwargs['entities_albert'] = self.ner.extract(kwargs['text'])
return kwargs
def parse_entity(self, entity_text: str, label: str) -> List[str]:
""" Parse an entity to BIO format
Keyword Arguments:
:param entity_text: Entity to parse
:param label: Label to add to the entity
:return: (List[str]) List of BIO labeled entities
"""
words = entity_text.split(" ")
entities = [(words[0], f"B-{label}")]
entities += [(w, f"I-{label}") for w in words[1:]]
return entities
def merge_entities(self, entities: List[Tuple[str, str]],
new_entities: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
""" Merge entities to get the whole text labeled
Keyword Arguments:
:param entities: Original entities to get original words from
:param new_entities: Entities that have been labeled
"""
original_words = list(enumerate([ent_[0].strip().strip(string.punctuation) for ent_ in entities]))
for ent in new_entities:
words = [x[0] for x in ent]
idxs = []
for i, word in original_words:
if word == words[0].strip().strip(string.punctuation):
val = True
for j in range(len(words)):
if entities[i + j][0] != words[j].strip().strip(string.punctuation):
val = False
break
if val:
idxs += list(zip(ent, range(i, i + len(words))))
for ent_, i in idxs:
if ent_[1].startswith("I"):
if i != 0 and (entities[i - 1][1] == ent_[1].replace("I-", "B-") or entities[i - 1][1] == ent_[1]):
entities[i] = ent_
else:
entities[i] = ent_
return entities
def parse_entities(self, **kwargs: dict):
""" Parse Entities from the text. Will take all entities from kwargs and
will return the text labeled in BIO format """
titles_parsed = []
for title in kwargs['titles']:
titles_parsed.append(self.parse_entity(title.strip(), "TITLE"))
years_parsed = []
for year in kwargs['years']:
years_parsed.append(self.parse_entity(year.strip(), "YEAR"))
ratings_avg_parsed = []
for rating_average in kwargs['rate_avg']:
ratings_avg_parsed.append(self.parse_entity(rating_average.strip(), "RATINGS_AVERAGE"))
awards_parsed = []
for award in kwargs['awards']:
awards_parsed.append(self.parse_entity(award.strip(), "AWARD"))
songs_parsed = []
for song in kwargs['songs']:
songs_parsed.append(self.parse_entity(song.strip(), "SONG"))
trailers_parsed = []
for trailer in kwargs['trailers']:
trailers_parsed.append(self.parse_entity(trailer.strip(), "TRAILER"))
rate_parsed = []
for rating in kwargs['rate']:
rate_parsed.append(self.parse_entity(rating.strip(), "RATING"))
genres_parsed = []
for genre in kwargs['genres']:
genres_parsed.append(self.parse_entity(genre.strip(), "GENRE"))
actors_parsed = []
for actor in kwargs['actors']:
actors_parsed.append(self.parse_entity(actor.strip(), "ACTOR"))
directors_parsed = []
for director in kwargs['directors']:
directors_parsed.append(self.parse_entity(director.strip(), "DIRECTOR"))
characters_parsed = []
for character in kwargs['characters']:
characters_parsed.append(self.parse_entity(character.strip(), "CHARACTER"))
new_entities = titles_parsed + years_parsed + ratings_avg_parsed + trailers_parsed + \
rate_parsed + genres_parsed + directors_parsed + actors_parsed + \
characters_parsed + songs_parsed + awards_parsed
return new_entities
def extract(self, text: str) -> Tuple[List[Tuple[str, str]], pd.DataFrame]:
""" Extract entities from texto and return the dataframe of those movies matched.
Keyword Arguments:
:param text: Text to extract entities from
:return: Entities extracted and movies matched
"""
kwargs = {'text': text}
words = text.split(" ")
entities = [(w.strip().strip(string.punctuation), "O") for w in words]
kwargs = self.get_entities(**kwargs)
for extractor in self.extractors:
kwargs = extractor.run(**kwargs)
kwargs, df = self.checker.run(**kwargs)
if len(df) == 1:
kwargs['titles'] = self.title_extractor.get_titles_from_df(text, df.original_title.values[0])
kwargs['genres'] = self.genre_extractor.get_genres_from_df(text, df.genre.values[0])
kwargs['actors'] = list(set(kwargs['actors'] + self.person_extractor.get_actors_from_df(text, df.actors.values[0])))
kwargs['directors'] = self.person_extractor.get_directors_from_df(text, df.director.values[0])
new_entities = self.parse_entities(**kwargs)
entities = self.merge_entities(entities, new_entities)
if len(df) == 1:
return entities, df
else:
return entities, None