From 47938853f1fb1278c6fc7be5e5d66e094c3c2108 Mon Sep 17 00:00:00 2001 From: kojosbk <97066032+kojosbk@users.noreply.github.com> Date: Fri, 23 Sep 2022 15:19:39 +0000 Subject: [PATCH 1/3] location and organization functions added --- conf/base/catalog.yml | 13 +++++ .../pipelines/text_comprehension/nodes.py | 54 ++++++++++++++++++- .../pipelines/text_comprehension/pipeline.py | 13 +++-- src/requirements.txt | 3 +- 4 files changed, 78 insertions(+), 5 deletions(-) diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml index 28087b5..ea776a5 100644 --- a/conf/base/catalog.yml +++ b/conf/base/catalog.yml @@ -44,3 +44,16 @@ text_classification.dummy_data: # save_args: # sep: '\t' # overwrite: True + +sdg_text_data: + type: pandas.CSVDataSet + filepath: data/01_raw/train.csv + layer: raw + load_args: + # sep: '\t' + +locations_and_org_data: + type: pandas.CSVDataSet + filepath: data/02_intermediate/qes_and_ans_data.csv + layer: intermediate + \ No newline at end of file diff --git a/src/nlp_sdg/pipelines/text_comprehension/nodes.py b/src/nlp_sdg/pipelines/text_comprehension/nodes.py index 8872f41..92b5b7c 100644 --- a/src/nlp_sdg/pipelines/text_comprehension/nodes.py +++ b/src/nlp_sdg/pipelines/text_comprehension/nodes.py @@ -2,6 +2,58 @@ This is a boilerplate pipeline 'text_comprehension' generated using Kedro 0.18.2 """ +import numpy as np +import pandas as pd + +from allennlp.predictors import Predictor +from allennlp_models.pretrained import load_predictor + def dummy_node(data): print("Text Comprehension dummy node completed") - return 5 \ No newline at end of file + return 5 + +nlp_models = [ + { 'name' : 'ner-model', + 'url': 'https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz' + }, +] + +for nlp_model in nlp_models: + nlp_model['model'] = Predictor.from_path(nlp_model['url']) + +def locationOrganization(data): + data= data.head(5) + def entity_recognition (sentence): + location = [] + for nlp_model in nlp_models: + results = nlp_model['model'].predict(sentence=sentence) + for word, tag in zip(results["words"], results["tags"]): + if tag != 'U-LOC': + continue + else: + # print([word])#(f"{word}") + location.append(word) + # print() + return location + + def entity_recognition_pe(sentence): + organisation = [] + for nlp_model in nlp_models: + results = nlp_model['model'].predict(sentence=sentence) + for word, tag in zip(results["words"], results["tags"]): + if tag != 'U-ORG': + continue + else: + # print([word])#(f"{word}") + organisation.append(word) + # print() + return organisation + result = [] + for i in range(len(data["text"])): + result.append(list(set(entity_recognition(data["text"][i])))) + re1 = [] + for i in range(len(data["text"])): + re1.append(list(set(entity_recognition_pe(data["text"][i])))) + data["location"]=result + data["organisation"]=re1 + return data[["text","location","organisation"]] \ No newline at end of file diff --git a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py index aa88f6e..2e686f6 100644 --- a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py +++ b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py @@ -5,7 +5,7 @@ from kedro.pipeline import Pipeline, node, pipeline -from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node +from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node,locationOrganization @@ -19,11 +19,18 @@ def create_pipeline(**kwargs) -> Pipeline: outputs="comprehension_output", name="dummy_node", ), + node( + func=locationOrganization, + inputs="sdg_text_data", + outputs="locations_and_org_data", + name="locationOrganization_node", + ), ] ) text_comprehension = pipeline( pipe=pipeline_instance, - inputs="model_input_data", - namespace = "text_comprehension" + inputs=["model_input_data","sdg_text_data"], + namespace = "text_comprehension", + outputs="locations_and_org_data" ) return text_comprehension \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt index 297f237..dfe2508 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -24,4 +24,5 @@ joblib numexpr sklearn scipy - +allennlp +allennlp-models From b2cf696c8631b60596688f2ca6d4788cf361c49b Mon Sep 17 00:00:00 2001 From: lista Date: Sun, 16 Oct 2022 02:12:32 +0000 Subject: [PATCH 2/3] Refactored NER code --- conf/base/catalog.yml | 122 ++++++++-------- .../pipelines/text_comprehension/nodes.py | 138 ++++++++++-------- .../pipelines/text_comprehension/pipeline.py | 76 +++++----- 3 files changed, 183 insertions(+), 153 deletions(-) diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml index ea776a5..e41da48 100644 --- a/conf/base/catalog.yml +++ b/conf/base/catalog.yml @@ -1,59 +1,63 @@ -# Here you can define all your data sets by using simple YAML syntax. -# -# Documentation for this file format can be found in "The Data Catalog" -# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html -# -# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS -# -# The Data Catalog supports being able to reference the same file using two different DataSet implementations -# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: -# https://kedro.readthedocs.io/en/stable/data/data_catalog.html -# -sdg_data: - type: spark.SparkDataSet - filepath: data/01_raw/train.csv - file_format: csv - #credentials: dev_s3 - load_args: - header: True - inferSchema: True - save_args: - sep: '\t' - -model_input_data: - type: MemoryDataSet - -twitter_analytics.dummy_data: - type: MemoryDataSet - -text_comprehension.dummy_data: - type: MemoryDataSet - -text_classification.dummy_data: - type: MemoryDataSet - - -# dummy_data: -# type: spark.SparkDataSet - # filepath: data/01_raw/osdg-dummy_data.csv - # file_format: csv - # #credentials: dev_s3 - # load_args: - # header: True - # inferSchema: True - # save_args: - # sep: '\t' - # overwrite: True - -sdg_text_data: - type: pandas.CSVDataSet - filepath: data/01_raw/train.csv - layer: raw - load_args: - # sep: '\t' - -locations_and_org_data: - type: pandas.CSVDataSet - filepath: data/02_intermediate/qes_and_ans_data.csv - layer: intermediate - \ No newline at end of file +# Here you can define all your data sets by using simple YAML syntax. +# +# Documentation for this file format can be found in "The Data Catalog" +# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html +# +# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS +# +# The Data Catalog supports being able to reference the same file using two different DataSet implementations +# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: +# https://kedro.readthedocs.io/en/stable/data/data_catalog.html +# +sdg_data: + type: spark.SparkDataSet + filepath: data/01_raw/train.csv + file_format: csv + #credentials: dev_s3 + load_args: + header: True + inferSchema: True + save_args: + sep: '\t' + +model_input_data: + type: MemoryDataSet + +twitter_analytics.dummy_data: + type: MemoryDataSet + +text_comprehension.dummy_data: + type: MemoryDataSet + +text_classification.dummy_data: + type: MemoryDataSet + + +# dummy_data: +# type: spark.SparkDataSet + # filepath: data/01_raw/osdg-dummy_data.csv + # file_format: csv + # #credentials: dev_s3 + # load_args: + # header: True + # inferSchema: True + # save_args: + # sep: '\t' + # overwrite: True + +sdg_text_data: + type: pandas.CSVDataSet + filepath: data/01_raw/train.csv + layer: raw + load_args: + # sep: '\t' + +organization_data: + type: pandas.CSVDataSet + filepath: data/02_intermediate/organization_data.csv + layer: intermediate + +organization_data: + type: pandas.CSVDataSet + filepath: data/02_intermediate/organization_data.csv + layer: intermediate \ No newline at end of file diff --git a/src/nlp_sdg/pipelines/text_comprehension/nodes.py b/src/nlp_sdg/pipelines/text_comprehension/nodes.py index 92b5b7c..66cebfa 100644 --- a/src/nlp_sdg/pipelines/text_comprehension/nodes.py +++ b/src/nlp_sdg/pipelines/text_comprehension/nodes.py @@ -1,59 +1,79 @@ -""" -This is a boilerplate pipeline 'text_comprehension' -generated using Kedro 0.18.2 -""" -import numpy as np -import pandas as pd - -from allennlp.predictors import Predictor -from allennlp_models.pretrained import load_predictor - -def dummy_node(data): - print("Text Comprehension dummy node completed") - return 5 - -nlp_models = [ - { 'name' : 'ner-model', - 'url': 'https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz' - }, -] - -for nlp_model in nlp_models: - nlp_model['model'] = Predictor.from_path(nlp_model['url']) - -def locationOrganization(data): - data= data.head(5) - def entity_recognition (sentence): - location = [] - for nlp_model in nlp_models: - results = nlp_model['model'].predict(sentence=sentence) - for word, tag in zip(results["words"], results["tags"]): - if tag != 'U-LOC': - continue - else: - # print([word])#(f"{word}") - location.append(word) - # print() - return location - - def entity_recognition_pe(sentence): - organisation = [] - for nlp_model in nlp_models: - results = nlp_model['model'].predict(sentence=sentence) - for word, tag in zip(results["words"], results["tags"]): - if tag != 'U-ORG': - continue - else: - # print([word])#(f"{word}") - organisation.append(word) - # print() - return organisation - result = [] - for i in range(len(data["text"])): - result.append(list(set(entity_recognition(data["text"][i])))) - re1 = [] - for i in range(len(data["text"])): - re1.append(list(set(entity_recognition_pe(data["text"][i])))) - data["location"]=result - data["organisation"]=re1 - return data[["text","location","organisation"]] \ No newline at end of file +""" +This is a boilerplate pipeline 'text_comprehension' +generated using Kedro 0.18.2 +""" +import numpy as np +import pandas as pd + +from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter +from allennlp.predictors.predictor import Predictor +import allennlp_models.tagging +from tqdm import tqdm + +def dummy_node(data): + print("Text Comprehension dummy node completed") + return 5 + +nlp_model = Predictor.from_path('https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz') + +def get_organization(data): + def get_org(df_org): + df_ent = [] + for _, row in pd.DataFrame({"beg": df_org.loc[lambda x: x["tags"] == "B-ORG"].index.values, + "end": df_org.loc[lambda x: x["tags"] == "L-ORG"].index.values + 1}).iterrows(): + df_ent.append(df_org.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" ")) + df_ent.extend(df_org.loc[lambda x: x["tags"] == "U-ORG"]["words"].to_list()) + return df_ent + + + df_edges = [] + for _, row in tqdm(list(data.iterrows())): + df_org = [] + sents = SpacySentenceSplitter().split_sentences(row["text"]) + for i, s in list(enumerate(sents)): + res = nlp_model.predict( + sentence=s + ) + df_org.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\ + .loc[lambda x: x["tags"].str.contains("ORG")]) + + df_org = pd.concat(df_org).reset_index(drop=True) + df_ent = get_org(df_org) + df_edges.append(pd.DataFrame({"text": row["text"], "organization": df_ent})) + + df_org = pd.concat(df_edges) + df_org = pd.DataFrame(df_org.groupby(df_org["text"])["organization"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index()) + + return df_org["organization"] + + +def get_location(data): + def get_location(df_loc): + df_ent = [] + for _, row in pd.DataFrame({"beg": df_loc.loc[lambda x: x["tags"] == "B-LOC"].index.values, + "end": df_loc.loc[lambda x: x["tags"] == "L-LOC"].index.values + 1}).iterrows(): + df_ent.append(df_loc.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" ")) + df_ent.extend(df_loc.loc[lambda x: x["tags"] == "U-LOC"]["words"].to_list()) + return df_ent + + + df_edges = [] + for _, row in tqdm(list(data.iterrows())): + df_loc = [] + sents = SpacySentenceSplitter().split_sentences(row["text"]) + for i, s in list(enumerate(sents)): + res = nlp_model.predict( + sentence=s + ) + df_loc.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\ + .loc[lambda x: x["tags"].str.contains("LOC")]) + + df_loc = pd.concat(df_loc).reset_index(drop=True) + df_ent = get_location(df_loc) + df_edges.append(pd.DataFrame({"text": row["text"], "location": df_ent})) + + + df_loc = pd.concat(df_edges) + df_loc = pd.DataFrame(df_loc.groupby(df_loc["text"])["location"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index()) + + return df_loc["location"] \ No newline at end of file diff --git a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py index 2e686f6..4294678 100644 --- a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py +++ b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py @@ -1,36 +1,42 @@ -""" -This is a boilerplate pipeline 'text_comprehension' -generated using Kedro 0.18.2 -""" - -from kedro.pipeline import Pipeline, node, pipeline - -from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node,locationOrganization - - - -def create_pipeline(**kwargs) -> Pipeline: - pipeline_instance = pipeline( - - [ - node( - func=dummy_node, - inputs="model_input_data", - outputs="comprehension_output", - name="dummy_node", - ), - node( - func=locationOrganization, - inputs="sdg_text_data", - outputs="locations_and_org_data", - name="locationOrganization_node", - ), - ] - ) - text_comprehension = pipeline( - pipe=pipeline_instance, - inputs=["model_input_data","sdg_text_data"], - namespace = "text_comprehension", - outputs="locations_and_org_data" - ) +""" +This is a boilerplate pipeline 'text_comprehension' +generated using Kedro 0.18.2 +""" + +from kedro.pipeline import Pipeline, node, pipeline + +from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node, get_organization, get_location + + + +def create_pipeline(**kwargs) -> Pipeline: + pipeline_instance = pipeline( + + [ + node( + func=dummy_node, + inputs="model_input_data", + outputs="comprehension_output", + name="dummy_node", + ), + node( + func=get_organization, + inputs="sdg_text_data", + outputs="organization_data", + name="get_organization_node", + ), + node( + func=get_location, + inputs="sdg_text_data", + outputs="location_data", + name="get_location_node", + ), + ] + ) + text_comprehension = pipeline( + pipe=pipeline_instance, + inputs=["model_input_data","sdg_text_data"], + namespace = "text_comprehension", + outputs=["organization_data", "location_data"] + ) return text_comprehension \ No newline at end of file From 4868144a65577a89b6b08e6d41d6a7f16b4f40a9 Mon Sep 17 00:00:00 2001 From: lista Date: Sun, 16 Oct 2022 02:19:35 +0000 Subject: [PATCH 3/3] Refactored NER code --- conf/base/catalog.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml index e41da48..13aa8ee 100644 --- a/conf/base/catalog.yml +++ b/conf/base/catalog.yml @@ -57,7 +57,7 @@ organization_data: filepath: data/02_intermediate/organization_data.csv layer: intermediate -organization_data: +location_data: type: pandas.CSVDataSet - filepath: data/02_intermediate/organization_data.csv + filepath: data/02_intermediate/location_data.csv layer: intermediate \ No newline at end of file