diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml index 28087b5..13aa8ee 100644 --- a/conf/base/catalog.yml +++ b/conf/base/catalog.yml @@ -1,46 +1,63 @@ -# Here you can define all your data sets by using simple YAML syntax. -# -# Documentation for this file format can be found in "The Data Catalog" -# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html -# -# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS -# -# The Data Catalog supports being able to reference the same file using two different DataSet implementations -# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: -# https://kedro.readthedocs.io/en/stable/data/data_catalog.html -# -sdg_data: - type: spark.SparkDataSet - filepath: data/01_raw/train.csv - file_format: csv - #credentials: dev_s3 - load_args: - header: True - inferSchema: True - save_args: - sep: '\t' - -model_input_data: - type: MemoryDataSet - -twitter_analytics.dummy_data: - type: MemoryDataSet - -text_comprehension.dummy_data: - type: MemoryDataSet - -text_classification.dummy_data: - type: MemoryDataSet - - -# dummy_data: -# type: spark.SparkDataSet - # filepath: data/01_raw/osdg-dummy_data.csv - # file_format: csv - # #credentials: dev_s3 - # load_args: - # header: True - # inferSchema: True - # save_args: - # sep: '\t' - # overwrite: True +# Here you can define all your data sets by using simple YAML syntax. +# +# Documentation for this file format can be found in "The Data Catalog" +# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html +# +# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS +# +# The Data Catalog supports being able to reference the same file using two different DataSet implementations +# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here: +# https://kedro.readthedocs.io/en/stable/data/data_catalog.html +# +sdg_data: + type: spark.SparkDataSet + filepath: data/01_raw/train.csv + file_format: csv + #credentials: dev_s3 + load_args: + header: True + inferSchema: True + save_args: + sep: '\t' + +model_input_data: + type: MemoryDataSet + +twitter_analytics.dummy_data: + type: MemoryDataSet + +text_comprehension.dummy_data: + type: MemoryDataSet + +text_classification.dummy_data: + type: MemoryDataSet + + +# dummy_data: +# type: spark.SparkDataSet + # filepath: data/01_raw/osdg-dummy_data.csv + # file_format: csv + # #credentials: dev_s3 + # load_args: + # header: True + # inferSchema: True + # save_args: + # sep: '\t' + # overwrite: True + +sdg_text_data: + type: pandas.CSVDataSet + filepath: data/01_raw/train.csv + layer: raw + load_args: + # sep: '\t' + +organization_data: + type: pandas.CSVDataSet + filepath: data/02_intermediate/organization_data.csv + layer: intermediate + +location_data: + type: pandas.CSVDataSet + filepath: data/02_intermediate/location_data.csv + layer: intermediate \ No newline at end of file diff --git a/src/nlp_sdg/pipelines/text_comprehension/nodes.py b/src/nlp_sdg/pipelines/text_comprehension/nodes.py index 8872f41..66cebfa 100644 --- a/src/nlp_sdg/pipelines/text_comprehension/nodes.py +++ b/src/nlp_sdg/pipelines/text_comprehension/nodes.py @@ -1,7 +1,79 @@ -""" -This is a boilerplate pipeline 'text_comprehension' -generated using Kedro 0.18.2 -""" -def dummy_node(data): - print("Text Comprehension dummy node completed") - return 5 \ No newline at end of file +""" +This is a boilerplate pipeline 'text_comprehension' +generated using Kedro 0.18.2 +""" +import numpy as np +import pandas as pd + +from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter +from allennlp.predictors.predictor import Predictor +import allennlp_models.tagging +from tqdm import tqdm + +def dummy_node(data): + print("Text Comprehension dummy node completed") + return 5 + +nlp_model = Predictor.from_path('https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz') + +def get_organization(data): + def get_org(df_org): + df_ent = [] + for _, row in pd.DataFrame({"beg": df_org.loc[lambda x: x["tags"] == "B-ORG"].index.values, + "end": df_org.loc[lambda x: x["tags"] == "L-ORG"].index.values + 1}).iterrows(): + df_ent.append(df_org.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" ")) + df_ent.extend(df_org.loc[lambda x: x["tags"] == "U-ORG"]["words"].to_list()) + return df_ent + + + df_edges = [] + for _, row in tqdm(list(data.iterrows())): + df_org = [] + sents = SpacySentenceSplitter().split_sentences(row["text"]) + for i, s in list(enumerate(sents)): + res = nlp_model.predict( + sentence=s + ) + df_org.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\ + .loc[lambda x: x["tags"].str.contains("ORG")]) + + df_org = pd.concat(df_org).reset_index(drop=True) + df_ent = get_org(df_org) + df_edges.append(pd.DataFrame({"text": row["text"], "organization": df_ent})) + + df_org = pd.concat(df_edges) + df_org = pd.DataFrame(df_org.groupby(df_org["text"])["organization"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index()) + + return df_org["organization"] + + +def get_location(data): + def get_location(df_loc): + df_ent = [] + for _, row in pd.DataFrame({"beg": df_loc.loc[lambda x: x["tags"] == "B-LOC"].index.values, + "end": df_loc.loc[lambda x: x["tags"] == "L-LOC"].index.values + 1}).iterrows(): + df_ent.append(df_loc.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" ")) + df_ent.extend(df_loc.loc[lambda x: x["tags"] == "U-LOC"]["words"].to_list()) + return df_ent + + + df_edges = [] + for _, row in tqdm(list(data.iterrows())): + df_loc = [] + sents = SpacySentenceSplitter().split_sentences(row["text"]) + for i, s in list(enumerate(sents)): + res = nlp_model.predict( + sentence=s + ) + df_loc.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\ + .loc[lambda x: x["tags"].str.contains("LOC")]) + + df_loc = pd.concat(df_loc).reset_index(drop=True) + df_ent = get_location(df_loc) + df_edges.append(pd.DataFrame({"text": row["text"], "location": df_ent})) + + + df_loc = pd.concat(df_edges) + df_loc = pd.DataFrame(df_loc.groupby(df_loc["text"])["location"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index()) + + return df_loc["location"] \ No newline at end of file diff --git a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py index aa88f6e..4294678 100644 --- a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py +++ b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py @@ -1,29 +1,42 @@ -""" -This is a boilerplate pipeline 'text_comprehension' -generated using Kedro 0.18.2 -""" - -from kedro.pipeline import Pipeline, node, pipeline - -from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node - - - -def create_pipeline(**kwargs) -> Pipeline: - pipeline_instance = pipeline( - - [ - node( - func=dummy_node, - inputs="model_input_data", - outputs="comprehension_output", - name="dummy_node", - ), - ] - ) - text_comprehension = pipeline( - pipe=pipeline_instance, - inputs="model_input_data", - namespace = "text_comprehension" - ) +""" +This is a boilerplate pipeline 'text_comprehension' +generated using Kedro 0.18.2 +""" + +from kedro.pipeline import Pipeline, node, pipeline + +from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node, get_organization, get_location + + + +def create_pipeline(**kwargs) -> Pipeline: + pipeline_instance = pipeline( + + [ + node( + func=dummy_node, + inputs="model_input_data", + outputs="comprehension_output", + name="dummy_node", + ), + node( + func=get_organization, + inputs="sdg_text_data", + outputs="organization_data", + name="get_organization_node", + ), + node( + func=get_location, + inputs="sdg_text_data", + outputs="location_data", + name="get_location_node", + ), + ] + ) + text_comprehension = pipeline( + pipe=pipeline_instance, + inputs=["model_input_data","sdg_text_data"], + namespace = "text_comprehension", + outputs=["organization_data", "location_data"] + ) return text_comprehension \ No newline at end of file diff --git a/src/requirements.txt b/src/requirements.txt index 297f237..dfe2508 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -24,4 +24,5 @@ joblib numexpr sklearn scipy - +allennlp +allennlp-models