Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

location and organization functions added #28

Open
wants to merge 3 commits into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 63 additions & 46 deletions conf/base/catalog.yml
Original file line number Diff line number Diff line change
@@ -1,46 +1,63 @@
# Here you can define all your data sets by using simple YAML syntax.
#
# Documentation for this file format can be found in "The Data Catalog"
# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
#
# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
#
# The Data Catalog supports being able to reference the same file using two different DataSet implementations
# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
# https://kedro.readthedocs.io/en/stable/data/data_catalog.html
#
sdg_data:
type: spark.SparkDataSet
filepath: data/01_raw/train.csv
file_format: csv
#credentials: dev_s3
load_args:
header: True
inferSchema: True
save_args:
sep: '\t'

model_input_data:
type: MemoryDataSet

twitter_analytics.dummy_data:
type: MemoryDataSet

text_comprehension.dummy_data:
type: MemoryDataSet

text_classification.dummy_data:
type: MemoryDataSet


# dummy_data:
# type: spark.SparkDataSet
# filepath: data/01_raw/osdg-dummy_data.csv
# file_format: csv
# #credentials: dev_s3
# load_args:
# header: True
# inferSchema: True
# save_args:
# sep: '\t'
# overwrite: True
# Here you can define all your data sets by using simple YAML syntax.
#
# Documentation for this file format can be found in "The Data Catalog"
# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
#
# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
#
# The Data Catalog supports being able to reference the same file using two different DataSet implementations
# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
# https://kedro.readthedocs.io/en/stable/data/data_catalog.html
#
sdg_data:
type: spark.SparkDataSet
filepath: data/01_raw/train.csv
file_format: csv
#credentials: dev_s3
load_args:
header: True
inferSchema: True
save_args:
sep: '\t'

model_input_data:
type: MemoryDataSet

twitter_analytics.dummy_data:
type: MemoryDataSet

text_comprehension.dummy_data:
type: MemoryDataSet

text_classification.dummy_data:
type: MemoryDataSet


# dummy_data:
# type: spark.SparkDataSet
# filepath: data/01_raw/osdg-dummy_data.csv
# file_format: csv
# #credentials: dev_s3
# load_args:
# header: True
# inferSchema: True
# save_args:
# sep: '\t'
# overwrite: True

sdg_text_data:
type: pandas.CSVDataSet
filepath: data/01_raw/train.csv
layer: raw
load_args:
# sep: '\t'

organization_data:
type: pandas.CSVDataSet
filepath: data/02_intermediate/organization_data.csv
layer: intermediate

location_data:
type: pandas.CSVDataSet
filepath: data/02_intermediate/location_data.csv
layer: intermediate
86 changes: 79 additions & 7 deletions src/nlp_sdg/pipelines/text_comprehension/nodes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,79 @@
"""
This is a boilerplate pipeline 'text_comprehension'
generated using Kedro 0.18.2
"""
def dummy_node(data):
print("Text Comprehension dummy node completed")
return 5
"""
This is a boilerplate pipeline 'text_comprehension'
generated using Kedro 0.18.2
"""
import numpy as np
import pandas as pd

from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
from allennlp.predictors.predictor import Predictor
import allennlp_models.tagging
from tqdm import tqdm

def dummy_node(data):
print("Text Comprehension dummy node completed")
return 5

nlp_model = Predictor.from_path('https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz')

def get_organization(data):
def get_org(df_org):
df_ent = []
for _, row in pd.DataFrame({"beg": df_org.loc[lambda x: x["tags"] == "B-ORG"].index.values,
"end": df_org.loc[lambda x: x["tags"] == "L-ORG"].index.values + 1}).iterrows():
df_ent.append(df_org.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" "))
df_ent.extend(df_org.loc[lambda x: x["tags"] == "U-ORG"]["words"].to_list())
return df_ent


df_edges = []
for _, row in tqdm(list(data.iterrows())):
df_org = []
sents = SpacySentenceSplitter().split_sentences(row["text"])
for i, s in list(enumerate(sents)):
res = nlp_model.predict(
sentence=s
)
df_org.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\
.loc[lambda x: x["tags"].str.contains("ORG")])

df_org = pd.concat(df_org).reset_index(drop=True)
df_ent = get_org(df_org)
df_edges.append(pd.DataFrame({"text": row["text"], "organization": df_ent}))

df_org = pd.concat(df_edges)
df_org = pd.DataFrame(df_org.groupby(df_org["text"])["organization"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index())

return df_org["organization"]


def get_location(data):
def get_location(df_loc):
df_ent = []
for _, row in pd.DataFrame({"beg": df_loc.loc[lambda x: x["tags"] == "B-LOC"].index.values,
"end": df_loc.loc[lambda x: x["tags"] == "L-LOC"].index.values + 1}).iterrows():
df_ent.append(df_loc.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" "))
df_ent.extend(df_loc.loc[lambda x: x["tags"] == "U-LOC"]["words"].to_list())
return df_ent


df_edges = []
for _, row in tqdm(list(data.iterrows())):
df_loc = []
sents = SpacySentenceSplitter().split_sentences(row["text"])
for i, s in list(enumerate(sents)):
res = nlp_model.predict(
sentence=s
)
df_loc.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\
.loc[lambda x: x["tags"].str.contains("LOC")])

df_loc = pd.concat(df_loc).reset_index(drop=True)
df_ent = get_location(df_loc)
df_edges.append(pd.DataFrame({"text": row["text"], "location": df_ent}))


df_loc = pd.concat(df_edges)
df_loc = pd.DataFrame(df_loc.groupby(df_loc["text"])["location"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index())

return df_loc["location"]
69 changes: 41 additions & 28 deletions src/nlp_sdg/pipelines/text_comprehension/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,42 @@
"""
This is a boilerplate pipeline 'text_comprehension'
generated using Kedro 0.18.2
"""

from kedro.pipeline import Pipeline, node, pipeline

from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node



def create_pipeline(**kwargs) -> Pipeline:
pipeline_instance = pipeline(

[
node(
func=dummy_node,
inputs="model_input_data",
outputs="comprehension_output",
name="dummy_node",
),
]
)
text_comprehension = pipeline(
pipe=pipeline_instance,
inputs="model_input_data",
namespace = "text_comprehension"
)
"""
This is a boilerplate pipeline 'text_comprehension'
generated using Kedro 0.18.2
"""

from kedro.pipeline import Pipeline, node, pipeline

from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node, get_organization, get_location



def create_pipeline(**kwargs) -> Pipeline:
pipeline_instance = pipeline(

[
node(
func=dummy_node,
inputs="model_input_data",
outputs="comprehension_output",
name="dummy_node",
),
node(
func=get_organization,
inputs="sdg_text_data",
outputs="organization_data",
name="get_organization_node",
),
node(
func=get_location,
inputs="sdg_text_data",
outputs="location_data",
name="get_location_node",
),
]
)
text_comprehension = pipeline(
pipe=pipeline_instance,
inputs=["model_input_data","sdg_text_data"],
namespace = "text_comprehension",
outputs=["organization_data", "location_data"]
)
return text_comprehension
3 changes: 2 additions & 1 deletion src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@ joblib
numexpr
sklearn
scipy

allennlp
allennlp-models