Nthiki · kojosbk · Sep 23, 2022 · Oct 16, 2022 · Oct 16, 2022
diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml
@@ -1,46 +1,63 @@
-# Here you can define all your data sets by using simple YAML syntax.
-#
-# Documentation for this file format can be found in "The Data Catalog"
-# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
-#
-# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
-#
-# The Data Catalog supports being able to reference the same file using two different DataSet implementations
-# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
-# https://kedro.readthedocs.io/en/stable/data/data_catalog.html
-#
-sdg_data:
-  type: spark.SparkDataSet
-  filepath: data/01_raw/train.csv
-  file_format: csv
-  #credentials: dev_s3
-  load_args:
-    header: True
-    inferSchema: True
-  save_args:
-    sep: '\t'
-
-model_input_data:
-  type: MemoryDataSet
-
-twitter_analytics.dummy_data:
-  type: MemoryDataSet
-
-text_comprehension.dummy_data:
-  type: MemoryDataSet
-
-text_classification.dummy_data:
-  type: MemoryDataSet
-
-
-# dummy_data:
-#   type: spark.SparkDataSet
-  # filepath: data/01_raw/osdg-dummy_data.csv
-  # file_format: csv
-  # #credentials: dev_s3
-  # load_args:
-  #   header: True
-  #   inferSchema: True
-  # save_args:
-  #   sep: '\t'
-  #   overwrite: True
+# Here you can define all your data sets by using simple YAML syntax.
+#
+# Documentation for this file format can be found in "The Data Catalog"
+# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
+#
+# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
+#
+# The Data Catalog supports being able to reference the same file using two different DataSet implementations
+# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
+# https://kedro.readthedocs.io/en/stable/data/data_catalog.html
+#
+sdg_data:
+  type: spark.SparkDataSet
+  filepath: data/01_raw/train.csv
+  file_format: csv
+  #credentials: dev_s3
+  load_args:
+    header: True
+    inferSchema: True
+  save_args:
+    sep: '\t'
+
+model_input_data:
+  type: MemoryDataSet
+
+twitter_analytics.dummy_data:
+  type: MemoryDataSet
+
+text_comprehension.dummy_data:
+  type: MemoryDataSet
+
+text_classification.dummy_data:
+  type: MemoryDataSet
+
+
+# dummy_data:
+#   type: spark.SparkDataSet
+  # filepath: data/01_raw/osdg-dummy_data.csv
+  # file_format: csv
+  # #credentials: dev_s3
+  # load_args:
+  #   header: True
+  #   inferSchema: True
+  # save_args:
+  #   sep: '\t'
+  #   overwrite: True
+
+sdg_text_data:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/train.csv
+  layer: raw
+  load_args:
+    # sep: '\t'
+
+organization_data:
+  type: pandas.CSVDataSet
+  filepath: data/02_intermediate/organization_data.csv
+  layer: intermediate
+
+location_data:
+  type: pandas.CSVDataSet
+  filepath: data/02_intermediate/location_data.csv
+  layer: intermediate
diff --git a/src/nlp_sdg/pipelines/text_comprehension/nodes.py b/src/nlp_sdg/pipelines/text_comprehension/nodes.py
@@ -1,7 +1,79 @@
-"""
-This is a boilerplate pipeline 'text_comprehension'
-generated using Kedro 0.18.2
-"""
-def dummy_node(data):
-    print("Text Comprehension dummy node completed")
-    return 5
+"""
+This is a boilerplate pipeline 'text_comprehension'
+generated using Kedro 0.18.2
+"""
+import numpy as np
+import pandas as pd
+
+from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
+from allennlp.predictors.predictor import Predictor
+import allennlp_models.tagging
+from tqdm import tqdm
+
+def dummy_node(data):
+    print("Text Comprehension dummy node completed")
+    return 5
+
+nlp_model = Predictor.from_path('https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz')
+
+def get_organization(data):
+    def get_org(df_org):
+        df_ent = []
+        for _, row in pd.DataFrame({"beg": df_org.loc[lambda x: x["tags"] == "B-ORG"].index.values,
+                    "end": df_org.loc[lambda x: x["tags"] == "L-ORG"].index.values + 1}).iterrows():
+            df_ent.append(df_org.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" "))
+        df_ent.extend(df_org.loc[lambda x: x["tags"] == "U-ORG"]["words"].to_list())
+        return df_ent
+
+
+    df_edges = []
+    for _, row in tqdm(list(data.iterrows())):
+        df_org = []
+        sents = SpacySentenceSplitter().split_sentences(row["text"])
+        for i, s in list(enumerate(sents)):
+            res = nlp_model.predict(
+            sentence=s
+            )
+            df_org.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\
+            .loc[lambda x: x["tags"].str.contains("ORG")])
+
+        df_org = pd.concat(df_org).reset_index(drop=True)
+        df_ent = get_org(df_org)
+        df_edges.append(pd.DataFrame({"text": row["text"], "organization": df_ent}))
+
+    df_org = pd.concat(df_edges)
+    df_org = pd.DataFrame(df_org.groupby(df_org["text"])["organization"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index())
+
+    return df_org["organization"]
+
+
+def get_location(data):
+    def get_location(df_loc):
+        df_ent = []
+        for _, row in pd.DataFrame({"beg": df_loc.loc[lambda x: x["tags"] == "B-LOC"].index.values,
+                        "end": df_loc.loc[lambda x: x["tags"] == "L-LOC"].index.values + 1}).iterrows():
+            df_ent.append(df_loc.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" "))
+        df_ent.extend(df_loc.loc[lambda x: x["tags"] == "U-LOC"]["words"].to_list())
+        return df_ent
+
+
+    df_edges = []
+    for _, row in tqdm(list(data.iterrows())):
+        df_loc = []
+        sents = SpacySentenceSplitter().split_sentences(row["text"])
+        for i, s in list(enumerate(sents)):
+            res = nlp_model.predict(
+            sentence=s
+            )
+            df_loc.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\
+            .loc[lambda x: x["tags"].str.contains("LOC")])
+
+        df_loc = pd.concat(df_loc).reset_index(drop=True)
+        df_ent = get_location(df_loc)
+        df_edges.append(pd.DataFrame({"text": row["text"], "location": df_ent}))
+
+
+    df_loc = pd.concat(df_edges)
+    df_loc = pd.DataFrame(df_loc.groupby(df_loc["text"])["location"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index())
+
+    return df_loc["location"]
diff --git a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py
@@ -1,29 +1,42 @@
-"""
-This is a boilerplate pipeline 'text_comprehension'
-generated using Kedro 0.18.2
-"""
-
-from kedro.pipeline import Pipeline, node, pipeline
-
-from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node
-
-
-
-def create_pipeline(**kwargs) -> Pipeline:
-    pipeline_instance =  pipeline(
-
-        [
-            node(
-                func=dummy_node,
-                inputs="model_input_data",
-                outputs="comprehension_output",
-                name="dummy_node",
-            ),
-        ]
-    )
-    text_comprehension = pipeline(
-        pipe=pipeline_instance,
-        inputs="model_input_data",
-        namespace = "text_comprehension"
-    )
+"""
+This is a boilerplate pipeline 'text_comprehension'
+generated using Kedro 0.18.2
+"""
+
+from kedro.pipeline import Pipeline, node, pipeline
+
+from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node, get_organization, get_location
+
+
+
+def create_pipeline(**kwargs) -> Pipeline:
+    pipeline_instance =  pipeline(
+
+        [
+            node(
+                func=dummy_node,
+                inputs="model_input_data",
+                outputs="comprehension_output",
+                name="dummy_node",
+            ),
+            node(
+                func=get_organization,
+                inputs="sdg_text_data",
+                outputs="organization_data",
+                name="get_organization_node",
+            ),
+            node(
+                func=get_location,
+                inputs="sdg_text_data",
+                outputs="location_data",
+                name="get_location_node",
+            ),            
+        ]
+    )
+    text_comprehension = pipeline(
+        pipe=pipeline_instance,
+        inputs=["model_input_data","sdg_text_data"],
+        namespace = "text_comprehension",
+        outputs=["organization_data", "location_data"]
+    )
     return text_comprehension
diff --git a/src/requirements.txt b/src/requirements.txt
@@ -24,4 +24,5 @@ joblib
 numexpr
 sklearn
 scipy
-
+allennlp
+allennlp-models
-Original file line number
+Diff line change
@@ Expand Up / @@ -24,4 +24,5 @@ joblib @@
     numexpr
     sklearn
     scipy
+    allennlp
+    allennlp-models