From 47938853f1fb1278c6fc7be5e5d66e094c3c2108 Mon Sep 17 00:00:00 2001
From: kojosbk <97066032+kojosbk@users.noreply.github.com>
Date: Fri, 23 Sep 2022 15:19:39 +0000
Subject: [PATCH 1/3] location and organization functions added

---
 conf/base/catalog.yml                         | 13 +++++
 .../pipelines/text_comprehension/nodes.py     | 54 ++++++++++++++++++-
 .../pipelines/text_comprehension/pipeline.py  | 13 +++--
 src/requirements.txt                          |  3 +-
 4 files changed, 78 insertions(+), 5 deletions(-)

diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml
index 28087b5..ea776a5 100644
--- a/conf/base/catalog.yml
+++ b/conf/base/catalog.yml
@@ -44,3 +44,16 @@ text_classification.dummy_data:
   # save_args:
   #   sep: '\t'
   #   overwrite: True
+
+sdg_text_data:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/train.csv
+  layer: raw
+  load_args:
+    # sep: '\t'
+
+locations_and_org_data:
+  type: pandas.CSVDataSet
+  filepath: data/02_intermediate/qes_and_ans_data.csv
+  layer: intermediate
+  
\ No newline at end of file
diff --git a/src/nlp_sdg/pipelines/text_comprehension/nodes.py b/src/nlp_sdg/pipelines/text_comprehension/nodes.py
index 8872f41..92b5b7c 100644
--- a/src/nlp_sdg/pipelines/text_comprehension/nodes.py
+++ b/src/nlp_sdg/pipelines/text_comprehension/nodes.py
@@ -2,6 +2,58 @@
 This is a boilerplate pipeline 'text_comprehension'
 generated using Kedro 0.18.2
 """
+import numpy as np
+import pandas as pd
+
+from allennlp.predictors import Predictor
+from allennlp_models.pretrained import load_predictor
+
 def dummy_node(data):
     print("Text Comprehension dummy node completed")
-    return 5
\ No newline at end of file
+    return 5
+
+nlp_models = [
+    { 'name' : 'ner-model',
+      'url': 'https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz'
+    },
+]
+
+for nlp_model in nlp_models:
+    nlp_model['model'] = Predictor.from_path(nlp_model['url'])
+
+def locationOrganization(data):
+        data= data.head(5)
+        def entity_recognition (sentence):
+            location = []
+            for nlp_model in nlp_models:
+                results =  nlp_model['model'].predict(sentence=sentence)
+                for word, tag in zip(results["words"], results["tags"]):
+                    if tag != 'U-LOC':
+                        continue
+                    else:
+                        # print([word])#(f"{word}")
+                        location.append(word)
+                # print()
+                return location
+
+        def entity_recognition_pe(sentence):
+            organisation = []
+            for nlp_model in nlp_models:
+                results =  nlp_model['model'].predict(sentence=sentence)
+                for word, tag in zip(results["words"], results["tags"]):
+                    if tag != 'U-ORG':
+                        continue
+                    else:
+                        # print([word])#(f"{word}")
+                        organisation.append(word)
+                # print()
+                return organisation
+        result = []
+        for i in range(len(data["text"])):
+            result.append(list(set(entity_recognition(data["text"][i]))))
+        re1 = []
+        for i in range(len(data["text"])):
+            re1.append(list(set(entity_recognition_pe(data["text"][i]))))
+        data["location"]=result
+        data["organisation"]=re1
+        return data[["text","location","organisation"]]
\ No newline at end of file
diff --git a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py
index aa88f6e..2e686f6 100644
--- a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py
+++ b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py
@@ -5,7 +5,7 @@
 
 from kedro.pipeline import Pipeline, node, pipeline
 
-from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node
+from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node,locationOrganization
 
 
 
@@ -19,11 +19,18 @@ def create_pipeline(**kwargs) -> Pipeline:
                 outputs="comprehension_output",
                 name="dummy_node",
             ),
+            node(
+                func=locationOrganization,
+                inputs="sdg_text_data",
+                outputs="locations_and_org_data",
+                name="locationOrganization_node",
+            ),            
         ]
     )
     text_comprehension = pipeline(
         pipe=pipeline_instance,
-        inputs="model_input_data",
-        namespace = "text_comprehension"
+        inputs=["model_input_data","sdg_text_data"],
+        namespace = "text_comprehension",
+        outputs="locations_and_org_data"
     )
     return text_comprehension
\ No newline at end of file
diff --git a/src/requirements.txt b/src/requirements.txt
index 297f237..dfe2508 100644
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -24,4 +24,5 @@ joblib
 numexpr
 sklearn
 scipy
-
+allennlp
+allennlp-models

From b2cf696c8631b60596688f2ca6d4788cf361c49b Mon Sep 17 00:00:00 2001
From: lista <listaabutto@gmail.com>
Date: Sun, 16 Oct 2022 02:12:32 +0000
Subject: [PATCH 2/3] Refactored NER code

---
 conf/base/catalog.yml                         | 122 ++++++++--------
 .../pipelines/text_comprehension/nodes.py     | 138 ++++++++++--------
 .../pipelines/text_comprehension/pipeline.py  |  76 +++++-----
 3 files changed, 183 insertions(+), 153 deletions(-)

diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml
index ea776a5..e41da48 100644
--- a/conf/base/catalog.yml
+++ b/conf/base/catalog.yml
@@ -1,59 +1,63 @@
-# Here you can define all your data sets by using simple YAML syntax.
-#
-# Documentation for this file format can be found in "The Data Catalog"
-# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
-#
-# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
-#
-# The Data Catalog supports being able to reference the same file using two different DataSet implementations
-# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
-# https://kedro.readthedocs.io/en/stable/data/data_catalog.html
-#
-sdg_data:
-  type: spark.SparkDataSet
-  filepath: data/01_raw/train.csv
-  file_format: csv
-  #credentials: dev_s3
-  load_args:
-    header: True
-    inferSchema: True
-  save_args:
-    sep: '\t'
-
-model_input_data:
-  type: MemoryDataSet
-
-twitter_analytics.dummy_data:
-  type: MemoryDataSet
-
-text_comprehension.dummy_data:
-  type: MemoryDataSet
-
-text_classification.dummy_data:
-  type: MemoryDataSet
-
-
-# dummy_data:
-#   type: spark.SparkDataSet
-  # filepath: data/01_raw/osdg-dummy_data.csv
-  # file_format: csv
-  # #credentials: dev_s3
-  # load_args:
-  #   header: True
-  #   inferSchema: True
-  # save_args:
-  #   sep: '\t'
-  #   overwrite: True
-
-sdg_text_data:
-  type: pandas.CSVDataSet
-  filepath: data/01_raw/train.csv
-  layer: raw
-  load_args:
-    # sep: '\t'
-
-locations_and_org_data:
-  type: pandas.CSVDataSet
-  filepath: data/02_intermediate/qes_and_ans_data.csv
-  layer: intermediate
-  
\ No newline at end of file
+# Here you can define all your data sets by using simple YAML syntax.
+#
+# Documentation for this file format can be found in "The Data Catalog"
+# Link: https://kedro.readthedocs.io/en/stable/data/data_catalog.html
+#
+# We support interacting with a variety of data stores including local file systems, cloud, network and HDFS
+#
+# The Data Catalog supports being able to reference the same file using two different DataSet implementations
+# (transcoding), templating and a way to reuse arguments that are frequently repeated. See more here:
+# https://kedro.readthedocs.io/en/stable/data/data_catalog.html
+#
+sdg_data:
+  type: spark.SparkDataSet
+  filepath: data/01_raw/train.csv
+  file_format: csv
+  #credentials: dev_s3
+  load_args:
+    header: True
+    inferSchema: True
+  save_args:
+    sep: '\t'
+
+model_input_data:
+  type: MemoryDataSet
+
+twitter_analytics.dummy_data:
+  type: MemoryDataSet
+
+text_comprehension.dummy_data:
+  type: MemoryDataSet
+
+text_classification.dummy_data:
+  type: MemoryDataSet
+
+
+# dummy_data:
+#   type: spark.SparkDataSet
+  # filepath: data/01_raw/osdg-dummy_data.csv
+  # file_format: csv
+  # #credentials: dev_s3
+  # load_args:
+  #   header: True
+  #   inferSchema: True
+  # save_args:
+  #   sep: '\t'
+  #   overwrite: True
+
+sdg_text_data:
+  type: pandas.CSVDataSet
+  filepath: data/01_raw/train.csv
+  layer: raw
+  load_args:
+    # sep: '\t'
+
+organization_data:
+  type: pandas.CSVDataSet
+  filepath: data/02_intermediate/organization_data.csv
+  layer: intermediate
+  
+organization_data:
+  type: pandas.CSVDataSet
+  filepath: data/02_intermediate/organization_data.csv
+  layer: intermediate
\ No newline at end of file
diff --git a/src/nlp_sdg/pipelines/text_comprehension/nodes.py b/src/nlp_sdg/pipelines/text_comprehension/nodes.py
index 92b5b7c..66cebfa 100644
--- a/src/nlp_sdg/pipelines/text_comprehension/nodes.py
+++ b/src/nlp_sdg/pipelines/text_comprehension/nodes.py
@@ -1,59 +1,79 @@
-"""
-This is a boilerplate pipeline 'text_comprehension'
-generated using Kedro 0.18.2
-"""
-import numpy as np
-import pandas as pd
-
-from allennlp.predictors import Predictor
-from allennlp_models.pretrained import load_predictor
-
-def dummy_node(data):
-    print("Text Comprehension dummy node completed")
-    return 5
-
-nlp_models = [
-    { 'name' : 'ner-model',
-      'url': 'https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz'
-    },
-]
-
-for nlp_model in nlp_models:
-    nlp_model['model'] = Predictor.from_path(nlp_model['url'])
-
-def locationOrganization(data):
-        data= data.head(5)
-        def entity_recognition (sentence):
-            location = []
-            for nlp_model in nlp_models:
-                results =  nlp_model['model'].predict(sentence=sentence)
-                for word, tag in zip(results["words"], results["tags"]):
-                    if tag != 'U-LOC':
-                        continue
-                    else:
-                        # print([word])#(f"{word}")
-                        location.append(word)
-                # print()
-                return location
-
-        def entity_recognition_pe(sentence):
-            organisation = []
-            for nlp_model in nlp_models:
-                results =  nlp_model['model'].predict(sentence=sentence)
-                for word, tag in zip(results["words"], results["tags"]):
-                    if tag != 'U-ORG':
-                        continue
-                    else:
-                        # print([word])#(f"{word}")
-                        organisation.append(word)
-                # print()
-                return organisation
-        result = []
-        for i in range(len(data["text"])):
-            result.append(list(set(entity_recognition(data["text"][i]))))
-        re1 = []
-        for i in range(len(data["text"])):
-            re1.append(list(set(entity_recognition_pe(data["text"][i]))))
-        data["location"]=result
-        data["organisation"]=re1
-        return data[["text","location","organisation"]]
\ No newline at end of file
+"""
+This is a boilerplate pipeline 'text_comprehension'
+generated using Kedro 0.18.2
+"""
+import numpy as np
+import pandas as pd
+
+from allennlp.data.tokenizers.sentence_splitter import SpacySentenceSplitter
+from allennlp.predictors.predictor import Predictor
+import allennlp_models.tagging
+from tqdm import tqdm
+
+def dummy_node(data):
+    print("Text Comprehension dummy node completed")
+    return 5
+
+nlp_model = Predictor.from_path('https://storage.googleapis.com/allennlp-public-models/ner-elmo.2021-02-12.tar.gz')
+
+def get_organization(data):
+    def get_org(df_org):
+        df_ent = []
+        for _, row in pd.DataFrame({"beg": df_org.loc[lambda x: x["tags"] == "B-ORG"].index.values,
+                    "end": df_org.loc[lambda x: x["tags"] == "L-ORG"].index.values + 1}).iterrows():
+            df_ent.append(df_org.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" "))
+        df_ent.extend(df_org.loc[lambda x: x["tags"] == "U-ORG"]["words"].to_list())
+        return df_ent
+
+
+    df_edges = []
+    for _, row in tqdm(list(data.iterrows())):
+        df_org = []
+        sents = SpacySentenceSplitter().split_sentences(row["text"])
+        for i, s in list(enumerate(sents)):
+            res = nlp_model.predict(
+            sentence=s
+            )
+            df_org.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\
+            .loc[lambda x: x["tags"].str.contains("ORG")])
+
+        df_org = pd.concat(df_org).reset_index(drop=True)
+        df_ent = get_org(df_org)
+        df_edges.append(pd.DataFrame({"text": row["text"], "organization": df_ent}))
+
+    df_org = pd.concat(df_edges)
+    df_org = pd.DataFrame(df_org.groupby(df_org["text"])["organization"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index())
+
+    return df_org["organization"]
+
+
+def get_location(data):
+    def get_location(df_loc):
+        df_ent = []
+        for _, row in pd.DataFrame({"beg": df_loc.loc[lambda x: x["tags"] == "B-LOC"].index.values,
+                        "end": df_loc.loc[lambda x: x["tags"] == "L-LOC"].index.values + 1}).iterrows():
+            df_ent.append(df_loc.iloc[row["beg"]:row["end"]]["words"].str.cat(sep=" "))
+        df_ent.extend(df_loc.loc[lambda x: x["tags"] == "U-LOC"]["words"].to_list())
+        return df_ent
+
+
+    df_edges = []
+    for _, row in tqdm(list(data.iterrows())):
+        df_loc = []
+        sents = SpacySentenceSplitter().split_sentences(row["text"])
+        for i, s in list(enumerate(sents)):
+            res = nlp_model.predict(
+            sentence=s
+            )
+            df_loc.append(pd.DataFrame({"tags": res["tags"], "words": res["words"], "text": row["text"]})\
+            .loc[lambda x: x["tags"].str.contains("LOC")])
+
+        df_loc = pd.concat(df_loc).reset_index(drop=True)
+        df_ent = get_location(df_loc)
+        df_edges.append(pd.DataFrame({"text": row["text"], "location": df_ent}))
+
+
+    df_loc = pd.concat(df_edges)
+    df_loc = pd.DataFrame(df_loc.groupby(df_loc["text"])["location"].apply(lambda x: ', '.join(np.unique(x.values.ravel()))).reset_index())
+
+    return df_loc["location"]
\ No newline at end of file
diff --git a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py
index 2e686f6..4294678 100644
--- a/src/nlp_sdg/pipelines/text_comprehension/pipeline.py
+++ b/src/nlp_sdg/pipelines/text_comprehension/pipeline.py
@@ -1,36 +1,42 @@
-"""
-This is a boilerplate pipeline 'text_comprehension'
-generated using Kedro 0.18.2
-"""
-
-from kedro.pipeline import Pipeline, node, pipeline
-
-from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node,locationOrganization
-
-
-
-def create_pipeline(**kwargs) -> Pipeline:
-    pipeline_instance =  pipeline(
-        
-        [
-            node(
-                func=dummy_node,
-                inputs="model_input_data",
-                outputs="comprehension_output",
-                name="dummy_node",
-            ),
-            node(
-                func=locationOrganization,
-                inputs="sdg_text_data",
-                outputs="locations_and_org_data",
-                name="locationOrganization_node",
-            ),            
-        ]
-    )
-    text_comprehension = pipeline(
-        pipe=pipeline_instance,
-        inputs=["model_input_data","sdg_text_data"],
-        namespace = "text_comprehension",
-        outputs="locations_and_org_data"
-    )
+"""
+This is a boilerplate pipeline 'text_comprehension'
+generated using Kedro 0.18.2
+"""
+
+from kedro.pipeline import Pipeline, node, pipeline
+
+from nlp_sdg.pipelines.text_comprehension.nodes import dummy_node, get_organization, get_location
+
+
+
+def create_pipeline(**kwargs) -> Pipeline:
+    pipeline_instance =  pipeline(
+        
+        [
+            node(
+                func=dummy_node,
+                inputs="model_input_data",
+                outputs="comprehension_output",
+                name="dummy_node",
+            ),
+            node(
+                func=get_organization,
+                inputs="sdg_text_data",
+                outputs="organization_data",
+                name="get_organization_node",
+            ),
+            node(
+                func=get_location,
+                inputs="sdg_text_data",
+                outputs="location_data",
+                name="get_location_node",
+            ),            
+        ]
+    )
+    text_comprehension = pipeline(
+        pipe=pipeline_instance,
+        inputs=["model_input_data","sdg_text_data"],
+        namespace = "text_comprehension",
+        outputs=["organization_data", "location_data"]
+    )
     return text_comprehension
\ No newline at end of file

From 4868144a65577a89b6b08e6d41d6a7f16b4f40a9 Mon Sep 17 00:00:00 2001
From: lista <listaabutto@gmail.com>
Date: Sun, 16 Oct 2022 02:19:35 +0000
Subject: [PATCH 3/3] Refactored NER code

---
 conf/base/catalog.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml
index e41da48..13aa8ee 100644
--- a/conf/base/catalog.yml
+++ b/conf/base/catalog.yml
@@ -57,7 +57,7 @@ organization_data:
   filepath: data/02_intermediate/organization_data.csv
   layer: intermediate
   
-organization_data:
+location_data:
   type: pandas.CSVDataSet
-  filepath: data/02_intermediate/organization_data.csv
+  filepath: data/02_intermediate/location_data.csv
   layer: intermediate
\ No newline at end of file