feat: support for English documents Kf/english pipeline (#65)

Adding English models where necessary as well CLI/config options to set language as "da" or "en"
centre-for-humanities-computing · Feb 23, 2024 · fa4f28b · fa4f28b
1 parent b816359
commit fa4f28b
Show file tree

Hide file tree

Showing 15 changed files with 293 additions and 104 deletions.
diff --git a/config/default.toml b/config/default.toml
diff --git a/config/template_csv.toml → config/eschatology.toml b/config/template_csv.toml → config/eschatology.toml
@@ -1,11 +1,9 @@
 [base]
-output_root = "output"
 language = "en"
 
 [preprocessing]
 enabled = true
 doc_type = "csv"
-metadata_fields = ["*"]
 
 [preprocessing.extra]
 id_column = "id"
@@ -14,8 +12,6 @@ text_column = "body"
 [docprocessing]
 enabled = true
 batch_size = 5
-continue_from_last = true
-triplet_extraction_method = "multi2oie"
 
 [corpusprocessing]
 enabled = true
diff --git a/config/template.toml b/config/template.toml
@@ -1,20 +1,32 @@
 [base]
+project_name = "PROJECT_NAME"  # also CLI arg
 output_root = "output"
-language = "da/en"
+language = "da/en"  # also CLI arg
 
 [preprocessing]
 enabled = true
+input_path = "PATH/TO/INPUT/*"  # also CLI arg
+n_docs = -1   # also CLI arg
 doc_type = "text/csv/tweets/infomedia"
 metadata_fields = ["*"]
 
 [preprocessing.extra]
-# specific extra arguments for your preprocessor, e.g. context length for tweets.
-# leave empty unless you have very specific needs
+# specific extra arguments for your preprocessor, e.g. context length for tweets or
+# or field specification for CSVs
 
 [docprocessing]
 enabled = true
+batch_size = 25
 continue_from_last = true
 triplet_extraction_method = "multi2oie/prompting"
 
 [corpusprocessing]
-enabled = true
+enabled = true
+embedding_model = "PATH_OR_NAME" # leave out for default model choice by language
+dimensions = 100 # leave out to skip dimensionality reduction
+n_neighbors = 15 # used for dimensionality reduction
+
+[corpusprocessing.thresholds]  # leave out for automatic estimation
+min_cluster_size = 3  # unused if auto_thresholds is true
+min_samples = 3  # unused if auto_thresholds is true
+min_topic_size = 5  # unused if auto_thresholds is true
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -5,21 +5,20 @@ Frequently asked questions
 How do I run the pipeline?
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-If you have cloned this project from git, you can run the pipeline via :code:`run.py`
-and, optionally, configurations from :code:`config`.
+If you have installed the package, you can run the pipeline via :code:`conspiracies.run`.
 
 .. code-block:: bash
 
-   python3 run.py my_project_name my_input_path
+   python3 -m conspiracies.run my_project_name my_input_path
 
 
-For a specific pipeline configuration, create one using :code:`config/template.toml` and
+For fine-grained control of pipeline behavior, create a configuration file based on the `config template <https://github.com/centre-for-humanities-computing/conspiracies/blob/main/config/template.toml>`__.  and
 pass it with the :code:`-c` flag.
 
 
 .. code-block:: bash
 
-   python3 run.py my_project_name my_input_path -c config/my-config.toml
+   python3 -m conspiracies.run my_project_name my_input_path -c my-config.toml
 
 
 Project name and input path can also be specified in the configuration instead in which
@@ -29,7 +28,7 @@ case you can do
 
    python3 run.py -c config/my-config.toml
 
-If you have installed the package via :code:`pip` and want to integrate (parts of) the
+If you have installed the package and want to integrate (parts of) the
 pipeline into your own workflow, you can use individual components or integrate a
 :code:`Pipeline` object in your script.
 

diff --git a/src/conspiracies/common/modelchoice.py b/src/conspiracies/common/modelchoice.py
@@ -0,0 +1,39 @@
+import logging
+from typing import Any
+
+
+class ModelChoice:
+    """Helper class for selecting an appropriate model based on language codes.
+
+    An error will be thrown on unsupported languages. To avoid that, set 'fallback'
+    model if appropriate.
+
+    If choices are given as supplier functions, they will be called and then returned.
+
+    Usage:
+
+    >>> mc1 = ModelChoice(da="danish_model", fallback="fallback_model")
+    >>> mc1.get_model("da") # "danish_model"
+    >>> mc1.get_model("de") # "fallback_model"
+    >>> mc2 = ModelChoice(da="danish_model")
+    >>> mc2.get_model("de") # throws error
+    >>> mc3 = ModelChoice(da=lambda: "danish_model")
+    >>> mc3.get_model("da") # "danish_model"
+    """
+
+    def __init__(self, **choices: Any):
+        self.models = choices
+
+    def get_model(self, language: str):
+        if language not in self.models:
+            error = f"Language '{language}' not supported!"
+            if "fallback" in self.models:
+                logging.warning(error + " Using fallback model.")
+                language = "fallback"
+            else:
+                raise ValueError(error)
+        model = self.models[language]
+        if callable(model):  # if supplier function
+            model = model()
+        logging.debug("Using '%s' model: %s", language, model)
+        return model
diff --git a/src/conspiracies/corpusprocessing/umap_hdb.py b/src/conspiracies/corpusprocessing/umap_hdb.py
@@ -1,5 +1,5 @@
 import json
-from typing import Tuple, List, Dict, Optional, Union
+from typing import Tuple, List, Dict, Optional, Union, Set
 import os
 import spacy
 from umap import UMAP
@@ -12,6 +12,8 @@
 import random
 import argparse
 
+from conspiracies.common.modelchoice import ModelChoice
+
 
 def read_txt(path: str):
     with open(path, mode="r", encoding="utf8") as f:
@@ -39,7 +41,7 @@ def triplet_from_line(line: str) -> Union[Tuple[str, str, str], None]:
 
 def filter_triplets_with_stopwords(
     triplets: List[Tuple[str, str, str]],
-    stopwords: List[str],
+    stopwords: Set[str],
     soft: bool = True,
 ) -> List[Tuple[str, str, str]]:
     """Filters triplets that contain a stopword.
@@ -69,6 +71,7 @@ def filter_triplets_with_stopwords(
 
 def load_triplets(
     file_path: str,
+    language: str = "danish",
     soft_filtering: bool = True,
     shuffle: bool = True,
 ) -> Tuple[list, list, list, list]:
@@ -84,7 +87,6 @@ def load_triplets(
         objects: List of objects
         filtered_triplets: List of filtered triplets
     """
-    triplets_list: List[Tuple[str, str, str]] = []
     data = read_txt(file_path)
     triplets_list = [
         triplet_from_line(line)
@@ -93,7 +95,7 @@ def load_triplets(
     ]
     filtered_triplets = filter_triplets_with_stopwords(
         triplets_list,
-        get_stop_words("danish"),
+        set(get_stop_words(language)),
         soft=soft_filtering,
     )
 
@@ -290,8 +292,9 @@ def label_clusters(
 
 def embed_and_cluster(
     list_to_embed: List[str],
-    embedding_model: str = "vesteinn/DanskBERT",
-    n_dimensions: int = 40,
+    language: str,
+    embedding_model: str,
+    n_dimensions: int = None,
     n_neighbors: int = 15,
     min_cluster_size: int = 5,
     min_samples: int = 3,
@@ -302,7 +305,10 @@ def embed_and_cluster(
 
     Args:
         list_to_embed: List of strings to embed and cluster
-        n_dimensions: Number of dimensions to reduce the embedding space to
+        language: language for SpaCy pipeline for cluster labeling
+        embedding_model: model name or path, refer to
+            https://www.sbert.net/docs/pretrained_models.html
+        n_dimensions: Number of dimensions to reduce the embedding space to, None to skip
         n_neighbors: Number of neighbors to use for UMAP
         min_cluster_size: Minimum cluster size for HDBscan
         min_samples: Minimum number of samples for HDBscan
@@ -316,12 +322,16 @@ def embed_and_cluster(
 
     embedding_model = SentenceTransformer(embedding_model)
 
-    # Embed and reduce embdding space
-    print("Embedding and reducing embedding space")
+    print("Embedding")
     embeddings = embedding_model.encode(list_to_embed)  # type: ignore
     scaled_embeddings = StandardScaler().fit_transform(embeddings)
-    reducer = UMAP(n_components=n_dimensions, n_neighbors=n_neighbors)
-    reduced_embeddings = reducer.fit_transform(scaled_embeddings)
+
+    if n_dimensions is not None:
+        print("Reducing embedding space")
+        reducer = UMAP(n_components=n_dimensions, n_neighbors=n_neighbors)
+        reduced_embeddings = reducer.fit_transform(scaled_embeddings)
+    else:
+        reduced_embeddings = scaled_embeddings
 
     # Cluster with HDBscan
     print("Clustering")
@@ -335,7 +345,8 @@ def embed_and_cluster(
 
     # Label and prune clusters
     print("Labeling clusters")
-    nlp = spacy.load("da_core_news_sm")
+    model = ModelChoice(da="da_core_news_sm", en="en_core_web_sm").get_model(language)
+    nlp = spacy.load(model)
     labeled_clusters = label_clusters(
         clusters,
         nlp,
@@ -420,18 +431,28 @@ def create_nodes_and_edges(
 
 def main(
     path: str,
-    embedding_model: str,
+    language: str,
+    embedding_model: str = None,
     dim=40,
     n_neighbors=15,
     min_cluster_size=5,
     min_samples=3,
     min_topic_size=20,
     save: bool = False,
 ):
+    # figure out embedding model if not given explicitly
+    if embedding_model is None:
+        embedding_model = ModelChoice(
+            da="vesteinn/DanskBERT",
+            en="all-MiniLM-L6-v2",
+            fallback="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
+        ).get_model(language)
+
     # Load triplets
     print("Loading triplets")
     subjects, predicates, objects, filtered_triplets = load_triplets(
         path,
+        language,
         soft_filtering=True,
         shuffle=True,
     )
@@ -443,12 +464,6 @@ def main(
             f"_clust={min_cluster_size}_samp={min_samples}_nodes_edges.json",
         )  # type: ignore
 
-    model = (
-        "vesteinn/DanskBERT"
-        if embedding_model == "danskBERT"
-        else "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
-    )
-
     print(
         f"Dimensions: {dim}, neighbors: {n_neighbors}, min cluster size: "
         f"{min_cluster_size}, samples: {min_samples}, min topic size: {min_topic_size}",
@@ -458,7 +473,8 @@ def main(
     # For predicate, we wanna keep all clusters -> min_topic_size=1
     predicate_clusters = embed_and_cluster(
         list_to_embed=predicates,
-        embedding_model=model,
+        language=language,
+        embedding_model=embedding_model,
         n_dimensions=dim,
         n_neighbors=n_neighbors,
         min_cluster_size=min_cluster_size,
@@ -472,7 +488,8 @@ def main(
     subj_obj = subjects + objects
     subj_obj_clusters = embed_and_cluster(
         list_to_embed=subj_obj,
-        embedding_model=model,
+        language=language,
+        embedding_model=embedding_model,
         n_dimensions=dim,
         n_neighbors=n_neighbors,
         min_cluster_size=min_cluster_size,
@@ -503,13 +520,21 @@ def main(
         help="Event to cluster. Must include name of source folder (newspapers or "
         "twitter) and event",
     )
+    parser.add_argument(
+        "-lang",
+        "--language",
+        type=str,
+        default="paraphrase",
+        help="Choice of language for embedding model (if not specified) and stop "
+        "words filtering",
+    )
     parser.add_argument(
         "-emb",
         "--embedding_model",
         type=str,
-        default="paraphrase",
-        help="""Which embedding model to use, default is paraphrase. 
-        The other option is danskBERT""",
+        default=None,
+        help="Which embedding model to use. Automatically determined via language if "
+        "not given.",
     )
     parser.add_argument(
         "-dim",
@@ -552,6 +577,7 @@ def main(
     main(
         path,
         embedding_model=args.embedding_model,
+        language=args.language,
         dim=args.n_dimensions,
         n_neighbors=args.n_neighbors,
         min_cluster_size=args.min_cluster_size,