Skip to content

Commit

Permalink
feat: support for English documents Kf/english pipeline (#65)
Browse files Browse the repository at this point in the history
Adding English models where necessary as well CLI/config options to set language as "da" or "en"
  • Loading branch information
KasperFyhn authored Feb 23, 2024
1 parent b816359 commit fa4f28b
Show file tree
Hide file tree
Showing 15 changed files with 293 additions and 104 deletions.
19 changes: 0 additions & 19 deletions config/default.toml

This file was deleted.

4 changes: 0 additions & 4 deletions config/template_csv.toml → config/eschatology.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
[base]
output_root = "output"
language = "en"

[preprocessing]
enabled = true
doc_type = "csv"
metadata_fields = ["*"]

[preprocessing.extra]
id_column = "id"
Expand All @@ -14,8 +12,6 @@ text_column = "body"
[docprocessing]
enabled = true
batch_size = 5
continue_from_last = true
triplet_extraction_method = "multi2oie"

[corpusprocessing]
enabled = true
20 changes: 16 additions & 4 deletions config/template.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
[base]
project_name = "PROJECT_NAME" # also CLI arg
output_root = "output"
language = "da/en"
language = "da/en" # also CLI arg

[preprocessing]
enabled = true
input_path = "PATH/TO/INPUT/*" # also CLI arg
n_docs = -1 # also CLI arg
doc_type = "text/csv/tweets/infomedia"
metadata_fields = ["*"]

[preprocessing.extra]
# specific extra arguments for your preprocessor, e.g. context length for tweets.
# leave empty unless you have very specific needs
# specific extra arguments for your preprocessor, e.g. context length for tweets or
# or field specification for CSVs

[docprocessing]
enabled = true
batch_size = 25
continue_from_last = true
triplet_extraction_method = "multi2oie/prompting"

[corpusprocessing]
enabled = true
enabled = true
embedding_model = "PATH_OR_NAME" # leave out for default model choice by language
dimensions = 100 # leave out to skip dimensionality reduction
n_neighbors = 15 # used for dimensionality reduction

[corpusprocessing.thresholds] # leave out for automatic estimation
min_cluster_size = 3 # unused if auto_thresholds is true
min_samples = 3 # unused if auto_thresholds is true
min_topic_size = 5 # unused if auto_thresholds is true
11 changes: 5 additions & 6 deletions docs/faq.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,20 @@ Frequently asked questions
How do I run the pipeline?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

If you have cloned this project from git, you can run the pipeline via :code:`run.py`
and, optionally, configurations from :code:`config`.
If you have installed the package, you can run the pipeline via :code:`conspiracies.run`.

.. code-block:: bash
python3 run.py my_project_name my_input_path
python3 -m conspiracies.run my_project_name my_input_path
For a specific pipeline configuration, create one using :code:`config/template.toml` and
For fine-grained control of pipeline behavior, create a configuration file based on the `config template <https://github.com/centre-for-humanities-computing/conspiracies/blob/main/config/template.toml>`__. and
pass it with the :code:`-c` flag.


.. code-block:: bash
python3 run.py my_project_name my_input_path -c config/my-config.toml
python3 -m conspiracies.run my_project_name my_input_path -c my-config.toml
Project name and input path can also be specified in the configuration instead in which
Expand All @@ -29,7 +28,7 @@ case you can do
python3 run.py -c config/my-config.toml
If you have installed the package via :code:`pip` and want to integrate (parts of) the
If you have installed the package and want to integrate (parts of) the
pipeline into your own workflow, you can use individual components or integrate a
:code:`Pipeline` object in your script.

Expand Down
39 changes: 39 additions & 0 deletions src/conspiracies/common/modelchoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
from typing import Any


class ModelChoice:
"""Helper class for selecting an appropriate model based on language codes.
An error will be thrown on unsupported languages. To avoid that, set 'fallback'
model if appropriate.
If choices are given as supplier functions, they will be called and then returned.
Usage:
>>> mc1 = ModelChoice(da="danish_model", fallback="fallback_model")
>>> mc1.get_model("da") # "danish_model"
>>> mc1.get_model("de") # "fallback_model"
>>> mc2 = ModelChoice(da="danish_model")
>>> mc2.get_model("de") # throws error
>>> mc3 = ModelChoice(da=lambda: "danish_model")
>>> mc3.get_model("da") # "danish_model"
"""

def __init__(self, **choices: Any):
self.models = choices

def get_model(self, language: str):
if language not in self.models:
error = f"Language '{language}' not supported!"
if "fallback" in self.models:
logging.warning(error + " Using fallback model.")
language = "fallback"
else:
raise ValueError(error)
model = self.models[language]
if callable(model): # if supplier function
model = model()
logging.debug("Using '%s' model: %s", language, model)
return model
74 changes: 50 additions & 24 deletions src/conspiracies/corpusprocessing/umap_hdb.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from typing import Tuple, List, Dict, Optional, Union
from typing import Tuple, List, Dict, Optional, Union, Set
import os
import spacy
from umap import UMAP
Expand All @@ -12,6 +12,8 @@
import random
import argparse

from conspiracies.common.modelchoice import ModelChoice


def read_txt(path: str):
with open(path, mode="r", encoding="utf8") as f:
Expand Down Expand Up @@ -39,7 +41,7 @@ def triplet_from_line(line: str) -> Union[Tuple[str, str, str], None]:

def filter_triplets_with_stopwords(
triplets: List[Tuple[str, str, str]],
stopwords: List[str],
stopwords: Set[str],
soft: bool = True,
) -> List[Tuple[str, str, str]]:
"""Filters triplets that contain a stopword.
Expand Down Expand Up @@ -69,6 +71,7 @@ def filter_triplets_with_stopwords(

def load_triplets(
file_path: str,
language: str = "danish",
soft_filtering: bool = True,
shuffle: bool = True,
) -> Tuple[list, list, list, list]:
Expand All @@ -84,7 +87,6 @@ def load_triplets(
objects: List of objects
filtered_triplets: List of filtered triplets
"""
triplets_list: List[Tuple[str, str, str]] = []
data = read_txt(file_path)
triplets_list = [
triplet_from_line(line)
Expand All @@ -93,7 +95,7 @@ def load_triplets(
]
filtered_triplets = filter_triplets_with_stopwords(
triplets_list,
get_stop_words("danish"),
set(get_stop_words(language)),
soft=soft_filtering,
)

Expand Down Expand Up @@ -290,8 +292,9 @@ def label_clusters(

def embed_and_cluster(
list_to_embed: List[str],
embedding_model: str = "vesteinn/DanskBERT",
n_dimensions: int = 40,
language: str,
embedding_model: str,
n_dimensions: int = None,
n_neighbors: int = 15,
min_cluster_size: int = 5,
min_samples: int = 3,
Expand All @@ -302,7 +305,10 @@ def embed_and_cluster(
Args:
list_to_embed: List of strings to embed and cluster
n_dimensions: Number of dimensions to reduce the embedding space to
language: language for SpaCy pipeline for cluster labeling
embedding_model: model name or path, refer to
https://www.sbert.net/docs/pretrained_models.html
n_dimensions: Number of dimensions to reduce the embedding space to, None to skip
n_neighbors: Number of neighbors to use for UMAP
min_cluster_size: Minimum cluster size for HDBscan
min_samples: Minimum number of samples for HDBscan
Expand All @@ -316,12 +322,16 @@ def embed_and_cluster(

embedding_model = SentenceTransformer(embedding_model)

# Embed and reduce embdding space
print("Embedding and reducing embedding space")
print("Embedding")
embeddings = embedding_model.encode(list_to_embed) # type: ignore
scaled_embeddings = StandardScaler().fit_transform(embeddings)
reducer = UMAP(n_components=n_dimensions, n_neighbors=n_neighbors)
reduced_embeddings = reducer.fit_transform(scaled_embeddings)

if n_dimensions is not None:
print("Reducing embedding space")
reducer = UMAP(n_components=n_dimensions, n_neighbors=n_neighbors)
reduced_embeddings = reducer.fit_transform(scaled_embeddings)
else:
reduced_embeddings = scaled_embeddings

# Cluster with HDBscan
print("Clustering")
Expand All @@ -335,7 +345,8 @@ def embed_and_cluster(

# Label and prune clusters
print("Labeling clusters")
nlp = spacy.load("da_core_news_sm")
model = ModelChoice(da="da_core_news_sm", en="en_core_web_sm").get_model(language)
nlp = spacy.load(model)
labeled_clusters = label_clusters(
clusters,
nlp,
Expand Down Expand Up @@ -420,18 +431,28 @@ def create_nodes_and_edges(

def main(
path: str,
embedding_model: str,
language: str,
embedding_model: str = None,
dim=40,
n_neighbors=15,
min_cluster_size=5,
min_samples=3,
min_topic_size=20,
save: bool = False,
):
# figure out embedding model if not given explicitly
if embedding_model is None:
embedding_model = ModelChoice(
da="vesteinn/DanskBERT",
en="all-MiniLM-L6-v2",
fallback="sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
).get_model(language)

# Load triplets
print("Loading triplets")
subjects, predicates, objects, filtered_triplets = load_triplets(
path,
language,
soft_filtering=True,
shuffle=True,
)
Expand All @@ -443,12 +464,6 @@ def main(
f"_clust={min_cluster_size}_samp={min_samples}_nodes_edges.json",
) # type: ignore

model = (
"vesteinn/DanskBERT"
if embedding_model == "danskBERT"
else "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

print(
f"Dimensions: {dim}, neighbors: {n_neighbors}, min cluster size: "
f"{min_cluster_size}, samples: {min_samples}, min topic size: {min_topic_size}",
Expand All @@ -458,7 +473,8 @@ def main(
# For predicate, we wanna keep all clusters -> min_topic_size=1
predicate_clusters = embed_and_cluster(
list_to_embed=predicates,
embedding_model=model,
language=language,
embedding_model=embedding_model,
n_dimensions=dim,
n_neighbors=n_neighbors,
min_cluster_size=min_cluster_size,
Expand All @@ -472,7 +488,8 @@ def main(
subj_obj = subjects + objects
subj_obj_clusters = embed_and_cluster(
list_to_embed=subj_obj,
embedding_model=model,
language=language,
embedding_model=embedding_model,
n_dimensions=dim,
n_neighbors=n_neighbors,
min_cluster_size=min_cluster_size,
Expand Down Expand Up @@ -503,13 +520,21 @@ def main(
help="Event to cluster. Must include name of source folder (newspapers or "
"twitter) and event",
)
parser.add_argument(
"-lang",
"--language",
type=str,
default="paraphrase",
help="Choice of language for embedding model (if not specified) and stop "
"words filtering",
)
parser.add_argument(
"-emb",
"--embedding_model",
type=str,
default="paraphrase",
help="""Which embedding model to use, default is paraphrase.
The other option is danskBERT""",
default=None,
help="Which embedding model to use. Automatically determined via language if "
"not given.",
)
parser.add_argument(
"-dim",
Expand Down Expand Up @@ -552,6 +577,7 @@ def main(
main(
path,
embedding_model=args.embedding_model,
language=args.language,
dim=args.n_dimensions,
n_neighbors=args.n_neighbors,
min_cluster_size=args.min_cluster_size,
Expand Down
Loading

0 comments on commit fa4f28b

Please sign in to comment.