diff --git a/convokit/expected_context_framework/col_normed_tfidf.py b/convokit/expected_context_framework/col_normed_tfidf.py
index 8299ab50..532179c0 100644
--- a/convokit/expected_context_framework/col_normed_tfidf.py
+++ b/convokit/expected_context_framework/col_normed_tfidf.py
@@ -48,8 +48,7 @@ def fit(self, corpus, y=None, selector=lambda x: True):
def transform(self, corpus, selector=lambda x: True):
"""
- Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as ``. Also annotates each utterance with a metadata field,
- `__n_feats`, indicating the number of terms in the vocabulary that utterance contains.
+ Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as ``. Also annotates each utterance with a metadata field, `__n_feats`, indicating the number of terms in the vocabulary that utterance contains.
:param corpus: Corpus
@@ -119,7 +118,7 @@ class ColNormedTfidf(TransformerMixin):
"""
Model that derives tf-idf reweighted representations of utterances,
- which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfWrapper` transformer; see documentation of that transformer for further details.
+ which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfTransformer` transformer; see documentation of that transformer for further details.
"""
def __init__(self, **kwargs):
diff --git a/convokit/expected_context_framework/dual_context_wrapper.py b/convokit/expected_context_framework/dual_context_wrapper.py
index 598997b7..dfb2b339 100644
--- a/convokit/expected_context_framework/dual_context_wrapper.py
+++ b/convokit/expected_context_framework/dual_context_wrapper.py
@@ -7,39 +7,31 @@
from convokit.transformer import Transformer
class DualContextWrapper(Transformer):
+ """
+ Transformer that derives and compares characterizations of terms and utterances with respect to two different choices of conversational context. Designed in particular to contrast replies and predecessors, though other choices of context are also possible.
+
+ This is a wrapper that encompasses two instances of `ExpectedContextModelTransformer`, stored at the `ec_models` attribute.
+ It computes two particular comparative term-level statistics, orientation and shift, stored as the `term_orientations` and `term_shifts` attributes.
+ It also computes these statistics at the utterance level in the transform step.
+
+ :param context_fields: list containing the names of the utterance-level attributes containing the IDs of the context-utterances used by each of the `ExpectedContextModelTransformer` instances.
+ :param output_prefixes: list containing the name of the attributes and vectors that each `ExpectedContextModelTransformer` instances will write to in the transform step.
+ :param vect_field: the name of the vectors to use as input vector representation for utterances, as stored in a corpus.
+ :param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default, the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles), then it's a good idea to use a different input representation.
+ :param wrapper_output_prefix: the metadata fields where the utterance-level orientation and shift statistics are stored. By default, these attributes are stored as `orn` and `shift` in the metadata; if `wrapper_output_prefix` is specified, then they are stored as `_orn` (orientation) and `_shift` (shift).
+ :param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
+ :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True` then in practice, we output `n_svd_dims-1`-dimensional representations.
+ :param n_clusters: the number of clusters to infer.
+ :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters. In some cases (e.g., if utterances are highly unstructured and lengthy) it might be better to cluster term representations first.
+ :param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
+ :param cluster_random_state: the random seed to use to infer clusters.
+ """
def __init__(self, context_fields, output_prefixes,
vect_field, context_vect_field=None, wrapper_output_prefix='',
n_svd_dims=25, snip_first_dim=True, n_clusters=8, cluster_on='utts',
random_state=None, cluster_random_state=None):
- """
- Transformer that derives and compares characterizations of terms and utterances with respect to two different choices of conversational context. Designed in particular to contrast replies and predecessors, though other choices of context are also possible.
-
- This is a wrapper that encompasses two instances of `ExpectedContextModelTransformer`, stored at the `ec_models` attribute.
- It computes two particular comparative term-level statistics, orientation and shift, stored as the `term_orientations` and `term_shifts` attributes.
- It also computes these statistics at the utterance level in the transform step.
-
- :param context_fields: list containing the names of the utterance-level attributes containing the IDs of the context-utterances used by each of the `ExpectedContextModelTransformer` instances.
- :param output_prefixes: list containing the name of the attributes and vectors that each `ExpectedContextModelTransformer` instances will write to in the transform step.
- :param vect_field: the name of the vectors to use as input vector representation for utterances, as stored in a corpus.
- :param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default,
- the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances
- and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles),
- then it's a good idea to use a different input representation.
- :param wrapper_output_prefix: the metadata fields where the utterance-level orientation and shift statistics are stored. By default, these attributes are stored as `orn` and `shift` in the metadata; if `wrapper_output_prefix` is specified, then they are stored as `_orn` (orientation) and `_shift` (shift).
- :param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
- :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've
- found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True`
- then in practice, we output `n_svd_dims-1`-dimensional representations.
- :param n_clusters: the number of clusters to infer.
- :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters
- based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters.
- In some cases (e.g., if utterances are highly unstructured and lengthy) it might
- be better to cluster term representations first.
- :param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
- :param cluster_random_state: the random seed to use to infer clusters.
- """
self.context_fields = context_fields
self.output_prefixes = output_prefixes
self.vect_field = vect_field
@@ -76,10 +68,8 @@ def fit(self, corpus, y=None, selector=lambda x: True, context_selector=lambda x
Fits a transformer over training data: fits the two `ExpectedContextModelTransformer` instances, and computes term-level orientation and shift.
:param corpus: Corpus containing training data
- :param selector: a boolean function of signature `filter(utterance)` that determines which utterances
- will be considered in the fit step. defaults to using all utterances.
- :param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances
- will be considered in the fit step. defaults to using all utterances.
+ :param selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered in the fit step. defaults to using all utterances.
+ :param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances will be considered in the fit step. defaults to using all utterances.
:return: None
"""
@@ -94,8 +84,7 @@ def transform(self, corpus, selector=lambda x: True):
Computes vector representations, ranges, and cluster assignments for utterances in a corpus, using the two `ExpectedContextModelTransformer` instances. Also computes utterance-level orientation and shift.
:param corpus: Corpus
- :param selector: a boolean function of signature `filter(utterance)` that determines which utterances
- to transform. defaults to all utterances.
+ :param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform. defaults to all utterances.
:return: the Corpus, with per-utterance attributes.
"""
self.ec_models[0].transform(corpus, selector=selector)
@@ -169,9 +158,10 @@ def get_term_df(self):
def summarize(self, k=10, max_chars=1000, corpus=None):
"""
For each constituent ExpectedContextModelTransformer, prints inferred clusters and statistics about their sizes.
+
:param k: number of examples to print out.
- :max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
- :corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
+ :param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
+ :param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
:return: None
"""
diff --git a/convokit/expected_context_framework/expected_context_model.py b/convokit/expected_context_framework/expected_context_model.py
index 5fb3cbc8..0e818eb9 100644
--- a/convokit/expected_context_framework/expected_context_model.py
+++ b/convokit/expected_context_framework/expected_context_model.py
@@ -25,16 +25,10 @@ class ExpectedContextModelTransformer(Transformer):
data consisting of pairs of utterances and context-utterances, represented as feature vectors (e.g., tf-idf reweighted
term-document matrices), specified via the `vect_field` and `context_vect_field` arguments. This model is stored as the `ec_model` attribute of the transformer, and can be accessed as such.
In the fit step, the model, which is based off of latent semantic analysis (LSA), computes the following:
- * representations of terms and utterances in the training data, with respect to the context,
- along with representations of the context (which are derived in the underlying LSA step). the dimensionality of these
- representations is specified via the `n_svd_dims` argument (see also the `snip_first_dim` and `random_state` arguments). these can
- be accessed via various `get` functions that the transformer provides.
- * a term-level statistic, "range", measuring the variation in context-utterances associated with a term. One interpretation of
- this statistic is that it quantifies the "strengths of our expectations" of what reply a term typically gets, or what predecessors
- it typically follows.
- * a clustering of utterance, term and context representations. The resultant clusters can help interpret the representations the model
- derives, by highlighting salient groupings that emerge. The number of clusters is specified via the `n_clusters` argument;
- the `print_clusters` function can be called to inspect this output. (see also the `cluster_on` and `cluster_random_state` arguments)
+
+ * representations of terms and utterances in the training data, with respect to the context, along with representations of the context (which are derived in the underlying LSA step). the dimensionality of these representations is specified via the `n_svd_dims` argument (see also the `snip_first_dim` and `random_state` arguments). these can be accessed via various `get` functions that the transformer provides.
+ * a term-level statistic, "range", measuring the variation in context-utterances associated with a term. One interpretation of this statistic is that it quantifies the "strengths of our expectations" of what reply a term typically gets, or what predecessors it typically follows.
+ * a clustering of utterance, term and context representations. The resultant clusters can help interpret the representations the model derives, by highlighting salient groupings that emerge. The number of clusters is specified via the `n_clusters` argument; the `print_clusters` function can be called to inspect this output. (see also the `cluster_on` and `cluster_random_state` arguments)
@@ -46,31 +40,20 @@ class ExpectedContextModelTransformer(Transformer):
The transfomer contains various functions to access term-level characterizations. In the transform step, it outputs
vector representations of utterances, stored as `_repr` in the corpus. It also outputs various attributes
of utterances (names prefixed with `_`), stored as metadata fields in each transformed utterance:
- * range: the range of the utterance
- * clustering.cluster: the name of the cluster the utterance has been assigned to
- * clustering.cluster_id_: the numerical ID (0-# of clusters) of the cluster the utterance has been assigned to
- * clustering.cluster_dist: the distance between the utterance representation and the centroid of its cluster
-
- :param context_field: the name of an utterance-level attribute containing the ID of the corresponding context-utterance.
- in particular, to use immediate predecessors as context, set `context_field` to `'reply_to'`. as another example,
- to use immediate replies, provided that utterances contain an attribute `next_id` containing the ID of their reply,
- set `context_field` to `'next_id'`.
- :param output_prefix: the name of the attributes and vectors to write to in the transform step. the transformer outputs several
- fields, which will be prefixed with the given string.
+
+ * `range`: the range of the utterance
+ * `clustering.cluster`: the name of the cluster the utterance has been assigned to
+ * `clustering.cluster_id_`: the numerical ID (0-# of clusters) of the cluster the utterance has been assigned to
+ * `clustering.cluster_dist`: the distance between the utterance representation and the centroid of its cluster
+
+ :param context_field: the name of an utterance-level attribute containing the ID of the corresponding context-utterance. in particular, to use immediate predecessors as context, set `context_field` to `'reply_to'`. as another example, to use immediate replies, provided that utterances contain an attribute `next_id` containing the ID of their reply, set `context_field` to `'next_id'`.
+ :param output_prefix: the name of the attributes and vectors to write to in the transform step. the transformer outputs several fields, which will be prefixed with the given string.
:param vect_field: the name of the vectors to use as input vector representation for utterances, as stored in a corpus.
- :param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default,
- the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances
- and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles),
- then it's a good idea to use a different input representation.
+ :param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default, the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles), then it's a good idea to use a different input representation.
:param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
- :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've
- found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True`
- then in practice, we output `n_svd_dims-1`-dimensional representations.
+ :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True` then in practice, we output `n_svd_dims-1`-dimensional representations.
:param n_clusters: the number of clusters to infer.
- :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters
- based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters.
- In some cases (e.g., if utterances are highly unstructured and lengthy) it might
- be better to cluster term representations first.
+ :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters. In some cases (e.g., if utterances are highly unstructured and lengthy) it might be better to cluster term representations first.
:param model: an existing, fitted `ExpectedContextModelTransformer` object to initialize with (optional)
:param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
:param cluster_random_state: the random seed to use to infer clusters.
@@ -240,9 +223,7 @@ def transform_context_utts(self, corpus, selector=lambda x: True):
def fit_clusters(self, n_clusters='default', random_state='default'):
"""
- Infers a clustering of term or utterance representations (specified by the `cluster_on` argument used to initialize the transformer).
- on the training data originally used to fit the transformer.
- Can be called to infer a different number of clusters than what was initially specified.
+ Infers a clustering of term or utterance representations (specified by the `cluster_on` argument used to initialize the transformer) on the training data originally used to fit the transformer. Can be called to infer a different number of clusters than what was initially specified.
:param n_clusters: number of clusters to infer. defaults to the number of clusters specified when initializing the transformer.
:param random_state: random seed used to infer clusters. defaults to the random seed used to initialize the transformer.
@@ -305,8 +286,8 @@ def print_clusters(self, k=10, max_chars=1000, corpus=None):
and context-utterances as well.
:param k: number of examples to print out.
- :max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
- :corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
+ :param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
+ :param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
:return: None
"""
@@ -351,9 +332,10 @@ def print_cluster_stats(self):
def summarize(self, k=10, max_chars=1000, corpus=None):
"""
Wrapper function to print inferred clusters and statistics about their sizes.
+
:param k: number of examples to print out.
- :max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
- :corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
+ :param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
+ :param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
:return: None
"""
@@ -407,6 +389,7 @@ def get_context_term_reprs(self):
def get_clustering(self):
"""
Returns a dictionary containing various objects pertaining to the inferred clustering, with fields as follows:
+
* `km_obj`: the fitted KMeans object
* `utts`: a Pandas dataframe of cluster assignments for utterances from the training data
* `terms`: a dataframe of cluster assignments for terms
diff --git a/convokit/expected_context_framework/expected_context_model_pipeline.py b/convokit/expected_context_framework/expected_context_model_pipeline.py
index c7a94d43..1b1b3eb4 100644
--- a/convokit/expected_context_framework/expected_context_model_pipeline.py
+++ b/convokit/expected_context_framework/expected_context_model_pipeline.py
@@ -10,18 +10,16 @@
class ExpectedContextModelPipeline(Transformer):
"""
Wrapper class implementing a pipeline that derives characterizations of terms and utterances in terms of their conversational context. The pipeline handles the following steps:
+
* processing input text (via a pipeline supplied by the user in the `text_pipe` argument);
* transforming text to input representation (via `ColNormedTfidfTransformer`);
* deriving characterizations (via `ExpectedContextModelTransformer`)
- The `ColNormTfidfTransformer` components are stored as the `tfidf_model` and `context_tfidf_model` attributes of the class; the `ExpectedContextModelTransformer` is stored as the `ec_model` attribute.
+ The `ColNormedTfidfTransformer` components are stored as the `tfidf_model` and `context_tfidf_model` attributes of the class; the `ExpectedContextModelTransformer` is stored as the `ec_model` attribute.
For further details, see the `ColNormedTfidfTransformer` and `ExpectedContextModelTransformer` classes.
- :param context_field: the name of an utterance-level attribute containing the ID of the corresponding context-utterance.
- in particular, to use immediate predecessors as context, set `context_field` to `'reply_to'`. as another example,
- to use immediate replies, provided that utterances contain an attribute `next_id` containing the ID of their reply,
- set `context_field` to `'next_id'`.
+ :param context_field: the name of an utterance-level attribute containing the ID of the corresponding context-utterance. in particular, to use immediate predecessors as context, set `context_field` to `'reply_to'`. as another example, to use immediate replies, provided that utterances contain an attribute `next_id` containing the ID of their reply, set `context_field` to `'next_id'`.
:param output_prefix: the name of the attributes and vectors to write to in the transform step. the transformer outputs several fields, which will be prefixed with the given string.
:param text_field: the name of the utterance-level attribute containing the text to use as input.
:param context_text_field: the name of the utterance-level attribute containing the text to use as input for context-utterances. by default, is equivalent to `text_field`.
@@ -33,14 +31,9 @@ class ExpectedContextModelPipeline(Transformer):
:param min_terms: the minimum number of terms in the vocabulary, derived by `ColNormedTfidfTransformer`, that an utterance must contain for it to be considered in fitting and transforming the underlying `ExpectedContextModelTransformer` object. defaults to 0, meaning the transformer will consider all utterances.
:param context_min_terms: minimum number of terms in the vocabulary for a context-utterance to be considered in fitting and transforming the underlying `ExpectedContextModelTransformer` object. equivalent to `min_terms` by default.
:param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
- :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've
- found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True`
- then in practice, we output `n_svd_dims-1`-dimensional representations.
+ :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True` then in practice, we output `n_svd_dims-1`-dimensional representations.
:param n_clusters: the number of clusters to infer.
- :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters
- based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters.
- In some cases (e.g., if utterances are highly unstructured and lengthy) it might
- be better to cluster term representations first.
+ :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters. In some cases (e.g., if utterances are highly unstructured and lengthy) it might be better to cluster term representations first.
:param ec_model: an existing, fitted `ExpectedContextModelPipeline` object to initialize with (optional)
:param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
:param cluster_random_state: the random seed to use to infer clusters.
@@ -132,10 +125,8 @@ def fit(self, corpus, y=None, selector=lambda x: True, context_selector=lambda x
range statistics for terms, and a clustering of the resultant representations.
:param corpus: Corpus containing training data
- :param selector: a boolean function of signature `filter(utterance)` that determines which utterances
- will be considered in the fit step. defaults to using all utterances, subject to `min_terms` parameter passed at initialization.
- :param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances
- will be considered in the fit step. defaults to using all utterances, subject to `context_min_terms` parameter passed at initialization.
+ :param selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered in the fit step. defaults to using all utterances, subject to `min_terms` parameter passed at initialization.
+ :param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances will be considered in the fit step. defaults to using all utterances, subject to `context_min_terms` parameter passed at initialization.
:return: None
"""
@@ -156,8 +147,7 @@ def transform(self, corpus, y=None, selector=lambda x: True):
Computes vector representations, ranges, and cluster assignments for utterances in a corpus.
:param corpus: Corpus
- :param selector: a boolean function of signature `filter(utterance)` that determines which utterances
- to transform.
+ :param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform.
:return: the Corpus, with per-utterance representations, ranges and cluster assignments.
"""
_ = self.text_pipe.transform(corpus)
@@ -183,9 +173,10 @@ def transform_utterance(self, utt):
def summarize(self, k=10, max_chars=1000, corpus=None):
"""
Prints inferred clusters and statistics about their sizes.
+
:param k: number of examples to print out.
- :max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
- :corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
+ :param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
+ :param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
:return: None
"""
@@ -253,18 +244,16 @@ def dump(self, dirname):
class DualContextPipeline(Transformer):
"""
Wrapper class implementing a pipeline that derives characterizations of terms and utterances in terms of two choices of conversational context. The pipeline handles the following steps:
+
* processing input text (via a pipeline supplied by the user in the `text_pipe` argument);
* transforming text to input representation (via `ColNormedTfidfTransformer`);
* deriving characterizations (via `DualContextWrapper`)
- The `ColNormTfidfTransformer` components are stored as the `tfidf_model` and `context_tfidf_model` attributes of the class; the `DualContextWrapper` is stored as the `dualmodel` attribute.
+ The `ColNormedTfidfTransformer` components are stored as the `tfidf_model` and `context_tfidf_model` attributes of the class; the `DualContextWrapper` is stored as the `dualmodel` attribute.
For further details, see the `ColNormedTfidfTransformer` and `DualContextWrapper` classes.
- :param context_field: the name of an utterance-level attribute containing the ID of the corresponding context-utterance.
- in particular, to use immediate predecessors as context, set `context_field` to `'reply_to'`. as another example,
- to use immediate replies, provided that utterances contain an attribute `next_id` containing the ID of their reply,
- set `context_field` to `'next_id'`.
+ :param context_field: the name of an utterance-level attribute containing the ID of the corresponding context-utterance. in particular, to use immediate predecessors as context, set `context_field` to `'reply_to'`. as another example, to use immediate replies, provided that utterances contain an attribute `next_id` containing the ID of their reply, set `context_field` to `'next_id'`.
:param output_prefixes: list containing the name of the attributes and vectors that the `DualContextWrapper` component will write to in the transform step.
:param text_field: the name of the utterance-level attribute containing the text to use as input.
:param context_text_field: the name of the utterance-level attribute containing the text to use as input for context-utterances. by default, is equivalent to `text_field`.
@@ -277,14 +266,9 @@ class DualContextPipeline(Transformer):
:param min_terms: the minimum number of terms in the vocabulary, derived by `ColNormedTfidfTransformer`, that an utterance must contain for it to be considered in fitting and transforming the underlying `ExpectedContextModelTransformer` object. defaults to 0, meaning the transformer will consider all utterances.
:param context_min_terms: minimum number of terms in the vocabulary for a context-utterance to be considered in fitting and transforming the underlying `ExpectedContextModelTransformer` object. equivalent to `min_terms` by default.
:param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
- :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've
- found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True`
- then in practice, we output `n_svd_dims-1`-dimensional representations.
+ :param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True` then in practice, we output `n_svd_dims-1`-dimensional representations.
:param n_clusters: the number of clusters to infer.
- :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters
- based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters.
- In some cases (e.g., if utterances are highly unstructured and lengthy) it might
- be better to cluster term representations first.
+ :param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters. In some cases (e.g., if utterances are highly unstructured and lengthy) it might be better to cluster term representations first.
:param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
:param cluster_random_state: the random seed to use to infer clusters.
@@ -370,10 +354,8 @@ def fit(self, corpus, y=None, selector=lambda x: True, context_selector=lambda x
Fits the model over training data.
:param corpus: Corpus containing training data
- :param selector: a boolean function of signature `filter(utterance)` that determines which utterances
- will be considered in the fit step. defaults to using all utterances, subject to `min_terms` parameter passed at initialization.
- :param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances
- will be considered in the fit step. defaults to using all utterances, subject to `context_min_terms` parameter passed at initialization.
+ :param selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered in the fit step. defaults to using all utterances, subject to `min_terms` parameter passed at initialization.
+ :param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances will be considered in the fit step. defaults to using all utterances, subject to `context_min_terms` parameter passed at initialization.
:return: None
"""
self.text_pipe.fit_transform(corpus)
@@ -393,8 +375,7 @@ def transform(self, corpus, y=None, selector=lambda x: True):
Computes vector representations, and statistics for utterances in a corpus, using the `DualContextWrapper` component.
:param corpus: Corpus
- :param selector: a boolean function of signature `filter(utterance)` that determines which utterances
- to transform. defaults to all utterances.
+ :param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform. defaults to all utterances.
:return: the Corpus, with per-utterance attributes.
"""
_ = self.text_pipe.transform(corpus)
@@ -421,9 +402,10 @@ def transform_utterance(self, utt):
def summarize(self, k=10, max_chars=1000, corpus=None):
"""
Prints inferred clusters and statistics about their sizes, for each component in the underlying `DualContextWrapper`.
+
:param k: number of examples to print out.
- :max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
- :corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
+ :param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
+ :param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
:return: None
"""
diff --git a/doc/source/col_normed_tfidf.rst b/doc/source/col_normed_tfidf.rst
index 44455a6c..8ebab333 100644
--- a/doc/source/col_normed_tfidf.rst
+++ b/doc/source/col_normed_tfidf.rst
@@ -4,4 +4,5 @@ Column normalized Tf-Idf
Implements a modifed Tf-Idf transformer that normalizes by columns (i.e., term-wise).
.. automodule:: convokit.expected_context_framework.col_normed_tfidf
+ :members:
diff --git a/doc/source/expected_context_model.rst b/doc/source/expected_context_model.rst
index d4b4680f..05493833 100644
--- a/doc/source/expected_context_model.rst
+++ b/doc/source/expected_context_model.rst
@@ -3,9 +3,19 @@ Expected Context Framework
Implements the Expected Context Framework as described in `this dissertation `_.
+Contains:
+
+* Basic `ExpectedContextModelTransformer `_
+* Wrapper `DualContextWrapper `_ that handles two choices of conversational context
+* Wrapper pipelines `ExpectedContextModelPipeline `_ and `DualContextPipeline `_
+
+
Example usage:
-`deriving question types and other characterizations in British parliamentary question periods `_,
-exploration of Switchboard dialog acts corpus `using ExpectedContextModelTransformer `_, and `using DualContextWrapper `_, `examining Wikipedia talk page discussions `_ and `computing the orientation of justice utterances in the US Supreme Court `_
+
+* `deriving question types and other characterizations in British parliamentary question periods `_
+* exploration of Switchboard dialog acts corpus `using ExpectedContextModelTransformer `_, and `using DualContextWrapper `_
+* `examining Wikipedia talk page discussions `_
+* `computing the orientation of justice utterances in the US Supreme Court `_
.. automodule:: convokit.expected_context_framework.expected_context_model
:members:
diff --git a/doc/source/featureExtraction.rst b/doc/source/featureExtraction.rst
index 7b6b949c..233c6dfa 100644
--- a/doc/source/featureExtraction.rst
+++ b/doc/source/featureExtraction.rst
@@ -7,6 +7,7 @@ These are the transformers related to extracting features from the corpus and it
:maxdepth: 2
Bag-of-words
+ Column-normalized tf-idf
Hyperconvo
PhrasingMotifs
PolitenessStrategies
diff --git a/website/index.md+ b/website/index.md+
index b2fc66e9..1289e88e 100644
--- a/website/index.md+
+++ b/website/index.md+
@@ -10,7 +10,7 @@
-This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn. Several large [conversational datasets](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [2.4.3](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/releases/tag/v2.4) (released 30 Oct 2020); follow the [project on GitHub](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit) to keep track of updates.
+This toolkit contains tools to extract conversational features and analyze social phenomena in conversations, using a [single unified interface](https://convokit.cornell.edu/documentation/architecture.html) inspired by (and compatible with) scikit-learn. Several large [conversational datasets](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit#datasets) are included together with scripts exemplifying the use of the toolkit on these datasets. The latest version is [2.5](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/releases/tag/v2.5) (released 06 Jul 2021); follow the [project on GitHub](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit) to keep track of updates.
Read our [documentation](https://convokit.cornell.edu/documentation) or try ConvoKit in our [interactive tutorial](https://colab.research.google.com/github/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/Introduction_to_ConvoKit.ipynb).
@@ -26,12 +26,18 @@ Example: [exploring the balance of power in the U.S. Supreme Court](https://gith
A set of lexical and parse-based features correlating with politeness and impoliteness.
Example: [understanding the (mis)use of politeness strategies in conversations gone awry on Wikipedia](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/conversations-gone-awry/Conversations_Gone_Awry_Prediction.ipynb).
-### [Prompt types](http://www.cs.cornell.edu/~cristian/Asking_too_much.html) [(API)](https://convokit.cornell.edu/documentation/promptTypes.html)
+### [Expected Conversational Context Framework](https://tisjune.github.io/research/dissertation) [(API)](https://convokit.cornell.edu/documentation/expected_context_model.html)
+
+A framework for characterizing utterances and terms based on their expected conversational context, consisting of model implementations and wrapper pipelines.
+Examples: [deriving question types and other characterizations in British parliamentary question periods](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/parliament_demo.ipynb),
+[exploration of Switchboard dialog acts corpus](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/switchboard_exploration_demo.ipynb), [examining Wikipedia talk page discussions](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/wiki_awry_demo.ipynb) and [computing the orientation of justice utterances in the US Supreme Court](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/convokit/expected_context_framework/demos/scotus_orientation_demo.ipynb)
+
+
### [Hypergraph conversation representation](http://www.cs.cornell.edu/~cristian/Patterns_of_participant_interactions.html) [(API)](https://convokit.cornell.edu/documentation/hyperconvo.html)
A method for extracting structural features of conversations through a hypergraph representation.
@@ -45,9 +51,7 @@ Example: [speaker conversation attributes and diversity example on ChangeMyView]
A neural model for forecasting future outcomes of conversations (e.g., derailment into personal attacks) as they develop.
Available as an interactive notebook: [full version (fine-tuning + inference)](https://colab.research.google.com/drive/1SH4iMEHdoH4IovN-b9QOSK4kG4DhAwmb) or [inference-only](https://colab.research.google.com/drive/1GvICZN0VwZQSWw3pJaEVY-EQGoO-L5lH).
-### [Orientation (coming soon)](https://www.cs.cornell.edu/~cristian/Orientation.html)
-A method to quantify the degree to which an utterance is intended to direct the flow of the conversation forwards or backwards.
## Datasets
ConvoKit ships with several datasets ready for use "out-of-the-box".
@@ -155,6 +159,12 @@ A conversational dataset comprising group meetings of two to four participants t
Name for download: `gap-corpus`
+### [Wikipedia Articles for Deletion Corpus](https://convokit.cornell.edu/documentation/wiki-articles-for-deletion-corpus.html)
+
+A collection of Wikipedia's Articles for Deletion editor debates that occurred between January 1, 2005 and December 31, 2018. This corpus contains about 3,200,000 contributions by approximately 150,000 Wikipedia editors across almost 400,000 debates.
+
+Name for download: `wiki-articles-for-deletion-corpus`
+
### ...And your own corpus!
In addition to the provided datasets, you may also use ConvoKit with your own custom datasets by loading them into a `convokit.Corpus` object. [This example script](https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/converting_movie_corpus.ipynb) shows how to construct a Corpus from custom data.