Skip to content

Commit

Permalink
updated documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
tisjune committed Jul 5, 2021
1 parent 19da017 commit fbf994d
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 124 deletions.
5 changes: 2 additions & 3 deletions convokit/expected_context_framework/col_normed_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ def fit(self, corpus, y=None, selector=lambda x: True):

def transform(self, corpus, selector=lambda x: True):
"""
Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as `<output_field>`. Also annotates each utterance with a metadata field,
`<output_field>__n_feats`, indicating the number of terms in the vocabulary that utterance contains.
Computes column-normalized tf-idf representations for utterances in a corpus, stored in the corpus as `<output_field>`. Also annotates each utterance with a metadata field, `<output_field>__n_feats`, indicating the number of terms in the vocabulary that utterance contains.
:param corpus: Corpus
Expand Down Expand Up @@ -119,7 +118,7 @@ class ColNormedTfidf(TransformerMixin):

"""
Model that derives tf-idf reweighted representations of utterances,
which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfWrapper` transformer; see documentation of that transformer for further details.
which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfTransformer` transformer; see documentation of that transformer for further details.
"""

def __init__(self, **kwargs):
Expand Down
60 changes: 25 additions & 35 deletions convokit/expected_context_framework/dual_context_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,31 @@
from convokit.transformer import Transformer

class DualContextWrapper(Transformer):
"""
Transformer that derives and compares characterizations of terms and utterances with respect to two different choices of conversational context. Designed in particular to contrast replies and predecessors, though other choices of context are also possible.
This is a wrapper that encompasses two instances of `ExpectedContextModelTransformer`, stored at the `ec_models` attribute.
It computes two particular comparative term-level statistics, orientation and shift, stored as the `term_orientations` and `term_shifts` attributes.
It also computes these statistics at the utterance level in the transform step.
:param context_fields: list containing the names of the utterance-level attributes containing the IDs of the context-utterances used by each of the `ExpectedContextModelTransformer` instances.
:param output_prefixes: list containing the name of the attributes and vectors that each `ExpectedContextModelTransformer` instances will write to in the transform step.
:param vect_field: the name of the vectors to use as input vector representation for utterances, as stored in a corpus.
:param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default, the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles), then it's a good idea to use a different input representation.
:param wrapper_output_prefix: the metadata fields where the utterance-level orientation and shift statistics are stored. By default, these attributes are stored as `orn` and `shift` in the metadata; if `wrapper_output_prefix` is specified, then they are stored as `<wrapper_output_prefix>_orn` (orientation) and `<wrapper_output_prefix>_shift` (shift).
:param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
:param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True` then in practice, we output `n_svd_dims-1`-dimensional representations.
:param n_clusters: the number of clusters to infer.
:param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters. In some cases (e.g., if utterances are highly unstructured and lengthy) it might be better to cluster term representations first.
:param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
:param cluster_random_state: the random seed to use to infer clusters.
"""
def __init__(self, context_fields, output_prefixes,
vect_field, context_vect_field=None, wrapper_output_prefix='',
n_svd_dims=25, snip_first_dim=True, n_clusters=8, cluster_on='utts',
random_state=None, cluster_random_state=None):
"""
Transformer that derives and compares characterizations of terms and utterances with respect to two different choices of conversational context. Designed in particular to contrast replies and predecessors, though other choices of context are also possible.
This is a wrapper that encompasses two instances of `ExpectedContextModelTransformer`, stored at the `ec_models` attribute.
It computes two particular comparative term-level statistics, orientation and shift, stored as the `term_orientations` and `term_shifts` attributes.
It also computes these statistics at the utterance level in the transform step.
:param context_fields: list containing the names of the utterance-level attributes containing the IDs of the context-utterances used by each of the `ExpectedContextModelTransformer` instances.
:param output_prefixes: list containing the name of the attributes and vectors that each `ExpectedContextModelTransformer` instances will write to in the transform step.
:param vect_field: the name of the vectors to use as input vector representation for utterances, as stored in a corpus.
:param context_vect_field: the name of the vectors to use as input vector representations for context-utterances, as stored in a corpus. by default,
the transformer will use the same vector representations as utterances, specified in `vect_field`. if you expect that utterances
and context-utterances will differ in some way (e.g., they come from speakers in a conversation who play clearly delineated roles),
then it's a good idea to use a different input representation.
:param wrapper_output_prefix: the metadata fields where the utterance-level orientation and shift statistics are stored. By default, these attributes are stored as `orn` and `shift` in the metadata; if `wrapper_output_prefix` is specified, then they are stored as `<wrapper_output_prefix>_orn` (orientation) and `<wrapper_output_prefix>_shift` (shift).
:param n_svd_dims: the dimensionality of the representations to derive (via LSA/SVD).
:param snip_first_dim: whether or not to remove the first dimension of the derived representations. by default this is set to `True`, since we've
found that the first dimension tends to reflect term frequency, making the output less informative. Note that if `snip_first_dim=True`
then in practice, we output `n_svd_dims-1`-dimensional representations.
:param n_clusters: the number of clusters to infer.
:param cluster_on: whether to cluster on utterance or term representations, (corresponding to values `'utts'` or `'terms'`). By default, we infer clusters
based on representations of the utterances from the training data, and then assign term and context-utterance representations to the resultant clusters.
In some cases (e.g., if utterances are highly unstructured and lengthy) it might
be better to cluster term representations first.
:param random_state: the random seed to use in the LSA step (which calls a randomized implementation of SVD)
:param cluster_random_state: the random seed to use to infer clusters.

"""
self.context_fields = context_fields
self.output_prefixes = output_prefixes
self.vect_field = vect_field
Expand Down Expand Up @@ -76,10 +68,8 @@ def fit(self, corpus, y=None, selector=lambda x: True, context_selector=lambda x
Fits a transformer over training data: fits the two `ExpectedContextModelTransformer` instances, and computes term-level orientation and shift.
:param corpus: Corpus containing training data
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances
will be considered in the fit step. defaults to using all utterances.
:param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances
will be considered in the fit step. defaults to using all utterances.
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances will be considered in the fit step. defaults to using all utterances.
:param context_selector: a boolean function of signature `filter(utterance)` that determines which context-utterances will be considered in the fit step. defaults to using all utterances.
:return: None
"""

Expand All @@ -94,8 +84,7 @@ def transform(self, corpus, selector=lambda x: True):
Computes vector representations, ranges, and cluster assignments for utterances in a corpus, using the two `ExpectedContextModelTransformer` instances. Also computes utterance-level orientation and shift.
:param corpus: Corpus
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances
to transform. defaults to all utterances.
:param selector: a boolean function of signature `filter(utterance)` that determines which utterances to transform. defaults to all utterances.
:return: the Corpus, with per-utterance attributes.
"""
self.ec_models[0].transform(corpus, selector=selector)
Expand Down Expand Up @@ -169,9 +158,10 @@ def get_term_df(self):
def summarize(self, k=10, max_chars=1000, corpus=None):
"""
For each constituent ExpectedContextModelTransformer, prints inferred clusters and statistics about their sizes.
:param k: number of examples to print out.
:max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
:corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
:param max_chars: maximum number of characters per utterance/context-utterance to print. Can be toggled to control the size of the output.
:param corpus: optional, the corpus that the transformer was trained on. if set, will print example utterances and context-utterances as well as terms.
:return: None
"""
Expand Down
Loading

0 comments on commit fbf994d

Please sign in to comment.