Skip to content

Commit

Permalink
Merge pull request #113 from chartbeat-labs/vectorizer
Browse files Browse the repository at this point in the history
Add `Vectorizer` class to replace `vsm.doc_term_matrix` function
  • Loading branch information
bdewilde authored Jun 21, 2017
2 parents 5b91cf7 + f3daac6 commit 9616c23
Show file tree
Hide file tree
Showing 4 changed files with 418 additions and 145 deletions.
22 changes: 11 additions & 11 deletions tests/test_topic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

from textacy.vsm import doc_term_matrix
from textacy.vsm import Vectorizer
from textacy import Corpus
from textacy.tm import TopicModel

Expand All @@ -25,13 +25,13 @@ def setUp(self):
"It waited patiently about until Mary did appear.",
"Why does the lamb love Mary so? The eager children cry.",
"Mary loves the lamb, you know, the teacher did reply."]
textcorpus = Corpus('en_core_web_sm', texts=texts)
corpus = Corpus('en_core_web_sm', texts=texts)
term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
for doc in textcorpus]
self.doc_term_matrix, self.id2term = doc_term_matrix(
term_lists,
for doc in corpus]
self.vectorizer = Vectorizer(
weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
self.doc_term_matrix = self.vectorizer.fit_transform(term_lists)
self.model = TopicModel('nmf', n_topics=5)
self.model.fit(self.doc_term_matrix)
self.tempdir = tempfile.mkdtemp(
Expand Down Expand Up @@ -76,25 +76,25 @@ def test_get_doc_topic_matrix_nonnormalized(self):

def test_top_topic_terms_topics(self):
self.assertEqual(
len(list(self.model.top_topic_terms(self.id2term, topics=-1))),
len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=-1))),
self.model.n_topics)
self.assertEqual(
len(list(self.model.top_topic_terms(self.id2term, topics=0))), 1)
len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=0))), 1)
self.assertEqual(
[topic_idx for topic_idx, _
in self.model.top_topic_terms(self.id2term, topics=(1, 2, 3))],
in self.model.top_topic_terms(self.vectorizer.id_to_term, topics=(1, 2, 3))],
[1, 2, 3])

def test_top_topic_terms_top_n(self):
self.assertEqual(
len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=10))[0][1]),
len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=0, top_n=10))[0][1]),
10)
self.assertEqual(
len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=5))[0][1]),
len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=0, top_n=5))[0][1]),
5)

def test_top_topic_terms_weights(self):
observed = list(self.model.top_topic_terms(self.id2term, topics=-1,
observed = list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=-1,
top_n=10, weights=True))
self.assertTrue(isinstance(observed[0][1][0], tuple))
for topic_idx, term_weights in observed:
Expand Down
62 changes: 42 additions & 20 deletions tests/test_vsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,34 @@ def setUp(self):
corpus = Corpus('en_core_web_sm', texts=texts)
term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
for doc in corpus]
self.doc_term_matrix, self.id_to_term = vsm.doc_term_matrix(
term_lists,
self.vectorizer = vsm.Vectorizer(
weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
self.idx_lamb = [k for k, v in self.id_to_term.items() if v == 'lamb'][0]
self.idx_child = [k for k, v in self.id_to_term.items() if v == 'child'][0]
self.doc_term_matrix = self.vectorizer.fit_transform(term_lists)
self.idx_lamb = [
id_ for term, id_ in self.vectorizer.vocabulary.items() if term == 'lamb'][0]
self.idx_child = [
id_ for term, id_ in self.vectorizer.vocabulary.items() if term == 'child'][0]

def test_vectorizer_feature_names(self):
expected = [
'mary', 'little', 'lamb', 'fleece', 'white', 'snow', 'go', 'sure',
'follow', 'school', 'day', 'rule', 'child', 'laugh', 'play', 'teacher',
'turn', 'linger', 'near', 'wait', 'patiently', 'appear', 'love',
'eager', 'cry', 'know', 'reply']
self.assertEqual(self.vectorizer.feature_names, expected)

def test_vectorizer_bad_init_params(self):
bad_init_params = (
{'min_df': -1},
{'max_df': -1},
{'max_n_terms': -1},
{'min_ic': -1.0},
{'vocabulary': 'foo bar bat baz'},
)
for bad_init_param in bad_init_params:
with self.assertRaises(ValueError):
vsm.Vectorizer(**bad_init_param)

def test_get_term_freqs(self):
term_freqs = vsm.get_term_freqs(self.doc_term_matrix, normalized=False)
Expand Down Expand Up @@ -79,38 +101,38 @@ def test_get_information_content(self):
self.assertAlmostEqual(ics[self.idx_child], 0.81127, places=4)

def test_filter_terms_by_df_identity(self):
dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
max_df=1.0, min_df=1, max_n_terms=None)
dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary,
max_df=1.0, min_df=1, max_n_terms=None)
self.assertEqual(dtm.shape, self.doc_term_matrix.shape)
self.assertEqual(i2t, self.id_to_term)
self.assertEqual(vocab, self.vectorizer.vocabulary)

def test_filter_terms_by_df_max_n_terms(self):
dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
max_df=1.0, min_df=1, max_n_terms=2)
dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary,
max_df=1.0, min_df=1, max_n_terms=2)
self.assertEqual(dtm.shape, (8, 2))
self.assertEqual(sorted(i2t.values()), ['lamb', 'mary'])
self.assertEqual(sorted(vocab.keys()), ['lamb', 'mary'])

def test_filter_terms_by_df_min_df(self):
dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
max_df=1.0, min_df=2, max_n_terms=None)
dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary,
max_df=1.0, min_df=2, max_n_terms=None)
self.assertEqual(dtm.shape, (8, 6))
self.assertEqual(
sorted(i2t.values()),
sorted(vocab.keys()),
['child', 'lamb', 'love', 'mary', 'school', 'teacher'])

def test_filter_terms_by_df_exception(self):
self.assertRaises(ValueError, vsm.filter_terms_by_df,
self.doc_term_matrix, self.id_to_term,
self.doc_term_matrix, self.vectorizer.vocabulary,
max_df=1.0, min_df=6, max_n_terms=None)

def test_filter_terms_by_ic_identity(self):
dtm, i2t = vsm.filter_terms_by_ic(self.doc_term_matrix, self.id_to_term,
min_ic=0.0, max_n_terms=None)
dtm, vocab = vsm.filter_terms_by_ic(self.doc_term_matrix, self.vectorizer.vocabulary,
min_ic=0.0, max_n_terms=None)
self.assertEqual(dtm.shape, self.doc_term_matrix.shape)
self.assertEqual(i2t, self.id_to_term)
self.assertEqual(vocab, self.vectorizer.vocabulary)

def test_filter_terms_by_ic_max_n_terms(self):
dtm, i2t = vsm.filter_terms_by_ic(self.doc_term_matrix, self.id_to_term,
min_ic=0.0, max_n_terms=3)
dtm, vocab = vsm.filter_terms_by_ic(self.doc_term_matrix, self.vectorizer.vocabulary,
min_ic=0.0, max_n_terms=3)
self.assertEqual(dtm.shape, (8, 3))
self.assertEqual(len(i2t), 3)
self.assertEqual(len(vocab), 3)
27 changes: 8 additions & 19 deletions textacy/tm/topic_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,14 @@ class TopicModel(object):
implementations of LSA, LDA, and NMF models. Inspect and visualize results.
Save and load trained models to and from disk.
Stream a corpus with metadata from disk::
Prepare a vectorized corpus (i.e. document-term matrix) and corresponding
vocabulary (i.e. mapping of term strings to column indices in the matrix).
See :class:`textacy.vsm.Vectorizer` for details. In short::
>>> cw = textacy.datasets.CapitolWords()
>>> text_stream, metadata_stream = textacy.fileio.split_record_fields(
... cw.records(limit=1000), 'text', itemwise=False)
>>> corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)
>>> corpus
Corpus(1000 docs; 537742 tokens)
Tokenize and vectorize the corpus::
>>> terms_lists = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
... for doc in corpus)
>>> doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
... terms_lists, weighting='tfidf', normalize=True, smooth_idf=True,
>>> vectorizer = Vectorizer(
... weighting='tfidf', normalize=True, smooth_idf=True,
... min_df=3, max_df=0.95, max_n_terms=100000)
>>> doc_term_matrix
<1000x5579 sparse matrix of type '<class 'numpy.float64'>'
with 105632 stored elements in Compressed Sparse Row format>
>>> doc_term_matrix = vectorizer.fit_transform(terms_list)
Initialize and train a topic model::
Expand All @@ -48,7 +37,7 @@ class TopicModel(object):
Transform the corpus and interpret our model::
>>> doc_topic_matrix = model.transform(doc_term_matrix)
>>> for topic_idx, top_terms in model.top_topic_terms(id2term, topics=[0,1]):
>>> for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, topics=[0,1]):
... print('topic', topic_idx, ':', ' '.join(top_terms))
topic 0 : people american go year work think $ today money america
topic 1 : rescind quorum order unanimous consent ask president mr. madam absence
Expand Down Expand Up @@ -84,7 +73,7 @@ class TopicModel(object):
Visualize the model::
>>> model.termite_plot(doc_term_matrix, id2term,
>>> model.termite_plot(doc_term_matrix, vectorizer.id_to_term,
... topics=-1, n_terms=25, sort_terms_by='seriation')
Persist our topic model to disk::
Expand Down
Loading

0 comments on commit 9616c23

Please sign in to comment.