Merge pull request #113 from chartbeat-labs/vectorizer

Add `Vectorizer` class to replace `vsm.doc_term_matrix` function
chartbeat-labs · Jun 21, 2017 · 9616c23 · 9616c23
2 parents 5b91cf7 + f3daac6
commit 9616c23
Show file tree

Hide file tree

Showing 4 changed files with 418 additions and 145 deletions.
diff --git a/tests/test_topic_model.py b/tests/test_topic_model.py
@@ -9,7 +9,7 @@
 import numpy as np
 from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
 
-from textacy.vsm import doc_term_matrix
+from textacy.vsm import Vectorizer
 from textacy import Corpus
 from textacy.tm import TopicModel
 
@@ -25,13 +25,13 @@ def setUp(self):
                  "It waited patiently about until Mary did appear.",
                  "Why does the lamb love Mary so? The eager children cry.",
                  "Mary loves the lamb, you know, the teacher did reply."]
-        textcorpus = Corpus('en_core_web_sm', texts=texts)
+        corpus = Corpus('en_core_web_sm', texts=texts)
         term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
-                      for doc in textcorpus]
-        self.doc_term_matrix, self.id2term = doc_term_matrix(
-            term_lists,
+                      for doc in corpus]
+        self.vectorizer = Vectorizer(
             weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
             min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
+        self.doc_term_matrix = self.vectorizer.fit_transform(term_lists)
         self.model = TopicModel('nmf', n_topics=5)
         self.model.fit(self.doc_term_matrix)
         self.tempdir = tempfile.mkdtemp(
@@ -76,25 +76,25 @@ def test_get_doc_topic_matrix_nonnormalized(self):
 
     def test_top_topic_terms_topics(self):
         self.assertEqual(
-            len(list(self.model.top_topic_terms(self.id2term, topics=-1))),
+            len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=-1))),
             self.model.n_topics)
         self.assertEqual(
-            len(list(self.model.top_topic_terms(self.id2term, topics=0))), 1)
+            len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=0))), 1)
         self.assertEqual(
             [topic_idx for topic_idx, _
-             in self.model.top_topic_terms(self.id2term, topics=(1, 2, 3))],
+             in self.model.top_topic_terms(self.vectorizer.id_to_term, topics=(1, 2, 3))],
             [1, 2, 3])
 
     def test_top_topic_terms_top_n(self):
         self.assertEqual(
-            len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=10))[0][1]),
+            len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=0, top_n=10))[0][1]),
             10)
         self.assertEqual(
-            len(list(self.model.top_topic_terms(self.id2term, topics=0, top_n=5))[0][1]),
+            len(list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=0, top_n=5))[0][1]),
             5)
 
     def test_top_topic_terms_weights(self):
-        observed = list(self.model.top_topic_terms(self.id2term, topics=-1,
+        observed = list(self.model.top_topic_terms(self.vectorizer.id_to_term, topics=-1,
                                                    top_n=10, weights=True))
         self.assertTrue(isinstance(observed[0][1][0], tuple))
         for topic_idx, term_weights in observed:

diff --git a/tests/test_vsm.py b/tests/test_vsm.py
@@ -23,12 +23,34 @@ def setUp(self):
         corpus = Corpus('en_core_web_sm', texts=texts)
         term_lists = [doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True)
                       for doc in corpus]
-        self.doc_term_matrix, self.id_to_term = vsm.doc_term_matrix(
-            term_lists,
+        self.vectorizer = vsm.Vectorizer(
             weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
             min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
-        self.idx_lamb = [k for k, v in self.id_to_term.items() if v == 'lamb'][0]
-        self.idx_child = [k for k, v in self.id_to_term.items() if v == 'child'][0]
+        self.doc_term_matrix = self.vectorizer.fit_transform(term_lists)
+        self.idx_lamb = [
+            id_ for term, id_ in self.vectorizer.vocabulary.items() if term == 'lamb'][0]
+        self.idx_child = [
+            id_ for term, id_ in self.vectorizer.vocabulary.items() if term == 'child'][0]
+
+    def test_vectorizer_feature_names(self):
+        expected = [
+            'mary', 'little', 'lamb', 'fleece', 'white', 'snow', 'go', 'sure',
+            'follow', 'school', 'day', 'rule', 'child', 'laugh', 'play', 'teacher',
+            'turn', 'linger', 'near', 'wait', 'patiently', 'appear', 'love',
+            'eager', 'cry', 'know', 'reply']
+        self.assertEqual(self.vectorizer.feature_names, expected)
+
+    def test_vectorizer_bad_init_params(self):
+        bad_init_params = (
+            {'min_df': -1},
+            {'max_df': -1},
+            {'max_n_terms': -1},
+            {'min_ic': -1.0},
+            {'vocabulary': 'foo bar bat baz'},
+            )
+        for bad_init_param in bad_init_params:
+            with self.assertRaises(ValueError):
+                vsm.Vectorizer(**bad_init_param)
 
     def test_get_term_freqs(self):
         term_freqs = vsm.get_term_freqs(self.doc_term_matrix, normalized=False)
@@ -79,38 +101,38 @@ def test_get_information_content(self):
         self.assertAlmostEqual(ics[self.idx_child], 0.81127, places=4)
 
     def test_filter_terms_by_df_identity(self):
-        dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
-                                          max_df=1.0, min_df=1, max_n_terms=None)
+        dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary,
+                                            max_df=1.0, min_df=1, max_n_terms=None)
         self.assertEqual(dtm.shape, self.doc_term_matrix.shape)
-        self.assertEqual(i2t, self.id_to_term)
+        self.assertEqual(vocab, self.vectorizer.vocabulary)
 
     def test_filter_terms_by_df_max_n_terms(self):
-        dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
-                                          max_df=1.0, min_df=1, max_n_terms=2)
+        dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary,
+                                            max_df=1.0, min_df=1, max_n_terms=2)
         self.assertEqual(dtm.shape, (8, 2))
-        self.assertEqual(sorted(i2t.values()), ['lamb', 'mary'])
+        self.assertEqual(sorted(vocab.keys()), ['lamb', 'mary'])
 
     def test_filter_terms_by_df_min_df(self):
-        dtm, i2t = vsm.filter_terms_by_df(self.doc_term_matrix, self.id_to_term,
-                                          max_df=1.0, min_df=2, max_n_terms=None)
+        dtm, vocab = vsm.filter_terms_by_df(self.doc_term_matrix, self.vectorizer.vocabulary,
+                                            max_df=1.0, min_df=2, max_n_terms=None)
         self.assertEqual(dtm.shape, (8, 6))
         self.assertEqual(
-            sorted(i2t.values()),
+            sorted(vocab.keys()),
             ['child', 'lamb', 'love', 'mary', 'school', 'teacher'])
 
     def test_filter_terms_by_df_exception(self):
         self.assertRaises(ValueError, vsm.filter_terms_by_df,
-                          self.doc_term_matrix, self.id_to_term,
+                          self.doc_term_matrix, self.vectorizer.vocabulary,
                           max_df=1.0, min_df=6, max_n_terms=None)
 
     def test_filter_terms_by_ic_identity(self):
-        dtm, i2t = vsm.filter_terms_by_ic(self.doc_term_matrix, self.id_to_term,
-                                          min_ic=0.0, max_n_terms=None)
+        dtm, vocab = vsm.filter_terms_by_ic(self.doc_term_matrix, self.vectorizer.vocabulary,
+                                            min_ic=0.0, max_n_terms=None)
         self.assertEqual(dtm.shape, self.doc_term_matrix.shape)
-        self.assertEqual(i2t, self.id_to_term)
+        self.assertEqual(vocab, self.vectorizer.vocabulary)
 
     def test_filter_terms_by_ic_max_n_terms(self):
-        dtm, i2t = vsm.filter_terms_by_ic(self.doc_term_matrix, self.id_to_term,
-                                          min_ic=0.0, max_n_terms=3)
+        dtm, vocab = vsm.filter_terms_by_ic(self.doc_term_matrix, self.vectorizer.vocabulary,
+                                            min_ic=0.0, max_n_terms=3)
         self.assertEqual(dtm.shape, (8, 3))
-        self.assertEqual(len(i2t), 3)
+        self.assertEqual(len(vocab), 3)
diff --git a/textacy/tm/topic_model.py b/textacy/tm/topic_model.py
@@ -18,25 +18,14 @@ class TopicModel(object):
     implementations of LSA, LDA, and NMF models. Inspect and visualize results.
     Save and load trained models to and from disk.
 
-    Stream a corpus with metadata from disk::
+    Prepare a vectorized corpus (i.e. document-term matrix) and corresponding
+    vocabulary (i.e. mapping of term strings to column indices in the matrix).
+    See :class:`textacy.vsm.Vectorizer` for details. In short::
 
-        >>> cw = textacy.datasets.CapitolWords()
-        >>> text_stream, metadata_stream = textacy.fileio.split_record_fields(
-        ...     cw.records(limit=1000), 'text', itemwise=False)
-        >>> corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)
-        >>> corpus
-        Corpus(1000 docs; 537742 tokens)
-
-    Tokenize and vectorize the corpus::
-
-        >>> terms_lists = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
-        ...                for doc in corpus)
-        >>> doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(
-        ...     terms_lists, weighting='tfidf', normalize=True, smooth_idf=True,
+        >>> vectorizer = Vectorizer(
+        ...     weighting='tfidf', normalize=True, smooth_idf=True,
         ...     min_df=3, max_df=0.95, max_n_terms=100000)
-        >>> doc_term_matrix
-        <1000x5579 sparse matrix of type '<class 'numpy.float64'>'
-                with 105632 stored elements in Compressed Sparse Row format>
+        >>> doc_term_matrix = vectorizer.fit_transform(terms_list)
 
     Initialize and train a topic model::
 
@@ -48,7 +37,7 @@ class TopicModel(object):
     Transform the corpus and interpret our model::
 
         >>> doc_topic_matrix = model.transform(doc_term_matrix)
-        >>> for topic_idx, top_terms in model.top_topic_terms(id2term, topics=[0,1]):
+        >>> for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, topics=[0,1]):
         ...     print('topic', topic_idx, ':', '   '.join(top_terms))
         topic 0 : people   american   go   year   work   think   $   today   money   america
         topic 1 : rescind   quorum   order   unanimous   consent   ask   president   mr.   madam   absence
@@ -84,7 +73,7 @@ class TopicModel(object):
 
     Visualize the model::
 
-        >>> model.termite_plot(doc_term_matrix, id2term,
+        >>> model.termite_plot(doc_term_matrix, vectorizer.id_to_term,
         ...                    topics=-1,  n_terms=25, sort_terms_by='seriation')
 
     Persist our topic model to disk::