rebuild in nbdev framework cemoody#103 (comment)

JiaxiangBU · Jul 12, 2020 · 0e6abb2 · 0e6abb2
1 parent b7f4642
commit 0e6abb2
Show file tree

Hide file tree

Showing 16 changed files with 254 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,12 @@ build
 *.egg-info/
 dist/
 htmlcov/
+.Rproj.user
+lda2vec.Rproj
+Makefile
+code
+commit.Rmd
+tmp
+lda2vec/*.md
+*.ipynb
+.Rhistory
diff --git a/README.md b/README.md
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -0,0 +1,16 @@
+_site/
+Gemfile
+Gemfile.lock
+_config.yml
+_data
+_includes
+_layouts
+css
+feed.xml
+fonts
+images
+js
+licenses
+sidebar.json
+sitemap.xml
+tooltips.json
diff --git a/lda2vec/__init__.py b/lda2vec/__init__.py
@@ -1,3 +1,4 @@
+__version__ = "0.1.0"
 import dirichlet_likelihood
 import embed_mixture
 import tracking

diff --git a/lda2vec/_nbdev.py b/lda2vec/_nbdev.py
@@ -0,0 +1,44 @@
+# AUTOGENERATED BY NBDEV! DO NOT EDIT!
+
+__all__ = ["index", "modules", "custom_doc_links", "git_url"]
+
+index = {"Corpus": "corpus.ipynb",
+         "fast_replace": "corpus.ipynb",
+         "dirichlet_likelihood": "dirichlet_likelihood.ipynb",
+         "EmbedMixture": "embed_mixture.ipynb",
+         "orthogonal_matrix": "fake_data.ipynb",
+         "softmax": "fake_data.ipynb",
+         "sample": "fake_data.ipynb",
+         "fake_data": "fake_data.ipynb",
+         "NegativeSamplingFunction": "negative_sampling.ipynb",
+         "negative_sampling": "negative_sampling.ipynb",
+         "negative_sampling.patched": "negative_sampling.ipynb",
+         "L.NegativeSampling.negative_sampling": "negative_sampling.ipynb",
+         "F.negative_sampling": "negative_sampling.ipynb",
+         "tokenize": "preprocess.ipynb",
+         "prob_words": "topics.ipynb",
+         "prepare_topics": "topics.ipynb",
+         "print_top_words_per_topic": "topics.ipynb",
+         "get_request": "topics.ipynb",
+         "topic_coherence": "topics.ipynb",
+         "Tracking": "tracking.ipynb",
+         "move": "utils.ipynb",
+         "most_similar": "utils.ipynb",
+         "chunks": "utils.ipynb",
+         "MovingAverage": "utils.ipynb"}
+
+modules = ["corpus.py",
+           "dirichlet_likelihood.py",
+           "embed_mixture.py",
+           "fake_data.py",
+           "negative_sampling.py",
+           "preprocess.py",
+           "topics.py",
+           "tracking.py",
+           "utils.py"]
+
+doc_url = "https://JiaxiangBU.github.io/lda2vec/"
+
+git_url = "https://github.com/JiaxiangBU/lda2vec/tree/master/"
+
+def custom_doc_links(name): return None
diff --git a/lda2vec/corpus.py b/lda2vec/corpus.py
@@ -1,14 +1,20 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: corpus.ipynb (unless otherwise specified).
+
+__all__ = ['Corpus', 'fast_replace']
+
+# Cell
 from collections import defaultdict
 import numpy as np
 import difflib
 import pandas as pd
 
+# Cell
 try:
     from pyxdameraulevenshtein import damerau_levenshtein_distance_withNPArray
 except ImportError:
     pass
 
-
+# Cell
 class Corpus():
     _keys_frequency = None
 
@@ -576,7 +582,7 @@ def compact_word_vectors(self, vocab, filename=None, array=None,
                     choice = np.array(keys_raw)[idx][np.argmin(d)]
                     # choice = difflib.get_close_matches(word, choices)[0]
                     vector = model[choice]
-                    print compact, word, ' --> ', choice
+                    print(compact, word, ' --> ', choice)
                 except IndexError:
                     pass
             if vector is None:
@@ -702,7 +708,7 @@ def compact_to_coocurrence(self, word_compact, indices, window_size=10):
                     .rename(columns=dict(frame='counts')))
         return counts
 
-
+# Cell
 def fast_replace(data, keys, values, skip_checks=False):
     """ Do a search-and-replace in array `data`.
 
@@ -730,4 +736,4 @@ def fast_replace(data, keys, values, skip_checks=False):
     keys, values = keys[sdx], values[sdx]
     idx = np.digitize(data, keys, right=True)
     new_data = values[idx]
-    return new_data
+    return new_data
diff --git a/lda2vec/dirichlet_likelihood.py b/lda2vec/dirichlet_likelihood.py
@@ -1,7 +1,12 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: dirichlet_likelihood.ipynb (unless otherwise specified).
+
+__all__ = ['dirichlet_likelihood']
+
+# Cell
 import chainer.functions as F
 from chainer import Variable
 
-
+# Cell
 def dirichlet_likelihood(weights, alpha=None):
     """ Calculate the log likelihood of the observed topic proportions.
     A negative likelihood is more likely than a negative likelihood.
@@ -31,4 +36,4 @@ def dirichlet_likelihood(weights, alpha=None):
     else:
         log_proportions = F.log_softmax(weights.W)
     loss = (alpha - 1.0) * log_proportions
-    return -F.sum(loss)
+    return -F.sum(loss)
diff --git a/lda2vec/embed_mixture.py b/lda2vec/embed_mixture.py
@@ -1,11 +1,17 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: embed_mixture.ipynb (unless otherwise specified).
+
+__all__ = ['EmbedMixture']
+
+# Cell
 import numpy as np
 
+# Cell
 import chainer
 import chainer.links as L
 import chainer.functions as F
 from chainer import Variable
 
-
+# Cell
 def _orthogonal_matrix(shape):
     # Stolen from blocks:
     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
@@ -22,7 +28,7 @@ def _orthogonal_matrix(shape):
     n_min = min(shape[0], shape[1])
     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
 
-
+# Cell
 class EmbedMixture(chainer.Chain):
     r""" A single document is encoded as a multinomial mixture of latent topics.
     The mixture is defined on simplex, so that mixture weights always sum
@@ -114,4 +120,4 @@ def proportions(self, doc_ids, softmax=False):
             norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y)
             return y / (norm + 1e-7)
         else:
-            return w
+            return w
diff --git a/lda2vec/fake_data.py b/lda2vec/fake_data.py
@@ -1,7 +1,12 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: fake_data.ipynb (unless otherwise specified).
+
+__all__ = ['orthogonal_matrix', 'softmax', 'sample', 'fake_data']
+
+# Cell
 import numpy as np
 from numpy.random import random_sample
 
-
+# Cell
 def orthogonal_matrix(shape):
     # Stolen from blocks:
     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
@@ -18,7 +23,7 @@ def orthogonal_matrix(shape):
     n_min = min(shape[0], shape[1])
     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
 
-
+# Cell
 def softmax(w):
     # https://gist.github.com/stober/1946926
     w = np.array(w)
@@ -28,13 +33,13 @@ def softmax(w):
     dist = e / np.sum(e, axis=1)[:, None]
     return dist
 
-
+# Cell
 def sample(values, probabilities, size):
     assert np.allclose(np.sum(probabilities, axis=-1), 1.0)
     bins = np.add.accumulate(probabilities)
     return values[np.digitize(random_sample(size), bins)]
 
-
+# Cell
 def fake_data(n_docs, n_words, n_sent_length, n_topics):
     """ Generate latent topic vectors for words and documents
     and then for each document, draw a sentence. Draw each word
@@ -70,4 +75,4 @@ def fake_data(n_docs, n_words, n_sent_length, n_topics):
         words = sample(indices, doc_to_wrd, n_sent_length)
         sentences.append(words)
     sentences = np.array(sentences)
-    return sentences.astype('int32')
+    return sentences.astype('int32')
diff --git a/lda2vec/negative_sampling.py b/lda2vec/negative_sampling.py
@@ -1,11 +1,17 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: negative_sampling.ipynb (unless otherwise specified).
+
+__all__ = ['NegativeSamplingFunction', 'negative_sampling']
+
+# Cell
 import numpy
 import six
 
+# Cell
 from chainer import cuda
 from chainer import function
 from chainer.utils import type_check
 
-
+# Cell
 class NegativeSamplingFunction(function.Function):
 
     ignore_label = -1
@@ -170,7 +176,7 @@ def backward_gpu(self, inputs, grads):
               self.sample_size + 1, gW)
         return gx, None, gW
 
-
+# Cell
 def negative_sampling(x, t, W, sampler, sample_size):
     """Negative sampling loss function.
 
@@ -223,11 +229,11 @@ def negative_sampling(x, t, W, sampler, sample_size):
     """
     return NegativeSamplingFunction(sampler, sample_size)(x, t, W)
 
-
+# Cell
 # Monkey-patch the chainer code to replace the negative sampling
 # with the one used here
 import chainer.links as L
 import chainer.functions as F
 negative_sampling.patched = True
 L.NegativeSampling.negative_sampling = negative_sampling
-F.negative_sampling = negative_sampling
+F.negative_sampling = negative_sampling
diff --git a/lda2vec/preprocess.py b/lda2vec/preprocess.py
@@ -1,9 +1,15 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: preprocess.ipynb (unless otherwise specified).
+
+__all__ = ['tokenize']
+
+# Cell
 from spacy.en import English
 from spacy.attrs import LOWER, LIKE_URL, LIKE_EMAIL
 
+# Cell
 import numpy as np
 
-
+# Cell
 def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
              **kwargs):
     """ Uses spaCy to quickly tokenize text and return an array
@@ -101,7 +107,7 @@ def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
     vocab[skip] = '<SKIP>'
     return data, vocab
 
-
+# Cell
 if __name__ == "__main__":
     import doctest
-    doctest.testmod()
+    doctest.testmod()
diff --git a/lda2vec/topics.py b/lda2vec/topics.py
@@ -1,21 +1,26 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: topics.ipynb (unless otherwise specified).
+
+__all__ = ['prob_words', 'prepare_topics', 'print_top_words_per_topic', 'get_request', 'topic_coherence']
+
+# Cell
 import numpy as np
 import requests
 import multiprocessing
 
-
+# Cell
 def _softmax(x):
     e_x = np.exp(x - np.max(x))
     out = e_x / e_x.sum()
     return out
 
-
+# Cell
 def _softmax_2d(x):
     y = x - x.max(axis=1, keepdims=True)
     np.exp(y, out=y)
     y /= y.sum(axis=1, keepdims=True)
     return y
 
-
+# Cell
 def prob_words(context, vocab, temperature=1.0):
     """ This calculates a softmax over the vocabulary as a function
     of the dot product of context and word.
@@ -24,7 +29,7 @@ def prob_words(context, vocab, temperature=1.0):
     prob = _softmax(dot / temperature)
     return prob
 
-
+# Cell
 def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
                    doc_lengths=None, term_frequency=None, normalize=False):
     """ Collects a dictionary of word, document and topic distributions.
@@ -85,9 +90,9 @@ def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
             'term_frequency': term_frequency}
     return data
 
-
+# Cell
 def print_top_words_per_topic(data, top_n=10, do_print=True):
-    """ Given a pyLDAvis data array, print out the top words in every topic.
+    """ Given a pyLDAvis data array, print(out the top words in every topic.)
 
     Arguments
     ---------
@@ -103,11 +108,11 @@ def print_top_words_per_topic(data, top_n=10, do_print=True):
         top_words = [data['vocab'][i].strip().replace(' ', '_') for i in top]
         msg = ' '.join(top_words)
         if do_print:
-            print prefix + msg
+            print(prefix + msg)
         lists.append(top_words)
     return lists
 
-
+# Cell
 def get_request(url):
     for _ in range(5):
         try:
@@ -116,7 +121,7 @@ def get_request(url):
             pass
     return None
 
-
+# Cell
 def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
                                      'umass']):
     """ Requests the topic coherence from AKSW Palmetto
@@ -142,4 +147,4 @@ def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
     ans = {}
     for ((j, s, t), tc) in zip(args, coherences):
         ans[(j, s)] = tc
-    return ans
+    return ans