Skip to content

Commit

Permalink
rebuild in nbdev framework cemoody#103 (comment)
Browse files Browse the repository at this point in the history
  • Loading branch information
JiaxiangBU committed Jul 12, 2020
1 parent b7f4642 commit 0e6abb2
Show file tree
Hide file tree
Showing 16 changed files with 254 additions and 56 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,12 @@ build
*.egg-info/
dist/
htmlcov/
.Rproj.user
lda2vec.Rproj
Makefile
code
commit.Rmd
tmp
lda2vec/*.md
*.ipynb
.Rhistory
Empty file added README.md
Empty file.
16 changes: 16 additions & 0 deletions docs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
_site/
Gemfile
Gemfile.lock
_config.yml
_data
_includes
_layouts
css
feed.xml
fonts
images
js
licenses
sidebar.json
sitemap.xml
tooltips.json
1 change: 1 addition & 0 deletions lda2vec/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
__version__ = "0.1.0"
import dirichlet_likelihood
import embed_mixture
import tracking
Expand Down
44 changes: 44 additions & 0 deletions lda2vec/_nbdev.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# AUTOGENERATED BY NBDEV! DO NOT EDIT!

__all__ = ["index", "modules", "custom_doc_links", "git_url"]

index = {"Corpus": "corpus.ipynb",
"fast_replace": "corpus.ipynb",
"dirichlet_likelihood": "dirichlet_likelihood.ipynb",
"EmbedMixture": "embed_mixture.ipynb",
"orthogonal_matrix": "fake_data.ipynb",
"softmax": "fake_data.ipynb",
"sample": "fake_data.ipynb",
"fake_data": "fake_data.ipynb",
"NegativeSamplingFunction": "negative_sampling.ipynb",
"negative_sampling": "negative_sampling.ipynb",
"negative_sampling.patched": "negative_sampling.ipynb",
"L.NegativeSampling.negative_sampling": "negative_sampling.ipynb",
"F.negative_sampling": "negative_sampling.ipynb",
"tokenize": "preprocess.ipynb",
"prob_words": "topics.ipynb",
"prepare_topics": "topics.ipynb",
"print_top_words_per_topic": "topics.ipynb",
"get_request": "topics.ipynb",
"topic_coherence": "topics.ipynb",
"Tracking": "tracking.ipynb",
"move": "utils.ipynb",
"most_similar": "utils.ipynb",
"chunks": "utils.ipynb",
"MovingAverage": "utils.ipynb"}

modules = ["corpus.py",
"dirichlet_likelihood.py",
"embed_mixture.py",
"fake_data.py",
"negative_sampling.py",
"preprocess.py",
"topics.py",
"tracking.py",
"utils.py"]

doc_url = "https://JiaxiangBU.github.io/lda2vec/"

git_url = "https://github.com/JiaxiangBU/lda2vec/tree/master/"

def custom_doc_links(name): return None
14 changes: 10 additions & 4 deletions lda2vec/corpus.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: corpus.ipynb (unless otherwise specified).

__all__ = ['Corpus', 'fast_replace']

# Cell
from collections import defaultdict
import numpy as np
import difflib
import pandas as pd

# Cell
try:
from pyxdameraulevenshtein import damerau_levenshtein_distance_withNPArray
except ImportError:
pass


# Cell
class Corpus():
_keys_frequency = None

Expand Down Expand Up @@ -576,7 +582,7 @@ def compact_word_vectors(self, vocab, filename=None, array=None,
choice = np.array(keys_raw)[idx][np.argmin(d)]
# choice = difflib.get_close_matches(word, choices)[0]
vector = model[choice]
print compact, word, ' --> ', choice
print(compact, word, ' --> ', choice)
except IndexError:
pass
if vector is None:
Expand Down Expand Up @@ -702,7 +708,7 @@ def compact_to_coocurrence(self, word_compact, indices, window_size=10):
.rename(columns=dict(frame='counts')))
return counts


# Cell
def fast_replace(data, keys, values, skip_checks=False):
""" Do a search-and-replace in array `data`.
Expand Down Expand Up @@ -730,4 +736,4 @@ def fast_replace(data, keys, values, skip_checks=False):
keys, values = keys[sdx], values[sdx]
idx = np.digitize(data, keys, right=True)
new_data = values[idx]
return new_data
return new_data
9 changes: 7 additions & 2 deletions lda2vec/dirichlet_likelihood.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: dirichlet_likelihood.ipynb (unless otherwise specified).

__all__ = ['dirichlet_likelihood']

# Cell
import chainer.functions as F
from chainer import Variable


# Cell
def dirichlet_likelihood(weights, alpha=None):
""" Calculate the log likelihood of the observed topic proportions.
A negative likelihood is more likely than a negative likelihood.
Expand Down Expand Up @@ -31,4 +36,4 @@ def dirichlet_likelihood(weights, alpha=None):
else:
log_proportions = F.log_softmax(weights.W)
loss = (alpha - 1.0) * log_proportions
return -F.sum(loss)
return -F.sum(loss)
12 changes: 9 additions & 3 deletions lda2vec/embed_mixture.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: embed_mixture.ipynb (unless otherwise specified).

__all__ = ['EmbedMixture']

# Cell
import numpy as np

# Cell
import chainer
import chainer.links as L
import chainer.functions as F
from chainer import Variable


# Cell
def _orthogonal_matrix(shape):
# Stolen from blocks:
# github.com/mila-udem/blocks/blob/master/blocks/initialization.py
Expand All @@ -22,7 +28,7 @@ def _orthogonal_matrix(shape):
n_min = min(shape[0], shape[1])
return np.dot(Q1[:, :n_min], Q2[:n_min, :])


# Cell
class EmbedMixture(chainer.Chain):
r""" A single document is encoded as a multinomial mixture of latent topics.
The mixture is defined on simplex, so that mixture weights always sum
Expand Down Expand Up @@ -114,4 +120,4 @@ def proportions(self, doc_ids, softmax=False):
norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y)
return y / (norm + 1e-7)
else:
return w
return w
15 changes: 10 additions & 5 deletions lda2vec/fake_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: fake_data.ipynb (unless otherwise specified).

__all__ = ['orthogonal_matrix', 'softmax', 'sample', 'fake_data']

# Cell
import numpy as np
from numpy.random import random_sample


# Cell
def orthogonal_matrix(shape):
# Stolen from blocks:
# github.com/mila-udem/blocks/blob/master/blocks/initialization.py
Expand All @@ -18,7 +23,7 @@ def orthogonal_matrix(shape):
n_min = min(shape[0], shape[1])
return np.dot(Q1[:, :n_min], Q2[:n_min, :])


# Cell
def softmax(w):
# https://gist.github.com/stober/1946926
w = np.array(w)
Expand All @@ -28,13 +33,13 @@ def softmax(w):
dist = e / np.sum(e, axis=1)[:, None]
return dist


# Cell
def sample(values, probabilities, size):
assert np.allclose(np.sum(probabilities, axis=-1), 1.0)
bins = np.add.accumulate(probabilities)
return values[np.digitize(random_sample(size), bins)]


# Cell
def fake_data(n_docs, n_words, n_sent_length, n_topics):
""" Generate latent topic vectors for words and documents
and then for each document, draw a sentence. Draw each word
Expand Down Expand Up @@ -70,4 +75,4 @@ def fake_data(n_docs, n_words, n_sent_length, n_topics):
words = sample(indices, doc_to_wrd, n_sent_length)
sentences.append(words)
sentences = np.array(sentences)
return sentences.astype('int32')
return sentences.astype('int32')
14 changes: 10 additions & 4 deletions lda2vec/negative_sampling.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: negative_sampling.ipynb (unless otherwise specified).

__all__ = ['NegativeSamplingFunction', 'negative_sampling']

# Cell
import numpy
import six

# Cell
from chainer import cuda
from chainer import function
from chainer.utils import type_check


# Cell
class NegativeSamplingFunction(function.Function):

ignore_label = -1
Expand Down Expand Up @@ -170,7 +176,7 @@ def backward_gpu(self, inputs, grads):
self.sample_size + 1, gW)
return gx, None, gW


# Cell
def negative_sampling(x, t, W, sampler, sample_size):
"""Negative sampling loss function.
Expand Down Expand Up @@ -223,11 +229,11 @@ def negative_sampling(x, t, W, sampler, sample_size):
"""
return NegativeSamplingFunction(sampler, sample_size)(x, t, W)


# Cell
# Monkey-patch the chainer code to replace the negative sampling
# with the one used here
import chainer.links as L
import chainer.functions as F
negative_sampling.patched = True
L.NegativeSampling.negative_sampling = negative_sampling
F.negative_sampling = negative_sampling
F.negative_sampling = negative_sampling
12 changes: 9 additions & 3 deletions lda2vec/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: preprocess.ipynb (unless otherwise specified).

__all__ = ['tokenize']

# Cell
from spacy.en import English
from spacy.attrs import LOWER, LIKE_URL, LIKE_EMAIL

# Cell
import numpy as np


# Cell
def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
**kwargs):
""" Uses spaCy to quickly tokenize text and return an array
Expand Down Expand Up @@ -101,7 +107,7 @@ def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
vocab[skip] = '<SKIP>'
return data, vocab


# Cell
if __name__ == "__main__":
import doctest
doctest.testmod()
doctest.testmod()
25 changes: 15 additions & 10 deletions lda2vec/topics.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: topics.ipynb (unless otherwise specified).

__all__ = ['prob_words', 'prepare_topics', 'print_top_words_per_topic', 'get_request', 'topic_coherence']

# Cell
import numpy as np
import requests
import multiprocessing


# Cell
def _softmax(x):
e_x = np.exp(x - np.max(x))
out = e_x / e_x.sum()
return out


# Cell
def _softmax_2d(x):
y = x - x.max(axis=1, keepdims=True)
np.exp(y, out=y)
y /= y.sum(axis=1, keepdims=True)
return y


# Cell
def prob_words(context, vocab, temperature=1.0):
""" This calculates a softmax over the vocabulary as a function
of the dot product of context and word.
Expand All @@ -24,7 +29,7 @@ def prob_words(context, vocab, temperature=1.0):
prob = _softmax(dot / temperature)
return prob


# Cell
def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
doc_lengths=None, term_frequency=None, normalize=False):
""" Collects a dictionary of word, document and topic distributions.
Expand Down Expand Up @@ -85,9 +90,9 @@ def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
'term_frequency': term_frequency}
return data


# Cell
def print_top_words_per_topic(data, top_n=10, do_print=True):
""" Given a pyLDAvis data array, print out the top words in every topic.
""" Given a pyLDAvis data array, print(out the top words in every topic.)
Arguments
---------
Expand All @@ -103,11 +108,11 @@ def print_top_words_per_topic(data, top_n=10, do_print=True):
top_words = [data['vocab'][i].strip().replace(' ', '_') for i in top]
msg = ' '.join(top_words)
if do_print:
print prefix + msg
print(prefix + msg)
lists.append(top_words)
return lists


# Cell
def get_request(url):
for _ in range(5):
try:
Expand All @@ -116,7 +121,7 @@ def get_request(url):
pass
return None


# Cell
def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
'umass']):
""" Requests the topic coherence from AKSW Palmetto
Expand All @@ -142,4 +147,4 @@ def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
ans = {}
for ((j, s, t), tc) in zip(args, coherences):
ans[(j, s)] = tc
return ans
return ans
Loading

0 comments on commit 0e6abb2

Please sign in to comment.