Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "PEP8 Fixes for topic_coherence and corpora" #1008

Merged
merged 1 commit into from
Nov 13, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gensim/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

# bring corpus classes directly into package namespace, to save some typing
from .indexedcorpus import IndexedCorpus # must appear before the other classes
from .indexedcorpus import IndexedCorpus # must appear before the other classes

from .mmcorpus import MmCorpus
from .bleicorpus import BleiCorpus
Expand Down
2 changes: 2 additions & 0 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from six.moves import xrange
from six.moves import zip as izip


logger = logging.getLogger('gensim.corpora.dictionary')


Expand Down Expand Up @@ -221,6 +222,7 @@ def filter_n_most_frequent(self, remove_n):
# do the actual filtering, then rebuild dictionary to remove gaps in ids
most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids]
logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])

self.filter_tokens(bad_ids=most_frequent_ids)
logger.info("resulting dictionary: %s" % self)

Expand Down
2 changes: 2 additions & 0 deletions gensim/corpora/indexedcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,6 @@ def __getitem__(self, docno):
else:
raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')



# endclass IndexedCorpus
22 changes: 11 additions & 11 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,27 +65,27 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s" % fname)

self.fname = fname # input file, see class doc for format
self.line2words = line2words # how to translate lines into words (simply split on space by default)
self.fname = fname # input file, see class doc for format
self.line2words = line2words # how to translate lines into words (simply split on space by default)
self.num_docs = self._calculate_num_docs()

if not id2word:
# build a list of all word types in the corpus (distinct words)
logger.info("extracting vocabulary from the corpus")
all_terms = set()
self.use_wordids = False # return documents as (word, wordCount) 2-tuples
self.use_wordids = False # return documents as (word, wordCount) 2-tuples
for doc in self:
all_terms.update(word for word, wordCnt in doc)
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
else:
logger.info("using provided word mapping (%i ids)" % len(id2word))
self.id2word = id2word
self.num_terms = len(self.word2id)
self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples
self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples

logger.info("loaded corpus with %i documents and %i terms from %s" %
(self.num_docs, self.num_terms, fname))
(self.num_docs, self.num_terms, fname))

def _calculate_num_docs(self):
# the first line in input data is the number of documents (integer). throws exception on bad input.
Expand Down Expand Up @@ -119,7 +119,7 @@ def line2doc(self, line):
marker.add(word)
# construct a list of (wordIndex, wordFrequency) 2-tuples
doc = list(zip(map(self.word2id.get, use_words),
map(words.count, use_words)))
map(words.count, use_words)))
else:
uniq_words = set(words)
# construct a list of (word, wordFrequency) 2-tuples
Expand All @@ -135,7 +135,7 @@ def __iter__(self):
"""
with utils.smart_open(self.fname) as fin:
for lineno, line in enumerate(fin):
if lineno > 0: # ignore the first line = number of documents
if lineno > 0: # ignore the first line = number of documents
yield self.line2doc(line)

@staticmethod
Expand Down Expand Up @@ -166,8 +166,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):

if truncated:
logger.warning("List-of-words format can only save vectors with "
"integer elements; %i float entries were truncated to integer value"
, truncated)
"integer elements; %i float entries were truncated to integer value" %
truncated)
return offsets

def docbyoffset(self, offset):
Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):

if truncated:
logger.warning("Mallet format can only save vectors with "
"integer elements; %i float entries were truncated to integer value" %
truncated)
"integer elements; %i float entries were truncated to integer value" %
truncated)

return offsets

Expand Down
47 changes: 23 additions & 24 deletions gensim/corpora/sharded_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
from gensim.interfaces import TransformedCorpus



class ShardedCorpus(IndexedCorpus):
"""
This corpus is designed for situations where you need to train a model
Expand Down Expand Up @@ -237,7 +236,7 @@ def __init__(self, output_prefix, corpus, dim=None,
# corresponds to index 0 of current shard

logger.info('Initializing sharded corpus with prefix '
'{0}'.format(output_prefix))
'{0}'.format(output_prefix))
if (not os.path.isfile(output_prefix)) or overwrite:
logger.info('Building from corpus...')
self.init_shards(output_prefix, corpus, shardsize)
Expand Down Expand Up @@ -266,9 +265,9 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
'{0}'.format(proposed_dim))
else:
logger.warn('Dataset dimension derived from input corpus diffe'
'rs from initialization argument, using corpus.'
'(corpus {0}, init arg {1})'.format(proposed_dim,
self.dim))
'rs from initialization argument, using corpus.'
'(corpus {0}, init arg {1})'.format(proposed_dim,
self.dim))

self.dim = proposed_dim
self.offsets = [0]
Expand All @@ -282,7 +281,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp

current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype)
logger.debug('Current chunk dimension: '
'{0} x {1}'.format(len(doc_chunk), self.dim))
'{0} x {1}'.format(len(doc_chunk), self.dim))

for i, doc in enumerate(doc_chunk):
doc = dict(doc)
Expand Down Expand Up @@ -313,8 +312,8 @@ def init_by_clone(self):
logger.info('Loaded dataset dimension: {0}'.format(temp.dim))
else:
logger.warn('Loaded dataset dimension differs from init arg '
'dimension, using loaded dim. '
'(loaded {0}, init {1})'.format(temp.dim, self.dim))
'dimension, using loaded dim. '
'(loaded {0}, init {1})'.format(temp.dim, self.dim))

self.dim = temp.dim # To be consistent with the loaded data!

Expand All @@ -329,7 +328,7 @@ def save_shard(self, shard, n=None, filename=None):
"""
new_shard = False
if n is None:
n = self.n_shards # Saving the *next* one by default.
n = self.n_shards # Saving the *next* one by default.
new_shard = True

if not filename:
Expand All @@ -345,7 +344,7 @@ def load_shard(self, n):
"""
Load (unpickle) the n-th shard as the "live" part of the dataset
into the Dataset object."""
# logger.debug('ShardedCorpus loading shard {0}, '
#logger.debug('ShardedCorpus loading shard {0}, '
# 'current shard: {1}'.format(n, self.current_shard_n))

# No-op if the shard is already open.
Expand Down Expand Up @@ -414,9 +413,9 @@ def in_next(self, offset):

"""
if self.current_shard_n == self.n_shards:
return False # There's no next shard.
return False # There's no next shard.
return (self.offsets[self.current_shard_n + 1] <= offset) \
and (offset < self.offsets[self.current_shard_n + 2])
and (offset < self.offsets[self.current_shard_n + 2])

def resize_shards(self, shardsize):
"""
Expand Down Expand Up @@ -472,8 +471,8 @@ def resize_shards(self, shardsize):
os.remove(old_shard_name)
except Exception as e:
logger.error('Exception occurred during old shard no. {0} '
'removal: {1}.\nAttempting to at least move '
'new shards in.'.format(old_shard_n, str(e)))
'removal: {1}.\nAttempting to at least move '
'new shards in.'.format(old_shard_n, str(e)))
finally:
# If something happens with cleaning up - try to at least get the
# new guys in.
Expand Down Expand Up @@ -529,17 +528,17 @@ def _guess_n_features(self, corpus):
else:
if not self.dim:
raise TypeError('Couldn\'t find number of features, '
'refusing to guess (dimension set to {0},'
'type of corpus: {1}).'.format(self.dim, type(corpus)))
'refusing to guess (dimension set to {0},'
'type of corpus: {1}).'.format(self.dim, type(corpus)))
else:
logger.warn('Couldn\'t find number of features, trusting '
'supplied dimension ({0})'.format(self.dim))
'supplied dimension ({0})'.format(self.dim))
n_features = self.dim

if self.dim and n_features != self.dim:
logger.warn('Discovered inconsistent dataset dim ({0}) and '
'feature count from corpus ({1}). Coercing to dimension'
' given by argument.'.format(self.dim, n_features))
'feature count from corpus ({1}). Coercing to dimension'
' given by argument.'.format(self.dim, n_features))

return n_features

Expand Down Expand Up @@ -605,7 +604,7 @@ def __getitem__(self, offset):
# This fails on one-past
# slice indexing; that's why there's a code branch here.

# logger.debug('ShardedCorpus: Retrieving slice {0}: '
#logger.debug('ShardedCorpus: Retrieving slice {0}: '
# 'shard {1}'.format((offset.start, offset.stop),
# (first_shard, last_shard)))

Expand All @@ -614,7 +613,7 @@ def __getitem__(self, offset):
# The easy case: both in one shard.
if first_shard == last_shard:
s_result = self.current_shard[start - self.current_offset:
stop - self.current_offset]
stop - self.current_offset]
# Handle different sparsity settings:
s_result = self._getitem_format(s_result)

Expand Down Expand Up @@ -650,13 +649,13 @@ def __getitem__(self, offset):
shard_stop = self.offsets[self.current_shard_n + 1] - \
self.current_offset

# s_result[result_start:result_stop] = self.current_shard[
#s_result[result_start:result_stop] = self.current_shard[
# shard_start:shard_stop]
s_result = self.__add_to_slice(s_result, result_start, result_stop,
shard_start, shard_stop)

# First and last get special treatment, these are in between
for shard_n in xrange(first_shard + 1, last_shard):
for shard_n in xrange(first_shard+1, last_shard):
self.load_shard(shard_n)

result_start = result_stop
Expand Down Expand Up @@ -747,7 +746,7 @@ def _getitem_sparse2gensim(self, result):

"""
def row_sparse2gensim(row_idx, csr_matrix):
indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx + 1]]
indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx+1]]
g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices]
return g_row

Expand Down
10 changes: 5 additions & 5 deletions gensim/corpora/svmlightcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self, fname, store_labels=True):
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s" % fname)

self.fname = fname # input file, see class doc for format
self.fname = fname # input file, see class doc for format
self.length = None
self.store_labels = store_labels
self.labels = []
Expand Down Expand Up @@ -94,7 +94,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
offsets = []
with utils.smart_open(fname, 'wb') as fout:
for docno, doc in enumerate(corpus):
label = labels[docno] if labels else 0 # target class is 0 by default
label = labels[docno] if labels else 0 # target class is 0 by default
offsets.append(fout.tell())
fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
return offsets
Expand All @@ -114,20 +114,20 @@ def line2doc(self, line):
line = utils.to_unicode(line)
line = line[: line.find('#')].strip()
if not line:
return None # ignore comments and empty lines
return None # ignore comments and empty lines
parts = line.split()
if not parts:
raise ValueError('invalid line format in %s' % self.fname)
target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
return doc, target

@staticmethod
def doc2line(doc, label=0):
"""
Output the document in SVMlight format, as a string. Inverse function to `line2doc`.
"""
pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
return "%s %s\n" % (label, pairs)

# endclass SvmLightCorpus
14 changes: 7 additions & 7 deletions gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self, input):
pass

logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
(self.num_docs, self.num_terms, self.num_nnz))
(self.num_docs, self.num_terms, self.num_nnz))

def skip_headers(self, input_file):
for lineno, _ in enumerate(input_file):
Expand Down Expand Up @@ -118,18 +118,18 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False):
offsets.append(posnow)
poslast = posnow

vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights
vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights
max_id, veclen = writer.write_vector(docno, vector)
num_terms = max(num_terms, 1 + max_id)
num_nnz += veclen
num_docs = docno + 1

if num_docs * num_terms != 0:
logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" %
(num_docs, num_terms,
100.0 * num_nnz / (num_docs * num_terms),
num_nnz,
num_docs * num_terms))
(num_docs, num_terms,
100.0 * num_nnz / (num_docs * num_terms),
num_nnz,
num_docs * num_terms))

# now write proper headers, by seeking and overwriting the spaces written earlier
writer.update_headers(num_docs, num_terms, num_nnz)
Expand Down Expand Up @@ -165,7 +165,7 @@ def __iter__(self):
(yielding one document at a time).
"""
for docId, doc in super(UciCorpus, self).__iter__():
yield doc # get rid of docId, return the sparse vector only
yield doc # get rid of docId, return the sparse vector only

def create_dictionary(self):
"""
Expand Down
1 change: 0 additions & 1 deletion gensim/topic_coherence/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

logger = logging.getLogger(__name__)


def arithmetic_mean(confirmed_measures):
"""
This functoin performs the arithmetic mean aggregation on the output obtained from
Expand Down
2 changes: 0 additions & 2 deletions gensim/topic_coherence/direct_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

EPSILON = 1e-12 # Should be small. Value as suggested in paper.


def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
"""
This function calculates the log-conditional-probability measure
Expand Down Expand Up @@ -43,7 +42,6 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):

return m_lc


def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False):
"""
If normalize=False:
Expand Down
2 changes: 0 additions & 2 deletions gensim/topic_coherence/indirect_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def _present(w_prime_star, w, w_backtrack):
return -1
return index


def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs):
"""
Internal helper function to return context vectors for segmentations.
Expand All @@ -70,7 +69,6 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc
context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
return (context_vectors, backtrack)


def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):
"""
This function calculates the indirect cosine measure. Given context vectors
Expand Down
Loading