Skip to content

Commit

Permalink
Revert "PEP8 Fixes for topic_coherence and corpora" (#1008)
Browse files Browse the repository at this point in the history
  • Loading branch information
tmylk authored Nov 13, 2016
1 parent 0c8dba6 commit 031b731
Show file tree
Hide file tree
Showing 13 changed files with 53 additions and 61 deletions.
2 changes: 1 addition & 1 deletion gensim/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

# bring corpus classes directly into package namespace, to save some typing
from .indexedcorpus import IndexedCorpus # must appear before the other classes
from .indexedcorpus import IndexedCorpus # must appear before the other classes

from .mmcorpus import MmCorpus
from .bleicorpus import BleiCorpus
Expand Down
2 changes: 2 additions & 0 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from six.moves import xrange
from six.moves import zip as izip


logger = logging.getLogger('gensim.corpora.dictionary')


Expand Down Expand Up @@ -221,6 +222,7 @@ def filter_n_most_frequent(self, remove_n):
# do the actual filtering, then rebuild dictionary to remove gaps in ids
most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids]
logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])

self.filter_tokens(bad_ids=most_frequent_ids)
logger.info("resulting dictionary: %s" % self)

Expand Down
2 changes: 2 additions & 0 deletions gensim/corpora/indexedcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,6 @@ def __getitem__(self, docno):
else:
raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')



# endclass IndexedCorpus
22 changes: 11 additions & 11 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,27 +65,27 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s" % fname)

self.fname = fname # input file, see class doc for format
self.line2words = line2words # how to translate lines into words (simply split on space by default)
self.fname = fname # input file, see class doc for format
self.line2words = line2words # how to translate lines into words (simply split on space by default)
self.num_docs = self._calculate_num_docs()

if not id2word:
# build a list of all word types in the corpus (distinct words)
logger.info("extracting vocabulary from the corpus")
all_terms = set()
self.use_wordids = False # return documents as (word, wordCount) 2-tuples
self.use_wordids = False # return documents as (word, wordCount) 2-tuples
for doc in self:
all_terms.update(word for word, wordCnt in doc)
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
else:
logger.info("using provided word mapping (%i ids)" % len(id2word))
self.id2word = id2word
self.num_terms = len(self.word2id)
self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples
self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples

logger.info("loaded corpus with %i documents and %i terms from %s" %
(self.num_docs, self.num_terms, fname))
(self.num_docs, self.num_terms, fname))

def _calculate_num_docs(self):
# the first line in input data is the number of documents (integer). throws exception on bad input.
Expand Down Expand Up @@ -119,7 +119,7 @@ def line2doc(self, line):
marker.add(word)
# construct a list of (wordIndex, wordFrequency) 2-tuples
doc = list(zip(map(self.word2id.get, use_words),
map(words.count, use_words)))
map(words.count, use_words)))
else:
uniq_words = set(words)
# construct a list of (word, wordFrequency) 2-tuples
Expand All @@ -135,7 +135,7 @@ def __iter__(self):
"""
with utils.smart_open(self.fname) as fin:
for lineno, line in enumerate(fin):
if lineno > 0: # ignore the first line = number of documents
if lineno > 0: # ignore the first line = number of documents
yield self.line2doc(line)

@staticmethod
Expand Down Expand Up @@ -166,8 +166,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):

if truncated:
logger.warning("List-of-words format can only save vectors with "
"integer elements; %i float entries were truncated to integer value"
, truncated)
"integer elements; %i float entries were truncated to integer value" %
truncated)
return offsets

def docbyoffset(self, offset):
Expand Down
4 changes: 2 additions & 2 deletions gensim/corpora/malletcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):

if truncated:
logger.warning("Mallet format can only save vectors with "
"integer elements; %i float entries were truncated to integer value" %
truncated)
"integer elements; %i float entries were truncated to integer value" %
truncated)

return offsets

Expand Down
47 changes: 23 additions & 24 deletions gensim/corpora/sharded_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
from gensim.interfaces import TransformedCorpus



class ShardedCorpus(IndexedCorpus):
"""
This corpus is designed for situations where you need to train a model
Expand Down Expand Up @@ -237,7 +236,7 @@ def __init__(self, output_prefix, corpus, dim=None,
# corresponds to index 0 of current shard

logger.info('Initializing sharded corpus with prefix '
'{0}'.format(output_prefix))
'{0}'.format(output_prefix))
if (not os.path.isfile(output_prefix)) or overwrite:
logger.info('Building from corpus...')
self.init_shards(output_prefix, corpus, shardsize)
Expand Down Expand Up @@ -266,9 +265,9 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
'{0}'.format(proposed_dim))
else:
logger.warn('Dataset dimension derived from input corpus diffe'
'rs from initialization argument, using corpus.'
'(corpus {0}, init arg {1})'.format(proposed_dim,
self.dim))
'rs from initialization argument, using corpus.'
'(corpus {0}, init arg {1})'.format(proposed_dim,
self.dim))

self.dim = proposed_dim
self.offsets = [0]
Expand All @@ -282,7 +281,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp

current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype)
logger.debug('Current chunk dimension: '
'{0} x {1}'.format(len(doc_chunk), self.dim))
'{0} x {1}'.format(len(doc_chunk), self.dim))

for i, doc in enumerate(doc_chunk):
doc = dict(doc)
Expand Down Expand Up @@ -313,8 +312,8 @@ def init_by_clone(self):
logger.info('Loaded dataset dimension: {0}'.format(temp.dim))
else:
logger.warn('Loaded dataset dimension differs from init arg '
'dimension, using loaded dim. '
'(loaded {0}, init {1})'.format(temp.dim, self.dim))
'dimension, using loaded dim. '
'(loaded {0}, init {1})'.format(temp.dim, self.dim))

self.dim = temp.dim # To be consistent with the loaded data!

Expand All @@ -329,7 +328,7 @@ def save_shard(self, shard, n=None, filename=None):
"""
new_shard = False
if n is None:
n = self.n_shards # Saving the *next* one by default.
n = self.n_shards # Saving the *next* one by default.
new_shard = True

if not filename:
Expand All @@ -345,7 +344,7 @@ def load_shard(self, n):
"""
Load (unpickle) the n-th shard as the "live" part of the dataset
into the Dataset object."""
# logger.debug('ShardedCorpus loading shard {0}, '
#logger.debug('ShardedCorpus loading shard {0}, '
# 'current shard: {1}'.format(n, self.current_shard_n))

# No-op if the shard is already open.
Expand Down Expand Up @@ -414,9 +413,9 @@ def in_next(self, offset):
"""
if self.current_shard_n == self.n_shards:
return False # There's no next shard.
return False # There's no next shard.
return (self.offsets[self.current_shard_n + 1] <= offset) \
and (offset < self.offsets[self.current_shard_n + 2])
and (offset < self.offsets[self.current_shard_n + 2])

def resize_shards(self, shardsize):
"""
Expand Down Expand Up @@ -472,8 +471,8 @@ def resize_shards(self, shardsize):
os.remove(old_shard_name)
except Exception as e:
logger.error('Exception occurred during old shard no. {0} '
'removal: {1}.\nAttempting to at least move '
'new shards in.'.format(old_shard_n, str(e)))
'removal: {1}.\nAttempting to at least move '
'new shards in.'.format(old_shard_n, str(e)))
finally:
# If something happens with cleaning up - try to at least get the
# new guys in.
Expand Down Expand Up @@ -529,17 +528,17 @@ def _guess_n_features(self, corpus):
else:
if not self.dim:
raise TypeError('Couldn\'t find number of features, '
'refusing to guess (dimension set to {0},'
'type of corpus: {1}).'.format(self.dim, type(corpus)))
'refusing to guess (dimension set to {0},'
'type of corpus: {1}).'.format(self.dim, type(corpus)))
else:
logger.warn('Couldn\'t find number of features, trusting '
'supplied dimension ({0})'.format(self.dim))
'supplied dimension ({0})'.format(self.dim))
n_features = self.dim

if self.dim and n_features != self.dim:
logger.warn('Discovered inconsistent dataset dim ({0}) and '
'feature count from corpus ({1}). Coercing to dimension'
' given by argument.'.format(self.dim, n_features))
'feature count from corpus ({1}). Coercing to dimension'
' given by argument.'.format(self.dim, n_features))

return n_features

Expand Down Expand Up @@ -605,7 +604,7 @@ def __getitem__(self, offset):
# This fails on one-past
# slice indexing; that's why there's a code branch here.

# logger.debug('ShardedCorpus: Retrieving slice {0}: '
#logger.debug('ShardedCorpus: Retrieving slice {0}: '
# 'shard {1}'.format((offset.start, offset.stop),
# (first_shard, last_shard)))

Expand All @@ -614,7 +613,7 @@ def __getitem__(self, offset):
# The easy case: both in one shard.
if first_shard == last_shard:
s_result = self.current_shard[start - self.current_offset:
stop - self.current_offset]
stop - self.current_offset]
# Handle different sparsity settings:
s_result = self._getitem_format(s_result)

Expand Down Expand Up @@ -650,13 +649,13 @@ def __getitem__(self, offset):
shard_stop = self.offsets[self.current_shard_n + 1] - \
self.current_offset

# s_result[result_start:result_stop] = self.current_shard[
#s_result[result_start:result_stop] = self.current_shard[
# shard_start:shard_stop]
s_result = self.__add_to_slice(s_result, result_start, result_stop,
shard_start, shard_stop)

# First and last get special treatment, these are in between
for shard_n in xrange(first_shard + 1, last_shard):
for shard_n in xrange(first_shard+1, last_shard):
self.load_shard(shard_n)

result_start = result_stop
Expand Down Expand Up @@ -747,7 +746,7 @@ def _getitem_sparse2gensim(self, result):
"""
def row_sparse2gensim(row_idx, csr_matrix):
indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx + 1]]
indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx+1]]
g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices]
return g_row

Expand Down
10 changes: 5 additions & 5 deletions gensim/corpora/svmlightcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def __init__(self, fname, store_labels=True):
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s" % fname)

self.fname = fname # input file, see class doc for format
self.fname = fname # input file, see class doc for format
self.length = None
self.store_labels = store_labels
self.labels = []
Expand Down Expand Up @@ -94,7 +94,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
offsets = []
with utils.smart_open(fname, 'wb') as fout:
for docno, doc in enumerate(corpus):
label = labels[docno] if labels else 0 # target class is 0 by default
label = labels[docno] if labels else 0 # target class is 0 by default
offsets.append(fout.tell())
fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
return offsets
Expand All @@ -114,20 +114,20 @@ def line2doc(self, line):
line = utils.to_unicode(line)
line = line[: line.find('#')].strip()
if not line:
return None # ignore comments and empty lines
return None # ignore comments and empty lines
parts = line.split()
if not parts:
raise ValueError('invalid line format in %s' % self.fname)
target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
return doc, target

@staticmethod
def doc2line(doc, label=0):
"""
Output the document in SVMlight format, as a string. Inverse function to `line2doc`.
"""
pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
return "%s %s\n" % (label, pairs)

# endclass SvmLightCorpus
14 changes: 7 additions & 7 deletions gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self, input):
pass

logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
(self.num_docs, self.num_terms, self.num_nnz))
(self.num_docs, self.num_terms, self.num_nnz))

def skip_headers(self, input_file):
for lineno, _ in enumerate(input_file):
Expand Down Expand Up @@ -118,18 +118,18 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False):
offsets.append(posnow)
poslast = posnow

vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights
vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights
max_id, veclen = writer.write_vector(docno, vector)
num_terms = max(num_terms, 1 + max_id)
num_nnz += veclen
num_docs = docno + 1

if num_docs * num_terms != 0:
logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" %
(num_docs, num_terms,
100.0 * num_nnz / (num_docs * num_terms),
num_nnz,
num_docs * num_terms))
(num_docs, num_terms,
100.0 * num_nnz / (num_docs * num_terms),
num_nnz,
num_docs * num_terms))

# now write proper headers, by seeking and overwriting the spaces written earlier
writer.update_headers(num_docs, num_terms, num_nnz)
Expand Down Expand Up @@ -165,7 +165,7 @@ def __iter__(self):
(yielding one document at a time).
"""
for docId, doc in super(UciCorpus, self).__iter__():
yield doc # get rid of docId, return the sparse vector only
yield doc # get rid of docId, return the sparse vector only

def create_dictionary(self):
"""
Expand Down
1 change: 0 additions & 1 deletion gensim/topic_coherence/aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

logger = logging.getLogger(__name__)


def arithmetic_mean(confirmed_measures):
"""
This functoin performs the arithmetic mean aggregation on the output obtained from
Expand Down
2 changes: 0 additions & 2 deletions gensim/topic_coherence/direct_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

EPSILON = 1e-12 # Should be small. Value as suggested in paper.


def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
"""
This function calculates the log-conditional-probability measure
Expand Down Expand Up @@ -43,7 +42,6 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):

return m_lc


def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False):
"""
If normalize=False:
Expand Down
2 changes: 0 additions & 2 deletions gensim/topic_coherence/indirect_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def _present(w_prime_star, w, w_backtrack):
return -1
return index


def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs):
"""
Internal helper function to return context vectors for segmentations.
Expand All @@ -70,7 +69,6 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc
context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
return (context_vectors, backtrack)


def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):
"""
This function calculates the indirect cosine measure. Given context vectors
Expand Down
Loading

0 comments on commit 031b731

Please sign in to comment.