piskvorky · tmylk · Nov 13, 2016 · Nov 13, 2016
diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py
@@ -3,7 +3,7 @@
 """
 
 # bring corpus classes directly into package namespace, to save some typing
-from .indexedcorpus import IndexedCorpus  # must appear before the other classes
+from .indexedcorpus import IndexedCorpus # must appear before the other classes
 
 from .mmcorpus import MmCorpus
 from .bleicorpus import BleiCorpus

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -31,6 +31,7 @@
 from six.moves import xrange
 from six.moves import zip as izip
 
+
 logger = logging.getLogger('gensim.corpora.dictionary')
 
 
@@ -221,6 +222,7 @@ def filter_n_most_frequent(self, remove_n):
         # do the actual filtering, then rebuild dictionary to remove gaps in ids
         most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids]
         logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10])
+
         self.filter_tokens(bad_ids=most_frequent_ids)
         logger.info("resulting dictionary: %s" % self)
 

diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
@@ -128,4 +128,6 @@ def __getitem__(self, docno):
         else:
             raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')
 
+
+
 # endclass IndexedCorpus
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
@@ -65,27 +65,27 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s" % fname)
 
-        self.fname = fname  # input file, see class doc for format
-        self.line2words = line2words  # how to translate lines into words (simply split on space by default)
+        self.fname = fname # input file, see class doc for format
+        self.line2words = line2words # how to translate lines into words (simply split on space by default)
         self.num_docs = self._calculate_num_docs()
 
         if not id2word:
             # build a list of all word types in the corpus (distinct words)
             logger.info("extracting vocabulary from the corpus")
             all_terms = set()
-            self.use_wordids = False  # return documents as (word, wordCount) 2-tuples
+            self.use_wordids = False # return documents as (word, wordCount) 2-tuples
             for doc in self:
                 all_terms.update(word for word, wordCnt in doc)
-            all_terms = sorted(all_terms)  # sort the list of all words; rank in that list = word's integer id
-            self.id2word = dict(izip(xrange(len(all_terms)), all_terms))  # build a mapping of word id(int) -> word (string)
+            all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id
+            self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string)
         else:
             logger.info("using provided word mapping (%i ids)" % len(id2word))
             self.id2word = id2word
         self.num_terms = len(self.word2id)
-        self.use_wordids = True  # return documents as (wordIndex, wordCount) 2-tuples
+        self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples
 
         logger.info("loaded corpus with %i documents and %i terms from %s" %
-                    (self.num_docs, self.num_terms, fname))
+                     (self.num_docs, self.num_terms, fname))
 
     def _calculate_num_docs(self):
         # the first line in input data is the number of documents (integer). throws exception on bad input.
@@ -119,7 +119,7 @@ def line2doc(self, line):
                     marker.add(word)
             # construct a list of (wordIndex, wordFrequency) 2-tuples
             doc = list(zip(map(self.word2id.get, use_words),
-                       map(words.count, use_words)))
+                      map(words.count, use_words)))
         else:
             uniq_words = set(words)
             # construct a list of (word, wordFrequency) 2-tuples
@@ -135,7 +135,7 @@ def __iter__(self):
         """
         with utils.smart_open(self.fname) as fin:
             for lineno, line in enumerate(fin):
-                if lineno > 0:  # ignore the first line = number of documents
+                if lineno > 0: # ignore the first line = number of documents
                     yield self.line2doc(line)
 
     @staticmethod
@@ -166,8 +166,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
 
         if truncated:
             logger.warning("List-of-words format can only save vectors with "
-                            "integer elements; %i float entries were truncated to integer value" 
-                            , truncated)
+                            "integer elements; %i float entries were truncated to integer value" %
+                            truncated)
         return offsets
 
     def docbyoffset(self, offset):

diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
@@ -107,8 +107,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
 
         if truncated:
             logger.warning("Mallet format can only save vectors with "
-                           "integer elements; %i float entries were truncated to integer value" %
-                           truncated)
+                            "integer elements; %i float entries were truncated to integer value" %
+                            truncated)
 
         return offsets
 

diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py
@@ -44,7 +44,6 @@
 from gensim.interfaces import TransformedCorpus
 
 
-
 class ShardedCorpus(IndexedCorpus):
     """
     This corpus is designed for situations where you need to train a model
@@ -237,7 +236,7 @@ def __init__(self, output_prefix, corpus, dim=None,
                                      # corresponds to index 0 of current shard
 
         logger.info('Initializing sharded corpus with prefix '
-                    '{0}'.format(output_prefix))
+                     '{0}'.format(output_prefix))
         if (not os.path.isfile(output_prefix)) or overwrite:
             logger.info('Building from corpus...')
             self.init_shards(output_prefix, corpus, shardsize)
@@ -266,9 +265,9 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
                              '{0}'.format(proposed_dim))
             else:
                 logger.warn('Dataset dimension derived from input corpus diffe'
-                            'rs from initialization argument, using corpus.'
-                            '(corpus {0}, init arg {1})'.format(proposed_dim,
-                                                                self.dim))
+                             'rs from initialization argument, using corpus.'
+                             '(corpus {0}, init arg {1})'.format(proposed_dim,
+                                                                 self.dim))
 
         self.dim = proposed_dim
         self.offsets = [0]
@@ -282,7 +281,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
 
             current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype)
             logger.debug('Current chunk dimension: '
-                         '{0} x {1}'.format(len(doc_chunk), self.dim))
+                          '{0} x {1}'.format(len(doc_chunk), self.dim))
 
             for i, doc in enumerate(doc_chunk):
                 doc = dict(doc)
@@ -313,8 +312,8 @@ def init_by_clone(self):
                 logger.info('Loaded dataset dimension: {0}'.format(temp.dim))
             else:
                 logger.warn('Loaded dataset dimension differs from init arg '
-                            'dimension, using loaded dim. '
-                            '(loaded {0}, init {1})'.format(temp.dim, self.dim))
+                             'dimension, using loaded dim. '
+                             '(loaded {0}, init {1})'.format(temp.dim, self.dim))
 
         self.dim = temp.dim  # To be consistent with the loaded data!
 
@@ -329,7 +328,7 @@ def save_shard(self, shard, n=None, filename=None):
         """
         new_shard = False
         if n is None:
-            n = self.n_shards  # Saving the *next* one by default.
+            n = self.n_shards # Saving the *next* one by default.
             new_shard = True
 
         if not filename:
@@ -345,7 +344,7 @@ def load_shard(self, n):
         """
         Load (unpickle) the n-th shard as the "live" part of the dataset
         into the Dataset object."""
-        # logger.debug('ShardedCorpus loading shard {0}, '
+        #logger.debug('ShardedCorpus loading shard {0}, '
         #              'current shard: {1}'.format(n, self.current_shard_n))
 
         # No-op if the shard is already open.
@@ -414,9 +413,9 @@ def in_next(self, offset):
 
         """
         if self.current_shard_n == self.n_shards:
-            return False  # There's no next shard.
+            return False # There's no next shard.
         return (self.offsets[self.current_shard_n + 1] <= offset) \
-                and (offset < self.offsets[self.current_shard_n + 2])
+               and (offset < self.offsets[self.current_shard_n + 2])
 
     def resize_shards(self, shardsize):
         """
@@ -472,8 +471,8 @@ def resize_shards(self, shardsize):
                 os.remove(old_shard_name)
         except Exception as e:
             logger.error('Exception occurred during old shard no. {0} '
-                         'removal: {1}.\nAttempting to at least move '
-                         'new shards in.'.format(old_shard_n, str(e)))
+                          'removal: {1}.\nAttempting to at least move '
+                          'new shards in.'.format(old_shard_n, str(e)))
         finally:
             # If something happens with cleaning up - try to at least get the
             # new guys in.
@@ -529,17 +528,17 @@ def _guess_n_features(self, corpus):
         else:
             if not self.dim:
                 raise TypeError('Couldn\'t find number of features, '
-                                'refusing to guess (dimension set to {0},'
-                                'type of corpus: {1}).'.format(self.dim, type(corpus)))
+                                 'refusing to guess (dimension set to {0},'
+                                 'type of corpus: {1}).'.format(self.dim, type(corpus)))
             else:
                 logger.warn('Couldn\'t find number of features, trusting '
-                            'supplied dimension ({0})'.format(self.dim))
+                             'supplied dimension ({0})'.format(self.dim))
                 n_features = self.dim
 
         if self.dim and n_features != self.dim:
             logger.warn('Discovered inconsistent dataset dim ({0}) and '
-                        'feature count from corpus ({1}). Coercing to dimension'
-                        ' given by argument.'.format(self.dim, n_features))
+                         'feature count from corpus ({1}). Coercing to dimension'
+                         ' given by argument.'.format(self.dim, n_features))
 
         return n_features
 
@@ -605,7 +604,7 @@ def __getitem__(self, offset):
                 # This fails on one-past
                 # slice indexing; that's why there's a code branch here.
 
-            # logger.debug('ShardedCorpus: Retrieving slice {0}: '
+            #logger.debug('ShardedCorpus: Retrieving slice {0}: '
             #              'shard {1}'.format((offset.start, offset.stop),
             #                                 (first_shard, last_shard)))
 
@@ -614,7 +613,7 @@ def __getitem__(self, offset):
             # The easy case: both in one shard.
             if first_shard == last_shard:
                 s_result = self.current_shard[start - self.current_offset:
-                                              stop - self.current_offset]
+                                            stop - self.current_offset]
                 # Handle different sparsity settings:
                 s_result = self._getitem_format(s_result)
 
@@ -650,13 +649,13 @@ def __getitem__(self, offset):
             shard_stop = self.offsets[self.current_shard_n + 1] - \
                          self.current_offset
 
-            # s_result[result_start:result_stop] = self.current_shard[
+            #s_result[result_start:result_stop] = self.current_shard[
             #                                         shard_start:shard_stop]
             s_result = self.__add_to_slice(s_result, result_start, result_stop,
                                            shard_start, shard_stop)
 
             # First and last get special treatment, these are in between
-            for shard_n in xrange(first_shard + 1, last_shard):
+            for shard_n in xrange(first_shard+1, last_shard):
                 self.load_shard(shard_n)
 
                 result_start = result_stop
@@ -747,7 +746,7 @@ def _getitem_sparse2gensim(self, result):
 
         """
         def row_sparse2gensim(row_idx, csr_matrix):
-            indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx + 1]]
+            indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx+1]]
             g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices]
             return g_row
 

diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py
@@ -58,7 +58,7 @@ def __init__(self, fname, store_labels=True):
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s" % fname)
 
-        self.fname = fname  # input file, see class doc for format
+        self.fname = fname # input file, see class doc for format
         self.length = None
         self.store_labels = store_labels
         self.labels = []
@@ -94,7 +94,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False):
         offsets = []
         with utils.smart_open(fname, 'wb') as fout:
             for docno, doc in enumerate(corpus):
-                label = labels[docno] if labels else 0  # target class is 0 by default
+                label = labels[docno] if labels else 0 # target class is 0 by default
                 offsets.append(fout.tell())
                 fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label)))
         return offsets
@@ -114,20 +114,20 @@ def line2doc(self, line):
         line = utils.to_unicode(line)
         line = line[: line.find('#')].strip()
         if not line:
-            return None  # ignore comments and empty lines
+            return None # ignore comments and empty lines
         parts = line.split()
         if not parts:
             raise ValueError('invalid line format in %s' % self.fname)
         target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]]
-        doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid']  # ignore 'qid' features, convert 1-based feature ids to 0-based
+        doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based
         return doc, target
 
     @staticmethod
     def doc2line(doc, label=0):
         """
         Output the document in SVMlight format, as a string. Inverse function to `line2doc`.
         """
-        pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc)  # +1 to convert 0-base to 1-base
+        pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base
         return "%s %s\n" % (label, pairs)
 
 # endclass SvmLightCorpus
diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
@@ -51,7 +51,7 @@ def __init__(self, input):
                 pass
 
         logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' %
-                    (self.num_docs, self.num_terms, self.num_nnz))
+            (self.num_docs, self.num_terms, self.num_nnz))
 
     def skip_headers(self, input_file):
         for lineno, _ in enumerate(input_file):
@@ -118,18 +118,18 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False):
                 offsets.append(posnow)
                 poslast = posnow
 
-            vector = [(x, int(y)) for (x, y) in bow if int(y) != 0]  # integer count, not floating weights
+            vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights
             max_id, veclen = writer.write_vector(docno, vector)
             num_terms = max(num_terms, 1 + max_id)
             num_nnz += veclen
         num_docs = docno + 1
 
         if num_docs * num_terms != 0:
             logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" %
-                        (num_docs, num_terms,
-                         100.0 * num_nnz / (num_docs * num_terms),
-                         num_nnz,
-                         num_docs * num_terms))
+                         (num_docs, num_terms,
+                          100.0 * num_nnz / (num_docs * num_terms),
+                          num_nnz,
+                          num_docs * num_terms))
 
         # now write proper headers, by seeking and overwriting the spaces written earlier
         writer.update_headers(num_docs, num_terms, num_nnz)
@@ -165,7 +165,7 @@ def __iter__(self):
         (yielding one document at a time).
         """
         for docId, doc in super(UciCorpus, self).__iter__():
-            yield doc  # get rid of docId, return the sparse vector only
+            yield doc # get rid of docId, return the sparse vector only
 
     def create_dictionary(self):
         """

diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py
@@ -14,7 +14,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 def arithmetic_mean(confirmed_measures):
     """
     This functoin performs the arithmetic mean aggregation on the output obtained from

diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -15,7 +15,6 @@
 
 EPSILON = 1e-12  # Should be small. Value as suggested in paper.
 
-
 def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
     """
     This function calculates the log-conditional-probability measure
@@ -43,7 +42,6 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
 
     return m_lc
 
-
 def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False):
     """
     If normalize=False:

diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -48,7 +48,6 @@ def _present(w_prime_star, w, w_backtrack):
         return -1
     return index
 
-
 def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs):
     """
     Internal helper function to return context vectors for segmentations.
@@ -70,7 +69,6 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc
             context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
     return (context_vectors, backtrack)
 
-
 def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs):
     """
     This function calculates the indirect cosine measure. Given context vectors
Original file line number	Diff line number	Diff line change
Expand Up		@@ -128,4 +128,6 @@ def __getitem__(self, docno):
		else:
		raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')



		# endclass IndexedCorpus
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,7 +14,6 @@ @@
     logger = logging.getLogger(__name__)
     def arithmetic_mean(confirmed_measures):
         """
         This functoin performs the arithmetic mean aggregation on the output obtained from
@@ Expand Down @@