diff --git a/gensim/corpora/__init__.py b/gensim/corpora/__init__.py index 093bde7674..a11a0df229 100644 --- a/gensim/corpora/__init__.py +++ b/gensim/corpora/__init__.py @@ -3,7 +3,7 @@ """ # bring corpus classes directly into package namespace, to save some typing -from .indexedcorpus import IndexedCorpus # must appear before the other classes +from .indexedcorpus import IndexedCorpus # must appear before the other classes from .mmcorpus import MmCorpus from .bleicorpus import BleiCorpus diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index ea4b92ff24..1fd7e31e61 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -31,6 +31,7 @@ from six.moves import xrange from six.moves import zip as izip + logger = logging.getLogger('gensim.corpora.dictionary') @@ -221,6 +222,7 @@ def filter_n_most_frequent(self, remove_n): # do the actual filtering, then rebuild dictionary to remove gaps in ids most_frequent_words = [(self[id], self.dfs.get(id, 0)) for id in most_frequent_ids] logger.info("discarding %i tokens: %s...", len(most_frequent_ids), most_frequent_words[:10]) + self.filter_tokens(bad_ids=most_frequent_ids) logger.info("resulting dictionary: %s" % self) diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 703bbd199a..dd3f703899 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -128,4 +128,6 @@ def __getitem__(self, docno): else: raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray') + + # endclass IndexedCorpus diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index d33f897e9e..b87f1108a2 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -65,27 +65,27 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) - self.fname = fname # input file, see class doc for format - self.line2words = line2words # how to translate lines into words (simply split on space by default) + self.fname = fname # input file, see class doc for format + self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() - self.use_wordids = False # return documents as (word, wordCount) 2-tuples + self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) - all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id - self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) + all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id + self.id2word = dict(izip(xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) - self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples + self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % - (self.num_docs, self.num_terms, fname)) + (self.num_docs, self.num_terms, fname)) def _calculate_num_docs(self): # the first line in input data is the number of documents (integer). throws exception on bad input. @@ -119,7 +119,7 @@ def line2doc(self, line): marker.add(word) # construct a list of (wordIndex, wordFrequency) 2-tuples doc = list(zip(map(self.word2id.get, use_words), - map(words.count, use_words))) + map(words.count, use_words))) else: uniq_words = set(words) # construct a list of (word, wordFrequency) 2-tuples @@ -135,7 +135,7 @@ def __iter__(self): """ with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): - if lineno > 0: # ignore the first line = number of documents + if lineno > 0: # ignore the first line = number of documents yield self.line2doc(line) @staticmethod @@ -166,8 +166,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): if truncated: logger.warning("List-of-words format can only save vectors with " - "integer elements; %i float entries were truncated to integer value" - , truncated) + "integer elements; %i float entries were truncated to integer value" % + truncated) return offsets def docbyoffset(self, offset): diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index 4efd4503ef..f8410845e6 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -107,8 +107,8 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): if truncated: logger.warning("Mallet format can only save vectors with " - "integer elements; %i float entries were truncated to integer value" % - truncated) + "integer elements; %i float entries were truncated to integer value" % + truncated) return offsets diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index db4da9f361..d2d8301019 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -44,7 +44,6 @@ from gensim.interfaces import TransformedCorpus - class ShardedCorpus(IndexedCorpus): """ This corpus is designed for situations where you need to train a model @@ -237,7 +236,7 @@ def __init__(self, output_prefix, corpus, dim=None, # corresponds to index 0 of current shard logger.info('Initializing sharded corpus with prefix ' - '{0}'.format(output_prefix)) + '{0}'.format(output_prefix)) if (not os.path.isfile(output_prefix)) or overwrite: logger.info('Building from corpus...') self.init_shards(output_prefix, corpus, shardsize) @@ -266,9 +265,9 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp '{0}'.format(proposed_dim)) else: logger.warn('Dataset dimension derived from input corpus diffe' - 'rs from initialization argument, using corpus.' - '(corpus {0}, init arg {1})'.format(proposed_dim, - self.dim)) + 'rs from initialization argument, using corpus.' + '(corpus {0}, init arg {1})'.format(proposed_dim, + self.dim)) self.dim = proposed_dim self.offsets = [0] @@ -282,7 +281,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp current_shard = numpy.zeros((len(doc_chunk), self.dim), dtype=dtype) logger.debug('Current chunk dimension: ' - '{0} x {1}'.format(len(doc_chunk), self.dim)) + '{0} x {1}'.format(len(doc_chunk), self.dim)) for i, doc in enumerate(doc_chunk): doc = dict(doc) @@ -313,8 +312,8 @@ def init_by_clone(self): logger.info('Loaded dataset dimension: {0}'.format(temp.dim)) else: logger.warn('Loaded dataset dimension differs from init arg ' - 'dimension, using loaded dim. ' - '(loaded {0}, init {1})'.format(temp.dim, self.dim)) + 'dimension, using loaded dim. ' + '(loaded {0}, init {1})'.format(temp.dim, self.dim)) self.dim = temp.dim # To be consistent with the loaded data! @@ -329,7 +328,7 @@ def save_shard(self, shard, n=None, filename=None): """ new_shard = False if n is None: - n = self.n_shards # Saving the *next* one by default. + n = self.n_shards # Saving the *next* one by default. new_shard = True if not filename: @@ -345,7 +344,7 @@ def load_shard(self, n): """ Load (unpickle) the n-th shard as the "live" part of the dataset into the Dataset object.""" - # logger.debug('ShardedCorpus loading shard {0}, ' + #logger.debug('ShardedCorpus loading shard {0}, ' # 'current shard: {1}'.format(n, self.current_shard_n)) # No-op if the shard is already open. @@ -414,9 +413,9 @@ def in_next(self, offset): """ if self.current_shard_n == self.n_shards: - return False # There's no next shard. + return False # There's no next shard. return (self.offsets[self.current_shard_n + 1] <= offset) \ - and (offset < self.offsets[self.current_shard_n + 2]) + and (offset < self.offsets[self.current_shard_n + 2]) def resize_shards(self, shardsize): """ @@ -472,8 +471,8 @@ def resize_shards(self, shardsize): os.remove(old_shard_name) except Exception as e: logger.error('Exception occurred during old shard no. {0} ' - 'removal: {1}.\nAttempting to at least move ' - 'new shards in.'.format(old_shard_n, str(e))) + 'removal: {1}.\nAttempting to at least move ' + 'new shards in.'.format(old_shard_n, str(e))) finally: # If something happens with cleaning up - try to at least get the # new guys in. @@ -529,17 +528,17 @@ def _guess_n_features(self, corpus): else: if not self.dim: raise TypeError('Couldn\'t find number of features, ' - 'refusing to guess (dimension set to {0},' - 'type of corpus: {1}).'.format(self.dim, type(corpus))) + 'refusing to guess (dimension set to {0},' + 'type of corpus: {1}).'.format(self.dim, type(corpus))) else: logger.warn('Couldn\'t find number of features, trusting ' - 'supplied dimension ({0})'.format(self.dim)) + 'supplied dimension ({0})'.format(self.dim)) n_features = self.dim if self.dim and n_features != self.dim: logger.warn('Discovered inconsistent dataset dim ({0}) and ' - 'feature count from corpus ({1}). Coercing to dimension' - ' given by argument.'.format(self.dim, n_features)) + 'feature count from corpus ({1}). Coercing to dimension' + ' given by argument.'.format(self.dim, n_features)) return n_features @@ -605,7 +604,7 @@ def __getitem__(self, offset): # This fails on one-past # slice indexing; that's why there's a code branch here. - # logger.debug('ShardedCorpus: Retrieving slice {0}: ' + #logger.debug('ShardedCorpus: Retrieving slice {0}: ' # 'shard {1}'.format((offset.start, offset.stop), # (first_shard, last_shard))) @@ -614,7 +613,7 @@ def __getitem__(self, offset): # The easy case: both in one shard. if first_shard == last_shard: s_result = self.current_shard[start - self.current_offset: - stop - self.current_offset] + stop - self.current_offset] # Handle different sparsity settings: s_result = self._getitem_format(s_result) @@ -650,13 +649,13 @@ def __getitem__(self, offset): shard_stop = self.offsets[self.current_shard_n + 1] - \ self.current_offset - # s_result[result_start:result_stop] = self.current_shard[ + #s_result[result_start:result_stop] = self.current_shard[ # shard_start:shard_stop] s_result = self.__add_to_slice(s_result, result_start, result_stop, shard_start, shard_stop) # First and last get special treatment, these are in between - for shard_n in xrange(first_shard + 1, last_shard): + for shard_n in xrange(first_shard+1, last_shard): self.load_shard(shard_n) result_start = result_stop @@ -747,7 +746,7 @@ def _getitem_sparse2gensim(self, result): """ def row_sparse2gensim(row_idx, csr_matrix): - indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx + 1]] + indices = csr_matrix.indices[csr_matrix.indptr[row_idx]:csr_matrix.indptr[row_idx+1]] g_row = [(col_idx, csr_matrix[row_idx, col_idx]) for col_idx in indices] return g_row diff --git a/gensim/corpora/svmlightcorpus.py b/gensim/corpora/svmlightcorpus.py index 5e24419421..4fdc764b16 100644 --- a/gensim/corpora/svmlightcorpus.py +++ b/gensim/corpora/svmlightcorpus.py @@ -58,7 +58,7 @@ def __init__(self, fname, store_labels=True): IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) - self.fname = fname # input file, see class doc for format + self.fname = fname # input file, see class doc for format self.length = None self.store_labels = store_labels self.labels = [] @@ -94,7 +94,7 @@ def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): offsets = [] with utils.smart_open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): - label = labels[docno] if labels else 0 # target class is 0 by default + label = labels[docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets @@ -114,12 +114,12 @@ def line2doc(self, line): line = utils.to_unicode(line) line = line[: line.find('#')].strip() if not line: - return None # ignore comments and empty lines + return None # ignore comments and empty lines parts = line.split() if not parts: raise ValueError('invalid line format in %s' % self.fname) target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] - doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based + doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] # ignore 'qid' features, convert 1-based feature ids to 0-based return doc, target @staticmethod @@ -127,7 +127,7 @@ def doc2line(doc, label=0): """ Output the document in SVMlight format, as a string. Inverse function to `line2doc`. """ - pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base + pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base return "%s %s\n" % (label, pairs) # endclass SvmLightCorpus diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 62f6213052..44b2a772d9 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -51,7 +51,7 @@ def __init__(self, input): pass logger.info('accepted corpus with %i documents, %i features, %i non-zero entries' % - (self.num_docs, self.num_terms, self.num_nnz)) + (self.num_docs, self.num_terms, self.num_nnz)) def skip_headers(self, input_file): for lineno, _ in enumerate(input_file): @@ -118,7 +118,7 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): offsets.append(posnow) poslast = posnow - vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights + vector = [(x, int(y)) for (x, y) in bow if int(y) != 0] # integer count, not floating weights max_id, veclen = writer.write_vector(docno, vector) num_terms = max(num_terms, 1 + max_id) num_nnz += veclen @@ -126,10 +126,10 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): if num_docs * num_terms != 0: logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % - (num_docs, num_terms, - 100.0 * num_nnz / (num_docs * num_terms), - num_nnz, - num_docs * num_terms)) + (num_docs, num_terms, + 100.0 * num_nnz / (num_docs * num_terms), + num_nnz, + num_docs * num_terms)) # now write proper headers, by seeking and overwriting the spaces written earlier writer.update_headers(num_docs, num_terms, num_nnz) @@ -165,7 +165,7 @@ def __iter__(self): (yielding one document at a time). """ for docId, doc in super(UciCorpus, self).__iter__(): - yield doc # get rid of docId, return the sparse vector only + yield doc # get rid of docId, return the sparse vector only def create_dictionary(self): """ diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 2338c47cc2..7c345d8812 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -14,7 +14,6 @@ logger = logging.getLogger(__name__) - def arithmetic_mean(confirmed_measures): """ This functoin performs the arithmetic mean aggregation on the output obtained from diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index ca3e8a0b6d..83227822e9 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -15,7 +15,6 @@ EPSILON = 1e-12 # Should be small. Value as suggested in paper. - def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): """ This function calculates the log-conditional-probability measure @@ -43,7 +42,6 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): return m_lc - def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False): """ If normalize=False: diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index a3071c7ca6..c68206a372 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -48,7 +48,6 @@ def _present(w_prime_star, w, w_backtrack): return -1 return index - def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_docs): """ Internal helper function to return context vectors for segmentations. @@ -70,7 +69,6 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma return (context_vectors, backtrack) - def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gamma, num_docs): """ This function calculates the indirect cosine measure. Given context vectors diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index ac0932d60d..a76f40db4c 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -17,7 +17,6 @@ logger = logging.getLogger(__name__) - def _ret_top_ids(segmented_topics): """ Helper function to return a set of all the unique topic ids in segmented topics. @@ -32,7 +31,6 @@ def _ret_top_ids(segmented_topics): top_ids.add(id) return top_ids - def p_boolean_document(corpus, segmented_topics): """ This function performs the boolean document probability estimation. Boolean document estimates the probability @@ -60,7 +58,6 @@ def p_boolean_document(corpus, segmented_topics): num_docs = len(corpus) return (per_topic_postings, num_docs) - def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): """ This function performs the boolean sliding window probability estimation. Boolean sliding window diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 1b4b05c6b4..9a2a58b060 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -13,7 +13,6 @@ logger = logging.getLogger(__name__) - def s_one_pre(topics): """ This function performs s_one_pre segmentation on a list of topics. @@ -44,7 +43,6 @@ def s_one_pre(topics): return s_one_pre - def s_one_one(topics): """ This function performs s_one_one segmentation on a list of topics. @@ -78,7 +76,6 @@ def s_one_one(topics): return s_one_one - def s_one_set(topics): """ This function performs s_one_set segmentation on a list of topics.