added documentation to textdoc methods; stripped _distance from all d…

…istance functions; updated changelog
chartbeat-labs · Jul 14, 2016 · 5687b24 · 5687b24
1 parent 03dab53
commit 5687b24
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 43 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -6,16 +6,18 @@ dev
 
 Changes:
 
-- Added `.save()` methods and `.load()` classmethods to both `TextDoc` and `TextCorpus` classes, which allows for fast serialization of parsed documents and associated metadata to/from disk.
-    - caveat: if `spacy.Vocab` object used to serialize and deserialize is not the same, there will be problems, making this format useful as short-term but not long-term storage
-- `TextCorpus` may now be instantiated with an already-loaded spaCy pipeline, which may or may not have all models loaded; it can still be instantiated using a language code string ('en', 'de') to load a spaCy pipeline that includes all models by default
+- New features for `TextDoc` and `TextCorpus` classes
+    - added `.save()` methods and `.load()` classmethods, which allows for fast serialization of parsed documents/corpora and associated metadata to/from disk — with an important caveat: if `spacy.Vocab` object used to serialize and deserialize is not the same, there will be problems, making this format useful as short-term but not long-term storage
+    - `TextCorpus` may now be instantiated with an already-loaded spaCy pipeline, which may or may not have all models loaded; it can still be instantiated using a language code string ('en', 'de') to load a spaCy pipeline that includes all models by default
+    - `TextDoc` methods wrapping `extract` and `keyterms` functions now have full documentation rather than forwarding users to the wrapped functions themselves; more irritating on the dev side, but much less irritating on the user side :)
 - Added a `distance.py` module containing several document, set, and string distance metrics
     - word movers: document distance as distance between individual words represented by word2vec vectors, normalized
     - "word2vec": token, span, or document distance as cosine distance between (average) word2vec representations, normalized
     - jaccard: string or set(string) distance as intersection / overlap, normalized, with optional fuzzy-matching across set members
     - hamming: distance between two strings as number of substititions, optionally normalized
     - levenshtein: distance between two strings as number of substitions, deletions, and insertions, optionally normalized (and removed a redundant function from the still-orphaned `math_utils.py` module)
     - jaro-winkler: distance between two strings with variable prefix weighting, normalized
+- Added `most_discriminating_terms()` function to `keyterms` module to take a collection of documents split into two exclusive groups and compute the most discriminating terms for group1-and-not-group2 as well as group2-and-not-group1
 
 Bugfixes:
 

diff --git a/tests/test_distance.py b/tests/test_distance.py
@@ -13,68 +13,68 @@ def setUp(self):
         self.doc1 = textacy.TextDoc(self.text1, lang='en')
         self.doc2 = textacy.TextDoc(self.text2, lang='en')
 
-    def test_word_movers_distance(self):
+    def test_word_movers(self):
         metrics = ('cosine', 'l1', 'manhattan', 'l2', 'euclidean')
         expected_values = (0.467695, 0.655712, 0.655712, 0.668999, 0.668999)
         for metric, expected_value in zip(metrics, expected_values):
             self.assertAlmostEqual(
-                textacy.distance.word_movers_distance(self.doc1, self.doc2, metric=metric),
+                textacy.distance.word_movers(self.doc1, self.doc2, metric=metric),
                 expected_value,
                 places=4)
 
-    def test_word2vec_distance(self):
+    def test_word2vec(self):
         pairs = ((self.doc1, self.doc2),
                  (self.doc1[-2:], self.doc2[-2:]),
                  (self.doc1[-1], self.doc2[-1]))
         expected_values = (0.089036, 0.238299, 0.500000)
         for pair, expected_value in zip(pairs, expected_values):
             self.assertAlmostEqual(
-                textacy.distance.word2vec_distance(pair[0], pair[1]),
+                textacy.distance.word2vec(pair[0], pair[1]),
                 expected_value,
                 places=4)
 
-    def test_jaccard_distance(self):
+    def test_jaccard(self):
         pairs = ((self.text1, self.text2),
                  (self.text1.split(), self.text2.split()))
         expected_values = (0.541666, 0.909090)
         for pair, expected_value in zip(pairs, expected_values):
             self.assertAlmostEqual(
-                textacy.distance.jaccard_distance(pair[0], pair[1]),
+                textacy.distance.jaccard(pair[0], pair[1]),
                 expected_value,
                 places=4)
 
-    def test_jaccard_distance_exception(self):
+    def test_jaccard_exception(self):
         self.assertRaises(
-            ValueError, textacy.distance.jaccard_distance,
+            ValueError, textacy.distance.jaccard,
             self.text1, self.text2, True)
 
-    def test_jaccard_distance_fuzzy_match(self):
+    def test_jaccard_fuzzy_match(self):
         thresholds = (50, 70, 90)
         expected_values = (0.545454, 0.727272, 0.909090)
         for thresh, expected_value in zip(thresholds, expected_values):
             self.assertAlmostEqual(
-                textacy.distance.jaccard_distance(self.text1.split(), self.text2.split(),
-                                                  fuzzy_match=True, match_threshold=thresh),
+                textacy.distance.jaccard(self.text1.split(), self.text2.split(),
+                                         fuzzy_match=True, match_threshold=thresh),
                 expected_value,
                 places=4)
 
-    def test_hamming_distance(self):
+    def test_hamming(self):
         self.assertEqual(
-            textacy.distance.hamming_distance(self.text1, self.text2),
+            textacy.distance.hamming(self.text1, self.text2),
             34)
         self.assertEqual(
-            textacy.distance.hamming_distance(self.text1, self.text2, normalize=True),
+            textacy.distance.hamming(self.text1, self.text2, normalize=True),
             0.8717948717948718)
 
-    def test_levenshtein_distance(self):
+    def test_levenshtein(self):
         self.assertEqual(
-            textacy.distance.levenshtein_distance(self.text1, self.text2),
+            textacy.distance.levenshtein(self.text1, self.text2),
             25)
         self.assertEqual(
-            textacy.distance.levenshtein_distance(self.text1, self.text2, normalize=True),
+            textacy.distance.levenshtein(self.text1, self.text2, normalize=True),
             0.6410256410256411)
 
-    def test_jaro_winkler_distance(self):
+    def test_jaro_winkler(self):
         self.assertEqual(
-            textacy.distance.jaro_winkler_distance(self.text1, self.text2),
+            textacy.distance.jaro_winkler(self.text1, self.text2),
             0.4281995781995781)
diff --git a/textacy/distance.py b/textacy/distance.py
@@ -20,7 +20,7 @@
 from textacy.compat import str
 
 
-def word_movers_distance(doc1, doc2, metric='cosine'):
+def word_movers(doc1, doc2, metric='cosine'):
     """
     Measure the semantic distance between two documents using Word Movers Distance.
 
@@ -71,7 +71,7 @@ def word_movers_distance(doc1, doc2, metric='cosine'):
     return emd(vec1, vec2, distance_mat)
 
 
-def word2vec_distance(obj1, obj2):
+def word2vec(obj1, obj2):
     """
     Measure the semantic similarity between one TextDoc or spacy Doc, Span, Token,
     or Lexeme and another like object as the cosine distance between the objects'
@@ -91,7 +91,7 @@ def word2vec_distance(obj1, obj2):
     return 1.0 - obj1.similarity(obj2)
 
 
-def jaccard_distance(obj1, obj2, fuzzy_match=False, match_threshold=80):
+def jaccard(obj1, obj2, fuzzy_match=False, match_threshold=80):
     """
     Measure the semantic distance between two strings or sequences of strings
     using Jaccard distance, with optional fuzzy matching of not-identical pairs
@@ -130,7 +130,7 @@ def jaccard_distance(obj1, obj2, fuzzy_match=False, match_threshold=80):
     return 1.0 - (intersection / union)
 
 
-def hamming_distance(str1, str2, normalize=False):
+def hamming(str1, str2, normalize=False):
     """
     Measure the distance between two strings using Hamming distance, which simply
     gives the number of characters in the strings that are different, i.e. the
@@ -167,7 +167,7 @@ def hamming_distance(str1, str2, normalize=False):
     return distance
 
 
-def levenshtein_distance(str1, str2, normalize=False):
+def levenshtein(str1, str2, normalize=False):
     """
     Measure the distance between two strings using Levenshtein distance, which
     gives the minimum number of character insertions, deletions, and substitutions
@@ -190,7 +190,7 @@ def levenshtein_distance(str1, str2, normalize=False):
     return distance
 
 
-def jaro_winkler_distance(str1, str2, prefix_weight=0.1):
+def jaro_winkler(str1, str2, prefix_weight=0.1):
     """
     Measure the distance between two strings using Jaro-Winkler similarity metric,
     a modification of Jaro metric giving more weight to a shared prefix.

diff --git a/textacy/texts.py b/textacy/texts.py
@@ -372,9 +372,27 @@ def words(self, **kwargs):
         Extract an ordered sequence of words from a spacy-parsed doc, optionally
         filtering words by part-of-speech (etc.) and frequency.
 
-        .. seealso:: :func:`extract.words() <textacy.extract.words>` for all function kwargs.
+        Args:
+            **kwargs:
+                filter_stops (bool, optional): if True, remove stop words from word list
+                filter_punct (bool, optional): if True, remove punctuation from word list
+                filter_nums (bool, optional): if True, remove number-like words
+                    (e.g. 10, 'ten') from word list
+                good_pos_tags (set[str], optional): remove words whose part-of-speech tag
+                    is NOT in the specified tags, using the set of universal POS tagset
+                bad_pos_tags (set[str], optional): remove words whose part-of-speech tag
+                    IS in the specified tags, using the set of universal POS tagset
+                min_freq (int, optional): remove words that occur in `doc` fewer than
+                    `min_freq` times
+
+        Yields:
+            ``spacy.Token``: the next token passing all specified filters,
+                in order of appearance in the document
+
+        .. seealso:: :func:`extract.words() <textacy.extract.words>`
         """
-        return extract.words(self.spacy_doc, **kwargs)
+        for word in extract.words(self.spacy_doc, **kwargs):
+            yield word
 
     def ngrams(self, n, **kwargs):
         """
@@ -385,30 +403,75 @@ def ngrams(self, n, **kwargs):
         Args:
             n (int): number of tokens to include in n-grams;
                 1 => unigrams, 2 => bigrams
+            **kwargs:
+                filter_stops (bool, optional): if True, remove ngrams that start or end
+                    with a stop word
+                filter_punct (bool, optional): if True, remove ngrams that contain
+                    any punctuation-only tokens
+                filter_nums (bool, optional): if True, remove ngrams that contain
+                    any numbers or number-like tokens (e.g. 10, 'ten')
+                good_pos_tags (set[str], optional): remove ngrams whose constituent
+                    tokens' part-of-speech tags are NOT all in the specified tags,
+                    using the universal POS tagset
+                bad_pos_tags (set[str], optional): remove ngrams if any of their constituent
+                    tokens' part-of-speech tags are in the specified tags,
+                    using the universal POS tagset
+                min_freq (int, optional): remove ngrams that occur in `doc` fewer than
+                    `min_freq` times
 
-        .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>` for all function kwargs.
+        Yields:
+            ``spacy.Span``: the next ngram passing all specified filters,
+                in order of appearance in the document
+
+        .. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>`
         """
-        return extract.ngrams(self.spacy_doc, n, **kwargs)
+        for ngram in extract.ngrams(self.spacy_doc, n, **kwargs):
+            yield ngram
 
     def named_entities(self, **kwargs):
         """
         Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
         doc, optionally filtering by the entity types and frequencies.
 
+        Args:
+            **kwargs:
+                good_ne_types (set[str] or 'numeric', optional): named entity types to
+                    include; if "numeric", all numeric entity types are included
+                bad_ne_types (set[str] or 'numeric', optional): named entity types to
+                    exclude; if "numeric", all numeric entity types are excluded
+                min_freq (int, optional): remove named entities that occur in `doc` fewer
+                    than `min_freq` times
+                drop_determiners (bool, optional): remove leading determiners (e.g. "the")
+                    from named entities (e.g. "the United States" => "United States")
+
+    Yields:
+        ``spacy.Span``: the next named entity passing all specified filters,
+            in order of appearance in the document
+
         .. seealso:: :func:`extract.named_entities() <textacy.extract.named_entities>`
-        for all function kwargs.
         """
-        return extract.named_entities(self.spacy_doc, **kwargs)
+        for ne in extract.named_entities(self.spacy_doc, **kwargs):
+            yield ne
 
     def noun_chunks(self, **kwargs):
         """
         Extract an ordered sequence of noun phrases from doc, optionally
         filtering by frequency and dropping leading determiners.
 
+        Args:
+            **kwargs:
+                drop_determiners (bool, optional): remove leading determiners (e.g. "the")
+                    from phrases (e.g. "the quick brown fox" => "quick brown fox")
+                min_freq (int, optional): remove chunks that occur in `doc` fewer than
+                    `min_freq` times
+
+        Yields:
+            ``spacy.Span``: the next noun chunk, in order of appearance in the document
+
         .. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>`
-        for all function kwargs.
         """
-        return extract.noun_chunks(self.spacy_doc, **kwargs)
+        for nc in extract.noun_chunks(self.spacy_doc, **kwargs):
+            yield nc
 
     def pos_regex_matches(self, pattern):
         """
@@ -429,26 +492,44 @@ def pos_regex_matches(self, pattern):
                 * compound nouns: r'<NOUN>+'
                 * verb phrase: r'<VERB>?<ADV>*<VERB>+'
                 * prepositional phrase: r'<PREP> <DET>? (<NOUN>+<ADP>)* <NOUN>+'
+
+        Yields:
+            ``spacy.Span``: the next span of consecutive tokens whose parts-of-speech
+                match ``pattern``, in order of apperance in the document
         """
-        return extract.pos_regex_matches(self.spacy_doc, pattern)
+        for match in extract.pos_regex_matches(self.spacy_doc, pattern):
+            yield match
 
     def subject_verb_object_triples(self):
         """
         Extract an *un*ordered sequence of distinct subject-verb-object (SVO) triples
         from doc.
+
+        Yields:
+            (``spacy.Span``, ``spacy.Span``, ``spacy.Span``): the next 3-tuple
+                representing a (subject, verb, object) triple, in order of apperance
         """
-        return extract.subject_verb_object_triples(self.spacy_doc)
+        for svo in extract.subject_verb_object_triples(self.spacy_doc):
+            yield svo
 
-    def acronyms_and_definitions(self, **kwargs):
+    def acronyms_and_definitions(self, known_acro_defs=None):
         """
         Extract a collection of acronyms and their most likely definitions,
         if available, from doc. If multiple definitions are found for a given acronym,
         only the most frequently occurring definition is returned.
 
+        Args:
+            known_acro_defs (dict, optional): if certain acronym/definition pairs
+                are known, pass them in as {acronym (str): definition (str)};
+                algorithm will not attempt to find new definitions
+
+        Returns:
+            dict: unique acronyms (keys) with matched definitions (values)
+
         .. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>`
         for all function kwargs.
         """
-        return extract.acronyms_and_definitions(self.spacy_doc, **kwargs)
+        return extract.acronyms_and_definitions(self.spacy_doc, known_acro_defs=known_acro_defs)
 
     def semistructured_statements(self, entity, **kwargs):
         """
@@ -458,18 +539,40 @@ def semistructured_statements(self, entity, **kwargs):
         Args:
             entity (str): a noun or noun phrase of some sort (e.g. "President Obama",
                 "global warming", "Python")
+            **kwargs:
+                cue (str, optional): verb lemma with which `entity` is associated
+                    (e.g. "talk about", "have", "write")
+                ignore_entity_case (bool, optional): if True, entity matching is
+                    case-independent
+                min_n_words (int, optional): min number of tokens allowed in a
+                    matching fragment
+                max_n_words (int, optional): max number of tokens allowed in a
+                    matching fragment
+
+        Yields:
+            (``spacy.Span`` or ``spacy.Token``, ``spacy.Span`` or ``spacy.Token``, ``spacy.Span``):
+                  where each element is a matching (entity, cue, fragment) triple
 
         .. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>`
-        for all function kwargs.
         """
-        return extract.semistructured_statements(self.spacy_doc, entity, **kwargs)
+        for sss in extract.semistructured_statements(self.spacy_doc, entity, **kwargs):
+            yield sss
 
     def direct_quotations(self):
         """
         Baseline, not-great attempt at direction quotation extraction (no indirect
         or mixed quotations) using rules and patterns. English only.
+
+        Yields:
+            (``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation
+                represented as a (speaker, reporting verb, quotation) 3-tuple
+
+        .. seealso:: :func:`extract.direct_quotations() <textacy.extract.direct_quotations>`
         """
-        return extract.direct_quotations(self.spacy_doc)
+        if self.lang != 'en':
+            raise NotImplementedError('sorry, English-language texts only :(')
+        for dq in extract.direct_quotations(self.spacy_doc):
+            yield dq
 
     def key_terms(self, algorithm='sgrank', n=10):
         """
@@ -482,8 +585,16 @@ def key_terms(self, algorithm='sgrank', n=10):
                 as keyterms; if float, must be in the open interval (0.0, 1.0),
                 representing the fraction of top-ranked terms to return as keyterms
 
+        Returns:
+            list[(str, float)]: sorted list of top `n` key terms and their
+                corresponding scores
+
         Raises:
             ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}
+
+        .. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
+        .. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
+        .. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
         """
         if algorithm == 'sgrank':
             return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n)