Skip to content

Commit

Permalink
added documentation to textdoc methods; stripped _distance from all d…
Browse files Browse the repository at this point in the history
…istance functions; updated changelog
  • Loading branch information
Burton DeWilde committed Jul 14, 2016
1 parent 03dab53 commit 5687b24
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 43 deletions.
8 changes: 5 additions & 3 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,18 @@ dev

Changes:

- Added `.save()` methods and `.load()` classmethods to both `TextDoc` and `TextCorpus` classes, which allows for fast serialization of parsed documents and associated metadata to/from disk.
- caveat: if `spacy.Vocab` object used to serialize and deserialize is not the same, there will be problems, making this format useful as short-term but not long-term storage
- `TextCorpus` may now be instantiated with an already-loaded spaCy pipeline, which may or may not have all models loaded; it can still be instantiated using a language code string ('en', 'de') to load a spaCy pipeline that includes all models by default
- New features for `TextDoc` and `TextCorpus` classes
- added `.save()` methods and `.load()` classmethods, which allows for fast serialization of parsed documents/corpora and associated metadata to/from disk — with an important caveat: if `spacy.Vocab` object used to serialize and deserialize is not the same, there will be problems, making this format useful as short-term but not long-term storage
- `TextCorpus` may now be instantiated with an already-loaded spaCy pipeline, which may or may not have all models loaded; it can still be instantiated using a language code string ('en', 'de') to load a spaCy pipeline that includes all models by default
- `TextDoc` methods wrapping `extract` and `keyterms` functions now have full documentation rather than forwarding users to the wrapped functions themselves; more irritating on the dev side, but much less irritating on the user side :)
- Added a `distance.py` module containing several document, set, and string distance metrics
- word movers: document distance as distance between individual words represented by word2vec vectors, normalized
- "word2vec": token, span, or document distance as cosine distance between (average) word2vec representations, normalized
- jaccard: string or set(string) distance as intersection / overlap, normalized, with optional fuzzy-matching across set members
- hamming: distance between two strings as number of substititions, optionally normalized
- levenshtein: distance between two strings as number of substitions, deletions, and insertions, optionally normalized (and removed a redundant function from the still-orphaned `math_utils.py` module)
- jaro-winkler: distance between two strings with variable prefix weighting, normalized
- Added `most_discriminating_terms()` function to `keyterms` module to take a collection of documents split into two exclusive groups and compute the most discriminating terms for group1-and-not-group2 as well as group2-and-not-group1

Bugfixes:

Expand Down
38 changes: 19 additions & 19 deletions tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,68 +13,68 @@ def setUp(self):
self.doc1 = textacy.TextDoc(self.text1, lang='en')
self.doc2 = textacy.TextDoc(self.text2, lang='en')

def test_word_movers_distance(self):
def test_word_movers(self):
metrics = ('cosine', 'l1', 'manhattan', 'l2', 'euclidean')
expected_values = (0.467695, 0.655712, 0.655712, 0.668999, 0.668999)
for metric, expected_value in zip(metrics, expected_values):
self.assertAlmostEqual(
textacy.distance.word_movers_distance(self.doc1, self.doc2, metric=metric),
textacy.distance.word_movers(self.doc1, self.doc2, metric=metric),
expected_value,
places=4)

def test_word2vec_distance(self):
def test_word2vec(self):
pairs = ((self.doc1, self.doc2),
(self.doc1[-2:], self.doc2[-2:]),
(self.doc1[-1], self.doc2[-1]))
expected_values = (0.089036, 0.238299, 0.500000)
for pair, expected_value in zip(pairs, expected_values):
self.assertAlmostEqual(
textacy.distance.word2vec_distance(pair[0], pair[1]),
textacy.distance.word2vec(pair[0], pair[1]),
expected_value,
places=4)

def test_jaccard_distance(self):
def test_jaccard(self):
pairs = ((self.text1, self.text2),
(self.text1.split(), self.text2.split()))
expected_values = (0.541666, 0.909090)
for pair, expected_value in zip(pairs, expected_values):
self.assertAlmostEqual(
textacy.distance.jaccard_distance(pair[0], pair[1]),
textacy.distance.jaccard(pair[0], pair[1]),
expected_value,
places=4)

def test_jaccard_distance_exception(self):
def test_jaccard_exception(self):
self.assertRaises(
ValueError, textacy.distance.jaccard_distance,
ValueError, textacy.distance.jaccard,
self.text1, self.text2, True)

def test_jaccard_distance_fuzzy_match(self):
def test_jaccard_fuzzy_match(self):
thresholds = (50, 70, 90)
expected_values = (0.545454, 0.727272, 0.909090)
for thresh, expected_value in zip(thresholds, expected_values):
self.assertAlmostEqual(
textacy.distance.jaccard_distance(self.text1.split(), self.text2.split(),
fuzzy_match=True, match_threshold=thresh),
textacy.distance.jaccard(self.text1.split(), self.text2.split(),
fuzzy_match=True, match_threshold=thresh),
expected_value,
places=4)

def test_hamming_distance(self):
def test_hamming(self):
self.assertEqual(
textacy.distance.hamming_distance(self.text1, self.text2),
textacy.distance.hamming(self.text1, self.text2),
34)
self.assertEqual(
textacy.distance.hamming_distance(self.text1, self.text2, normalize=True),
textacy.distance.hamming(self.text1, self.text2, normalize=True),
0.8717948717948718)

def test_levenshtein_distance(self):
def test_levenshtein(self):
self.assertEqual(
textacy.distance.levenshtein_distance(self.text1, self.text2),
textacy.distance.levenshtein(self.text1, self.text2),
25)
self.assertEqual(
textacy.distance.levenshtein_distance(self.text1, self.text2, normalize=True),
textacy.distance.levenshtein(self.text1, self.text2, normalize=True),
0.6410256410256411)

def test_jaro_winkler_distance(self):
def test_jaro_winkler(self):
self.assertEqual(
textacy.distance.jaro_winkler_distance(self.text1, self.text2),
textacy.distance.jaro_winkler(self.text1, self.text2),
0.4281995781995781)
12 changes: 6 additions & 6 deletions textacy/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from textacy.compat import str


def word_movers_distance(doc1, doc2, metric='cosine'):
def word_movers(doc1, doc2, metric='cosine'):
"""
Measure the semantic distance between two documents using Word Movers Distance.
Expand Down Expand Up @@ -71,7 +71,7 @@ def word_movers_distance(doc1, doc2, metric='cosine'):
return emd(vec1, vec2, distance_mat)


def word2vec_distance(obj1, obj2):
def word2vec(obj1, obj2):
"""
Measure the semantic similarity between one TextDoc or spacy Doc, Span, Token,
or Lexeme and another like object as the cosine distance between the objects'
Expand All @@ -91,7 +91,7 @@ def word2vec_distance(obj1, obj2):
return 1.0 - obj1.similarity(obj2)


def jaccard_distance(obj1, obj2, fuzzy_match=False, match_threshold=80):
def jaccard(obj1, obj2, fuzzy_match=False, match_threshold=80):
"""
Measure the semantic distance between two strings or sequences of strings
using Jaccard distance, with optional fuzzy matching of not-identical pairs
Expand Down Expand Up @@ -130,7 +130,7 @@ def jaccard_distance(obj1, obj2, fuzzy_match=False, match_threshold=80):
return 1.0 - (intersection / union)


def hamming_distance(str1, str2, normalize=False):
def hamming(str1, str2, normalize=False):
"""
Measure the distance between two strings using Hamming distance, which simply
gives the number of characters in the strings that are different, i.e. the
Expand Down Expand Up @@ -167,7 +167,7 @@ def hamming_distance(str1, str2, normalize=False):
return distance


def levenshtein_distance(str1, str2, normalize=False):
def levenshtein(str1, str2, normalize=False):
"""
Measure the distance between two strings using Levenshtein distance, which
gives the minimum number of character insertions, deletions, and substitutions
Expand All @@ -190,7 +190,7 @@ def levenshtein_distance(str1, str2, normalize=False):
return distance


def jaro_winkler_distance(str1, str2, prefix_weight=0.1):
def jaro_winkler(str1, str2, prefix_weight=0.1):
"""
Measure the distance between two strings using Jaro-Winkler similarity metric,
a modification of Jaro metric giving more weight to a shared prefix.
Expand Down
141 changes: 126 additions & 15 deletions textacy/texts.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,9 +372,27 @@ def words(self, **kwargs):
Extract an ordered sequence of words from a spacy-parsed doc, optionally
filtering words by part-of-speech (etc.) and frequency.
.. seealso:: :func:`extract.words() <textacy.extract.words>` for all function kwargs.
Args:
**kwargs:
filter_stops (bool, optional): if True, remove stop words from word list
filter_punct (bool, optional): if True, remove punctuation from word list
filter_nums (bool, optional): if True, remove number-like words
(e.g. 10, 'ten') from word list
good_pos_tags (set[str], optional): remove words whose part-of-speech tag
is NOT in the specified tags, using the set of universal POS tagset
bad_pos_tags (set[str], optional): remove words whose part-of-speech tag
IS in the specified tags, using the set of universal POS tagset
min_freq (int, optional): remove words that occur in `doc` fewer than
`min_freq` times
Yields:
``spacy.Token``: the next token passing all specified filters,
in order of appearance in the document
.. seealso:: :func:`extract.words() <textacy.extract.words>`
"""
return extract.words(self.spacy_doc, **kwargs)
for word in extract.words(self.spacy_doc, **kwargs):
yield word

def ngrams(self, n, **kwargs):
"""
Expand All @@ -385,30 +403,75 @@ def ngrams(self, n, **kwargs):
Args:
n (int): number of tokens to include in n-grams;
1 => unigrams, 2 => bigrams
**kwargs:
filter_stops (bool, optional): if True, remove ngrams that start or end
with a stop word
filter_punct (bool, optional): if True, remove ngrams that contain
any punctuation-only tokens
filter_nums (bool, optional): if True, remove ngrams that contain
any numbers or number-like tokens (e.g. 10, 'ten')
good_pos_tags (set[str], optional): remove ngrams whose constituent
tokens' part-of-speech tags are NOT all in the specified tags,
using the universal POS tagset
bad_pos_tags (set[str], optional): remove ngrams if any of their constituent
tokens' part-of-speech tags are in the specified tags,
using the universal POS tagset
min_freq (int, optional): remove ngrams that occur in `doc` fewer than
`min_freq` times
.. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>` for all function kwargs.
Yields:
``spacy.Span``: the next ngram passing all specified filters,
in order of appearance in the document
.. seealso:: :func:`extract.ngrams() <textacy.extract.ngrams>`
"""
return extract.ngrams(self.spacy_doc, n, **kwargs)
for ngram in extract.ngrams(self.spacy_doc, n, **kwargs):
yield ngram

def named_entities(self, **kwargs):
"""
Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
doc, optionally filtering by the entity types and frequencies.
Args:
**kwargs:
good_ne_types (set[str] or 'numeric', optional): named entity types to
include; if "numeric", all numeric entity types are included
bad_ne_types (set[str] or 'numeric', optional): named entity types to
exclude; if "numeric", all numeric entity types are excluded
min_freq (int, optional): remove named entities that occur in `doc` fewer
than `min_freq` times
drop_determiners (bool, optional): remove leading determiners (e.g. "the")
from named entities (e.g. "the United States" => "United States")
Yields:
``spacy.Span``: the next named entity passing all specified filters,
in order of appearance in the document
.. seealso:: :func:`extract.named_entities() <textacy.extract.named_entities>`
for all function kwargs.
"""
return extract.named_entities(self.spacy_doc, **kwargs)
for ne in extract.named_entities(self.spacy_doc, **kwargs):
yield ne

def noun_chunks(self, **kwargs):
"""
Extract an ordered sequence of noun phrases from doc, optionally
filtering by frequency and dropping leading determiners.
Args:
**kwargs:
drop_determiners (bool, optional): remove leading determiners (e.g. "the")
from phrases (e.g. "the quick brown fox" => "quick brown fox")
min_freq (int, optional): remove chunks that occur in `doc` fewer than
`min_freq` times
Yields:
``spacy.Span``: the next noun chunk, in order of appearance in the document
.. seealso:: :func:`extract.noun_chunks() <textacy.extract.noun_chunks>`
for all function kwargs.
"""
return extract.noun_chunks(self.spacy_doc, **kwargs)
for nc in extract.noun_chunks(self.spacy_doc, **kwargs):
yield nc

def pos_regex_matches(self, pattern):
"""
Expand All @@ -429,26 +492,44 @@ def pos_regex_matches(self, pattern):
* compound nouns: r'<NOUN>+'
* verb phrase: r'<VERB>?<ADV>*<VERB>+'
* prepositional phrase: r'<PREP> <DET>? (<NOUN>+<ADP>)* <NOUN>+'
Yields:
``spacy.Span``: the next span of consecutive tokens whose parts-of-speech
match ``pattern``, in order of apperance in the document
"""
return extract.pos_regex_matches(self.spacy_doc, pattern)
for match in extract.pos_regex_matches(self.spacy_doc, pattern):
yield match

def subject_verb_object_triples(self):
"""
Extract an *un*ordered sequence of distinct subject-verb-object (SVO) triples
from doc.
Yields:
(``spacy.Span``, ``spacy.Span``, ``spacy.Span``): the next 3-tuple
representing a (subject, verb, object) triple, in order of apperance
"""
return extract.subject_verb_object_triples(self.spacy_doc)
for svo in extract.subject_verb_object_triples(self.spacy_doc):
yield svo

def acronyms_and_definitions(self, **kwargs):
def acronyms_and_definitions(self, known_acro_defs=None):
"""
Extract a collection of acronyms and their most likely definitions,
if available, from doc. If multiple definitions are found for a given acronym,
only the most frequently occurring definition is returned.
Args:
known_acro_defs (dict, optional): if certain acronym/definition pairs
are known, pass them in as {acronym (str): definition (str)};
algorithm will not attempt to find new definitions
Returns:
dict: unique acronyms (keys) with matched definitions (values)
.. seealso:: :func:`extract.acronyms_and_definitions() <textacy.extract.acronyms_and_definitions>`
for all function kwargs.
"""
return extract.acronyms_and_definitions(self.spacy_doc, **kwargs)
return extract.acronyms_and_definitions(self.spacy_doc, known_acro_defs=known_acro_defs)

def semistructured_statements(self, entity, **kwargs):
"""
Expand All @@ -458,18 +539,40 @@ def semistructured_statements(self, entity, **kwargs):
Args:
entity (str): a noun or noun phrase of some sort (e.g. "President Obama",
"global warming", "Python")
**kwargs:
cue (str, optional): verb lemma with which `entity` is associated
(e.g. "talk about", "have", "write")
ignore_entity_case (bool, optional): if True, entity matching is
case-independent
min_n_words (int, optional): min number of tokens allowed in a
matching fragment
max_n_words (int, optional): max number of tokens allowed in a
matching fragment
Yields:
(``spacy.Span`` or ``spacy.Token``, ``spacy.Span`` or ``spacy.Token``, ``spacy.Span``):
where each element is a matching (entity, cue, fragment) triple
.. seealso:: :func:`extract.semistructured_statements() <textacy.extract.semistructured_statements>`
for all function kwargs.
"""
return extract.semistructured_statements(self.spacy_doc, entity, **kwargs)
for sss in extract.semistructured_statements(self.spacy_doc, entity, **kwargs):
yield sss

def direct_quotations(self):
"""
Baseline, not-great attempt at direction quotation extraction (no indirect
or mixed quotations) using rules and patterns. English only.
Yields:
(``spacy.Span``, ``spacy.Token``, ``spacy.Span``): next quotation
represented as a (speaker, reporting verb, quotation) 3-tuple
.. seealso:: :func:`extract.direct_quotations() <textacy.extract.direct_quotations>`
"""
return extract.direct_quotations(self.spacy_doc)
if self.lang != 'en':
raise NotImplementedError('sorry, English-language texts only :(')
for dq in extract.direct_quotations(self.spacy_doc):
yield dq

def key_terms(self, algorithm='sgrank', n=10):
"""
Expand All @@ -482,8 +585,16 @@ def key_terms(self, algorithm='sgrank', n=10):
as keyterms; if float, must be in the open interval (0.0, 1.0),
representing the fraction of top-ranked terms to return as keyterms
Returns:
list[(str, float)]: sorted list of top `n` key terms and their
corresponding scores
Raises:
ValueError: if ``algorithm`` not in {'sgrank', 'textrank', 'singlerank'}
.. seealso:: :func:`keyterms.sgrank() <textacy.keyterms.sgrank>`
.. seealso:: :func:`keyterms.textrank() <textacy.keyterms.textrank>`
.. seealso:: :func:`keyterms.singlerank() <textacy.keyterms.singlerank>`
"""
if algorithm == 'sgrank':
return keyterms.sgrank(self.spacy_doc, window_width=1500, n_keyterms=n)
Expand Down

0 comments on commit 5687b24

Please sign in to comment.