Updated changelog, fixed spacy version in setup.py

- Renamed Corpus.vector => Corpus.vectors to avoid misinterpretation - Py2 bugfixes for wiki_reader re: str/unicode, as usual
chartbeat-labs · Nov 15, 2016 · a5e1c89 · a5e1c89
1 parent 7fc320a
commit a5e1c89
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 11 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,6 +1,25 @@
 Changelog
 =========
 
+0.3.2 (2016-11-15)
+------------------
+
+Changes:
+
+- Preliminary inclusion of custom spaCy pipelines
+    - updated ``load_spacy()`` to include explicit path and create_pipeline kwargs, and removed the already-deprecated ``load_spacy_pipeline()`` function to avoid confusion around spaCy languages and pipelines
+    - added ``spacy_pipelines`` module to hold implementations of custom spaCy pipelines, including a basic one that merges entities into single tokens
+    - note: necessarily bumped minimum spaCy version to 1.1.0+
+    - see the announcement here: https://explosion.ai/blog/spacy-deep-learning-keras
+- To reduce code bloat, made the ``matplotlib`` dependency optional and dropped the ``gensim`` dependency
+    - to install ``matplotlib`` at the same time as textacy, do ``$ pip install textacy[viz]``
+    - bonus: ``backports.csv`` is now only installed for Py2 users
+    - thanks to @mbatchkarov for the request
+- Improved performance of ``textacy.corpora.WikiReader().texts()``; results should stream faster and have cleaner plaintext content than when they were produced by ``gensim``
+    - this *should* also fix a bug reported in Issue #51 by @baisk
+- Added a ``Corpus.vectors`` property that returns a matrix of shape (# documents, vector dim) containing the average word2vec-style vector representation of constituent tokens for all ``Doc`` s
+
+
 0.3.1 (2016-10-19)
 ------------------
 

diff --git a/setup.py b/setup.py
@@ -12,7 +12,6 @@
     'cytoolz>=0.8.0',
     'ftfy>=4.2.0',
     'fuzzywuzzy>=0.12.0',
-    'gensim>=0.13.2',
     'ijson>=2.3',
     'networkx>=1.11',
     'numpy>=1.8.0',
@@ -22,7 +21,7 @@
     'requests>=2.10.0',
     'scipy>=0.17.0',
     'scikit-learn>=0.17.0',
-    'spacy>=1.0.1',
+    'spacy>=1.1.0',
     'unidecode>=0.04.19',
     ]
 EXTRAS_REQUIRE = {

diff --git a/textacy/corpora/wiki_reader.py b/textacy/corpora/wiki_reader.py
@@ -163,13 +163,14 @@ def _parse_content(self, content, parser):
         wikicode = parser.parse(content)
         parsed_page = {'sections': []}
 
-        wikilinks = [str(wc.title) for wc in wikicode.ifilter_wikilinks()]
+        wikilinks = [unicode_type(wc.title) for wc in wikicode.ifilter_wikilinks()]
         parsed_page['categories'] = [wc for wc in wikilinks if wc.startswith('Category:')]
         parsed_page['wiki_links'] = [wc for wc in wikilinks
                                      if not wc.startswith('Category:') and
                                      not wc.startswith('File:') and
                                      not wc.startswith('Image:')]
-        parsed_page['ext_links'] = [str(wc.url) for wc in wikicode.ifilter_external_links()]
+        parsed_page['ext_links'] = [
+            unicode_type(wc.url) for wc in wikicode.ifilter_external_links()]
 
         def _filter_tags(obj):
             return obj.tag == 'ref' or obj.tag == 'table'
@@ -183,7 +184,7 @@ def _filter_tags(obj):
 
             if section_idx == 0 or len(headings) == 1:
                 try:
-                    sec_title = str(headings[0].title)
+                    sec_title = unicode_type(headings[0].title)
                     if sec_title.lower() in bad_section_titles:
                         continue
                     sec['title'] = sec_title
@@ -199,23 +200,24 @@ def _filter_tags(obj):
                         continue
                 for obj in section.ifilter_wikilinks(recursive=True):
                     try:
-                        obj_title = str(obj.title)
+                        obj_title = unicode_type(obj.title)
                         if obj_title.startswith('File:') or obj_title.startswith('Image:'):
                             section.remove(obj)
                     except Exception:
                         pass
-                sec['text'] = str(section.strip_code(normalize=True, collapse=True)).strip()
+                sec['text'] = unicode_type(section.strip_code(normalize=True, collapse=True)).strip()
                 if sec.get('title'):
                     sec['text'] = re.sub(r'^' + re.escape(sec['title']) + r'\s*', '', sec['text'])
                 parsed_page['sections'].append(sec)
                 section_idx += 1
 
             # dammit! the parser has failed us; let's handle it as best we can
             elif len(headings) > 1:
-                titles = [str(h.title).strip() for h in headings]
+                titles = [unicode_type(h.title).strip() for h in headings]
                 levels = [int(h.level) for h in headings]
-                sub_sections = [str(ss) for ss in
-                                re.split(r'\s*' + '|'.join(re.escape(str(h)) for h in headings) + r'\s*', str(section))]
+                sub_sections = [
+                    unicode_type(ss) for ss in
+                    re.split(r'\s*' + '|'.join(re.escape(unicode_type(h)) for h in headings) + r'\s*', unicode_type(section))]
                 # re.split leaves an empty string result up front :shrug:
                 if sub_sections[0] == '':
                     del sub_sections[0]

diff --git a/textacy/corpus.py b/textacy/corpus.py
@@ -173,7 +173,8 @@ def __delitem__(self, idx_or_slice):
             raise ValueError(msg)
 
     @property
-    def vector(self):
+    def vectors(self):
+        """Constituent docs' word vectors stacked together in a matrix."""
         return np.vstack((doc.spacy_doc.vector for doc in self))
 
     ##########