Merge pull request #6 from autonomio/add_arxiv_api

add arxiv API
autonomio · Jan 29, 2022 · 82273de · 82273de
2 parents e3ad9ca + e149758
commit 82273de
Show file tree

Hide file tree

Showing 6 changed files with 53 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 .DS_Store
 *.pyc
 __pycache__
+dedomena/*.pyc
+dedomena/*/*.pyc
diff --git a/dedomena/apis/__init__.py b/dedomena/apis/__init__.py
@@ -1,2 +1,3 @@
 from .twitter import twitter
 from .pubmed import pubmed
+from .arxiv import arxiv
diff --git a/dedomena/apis/arxiv.py b/dedomena/apis/arxiv.py
@@ -0,0 +1,38 @@
+def arxiv(keyword, n):
+
+    '''Get articles with meta-data from ArXiv
+
+    articles = arxiv('tibetan', 50)
+    
+    keyword | str | The string to be searched for in the title of the articles.
+    n | int | Number of articles to return.
+
+    '''
+
+    import urllib
+    import xmltodict
+    import pandas as pd
+
+    url = 'http://export.arxiv.org/api/query?search_query=all:'
+    query = keyword + '&start=0&max_results=' + str(n)
+    data = urllib.request.urlopen(url + query)
+    results_xml = data.read().decode('utf-8')
+
+    out = []
+
+    results_dict = xmltodict.parse(results_xml)
+
+    results_list = results_dict['feed']['entry']
+
+    for i in range(len(results_list)):
+
+        title = results_list[i]['title']
+        publication_date = results_list[i]['published']
+        abstract = results_list[i]['summary']
+
+        out.append([title, publication_date, abstract])
+
+    out = pd.DataFrame(out)
+    out.columns = ['title', 'publication_date', 'abstract']
+
+    return out
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 pandas
 pymed
 twintel
-pmlb
+pmlb
+xmltodict
diff --git a/setup.py b/setup.py
@@ -15,7 +15,7 @@
 URL = 'http://autonom.io'
 LICENSE = 'MIT'
 DOWNLOAD_URL = 'https://github.com/autonomio/dedomena/'
-VERSION = '0.0.7'
+VERSION = '0.1.0'
 
 try:
     from setuptools import setup
@@ -26,7 +26,8 @@
 install_requires = ['pandas',
                     'pymed',
                     'twintel',
-                    'pmlb']
+                    'pmlb',
+                    'xmltodict']
 
 if __name__ == "__main__":
 

diff --git a/test_script.py b/test_script.py
@@ -13,4 +13,10 @@
     _null = da.datasets.pmlb(dataset)
 
 # test apis.twitter
-_null = da.apis.twitter('cars', 200)
+_null = da.apis.twitter('cars', 50)
+
+# test apis.pubmed
+_null = da.apis.pubmed('COVID', 50)
+
+# test apis.arxiv
+_null = da.apis.arxiv('nlp', 50)