From e1497587536a0a66e1b84c0f7cd34fe9d2db9a3c Mon Sep 17 00:00:00 2001 From: Mikko Kotila Date: Sun, 30 Jan 2022 00:47:52 +0200 Subject: [PATCH] add arxiv API --- .gitignore | 2 ++ dedomena/apis/__init__.py | 1 + dedomena/apis/arxiv.py | 38 ++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 ++- setup.py | 5 +++-- test_script.py | 8 +++++++- 6 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 dedomena/apis/arxiv.py diff --git a/.gitignore b/.gitignore index 9ab76a1..b04d7df 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .DS_Store *.pyc __pycache__ +dedomena/*.pyc +dedomena/*/*.pyc diff --git a/dedomena/apis/__init__.py b/dedomena/apis/__init__.py index 9dd7a7c..2d77cf7 100644 --- a/dedomena/apis/__init__.py +++ b/dedomena/apis/__init__.py @@ -1,2 +1,3 @@ from .twitter import twitter from .pubmed import pubmed +from .arxiv import arxiv diff --git a/dedomena/apis/arxiv.py b/dedomena/apis/arxiv.py new file mode 100644 index 0000000..61d797e --- /dev/null +++ b/dedomena/apis/arxiv.py @@ -0,0 +1,38 @@ +def arxiv(keyword, n): + + '''Get articles with meta-data from ArXiv + + articles = arxiv('tibetan', 50) + + keyword | str | The string to be searched for in the title of the articles. + n | int | Number of articles to return. + + ''' + + import urllib + import xmltodict + import pandas as pd + + url = 'http://export.arxiv.org/api/query?search_query=all:' + query = keyword + '&start=0&max_results=' + str(n) + data = urllib.request.urlopen(url + query) + results_xml = data.read().decode('utf-8') + + out = [] + + results_dict = xmltodict.parse(results_xml) + + results_list = results_dict['feed']['entry'] + + for i in range(len(results_list)): + + title = results_list[i]['title'] + publication_date = results_list[i]['published'] + abstract = results_list[i]['summary'] + + out.append([title, publication_date, abstract]) + + out = pd.DataFrame(out) + out.columns = ['title', 'publication_date', 'abstract'] + + return out \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index eb43ed7..d696842 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pandas pymed twintel -pmlb \ No newline at end of file +pmlb +xmltodict \ No newline at end of file diff --git a/setup.py b/setup.py index e2e9936..2f2ca86 100755 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ URL = 'http://autonom.io' LICENSE = 'MIT' DOWNLOAD_URL = 'https://github.com/autonomio/dedomena/' -VERSION = '0.0.7' +VERSION = '0.1.0' try: from setuptools import setup @@ -26,7 +26,8 @@ install_requires = ['pandas', 'pymed', 'twintel', - 'pmlb'] + 'pmlb', + 'xmltodict'] if __name__ == "__main__": diff --git a/test_script.py b/test_script.py index 0009f3a..aa98a83 100644 --- a/test_script.py +++ b/test_script.py @@ -13,4 +13,10 @@ _null = da.datasets.pmlb(dataset) # test apis.twitter -_null = da.apis.twitter('cars', 200) +_null = da.apis.twitter('cars', 50) + +# test apis.pubmed +_null = da.apis.pubmed('COVID', 50) + +# test apis.arxiv +_null = da.apis.arxiv('nlp', 50)