diff --git a/examples/wellcome2013/apc-oag.xlsx b/examples/wellcome2013/apc-oag.xlsx new file mode 100644 index 0000000..5eab744 Binary files /dev/null and b/examples/wellcome2013/apc-oag.xlsx differ diff --git a/examples/wellcome2013/oacensus.yaml b/examples/wellcome2013/oacensus.yaml new file mode 100644 index 0000000..9e11f68 --- /dev/null +++ b/examples/wellcome2013/oacensus.yaml @@ -0,0 +1,33 @@ +# Load standard licenses. +- licenses + +# Create articles from xlsx file. +- excelarticles: { + location: "wellcome20.xlsx", + list-name: Wellcome 2012-13, + source: wellcome, + period: 2012-13, + column-mapping : { + publisher : publisher.name, + journal : journal.title, + title : title, + doi : doi + } + } + +# Get publication date and standardize journal title (based on article DOI). +- crossref + +# Get ISSNs based on journal titles. +- crossrefjournals + +# Get openness information from OAG (based on article DOI). +- oag + +# Check pubmed, get external identifiers including PMC (based on article DOI). +- pubmed-update-repositories + +# Look up journal info in DOAJ +- doaj + +# TODO look up license in PMC diff --git a/examples/wellcome2013/run-report.sh b/examples/wellcome2013/run-report.sh new file mode 100644 index 0000000..58562d9 --- /dev/null +++ b/examples/wellcome2013/run-report.sh @@ -0,0 +1 @@ +oacensus run --progress --reports "openness-excel" diff --git a/examples/wellcome2013/wellcome.xlsx b/examples/wellcome2013/wellcome.xlsx new file mode 100644 index 0000000..5bb9828 Binary files /dev/null and b/examples/wellcome2013/wellcome.xlsx differ diff --git a/examples/wellcome2013/wellcome20.xlsx b/examples/wellcome2013/wellcome20.xlsx new file mode 100644 index 0000000..427e129 Binary files /dev/null and b/examples/wellcome2013/wellcome20.xlsx differ diff --git a/oacensus/load_plugins.py b/oacensus/load_plugins.py index 178c19d..aa49ab3 100644 --- a/oacensus/load_plugins.py +++ b/oacensus/load_plugins.py @@ -14,9 +14,11 @@ import oacensus.scrapers.rcukgtr import oacensus.scrapers.scimago import oacensus.scrapers.wiley +import oacensus.scrapers.openaire import oacensus.reports.excel_dump import oacensus.reports.oa_excel +import oacensus.reports.oa_excel_article import oacensus.reports.text_dump import oacensus.reports.personal_openness import oacensus.reports.institution diff --git a/oacensus/scrapers/openaire.py b/oacensus/scrapers/openaire.py new file mode 100644 index 0000000..11c4e4d --- /dev/null +++ b/oacensus/scrapers/openaire.py @@ -0,0 +1,50 @@ +__author__ = 'cneylon' +from oacensus.scraper import ArticleScraper +from oacensus.models import Article +from oacensus.models import Repository +from oacensus.models import Instance +import json +import requests +import xml.etree.ElementTree as ET + +class OpenAIRE(ArticleScraper): + """ + Gets accessibility information for articles with DOIs in the database. + """ + aliases = ['openaire'] + + _settings = { + 'base-url' : ("Base url of OpenAIRE API", "http://api.openaire.eu/search/publications"), + } + + def scrape(self): + # don't use scrape method since our query depends on db state, so + # caching will not be accurate + pass + + def process(self): + articles = Article.select().where(~(Article.doi >> None)) + for article in articles: + response = requests.get(self.setting('base-url'), params = {'doi' : article.doi}) + openaire_response = ET.fromstring(response.text.encode('utf-8')) + + for inst in openaire_response.iter('instance'): + reponame = inst.find('hostedby').get('name') + repository = Repository.find_or_create_by_name({'name':reponame, + 'source': 'openaire', + 'log' : 'Created by openaire plugin'}) + + + status = inst.find('licence').get('classname') + ftr = {'Open Access' : True, + 'Closed Access' : False, + 'Embargo' : False, + 'Restricted' : False}.get(status, False) + + url = inst.find('webresource').find('url').text + Instance.create(article = article, + repository = repository, + free_to_read = ftr, + info_url=url, + source=self.db_source(), + log='OpenAIRE response obtained from %s repository' % reponame) \ No newline at end of file diff --git a/tests/test_openaire_scraper.py b/tests/test_openaire_scraper.py new file mode 100644 index 0000000..da4a5ca --- /dev/null +++ b/tests/test_openaire_scraper.py @@ -0,0 +1,81 @@ +__author__ = 'cneylon' +from oacensus.commands import defaults +from oacensus.models import Article +from oacensus.scraper import Scraper +from oacensus.models import Repository +from oacensus.models import Instance +from tests.utils import setup_db +setup_db() + +test_doi_open = '10.1371/journal.pone.0001164' +test_doi_embargoed = '10.1007/s00024-004-0394-3' +test_doi_closed = '10.1063/1.3663569' +test_doi_restricted = '10.1111/j.1365-2125.2009.03481.x' + +# This is a DOI from the Crossref Labs 'Journal of Pyschoceramics' which should never appear in OpenAIRE +test_doi_no_response = '10.5555/12345678' + +def test_openaire_scraper(): + dois = [ + test_doi_open, + test_doi_embargoed, + test_doi_closed, + test_doi_restricted, + test_doi_no_response + ] + + doilist = Scraper.create_instance("doilist") + doilist.update_settings({"doi-list" : dois }) + doilist.run() + + # Test cases # + ############## + # Does the scraper run properly? + # Are the relevant repositories created? + # Are all DOIs that should be returned? + + # Scraper runs successfully + scraper = Scraper.create_instance("openaire") + scraper.run() + + # Repositories created properly + r1 = Repository.select() + repos = [r for r in r1] + assert len(repos) > 3 + + names = [r.name for r in r1] + assert 'Oxford University Research Archive' in names + assert 'Europe PubMed Central' in names + assert 'Surrey Research Insight' in names + assert 'DSpace at VSB Technical University of Ostrava' in names + + # All appropriate DOI's returned + for d in dois[0:4]: + a = Article.select().where(Article.doi == d)[0] + instances = [inst for inst in a.instances] + assert len(instances) > 0 + for inst in instances: + assert inst.free_to_read is not None + + # Nonexistent doi not returned + a = Article.select().where(Article.doi == test_doi_no_response)[0] + instances = [inst for inst in a.instances] + assert len(instances) == 0 + + # Test correct answer for 'open' case + a = Article.select().where(Article.doi == test_doi_open)[0] + instances = [inst for inst in a.instances] + for inst in instances: + assert inst.free_to_read + + # Test correct answer for non-open cases + for doi in [test_doi_restricted, test_doi_closed, test_doi_embargoed]: + a = Article.select().where(Article.doi == doi)[0] + instances = [inst for inst in a.instances] + for inst in instances: + print inst.repository.name, inst.free_to_read + assert not inst.free_to_read + + + +