Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added OpenAIRE plugin #20

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added examples/wellcome2013/apc-oag.xlsx
Binary file not shown.
33 changes: 33 additions & 0 deletions examples/wellcome2013/oacensus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Load standard licenses.
- licenses

# Create articles from xlsx file.
- excelarticles: {
location: "wellcome20.xlsx",
list-name: Wellcome 2012-13,
source: wellcome,
period: 2012-13,
column-mapping : {
publisher : publisher.name,
journal : journal.title,
title : title,
doi : doi
}
}

# Get publication date and standardize journal title (based on article DOI).
- crossref

# Get ISSNs based on journal titles.
- crossrefjournals

# Get openness information from OAG (based on article DOI).
- oag

# Check pubmed, get external identifiers including PMC (based on article DOI).
- pubmed-update-repositories

# Look up journal info in DOAJ
- doaj

# TODO look up license in PMC
1 change: 1 addition & 0 deletions examples/wellcome2013/run-report.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
oacensus run --progress --reports "openness-excel"
Binary file added examples/wellcome2013/wellcome.xlsx
Binary file not shown.
Binary file added examples/wellcome2013/wellcome20.xlsx
Binary file not shown.
2 changes: 2 additions & 0 deletions oacensus/load_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
import oacensus.scrapers.rcukgtr
import oacensus.scrapers.scimago
import oacensus.scrapers.wiley
import oacensus.scrapers.openaire

import oacensus.reports.excel_dump
import oacensus.reports.oa_excel
import oacensus.reports.oa_excel_article
import oacensus.reports.text_dump
import oacensus.reports.personal_openness
import oacensus.reports.institution
50 changes: 50 additions & 0 deletions oacensus/scrapers/openaire.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
__author__ = 'cneylon'
from oacensus.scraper import ArticleScraper
from oacensus.models import Article
from oacensus.models import Repository
from oacensus.models import Instance
import json
import requests
import xml.etree.ElementTree as ET

class OpenAIRE(ArticleScraper):
"""
Gets accessibility information for articles with DOIs in the database.
"""
aliases = ['openaire']

_settings = {
'base-url' : ("Base url of OpenAIRE API", "http://api.openaire.eu/search/publications"),
}

def scrape(self):
# don't use scrape method since our query depends on db state, so
# caching will not be accurate
pass

def process(self):
articles = Article.select().where(~(Article.doi >> None))
for article in articles:
response = requests.get(self.setting('base-url'), params = {'doi' : article.doi})
openaire_response = ET.fromstring(response.text.encode('utf-8'))

for inst in openaire_response.iter('instance'):
reponame = inst.find('hostedby').get('name')
repository = Repository.find_or_create_by_name({'name':reponame,
'source': 'openaire',
'log' : 'Created by openaire plugin'})


status = inst.find('licence').get('classname')
ftr = {'Open Access' : True,
'Closed Access' : False,
'Embargo' : False,
'Restricted' : False}.get(status, False)

url = inst.find('webresource').find('url').text
Instance.create(article = article,
repository = repository,
free_to_read = ftr,
info_url=url,
source=self.db_source(),
log='OpenAIRE response obtained from %s repository' % reponame)
81 changes: 81 additions & 0 deletions tests/test_openaire_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
__author__ = 'cneylon'
from oacensus.commands import defaults
from oacensus.models import Article
from oacensus.scraper import Scraper
from oacensus.models import Repository
from oacensus.models import Instance
from tests.utils import setup_db
setup_db()

test_doi_open = '10.1371/journal.pone.0001164'
test_doi_embargoed = '10.1007/s00024-004-0394-3'
test_doi_closed = '10.1063/1.3663569'
test_doi_restricted = '10.1111/j.1365-2125.2009.03481.x'

# This is a DOI from the Crossref Labs 'Journal of Pyschoceramics' which should never appear in OpenAIRE
test_doi_no_response = '10.5555/12345678'

def test_openaire_scraper():
dois = [
test_doi_open,
test_doi_embargoed,
test_doi_closed,
test_doi_restricted,
test_doi_no_response
]

doilist = Scraper.create_instance("doilist")
doilist.update_settings({"doi-list" : dois })
doilist.run()

# Test cases #
##############
# Does the scraper run properly?
# Are the relevant repositories created?
# Are all DOIs that should be returned?

# Scraper runs successfully
scraper = Scraper.create_instance("openaire")
scraper.run()

# Repositories created properly
r1 = Repository.select()
repos = [r for r in r1]
assert len(repos) > 3

names = [r.name for r in r1]
assert 'Oxford University Research Archive' in names
assert 'Europe PubMed Central' in names
assert 'Surrey Research Insight' in names
assert 'DSpace at VSB Technical University of Ostrava' in names

# All appropriate DOI's returned
for d in dois[0:4]:
a = Article.select().where(Article.doi == d)[0]
instances = [inst for inst in a.instances]
assert len(instances) > 0
for inst in instances:
assert inst.free_to_read is not None

# Nonexistent doi not returned
a = Article.select().where(Article.doi == test_doi_no_response)[0]
instances = [inst for inst in a.instances]
assert len(instances) == 0

# Test correct answer for 'open' case
a = Article.select().where(Article.doi == test_doi_open)[0]
instances = [inst for inst in a.instances]
for inst in instances:
assert inst.free_to_read

# Test correct answer for non-open cases
for doi in [test_doi_restricted, test_doi_closed, test_doi_embargoed]:
a = Article.select().where(Article.doi == doi)[0]
instances = [inst for inst in a.instances]
for inst in instances:
print inst.repository.name, inst.free_to_read
assert not inst.free_to_read