Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pub speed #21

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added examples/pubspeed/2012/.DS_Store
Binary file not shown.
40 changes: 40 additions & 0 deletions examples/pubspeed/2012/oacensus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Get articles from Pubmed
- pubmed:
search: '"journal article"[Publication Type]'
journals :
- "Current Biology : CB"
- "Nature Neuroscience"
- "Genes & development"
- "Bioinformatics"
- "Cell"
- "Cell host & microbe"
- "Nature genetics"
- "Molecular systems biology"
- "Nature"
- "PLoS neglected tropical diseases"
- "PLoS medicine"
- "PLoS biology"
- "PLoS computational biology"
- "PLoS pathogens"
- "The Journal of infectious diseases"
- "Nature medicine"
- "Genome research"
- "BMC infectious diseases"
- "PLoS genetics"
- "BMC public health"
- "Nature cell biology"
- "The American journal of tropical medicine and hygiene"
- "PeerJ"
- "SpringerPlus"
- "eLife"
- "Nature communications"
- "Scientific reports"
- "PLoS one"
- "BMJ open"
- "British medical journal"
- "Journal of virology"
- "Proceedings of the national academy of sciences of the united states of america"
- "Science (New York, N.Y.)"
start-period : 2012-01
end-period : 2012-12
ret-max : 1000
Binary file added examples/pubspeed/2012/pubspeed2012.xls
Binary file not shown.
1 change: 1 addition & 0 deletions examples/pubspeed/2012/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
oacensus run --config oacensus.yaml --reports pubspeed-excel
40 changes: 40 additions & 0 deletions examples/pubspeed/2013/oacensus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Get articles from Pubmed
- pubmed:
search: '"journal article"[Publication Type]'
journals :
- "Current Biology : CB"
- "Nature Neuroscience"
- "Genes & development"
- "Bioinformatics"
- "Cell"
- "Cell host & microbe"
- "Nature genetics"
- "Molecular systems biology"
- "Nature"
- "PLoS neglected tropical diseases"
- "PLoS medicine"
- "PLoS biology"
- "PLoS computational biology"
- "PLoS pathogens"
- "The Journal of infectious diseases"
- "Nature medicine"
- "Genome research"
- "BMC infectious diseases"
- "PLoS genetics"
- "BMC public health"
- "Nature cell biology"
- "The American journal of tropical medicine and hygiene"
- "PeerJ"
- "SpringerPlus"
- "eLife"
- "Nature communications"
- "Scientific reports"
- "PLoS one"
- "BMJ open"
- "British medical journal"
- "Journal of virology"
- "Proceedings of the national academy of sciences of the united states of america"
- "Science (New York, N.Y.)"
start-period : 2013-01
end-period : 2013-12
ret-max : 1000
Binary file added examples/pubspeed/2013/pubspeed2013.xls
Binary file not shown.
1 change: 1 addition & 0 deletions examples/pubspeed/2013/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
oacensus run --config oacensus.yaml --reports pubspeed-excel
40 changes: 40 additions & 0 deletions examples/pubspeed/2014/oacensus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Get articles from Pubmed
- pubmed:
search: '"journal article"[Publication Type]'
journals :
- "Current Biology : CB"
- "Nature Neuroscience"
- "Genes & development"
- "Bioinformatics"
- "Cell"
- "Cell host & microbe"
- "Nature genetics"
- "Molecular systems biology"
- "Nature"
- "PLoS neglected tropical diseases"
- "PLoS medicine"
- "PLoS biology"
- "PLoS computational biology"
- "PLoS pathogens"
- "The Journal of infectious diseases"
- "Nature medicine"
- "Genome research"
- "BMC infectious diseases"
- "PLoS genetics"
- "BMC public health"
- "Nature cell biology"
- "The American journal of tropical medicine and hygiene"
- "PeerJ"
- "SpringerPlus"
- "eLife"
- "Nature communications"
- "Scientific reports"
- "PLoS one"
- "BMJ open"
- "British medical journal"
- "Journal of virology"
- "Proceedings of the national academy of sciences of the united states of america"
- "Science (New York, N.Y.)"
start-period : 2014-01
end-period : 2014-06
ret-max : 1000
Binary file added examples/pubspeed/2014/pubspeed2014.xls
Binary file not shown.
1 change: 1 addition & 0 deletions examples/pubspeed/2014/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
oacensus run --config oacensus.yaml --reports pubspeed-excel
41 changes: 41 additions & 0 deletions examples/pubspeed/execute_pone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
__author__ = 'cneylon'

import os
import os.path
import subprocess

DATE_PERIODS = [('2012-01-02', '2012-06-30'),
('2012-07-01', '2012-12-31'),
('2013-01-02', '2013-06-30'),
('2013-07-01', '2013-12-31'),
('2014-01-02', '2014-06-30')]

for period in DATE_PERIODS:
date_term = 'AND "journal article"[Publication Type]'
search_term = """'"PLoS one"[Journal] %s'""" % (date_term)
path = 'run/pone_' + period[0]
if not os.path.isdir(path):
os.makedirs(path)

yelems = (
search_term,
period[0][0:7],
period[1][0:7]
)
yaml = """
- pubmed:
search: %s
start-period: %s
end-period: %s
""" % yelems

yaml_path = os.path.join(path, 'oacensus.yaml')
with open(yaml_path, 'w') as f:
f.write(yaml)

os.chdir(path)
subprocess.call(['oacensus', 'run', '--config', 'oacensus.yaml', '--reports', 'pubspeed-excel'])
os.chdir('../..')



2 changes: 2 additions & 0 deletions examples/pubspeed/run-report.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/sh
oacensus run --progress --reports "pubspeed-excel"
Binary file added examples/wellcome2013/apc-oag.xlsx
Binary file not shown.
33 changes: 33 additions & 0 deletions examples/wellcome2013/oacensus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Load standard licenses.
- licenses

# Create articles from xlsx file.
- excelarticles: {
location: "wellcome20.xlsx",
list-name: Wellcome 2012-13,
source: wellcome,
period: 2012-13,
column-mapping : {
publisher : publisher.name,
journal : journal.title,
title : title,
doi : doi
}
}

# Get publication date and standardize journal title (based on article DOI).
- crossref

# Get ISSNs based on journal titles.
- crossrefjournals

# Get openness information from OAG (based on article DOI).
- oag

# Check pubmed, get external identifiers including PMC (based on article DOI).
- pubmed-update-repositories

# Look up journal info in DOAJ
- doaj

# TODO look up license in PMC
1 change: 1 addition & 0 deletions examples/wellcome2013/run-report.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
oacensus run --progress --reports "openness-excel"
Binary file added examples/wellcome2013/wellcome.xlsx
Binary file not shown.
Binary file added examples/wellcome2013/wellcome20.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion oacensus/licenses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ cc-by-nd:
cc-by-nc-sa:
title: Creative Commons Attribution NonCommercial ShareAlike
url: https://creativecommons.org/licenses/by-nc-nd-sa/4.0
aliases: []
aliases: [cc-BY-NC-SA]

cc-zero:
title: No Rights Reserved
Expand Down
2 changes: 2 additions & 0 deletions oacensus/load_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
import oacensus.scrapers.rcukgtr
import oacensus.scrapers.scimago
import oacensus.scrapers.wiley
import oacensus.scrapers.openaire

import oacensus.reports.excel_dump
import oacensus.reports.oa_excel
import oacensus.reports.pubspeed_excel
import oacensus.reports.text_dump
import oacensus.reports.personal_openness
import oacensus.reports.institution
6 changes: 6 additions & 0 deletions oacensus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,12 @@ class Article(ModelBase):
help_text="Digital object identifier for article.")
date_published = CharField(null=True,
help_text="When article was published, in YYYY(-MM(-DD)) format.")
date_submitted = CharField(null=True,
help_text="When article was submitted for peer review, in YYYY(-MM(-DD)) format.")
date_accepted = CharField(null=True,
help_text="When article was accepted for publication, in YYYY(-MM(-DD)) format.")
date_aheadofprint = CharField(null=True,
help_text="Ahead of print publication date, generally AOP, in YYYY(-MM(-DD)) format.")
period = CharField(
help_text="Name of date-based period in which this article was scraped.")
url = CharField(null=True,
Expand Down
82 changes: 82 additions & 0 deletions oacensus/reports/pubspeed_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from oacensus.models import Article
from oacensus.models import ModelBase
from oacensus.report import Report
import datetime
import inflection
import inspect
import os
import xlwt

class PubspeedExcel(Report):
"""
An excel-based openness report.
"""

aliases = ['pubspeed-excel']

_settings = {
'filename' : ("Name of file to write excel dump to.", "pubspeed.xls"),
"sheet-name" : ("Name of worksheet.", "Articles"),
'date-format-string' : ( "Excel style date format string.", "D-MMM-YYYY"),
"fields" : ("Fields to include in report.", [
"doi", "title", "date_published",
"date_submitted", "date_accepted", "date_aheadofprint",
"journal.title", "journal.issn"
])
}

def run(self):
date_style = xlwt.XFStyle()
date_style.num_format_str = self.setting('date-format-string')

bold_font = xlwt.Font()
bold_font.bold = True

bold_style = xlwt.XFStyle()
bold_style.font = bold_font

filename = self.setting('filename')
if os.path.exists(filename):
os.remove(filename)

workbook = xlwt.Workbook()
ws = workbook.add_sheet("Articles")

keys = self.setting('fields')

# Write Headers
for j, k in enumerate(keys):
heading = inflection.titleize(k.replace("_id", "_identifier"))
ws.write(0, j, heading, bold_style)

for i, article in enumerate(Article.select()):
print article
for j, key in enumerate(keys):
if key.startswith("journal."):
value = getattr(article.journal, key.replace("journal.", ""))
elif key.startswith("publisher."):
if article.journal.publisher is not None:
value = getattr(article.journal.publisher, key.replace("publisher.", ""))
else:
value = None
else:
value = getattr(article, key)

fmt = None

if isinstance(value, ModelBase):
value = unicode(value)
elif isinstance(value, datetime.date):
fmt = date_style
elif inspect.ismethod(value):
value = value()
else:
pass

if fmt:
ws.write(i+1, j, value, fmt)
else:
ws.write(i+1, j, value)

workbook.save(filename)
print " pubspeed report written to %s" % filename
50 changes: 50 additions & 0 deletions oacensus/scrapers/openaire.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
__author__ = 'cneylon'
from oacensus.scraper import ArticleScraper
from oacensus.models import Article
from oacensus.models import Repository
from oacensus.models import Instance
import json
import requests
import xml.etree.ElementTree as ET

class OpenAIRE(ArticleScraper):
"""
Gets accessibility information for articles with DOIs in the database.
"""
aliases = ['openaire']

_settings = {
'base-url' : ("Base url of OpenAIRE API", "http://api.openaire.eu/search/publications"),
}

def scrape(self):
# don't use scrape method since our query depends on db state, so
# caching will not be accurate
pass

def process(self):
articles = Article.select().where(~(Article.doi >> None))
for article in articles:
response = requests.get(self.setting('base-url'), params = {'doi' : article.doi})
openaire_response = ET.fromstring(response.text.encode('utf-8'))

for inst in openaire_response.iter('instance'):
reponame = inst.find('hostedby').get('name')
repository = Repository.find_or_create_by_name({'name':reponame,
'source': 'openaire',
'log' : 'Created by openaire plugin'})


status = inst.find('licence').get('classname')
ftr = {'Open Access' : True,
'Closed Access' : False,
'Embargo' : False,
'Restricted' : False}.get(status, False)

url = inst.find('webresource').find('url').text
Instance.create(article = article,
repository = repository,
free_to_read = ftr,
info_url=url,
source=self.db_source(),
log='OpenAIRE response obtained from %s repository' % reponame)
Loading