diff --git a/examples/pubspeed/2012/.DS_Store b/examples/pubspeed/2012/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/examples/pubspeed/2012/.DS_Store differ diff --git a/examples/pubspeed/2012/oacensus.yaml b/examples/pubspeed/2012/oacensus.yaml new file mode 100644 index 0000000..07da46b --- /dev/null +++ b/examples/pubspeed/2012/oacensus.yaml @@ -0,0 +1,40 @@ +# Get articles from Pubmed +- pubmed: + search: '"journal article"[Publication Type]' + journals : + - "Current Biology : CB" + - "Nature Neuroscience" + - "Genes & development" + - "Bioinformatics" + - "Cell" + - "Cell host & microbe" + - "Nature genetics" + - "Molecular systems biology" + - "Nature" + - "PLoS neglected tropical diseases" + - "PLoS medicine" + - "PLoS biology" + - "PLoS computational biology" + - "PLoS pathogens" + - "The Journal of infectious diseases" + - "Nature medicine" + - "Genome research" + - "BMC infectious diseases" + - "PLoS genetics" + - "BMC public health" + - "Nature cell biology" + - "The American journal of tropical medicine and hygiene" + - "PeerJ" + - "SpringerPlus" + - "eLife" + - "Nature communications" + - "Scientific reports" + - "PLoS one" + - "BMJ open" + - "British medical journal" + - "Journal of virology" + - "Proceedings of the national academy of sciences of the united states of america" + - "Science (New York, N.Y.)" + start-period : 2012-01 + end-period : 2012-12 + ret-max : 1000 \ No newline at end of file diff --git a/examples/pubspeed/2012/pubspeed2012.xls b/examples/pubspeed/2012/pubspeed2012.xls new file mode 100644 index 0000000..705d557 Binary files /dev/null and b/examples/pubspeed/2012/pubspeed2012.xls differ diff --git a/examples/pubspeed/2012/run.sh b/examples/pubspeed/2012/run.sh new file mode 100644 index 0000000..4ccacb3 --- /dev/null +++ b/examples/pubspeed/2012/run.sh @@ -0,0 +1 @@ +oacensus run --config oacensus.yaml --reports pubspeed-excel \ No newline at end of file diff --git a/examples/pubspeed/2013/oacensus.yaml b/examples/pubspeed/2013/oacensus.yaml new file mode 100644 index 0000000..b5d5deb --- /dev/null +++ b/examples/pubspeed/2013/oacensus.yaml @@ -0,0 +1,40 @@ +# Get articles from Pubmed +- pubmed: + search: '"journal article"[Publication Type]' + journals : + - "Current Biology : CB" + - "Nature Neuroscience" + - "Genes & development" + - "Bioinformatics" + - "Cell" + - "Cell host & microbe" + - "Nature genetics" + - "Molecular systems biology" + - "Nature" + - "PLoS neglected tropical diseases" + - "PLoS medicine" + - "PLoS biology" + - "PLoS computational biology" + - "PLoS pathogens" + - "The Journal of infectious diseases" + - "Nature medicine" + - "Genome research" + - "BMC infectious diseases" + - "PLoS genetics" + - "BMC public health" + - "Nature cell biology" + - "The American journal of tropical medicine and hygiene" + - "PeerJ" + - "SpringerPlus" + - "eLife" + - "Nature communications" + - "Scientific reports" + - "PLoS one" + - "BMJ open" + - "British medical journal" + - "Journal of virology" + - "Proceedings of the national academy of sciences of the united states of america" + - "Science (New York, N.Y.)" + start-period : 2013-01 + end-period : 2013-12 + ret-max : 1000 \ No newline at end of file diff --git a/examples/pubspeed/2013/pubspeed2013.xls b/examples/pubspeed/2013/pubspeed2013.xls new file mode 100644 index 0000000..14ec479 Binary files /dev/null and b/examples/pubspeed/2013/pubspeed2013.xls differ diff --git a/examples/pubspeed/2013/run.sh b/examples/pubspeed/2013/run.sh new file mode 100644 index 0000000..4ccacb3 --- /dev/null +++ b/examples/pubspeed/2013/run.sh @@ -0,0 +1 @@ +oacensus run --config oacensus.yaml --reports pubspeed-excel \ No newline at end of file diff --git a/examples/pubspeed/2014/oacensus.yaml b/examples/pubspeed/2014/oacensus.yaml new file mode 100644 index 0000000..dec6a82 --- /dev/null +++ b/examples/pubspeed/2014/oacensus.yaml @@ -0,0 +1,40 @@ +# Get articles from Pubmed +- pubmed: + search: '"journal article"[Publication Type]' + journals : + - "Current Biology : CB" + - "Nature Neuroscience" + - "Genes & development" + - "Bioinformatics" + - "Cell" + - "Cell host & microbe" + - "Nature genetics" + - "Molecular systems biology" + - "Nature" + - "PLoS neglected tropical diseases" + - "PLoS medicine" + - "PLoS biology" + - "PLoS computational biology" + - "PLoS pathogens" + - "The Journal of infectious diseases" + - "Nature medicine" + - "Genome research" + - "BMC infectious diseases" + - "PLoS genetics" + - "BMC public health" + - "Nature cell biology" + - "The American journal of tropical medicine and hygiene" + - "PeerJ" + - "SpringerPlus" + - "eLife" + - "Nature communications" + - "Scientific reports" + - "PLoS one" + - "BMJ open" + - "British medical journal" + - "Journal of virology" + - "Proceedings of the national academy of sciences of the united states of america" + - "Science (New York, N.Y.)" + start-period : 2014-01 + end-period : 2014-06 + ret-max : 1000 \ No newline at end of file diff --git a/examples/pubspeed/2014/pubspeed2014.xls b/examples/pubspeed/2014/pubspeed2014.xls new file mode 100644 index 0000000..40be1b3 Binary files /dev/null and b/examples/pubspeed/2014/pubspeed2014.xls differ diff --git a/examples/pubspeed/2014/run.sh b/examples/pubspeed/2014/run.sh new file mode 100644 index 0000000..4ccacb3 --- /dev/null +++ b/examples/pubspeed/2014/run.sh @@ -0,0 +1 @@ +oacensus run --config oacensus.yaml --reports pubspeed-excel \ No newline at end of file diff --git a/examples/pubspeed/execute_pone.py b/examples/pubspeed/execute_pone.py new file mode 100644 index 0000000..642ad73 --- /dev/null +++ b/examples/pubspeed/execute_pone.py @@ -0,0 +1,41 @@ +__author__ = 'cneylon' + +import os +import os.path +import subprocess + +DATE_PERIODS = [('2012-01-02', '2012-06-30'), + ('2012-07-01', '2012-12-31'), + ('2013-01-02', '2013-06-30'), + ('2013-07-01', '2013-12-31'), + ('2014-01-02', '2014-06-30')] + +for period in DATE_PERIODS: + date_term = 'AND "journal article"[Publication Type]' + search_term = """'"PLoS one"[Journal] %s'""" % (date_term) + path = 'run/pone_' + period[0] + if not os.path.isdir(path): + os.makedirs(path) + + yelems = ( + search_term, + period[0][0:7], + period[1][0:7] + ) + yaml = """ +- pubmed: + search: %s + start-period: %s + end-period: %s +""" % yelems + + yaml_path = os.path.join(path, 'oacensus.yaml') + with open(yaml_path, 'w') as f: + f.write(yaml) + + os.chdir(path) + subprocess.call(['oacensus', 'run', '--config', 'oacensus.yaml', '--reports', 'pubspeed-excel']) + os.chdir('../..') + + + \ No newline at end of file diff --git a/examples/pubspeed/run-report.sh b/examples/pubspeed/run-report.sh new file mode 100644 index 0000000..5322d67 --- /dev/null +++ b/examples/pubspeed/run-report.sh @@ -0,0 +1,2 @@ +#!/bin/sh +oacensus run --progress --reports "pubspeed-excel" diff --git a/examples/wellcome2013/apc-oag.xlsx b/examples/wellcome2013/apc-oag.xlsx new file mode 100644 index 0000000..5eab744 Binary files /dev/null and b/examples/wellcome2013/apc-oag.xlsx differ diff --git a/examples/wellcome2013/oacensus.yaml b/examples/wellcome2013/oacensus.yaml new file mode 100644 index 0000000..9e11f68 --- /dev/null +++ b/examples/wellcome2013/oacensus.yaml @@ -0,0 +1,33 @@ +# Load standard licenses. +- licenses + +# Create articles from xlsx file. +- excelarticles: { + location: "wellcome20.xlsx", + list-name: Wellcome 2012-13, + source: wellcome, + period: 2012-13, + column-mapping : { + publisher : publisher.name, + journal : journal.title, + title : title, + doi : doi + } + } + +# Get publication date and standardize journal title (based on article DOI). +- crossref + +# Get ISSNs based on journal titles. +- crossrefjournals + +# Get openness information from OAG (based on article DOI). +- oag + +# Check pubmed, get external identifiers including PMC (based on article DOI). +- pubmed-update-repositories + +# Look up journal info in DOAJ +- doaj + +# TODO look up license in PMC diff --git a/examples/wellcome2013/run-report.sh b/examples/wellcome2013/run-report.sh new file mode 100644 index 0000000..58562d9 --- /dev/null +++ b/examples/wellcome2013/run-report.sh @@ -0,0 +1 @@ +oacensus run --progress --reports "openness-excel" diff --git a/examples/wellcome2013/wellcome.xlsx b/examples/wellcome2013/wellcome.xlsx new file mode 100644 index 0000000..5bb9828 Binary files /dev/null and b/examples/wellcome2013/wellcome.xlsx differ diff --git a/examples/wellcome2013/wellcome20.xlsx b/examples/wellcome2013/wellcome20.xlsx new file mode 100644 index 0000000..427e129 Binary files /dev/null and b/examples/wellcome2013/wellcome20.xlsx differ diff --git a/oacensus/licenses.yaml b/oacensus/licenses.yaml index 898133b..835d037 100644 --- a/oacensus/licenses.yaml +++ b/oacensus/licenses.yaml @@ -26,7 +26,7 @@ cc-by-nd: cc-by-nc-sa: title: Creative Commons Attribution NonCommercial ShareAlike url: https://creativecommons.org/licenses/by-nc-nd-sa/4.0 - aliases: [] + aliases: [cc-BY-NC-SA] cc-zero: title: No Rights Reserved diff --git a/oacensus/load_plugins.py b/oacensus/load_plugins.py index 178c19d..972eb29 100644 --- a/oacensus/load_plugins.py +++ b/oacensus/load_plugins.py @@ -14,9 +14,11 @@ import oacensus.scrapers.rcukgtr import oacensus.scrapers.scimago import oacensus.scrapers.wiley +import oacensus.scrapers.openaire import oacensus.reports.excel_dump import oacensus.reports.oa_excel +import oacensus.reports.pubspeed_excel import oacensus.reports.text_dump import oacensus.reports.personal_openness import oacensus.reports.institution diff --git a/oacensus/models.py b/oacensus/models.py index 78df803..03fecce 100644 --- a/oacensus/models.py +++ b/oacensus/models.py @@ -222,6 +222,12 @@ class Article(ModelBase): help_text="Digital object identifier for article.") date_published = CharField(null=True, help_text="When article was published, in YYYY(-MM(-DD)) format.") + date_submitted = CharField(null=True, + help_text="When article was submitted for peer review, in YYYY(-MM(-DD)) format.") + date_accepted = CharField(null=True, + help_text="When article was accepted for publication, in YYYY(-MM(-DD)) format.") + date_aheadofprint = CharField(null=True, + help_text="Ahead of print publication date, generally AOP, in YYYY(-MM(-DD)) format.") period = CharField( help_text="Name of date-based period in which this article was scraped.") url = CharField(null=True, diff --git a/oacensus/reports/pubspeed_excel.py b/oacensus/reports/pubspeed_excel.py new file mode 100644 index 0000000..9b0715e --- /dev/null +++ b/oacensus/reports/pubspeed_excel.py @@ -0,0 +1,82 @@ +from oacensus.models import Article +from oacensus.models import ModelBase +from oacensus.report import Report +import datetime +import inflection +import inspect +import os +import xlwt + +class PubspeedExcel(Report): + """ + An excel-based openness report. + """ + + aliases = ['pubspeed-excel'] + + _settings = { + 'filename' : ("Name of file to write excel dump to.", "pubspeed.xls"), + "sheet-name" : ("Name of worksheet.", "Articles"), + 'date-format-string' : ( "Excel style date format string.", "D-MMM-YYYY"), + "fields" : ("Fields to include in report.", [ + "doi", "title", "date_published", + "date_submitted", "date_accepted", "date_aheadofprint", + "journal.title", "journal.issn" + ]) + } + + def run(self): + date_style = xlwt.XFStyle() + date_style.num_format_str = self.setting('date-format-string') + + bold_font = xlwt.Font() + bold_font.bold = True + + bold_style = xlwt.XFStyle() + bold_style.font = bold_font + + filename = self.setting('filename') + if os.path.exists(filename): + os.remove(filename) + + workbook = xlwt.Workbook() + ws = workbook.add_sheet("Articles") + + keys = self.setting('fields') + + # Write Headers + for j, k in enumerate(keys): + heading = inflection.titleize(k.replace("_id", "_identifier")) + ws.write(0, j, heading, bold_style) + + for i, article in enumerate(Article.select()): + print article + for j, key in enumerate(keys): + if key.startswith("journal."): + value = getattr(article.journal, key.replace("journal.", "")) + elif key.startswith("publisher."): + if article.journal.publisher is not None: + value = getattr(article.journal.publisher, key.replace("publisher.", "")) + else: + value = None + else: + value = getattr(article, key) + + fmt = None + + if isinstance(value, ModelBase): + value = unicode(value) + elif isinstance(value, datetime.date): + fmt = date_style + elif inspect.ismethod(value): + value = value() + else: + pass + + if fmt: + ws.write(i+1, j, value, fmt) + else: + ws.write(i+1, j, value) + + workbook.save(filename) + print " pubspeed report written to %s" % filename diff --git a/oacensus/scrapers/openaire.py b/oacensus/scrapers/openaire.py new file mode 100644 index 0000000..11c4e4d --- /dev/null +++ b/oacensus/scrapers/openaire.py @@ -0,0 +1,50 @@ +__author__ = 'cneylon' +from oacensus.scraper import ArticleScraper +from oacensus.models import Article +from oacensus.models import Repository +from oacensus.models import Instance +import json +import requests +import xml.etree.ElementTree as ET + +class OpenAIRE(ArticleScraper): + """ + Gets accessibility information for articles with DOIs in the database. + """ + aliases = ['openaire'] + + _settings = { + 'base-url' : ("Base url of OpenAIRE API", "http://api.openaire.eu/search/publications"), + } + + def scrape(self): + # don't use scrape method since our query depends on db state, so + # caching will not be accurate + pass + + def process(self): + articles = Article.select().where(~(Article.doi >> None)) + for article in articles: + response = requests.get(self.setting('base-url'), params = {'doi' : article.doi}) + openaire_response = ET.fromstring(response.text.encode('utf-8')) + + for inst in openaire_response.iter('instance'): + reponame = inst.find('hostedby').get('name') + repository = Repository.find_or_create_by_name({'name':reponame, + 'source': 'openaire', + 'log' : 'Created by openaire plugin'}) + + + status = inst.find('licence').get('classname') + ftr = {'Open Access' : True, + 'Closed Access' : False, + 'Embargo' : False, + 'Restricted' : False}.get(status, False) + + url = inst.find('webresource').find('url').text + Instance.create(article = article, + repository = repository, + free_to_read = ftr, + info_url=url, + source=self.db_source(), + log='OpenAIRE response obtained from %s repository' % reponame) \ No newline at end of file diff --git a/oacensus/scrapers/pubmed.py b/oacensus/scrapers/pubmed.py index 9c3e9f5..e342006 100644 --- a/oacensus/scrapers/pubmed.py +++ b/oacensus/scrapers/pubmed.py @@ -97,7 +97,17 @@ def initial_search(self, search_term, start_date=None, end_date=None): 'retMax' : self.setting('initial-ret-max') } + if start_date and end_date: + # There is a bug in Pubmed searches that start on Jan 1. As nothing should + # be published on Jan 1 we can change the start date to Jan 2. + if start_date.month == 1 & start_date.day == 1: + start_date = start_date.replace(day=2) + if end_date.month == 1 & start_date.day == 1: + end_date = end_date.replace(year = end_date.year-1, + month = 12, + day=31) + params.update({ 'datetype' : self.setting('datetype'), 'mindate' : start_date.strftime("%Y/%m/%d"), @@ -124,7 +134,7 @@ def initial_search(self, search_term, start_date=None, end_date=None): return (count, web_env, query_key) - def fetch_batch(self, i, retstart, retmax, web_env, query_key, start_date=None, batch_prefix=None): + def fetch_batch(self, i, retstart, retmax, web_env, query_key, start_date=None, end_date=None, batch_prefix=None): self.print_progress("waiting requested delay time...") time.sleep(self.setting('delay')) msg = "fetching values %s through %s..." % (retstart, retstart+retmax-1) @@ -138,6 +148,14 @@ def fetch_batch(self, i, retstart, retmax, web_env, query_key, start_date=None, 'retmode' : 'xml' } + if start_date and end_date: + params.update({ + 'mindate' : start_date.strftime("%Y/%m/%d"), + 'maxdate' : end_date.strftime("%Y/%m/%d") # TODO is this right? + }) + elif start_date or end_date: + raise Exception("Both start and end date must be provided if either is.") + result = requests.get( self.fetch_url(), params=self.search_params(params), @@ -160,7 +178,7 @@ def search_and_fetch(self, term, start_date = None, end_date = None): retmax = self.setting('ret-max') while retstart < count: - self.fetch_batch(i, retstart, retmax, web_env, query_key, start_date) + self.fetch_batch(i, retstart, retmax, web_env, query_key, start_date, end_date) retstart += retmax i += 1 @@ -183,7 +201,10 @@ def parse_date(self, entry): if year is not None: datestring = '%s %s %s' % (year, month, day) - date = dateutil.parser.parse(datestring) + try: + date = dateutil.parser.parse(datestring) + except ValueError: + return "Unknown" if month_is_none: return date.strftime("%Y") @@ -384,6 +405,9 @@ def process_period(self, start_date, end_date): article_entry = medline_citation.find("Article") journal_entry = article_entry.find("Journal") + pubmed_data = pubmed_article.find("PubmedData") + pubmed_history = pubmed_data.find("History") + # Parse journal info journal_title = journal_entry.findtext("Title") journal_iso = journal_entry.findtext("ISOAbbreviation") @@ -409,12 +433,31 @@ def process_period(self, start_date, end_date): # Parse date info date_published = None journal_pubdate_entry = journal_entry.find("JournalIssue").find("PubDate") + article_date_entry = article_entry.find("ArticleDate") - if journal_pubdate_entry is not None: + if article_date_entry is not None: + date_published = self.parse_date(article_date_entry) + elif journal_pubdate_entry is not None: date_published = self.parse_date(journal_pubdate_entry) + + # Attempt to parse submission and acceptance date + submitted_date_entry = pubmed_history.find("PubMedPubDate[@PubStatus='received']") + if submitted_date_entry is not None: + date_submitted = self.parse_date(submitted_date_entry) else: - print " no journal pub date, skipping article", title - continue + date_submitted = 'Unknown' + + accepted_date_entry = pubmed_history.find("PubMedPubDate[@PubStatus='accepted']") + if accepted_date_entry is not None: + date_accepted = self.parse_date(accepted_date_entry) + else: + date_accepted = 'Unknown' + + aheadofprint_date_entry = pubmed_history.find("PubMedPubDate[@PubStatus='aheadofprint']") + if aheadofprint_date_entry is not None: + date_aheadofprint = self.parse_date(aheadofprint_date_entry) + else: + date_aheadofprint = 'Unknown' doi_entry = article_entry.find("ELocationID") @@ -439,15 +482,32 @@ def process_period(self, start_date, end_date): assert title is not None - article = Article.create( + # TODO Check if downstream issues created here + if doi is not None: + article = Article.create_or_update_by_doi({ + 'doi' : doi, + 'title' : title, + 'journal' : journal, + 'period' : start_date.strftime("%Y-%m"), + 'date_published' : date_published, + 'date_submitted' : date_submitted, + 'date_accepted' : date_accepted, + 'date_aheadofprint' : date_aheadofprint, + 'source' : self.db_source(), + 'log' : self.db_source()}) + + else: + article = Article.create( title = title, doi = doi, journal = journal, period = start_date.strftime("%Y-%m"), date_published = date_published, + date_submitted = date_submitted, + date_accepted = date_accepted, source = self.db_source(), log = self.db_source()) - + if nihm_id is not None: Instance.create( article=article, diff --git a/tests/test_openaire_scraper.py b/tests/test_openaire_scraper.py new file mode 100644 index 0000000..da4a5ca --- /dev/null +++ b/tests/test_openaire_scraper.py @@ -0,0 +1,81 @@ +__author__ = 'cneylon' +from oacensus.commands import defaults +from oacensus.models import Article +from oacensus.scraper import Scraper +from oacensus.models import Repository +from oacensus.models import Instance +from tests.utils import setup_db +setup_db() + +test_doi_open = '10.1371/journal.pone.0001164' +test_doi_embargoed = '10.1007/s00024-004-0394-3' +test_doi_closed = '10.1063/1.3663569' +test_doi_restricted = '10.1111/j.1365-2125.2009.03481.x' + +# This is a DOI from the Crossref Labs 'Journal of Pyschoceramics' which should never appear in OpenAIRE +test_doi_no_response = '10.5555/12345678' + +def test_openaire_scraper(): + dois = [ + test_doi_open, + test_doi_embargoed, + test_doi_closed, + test_doi_restricted, + test_doi_no_response + ] + + doilist = Scraper.create_instance("doilist") + doilist.update_settings({"doi-list" : dois }) + doilist.run() + + # Test cases # + ############## + # Does the scraper run properly? + # Are the relevant repositories created? + # Are all DOIs that should be returned? + + # Scraper runs successfully + scraper = Scraper.create_instance("openaire") + scraper.run() + + # Repositories created properly + r1 = Repository.select() + repos = [r for r in r1] + assert len(repos) > 3 + + names = [r.name for r in r1] + assert 'Oxford University Research Archive' in names + assert 'Europe PubMed Central' in names + assert 'Surrey Research Insight' in names + assert 'DSpace at VSB Technical University of Ostrava' in names + + # All appropriate DOI's returned + for d in dois[0:4]: + a = Article.select().where(Article.doi == d)[0] + instances = [inst for inst in a.instances] + assert len(instances) > 0 + for inst in instances: + assert inst.free_to_read is not None + + # Nonexistent doi not returned + a = Article.select().where(Article.doi == test_doi_no_response)[0] + instances = [inst for inst in a.instances] + assert len(instances) == 0 + + # Test correct answer for 'open' case + a = Article.select().where(Article.doi == test_doi_open)[0] + instances = [inst for inst in a.instances] + for inst in instances: + assert inst.free_to_read + + # Test correct answer for non-open cases + for doi in [test_doi_restricted, test_doi_closed, test_doi_embargoed]: + a = Article.select().where(Article.doi == doi)[0] + instances = [inst for inst in a.instances] + for inst in instances: + print inst.repository.name, inst.free_to_read + assert not inst.free_to_read + + + +