From 775b991de10cc076a0d4b4dbbb8e73475050ccd7 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 12 Apr 2019 16:01:52 -0400 Subject: [PATCH 01/15] test using (local) updated pyoai module --- oaiharvest/harvest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index 68194aa..d74f3f7 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -109,7 +109,7 @@ def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix - client = Client(baseUrl, self._mdRegistry) + client = Client(baseUrl, self._mdRegistry, recover=True) incremental_range = kwargs.pop('between', None) # Check that baseUrl actually represents an OAI-PMH target try: @@ -508,8 +508,10 @@ def parse_time(argument): ch.setLevel(logging.DEBUG) formatter = logging.Formatter('%(levelname)-8s %(message)s') ch.setFormatter(formatter) -logging.getLogger(__name__).addHandler(ch) +logging.getLogger().addHandler(ch) +from lxml import etree +etree.use_global_python_log(etree.PyErrorLog()) if __name__ == "__main__": sys.exit(main()) From d9631394643f74979e76b77bb871439c78de2390 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 12 Apr 2019 17:38:10 -0400 Subject: [PATCH 02/15] swap streams in basicConfig --- oaiharvest/harvest.py | 23 +++++++++++++++-------- oaiharvest/registry.py | 15 ++++++++++----- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index d74f3f7..995419d 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -500,18 +500,25 @@ def parse_time(argument): # Set up logger logging.basicConfig( level=logging.DEBUG, - format='%(asctime)s %(name)-16s %(levelname)-8s %(message)s', - datefmt='[%Y-%m-%d %H:%M:%S]', - filename=os.path.join(appdir, 'harvest.log')) - -ch = logging.StreamHandler() + format='%(levelname)-8s %(message)s', + # format='%(asctime)s %(name)-16s %(levelname)-8s %(message)s', + # datefmt='[%Y-%m-%d %H:%M:%S]', + # filename=os.path.join(appdir, 'harvest.log') + ) + +#ch = logging.StreamHandler() +ch = logging.FileHandler( os.path.join( appdir, 'harvest.log')) ch.setLevel(logging.DEBUG) -formatter = logging.Formatter('%(levelname)-8s %(message)s') +formatter = logging.Formatter( + '%(asctime)s %(name)-16s %(levelname)-8s %(message)s', + '[%Y-%m-%d %H:%M:%S]') +#formatter = logging.Formatter('%(levelname)-8s %(message)s') ch.setFormatter(formatter) -logging.getLogger().addHandler(ch) +logging.getLogger(__name__).addHandler(ch) from lxml import etree -etree.use_global_python_log(etree.PyErrorLog()) + +etree.use_global_python_log(etree.PyErrorLog(logger=logging.getLogger(__name__).getChild('XMLParser'))) if __name__ == "__main__": sys.exit(main()) diff --git a/oaiharvest/registry.py b/oaiharvest/registry.py index 7932db0..226c16b 100644 --- a/oaiharvest/registry.py +++ b/oaiharvest/registry.py @@ -340,14 +340,19 @@ def main(argv=None): # Set up logger logging.basicConfig( level=logging.DEBUG, - format='%(asctime)s %(name)-16s %(levelname)-8s %(message)s', - datefmt='[%Y-%m-%d %H:%M:%S]', - filename=os.path.join(appdir, 'registry.log') + format='%(levelname)-8s %(message)s' + # format='%(asctime)s %(name)-16s %(levelname)-8s %(message)s', + # datefmt='[%Y-%m-%d %H:%M:%S]', + # filename=os.path.join(appdir, 'registry.log') ) logger = logging.getLogger(__name__) -ch = logging.StreamHandler() +# ch = logging.StreamHandler() +ch = logging.FileHandler( os.path.join( appdir, 'registry.log')) ch.setLevel(logging.DEBUG) -formatter = logging.Formatter('%(levelname)-8s %(message)s') +# formatter = logging.Formatter('%(levelname)-8s %(message)s') +formatter = logging.Formatter( + '%(asctime)s %(name)-16s %(levelname)-8s %(message)s', + '[%Y-%m-%d %H:%M:%S]' ) ch.setFormatter(formatter) logger.addHandler(ch) From 4880252867126e705a4219439b2bff4422c2c217 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Wed, 17 Apr 2019 15:48:01 -0400 Subject: [PATCH 03/15] finally think the logging is working correctly from PyErrorLog subclass --- oaiharvest/harvest.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index 995419d..eef2612 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -129,6 +129,9 @@ def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about) + if client.XMLParser.error_log : + logging.getLogger(__name__).getChild('XMLParser').warning( + 'Recoverable XMLParser error on: %s', header.identifier() ) self.maybe_pause_if_incremental(incremental_range) def harvest(self, baseUrl, metadataPrefix, **kwargs): @@ -518,7 +521,19 @@ def parse_time(argument): from lxml import etree -etree.use_global_python_log(etree.PyErrorLog(logger=logging.getLogger(__name__).getChild('XMLParser'))) +class XMLErrorLog( etree.PyErrorLog ): + new_map = { 1: 30, 2: 30, 3: 30 } + def __init__( self, *args, **kwargs ): + etree.PyErrorLog.__init__( self, *args, **kwargs ) + self.level_map.update( self.new_map ) + def receive(self, log_entry ): + logrepr = "%s:%d:%d:%s%s.%s:[%s]" % ( + log_entry.filename, log_entry.line, log_entry.column, "", + log_entry.domain_name, log_entry.type_name, log_entry.message) + self.log( log_entry, logrepr ) + +etree.use_global_python_log(XMLErrorLog(logger=logging.getLogger(__name__).getChild('XMLParser'))) + if __name__ == "__main__": sys.exit(main()) From f2d389b126784379cc19fbc2dbbff35d9c99c2fb Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Thu, 18 Apr 2019 16:46:42 -0400 Subject: [PATCH 04/15] use local version of client instead of oaipmh version until/unless it gets updated --- oaiharvest/client.py | 425 ++++++++++++++++++++++++++++++++++++++++++ oaiharvest/harvest.py | 2 +- 2 files changed, 426 insertions(+), 1 deletion(-) create mode 100644 oaiharvest/client.py diff --git a/oaiharvest/client.py b/oaiharvest/client.py new file mode 100644 index 0000000..138e12c --- /dev/null +++ b/oaiharvest/client.py @@ -0,0 +1,425 @@ +# Copyright 2003, 2004, 2005 Infrae +# Released under the BSD license (see LICENSE.txt) +from __future__ import nested_scopes +from __future__ import absolute_import + +import six + +try: + import urllib.request as urllib2 + from urllib.parse import urlencode +except ImportError: + import urllib2 + from urllib import urlencode + +import sys +import base64 +from lxml import etree +import time +import codecs + +from oaipmh import common, metadata, validation, error +from oaipmh.datestamp import datestamp_to_datetime, datetime_to_datestamp + +WAIT_DEFAULT = 120 # two minutes +WAIT_MAX = 5 + +class Error(Exception): + pass + + +class BaseClient(common.OAIPMH): + # retry policy on error. Default is to retry request `WAIT_MAX` times + # on HTTP 503 errors, waiting `WAIT_DEFAULT` before each retry + default_retry_policy = { + # how many seconds should we wait before each retry + 'wait-default': WAIT_DEFAULT, + # how many times should we retry + 'retry': WAIT_MAX, + # which HTTP codes are expected + 'expected-errcodes': {503}, + } + + def __init__(self, metadata_registry=None, custom_retry_policy=None, recover=False): + self._metadata_registry = ( + metadata_registry or metadata.global_metadata_registry) + self._ignore_bad_character_hack = 0 + self._day_granularity = False + self.retry_policy = self.default_retry_policy.copy() + if custom_retry_policy is not None: + self.retry_policy.update(custom_retry_policy) + self.XMLParser = etree.XMLParser(recover=recover) + + def updateGranularity(self): + """Update the granularity setting dependent on that the server says. + """ + identify = self.identify() + granularity = identify.granularity() + if granularity == 'YYYY-MM-DD': + self._day_granularity = True + elif granularity == 'YYYY-MM-DDThh:mm:ssZ': + self._day_granularity= False + else: + raise Error("Non-standard granularity on server: %s" % granularity) + + def handleVerb(self, verb, kw): + # validate kw first + validation.validateArguments(verb, kw) + # encode datetimes as datestamps + from_ = kw.get('from_') + if from_ is not None: + # turn it into 'from', not 'from_' before doing actual request + kw['from'] = datetime_to_datestamp(from_, + self._day_granularity) + if 'from_' in kw: + # always remove it from the kw, no matter whether it be None or not + del kw['from_'] + + until = kw.get('until') + if until is not None: + kw['until'] = datetime_to_datestamp(until, + self._day_granularity) + elif 'until' in kw: + # until is None but is explicitly in kw, remove it + del kw['until'] + + # now call underlying implementation + method_name = verb + '_impl' + return getattr(self, method_name)( + kw, self.makeRequestErrorHandling(verb=verb, **kw)) + + def getNamespaces(self): + """Get OAI namespaces. + """ + return {'oai': 'http://www.openarchives.org/OAI/2.0/'} + + def getMetadataRegistry(self): + """Return the metadata registry in use. + + Do we want to allow the returning of the global registry? + """ + return self._metadata_registry + + def ignoreBadCharacters(self, true_or_false): + """Set to ignore bad characters in UTF-8 input. + This is a hack to get around well-formedness errors of + input sources which *should* be in UTF-8 but for some reason + aren't completely. + """ + self._ignore_bad_character_hack = true_or_false + + def parse(self, xml): + """Parse the XML to a lxml tree. + """ + # XXX this is only safe for UTF-8 encoded content, + # and we're basically hacking around non-wellformedness anyway, + # but oh well + if self._ignore_bad_character_hack: + xml = six.text_type(xml, 'UTF-8', 'replace') + # also get rid of character code 12 + xml = xml.replace(chr(12), '?') + xml = xml.encode('UTF-8') + if six.PY3: + if hasattr(xml, "encode"): + xml = xml.encode("utf-8") + # xml = xml.encode("utf-8") + return etree.XML(xml, parser=self.XMLParser) + + # implementation of the various methods, delegated here by + # handleVerb method + + def GetRecord_impl(self, args, tree): + records, token = self.buildRecords( + args['metadataPrefix'], + self.getNamespaces(), + self._metadata_registry, + tree + ) + assert token is None + return records[0] + + def GetMetadata_impl(self, args, tree): + return tree + + def Identify_impl(self, args, tree): + namespaces = self.getNamespaces() + evaluator = etree.XPathEvaluator(tree, namespaces=namespaces) + identify_node = evaluator.evaluate( + '/oai:OAI-PMH/oai:Identify')[0] + identify_evaluator = etree.XPathEvaluator(identify_node, + namespaces=namespaces) + e = identify_evaluator.evaluate + + repositoryName = e('string(oai:repositoryName/text())') + baseURL = e('string(oai:baseURL/text())') + protocolVersion = e('string(oai:protocolVersion/text())') + adminEmails = e('oai:adminEmail/text()') + earliestDatestamp = datestamp_to_datetime( + e('string(oai:earliestDatestamp/text())')) + deletedRecord = e('string(oai:deletedRecord/text())') + granularity = e('string(oai:granularity/text())') + compression = e('oai:compression/text()') + # XXX description + identify = common.Identify( + repositoryName, baseURL, protocolVersion, + adminEmails, earliestDatestamp, + deletedRecord, granularity, compression) + return identify + + def ListIdentifiers_impl(self, args, tree): + namespaces = self.getNamespaces() + def firstBatch(): + return self.buildIdentifiers(namespaces, tree) + def nextBatch(token): + tree = self.makeRequestErrorHandling(verb='ListIdentifiers', + resumptionToken=token) + return self.buildIdentifiers(namespaces, tree) + return ResumptionListGenerator(firstBatch, nextBatch) + + def ListMetadataFormats_impl(self, args, tree): + namespaces = self.getNamespaces() + evaluator = etree.XPathEvaluator(tree, + namespaces=namespaces) + + metadataFormat_nodes = evaluator.evaluate( + '/oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat') + metadataFormats = [] + for metadataFormat_node in metadataFormat_nodes: + e = etree.XPathEvaluator(metadataFormat_node, + namespaces=namespaces).evaluate + metadataPrefix = e('string(oai:metadataPrefix/text())') + schema = e('string(oai:schema/text())') + metadataNamespace = e('string(oai:metadataNamespace/text())') + metadataFormat = (metadataPrefix, schema, metadataNamespace) + metadataFormats.append(metadataFormat) + + return metadataFormats + + def ListRecords_impl(self, args, tree): + namespaces = self.getNamespaces() + metadata_prefix = args['metadataPrefix'] + metadata_registry = self._metadata_registry + def firstBatch(): + return self.buildRecords( + metadata_prefix, namespaces, + metadata_registry, tree) + def nextBatch(token): + tree = self.makeRequestErrorHandling( + verb='ListRecords', + resumptionToken=token) + return self.buildRecords( + metadata_prefix, namespaces, + metadata_registry, tree) + return ResumptionListGenerator(firstBatch, nextBatch) + + def ListSets_impl(self, args, tree): + namespaces = self.getNamespaces() + def firstBatch(): + return self.buildSets(namespaces, tree) + def nextBatch(token): + tree = self.makeRequestErrorHandling( + verb='ListSets', + resumptionToken=token) + return self.buildSets(namespaces, tree) + return ResumptionListGenerator(firstBatch, nextBatch) + + # various helper methods + + def buildRecords(self, + metadata_prefix, namespaces, metadata_registry, tree): + # first find resumption token if available + evaluator = etree.XPathEvaluator(tree, + namespaces=namespaces) + token = evaluator.evaluate( + 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') + if token.strip() == '': + token = None + record_nodes = evaluator.evaluate( + '/oai:OAI-PMH/*/oai:record') + result = [] + for record_node in record_nodes: + record_evaluator = etree.XPathEvaluator(record_node, + namespaces=namespaces) + e = record_evaluator.evaluate + # find header node + header_node = e('oai:header')[0] + # create header + header = buildHeader(header_node, namespaces) + # find metadata node + metadata_list = e('oai:metadata') + if metadata_list: + metadata_node = metadata_list[0] + # create metadata + metadata = metadata_registry.readMetadata(metadata_prefix, + metadata_node) + else: + metadata = None + # XXX TODO: about, should be third element of tuple + result.append((header, metadata, None)) + return result, token + + def buildIdentifiers(self, namespaces, tree): + evaluator = etree.XPathEvaluator(tree, + namespaces=namespaces) + # first find resumption token is available + token = evaluator.evaluate( + 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') + #'string(/oai:OAI-PMH/oai:ListIdentifiers/oai:resumptionToken/text())') + if token.strip() == '': + token = None + header_nodes = evaluator.evaluate( + '/oai:OAI-PMH/oai:ListIdentifiers/oai:header') + result = [] + for header_node in header_nodes: + header = buildHeader(header_node, namespaces) + result.append(header) + return result, token + + def buildSets(self, namespaces, tree): + evaluator = etree.XPathEvaluator(tree, + namespaces=namespaces) + # first find resumption token if available + token = evaluator.evaluate( + 'string(/oai:OAI-PMH/oai:ListSets/oai:resumptionToken/text())') + if token.strip() == '': + token = None + set_nodes = evaluator.evaluate( + '/oai:OAI-PMH/oai:ListSets/oai:set') + sets = [] + for set_node in set_nodes: + e = etree.XPathEvaluator(set_node, + namespaces=namespaces).evaluate + # make sure we get back unicode strings instead + # of lxml.etree._ElementUnicodeResult objects. + setSpec = six.text_type(e('string(oai:setSpec/text())')) + setName = six.text_type(e('string(oai:setName/text())')) + # XXX setDescription nodes + sets.append((setSpec, setName, None)) + return sets, token + + def makeRequestErrorHandling(self, **kw): + xml = self.makeRequest(**kw) + try: + tree = self.parse(xml) + except SyntaxError: + raise error.XMLSyntaxError(kw) + # check whether there are errors first + e_errors = tree.xpath('/oai:OAI-PMH/oai:error', + namespaces=self.getNamespaces()) + if e_errors: + # XXX right now only raise first error found, does not + # collect error info + for e_error in e_errors: + code = e_error.get('code') + msg = e_error.text + if code not in ['badArgument', 'badResumptionToken', + 'badVerb', 'cannotDisseminateFormat', + 'idDoesNotExist', 'noRecordsMatch', + 'noMetadataFormats', 'noSetHierarchy']: + raise error.UnknownError( + "Unknown error code from server: %s, message: %s" % ( + code, msg)) + # find exception in error module and raise with msg + raise getattr(error, code[0].upper() + code[1:] + 'Error')(msg) + return tree + + def makeRequest(self, **kw): + raise NotImplementedError + +class Client(BaseClient): + + def __init__(self, base_url, metadata_registry=None, credentials=None, + local_file=False, force_http_get=False, custom_retry_policy=None, + recover=False): + BaseClient.__init__(self, metadata_registry, + custom_retry_policy=custom_retry_policy, recover=recover) + self._base_url = base_url + self._local_file = local_file + self._force_http_get = force_http_get + if credentials is not None: + self._credentials = base64.encodestring('%s:%s' % credentials) + else: + self._credentials = None + + def makeRequest(self, **kw): + """Either load a local XML file or actually retrieve XML from a server. + """ + if self._local_file: + with codecs.open(self._base_url, 'r', 'utf-8') as xmlfile: + text = xmlfile.read() + return text.encode('ascii', 'replace') + else: + # XXX include From header? + headers = {'User-Agent': 'pyoai'} + if self._credentials is not None: + headers['Authorization'] = 'Basic ' + self._credentials.strip() + if self._force_http_get: + request_url = '%s?%s' % (self._base_url, urlencode(kw)) + request = urllib2.Request(request_url, headers=headers) + else: + binary_data = urlencode(kw).encode('utf-8') + request = urllib2.Request( + self._base_url, data=binary_data, headers=headers) + + return retrieveFromUrlWaiting( + request, + wait_max=self.retry_policy['retry'], + wait_default=self.retry_policy['wait-default'], + expected_errcodes=self.retry_policy['expected-errcodes'] + ) + +def buildHeader(header_node, namespaces): + e = etree.XPathEvaluator(header_node, + namespaces=namespaces).evaluate + identifier = e('string(oai:identifier/text())') + datestamp = datestamp_to_datetime( + str(e('string(oai:datestamp/text())'))) + setspec = [str(s) for s in e('oai:setSpec/text()')] + deleted = e("@status = 'deleted'") + return common.Header(header_node, identifier, datestamp, setspec, deleted) + +def ResumptionListGenerator(firstBatch, nextBatch): + result, token = firstBatch() + while 1: + for item in result: + yield item + if token is None: + break + result, token = nextBatch(token) + +def retrieveFromUrlWaiting(request, + wait_max=WAIT_MAX, wait_default=WAIT_DEFAULT, + expected_errcodes={503}): + """Get text from URL, handling 503 Retry-After. + """ + for i in list(range(wait_max)): + try: + f = urllib2.urlopen(request) + text = f.read() + f.close() + # we successfully opened without having to wait + break + except urllib2.HTTPError as e: + if e.code in expected_errcodes: + try: + retryAfter = int(e.hdrs.get('Retry-After')) + except TypeError: + retryAfter = None + if retryAfter is None: + time.sleep(wait_default) + else: + time.sleep(retryAfter) + else: + # reraise any other HTTP error + raise + else: + raise Error("Waited too often (more than %s times)" % wait_max) + return text + +class ServerClient(BaseClient): + def __init__(self, server, metadata_registry=None): + BaseClient.__init__(self, metadata_registry) + self._server = server + + def makeRequest(self, **kw): + return self._server.handleRequest(kw) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index eef2612..0c1591a 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -65,7 +65,7 @@ from time import sleep import six.moves.urllib.parse as urllib -from oaipmh.client import Client +from .client import Client from oaipmh.error import NoRecordsMatchError from six import string_types From f76e75a58c24e946f5391c4101e3b095a3179071 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Thu, 18 Apr 2019 17:02:25 -0400 Subject: [PATCH 05/15] only use recover=True parser option if that is passed as keyword to harvester --- oaiharvest/harvest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index 0c1591a..8c8ebbf 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -109,7 +109,7 @@ def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): # Generator to yield records from baseUrl in the given metadataPrefix # Add metatdataPrefix to args kwargs['metadataPrefix'] = metadataPrefix - client = Client(baseUrl, self._mdRegistry, recover=True) + client = Client(baseUrl, self._mdRegistry, recover=kwargs.pop('recover', False)) incremental_range = kwargs.pop('between', None) # Check that baseUrl actually represents an OAI-PMH target try: @@ -339,6 +339,7 @@ def main(argv=None): try: completed = harvester.harvest(baseUrl, args.metadataPrefix, + recover=True, **kwargs ) except NoRecordsMatchError: From df558d9a95fdd33cf777249f4ad3903a69531164 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Thu, 18 Apr 2019 18:27:42 -0400 Subject: [PATCH 06/15] use level symbolic names; add command line option to control recover parser --- oaiharvest/harvest.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index 8c8ebbf..4742179 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -339,7 +339,7 @@ def main(argv=None): try: completed = harvester.harvest(baseUrl, args.metadataPrefix, - recover=True, + recover=args.recover, **kwargs ) except NoRecordsMatchError: @@ -491,6 +491,19 @@ def parse_time(argument): help=("create target subdirs based on occurrences of the given character" "in identifiers")) +# XMLParser( recover=? ) +group = argparser.add_mutually_exclusive_group(required=False) +group.set_defaults(recover=True) +group.add_argument( + '--recover', + action='store_true', + dest='recover', + ) +group.add_argument( + '--no-recover', + action='store_false', + dest='recover', + ) # Set up metadata registry xmlReader = XMLMetadataReader() @@ -523,7 +536,11 @@ def parse_time(argument): from lxml import etree class XMLErrorLog( etree.PyErrorLog ): - new_map = { 1: 30, 2: 30, 3: 30 } + new_map = { + etree.ErrorLevels.WARNING : logging.WARNING, + etree.ErrorLevels.ERROR : logging.WARNING, + etree.ErrorLevels.FATAL : logging.WARNING, + } def __init__( self, *args, **kwargs ): etree.PyErrorLog.__init__( self, *args, **kwargs ) self.level_map.update( self.new_map ) From e2ae4d4cf4c494fef124859b965aa26a2e1ee866 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Thu, 18 Apr 2019 18:37:26 -0400 Subject: [PATCH 07/15] add help text for recover --- oaiharvest/harvest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index 4742179..39a33cf 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -498,6 +498,8 @@ def parse_time(argument): '--recover', action='store_true', dest='recover', + help=("create XMLParser with (recover=True) option: " + "parser will try to continue to parse broken XML payloads") ) group.add_argument( '--no-recover', From 1018ebb3b222cc2efeee78104fbdc54187edc761 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 19 Apr 2019 14:42:59 -0400 Subject: [PATCH 08/15] check len as well, otherwise mock test were firing off this log message --- oaiharvest/harvest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index 39a33cf..d100bb6 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -129,7 +129,7 @@ def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about) - if client.XMLParser.error_log : + if client.XMLParser.error_log and len(client.XMLParser.error_log) > 0 : logging.getLogger(__name__).getChild('XMLParser').warning( 'Recoverable XMLParser error on: %s', header.identifier() ) self.maybe_pause_if_incremental(incremental_range) From 51dd080af6d713e4fec47cf1f1d36c584db787f4 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 19 Apr 2019 14:54:28 -0400 Subject: [PATCH 09/15] add missing whitespace in help strings --- oaiharvest/harvest.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index d100bb6..ffb4434 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -450,7 +450,7 @@ def parse_time(argument): '-d', '--dir', dest='dir', - help=("where to output files for harvested records." + help=("where to output files for harvested records. " "default: current working path")) # What to do about deletions group = argparser.add_mutually_exclusive_group() @@ -480,7 +480,7 @@ def parse_time(argument): "--create-subdirs", action='store_true', dest='subdirs', - help=("create target subdirs (based on / characters in identifiers) if" + help=("create target subdirs (based on / characters in identifiers) if " "they don't exist. To use something other than /, use the newer" "--subdirs-on option") ) @@ -493,7 +493,7 @@ def parse_time(argument): # XMLParser( recover=? ) group = argparser.add_mutually_exclusive_group(required=False) -group.set_defaults(recover=True) +group.set_defaults(recover=False) group.add_argument( '--recover', action='store_true', @@ -505,6 +505,7 @@ def parse_time(argument): '--no-recover', action='store_false', dest='recover', + help=( "default is --no-recover" ) ) # Set up metadata registry From d3d020e19eac3fbc99a5b9de3ffa841f796ee10d Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 26 Apr 2019 12:29:36 -0400 Subject: [PATCH 10/15] remove commented code, add to doc at top --- oaiharvest/harvest.py | 10 +++++----- oaiharvest/registry.py | 19 +++++++------------ 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/oaiharvest/harvest.py b/oaiharvest/harvest.py index ffb4434..2588366 100644 --- a/oaiharvest/harvest.py +++ b/oaiharvest/harvest.py @@ -45,6 +45,10 @@ other than /, use the newer--subdirs-on option --subdirs-on SUBDIRS create target subdirs based on occurrences of the given characterin identifiers + --recover create XMLParser with (recover=True) option: parser + will try to continue to parse broken XML payloads + --no-recover default is --no-recover + Copyright (c) 2013, the University of Liverpool . All rights reserved. @@ -129,7 +133,7 @@ def _listRecords(self, baseUrl, metadataPrefix="oai_dc", **kwargs): if isinstance(metadata, str) and metadata.startswith("b'"): metadata = ast.literal_eval(metadata).decode("utf-8") yield (header, metadata, about) - if client.XMLParser.error_log and len(client.XMLParser.error_log) > 0 : + if client.XMLParser.error_log and len(client.XMLParser.error_log) > 0: logging.getLogger(__name__).getChild('XMLParser').warning( 'Recoverable XMLParser error on: %s', header.identifier() ) self.maybe_pause_if_incremental(incremental_range) @@ -521,9 +525,6 @@ def parse_time(argument): logging.basicConfig( level=logging.DEBUG, format='%(levelname)-8s %(message)s', - # format='%(asctime)s %(name)-16s %(levelname)-8s %(message)s', - # datefmt='[%Y-%m-%d %H:%M:%S]', - # filename=os.path.join(appdir, 'harvest.log') ) #ch = logging.StreamHandler() @@ -532,7 +533,6 @@ def parse_time(argument): formatter = logging.Formatter( '%(asctime)s %(name)-16s %(levelname)-8s %(message)s', '[%Y-%m-%d %H:%M:%S]') -#formatter = logging.Formatter('%(levelname)-8s %(message)s') ch.setFormatter(formatter) logging.getLogger(__name__).addHandler(ch) diff --git a/oaiharvest/registry.py b/oaiharvest/registry.py index 226c16b..690e8f1 100644 --- a/oaiharvest/registry.py +++ b/oaiharvest/registry.py @@ -45,10 +45,10 @@ def add_provider(cxn, args): """Add a new provider to the registry database. - + Process ``args`` to add a new provider to the registry database. Return 0 for success, 1 for failure (error message should be logged). - + ``cxn`` => instance of ``sqlite3.Connection`` ``args`` => instance of ``argparse.Namespace`` """ @@ -148,10 +148,10 @@ def add_provider(cxn, args): def rm_provider(cxn, args): """Remove existing provider(s) from the registry database. - + Process ``args`` to remove provider(s) to the registry database. Return 0 for success, 1 for failure (error message should be logged). - + ``cxn`` => instance of ``sqlite3.Connection`` ``args`` => instance of ``argparse.Namespace`` """ @@ -174,10 +174,10 @@ def rm_provider(cxn, args): def list_providers(cxn, args): """List provider(s) currently in the registry database. - + Process ``args`` to remove provider(s) to the registry database. Return 0 for success, 1 for failure (error message should be logged). - + ``cxn`` => instance of ``sqlite3.Connection`` ``args`` => instance of ``argparse.Namespace`` """ @@ -228,7 +228,7 @@ def verify_database(path): return 1 # Verify that table exists try: - + cxn.execute('SELECT name FROM providers') except sqlite3.OperationalError: # Create the table @@ -341,15 +341,10 @@ def main(argv=None): logging.basicConfig( level=logging.DEBUG, format='%(levelname)-8s %(message)s' - # format='%(asctime)s %(name)-16s %(levelname)-8s %(message)s', - # datefmt='[%Y-%m-%d %H:%M:%S]', - # filename=os.path.join(appdir, 'registry.log') ) logger = logging.getLogger(__name__) -# ch = logging.StreamHandler() ch = logging.FileHandler( os.path.join( appdir, 'registry.log')) ch.setLevel(logging.DEBUG) -# formatter = logging.Formatter('%(levelname)-8s %(message)s') formatter = logging.Formatter( '%(asctime)s %(name)-16s %(levelname)-8s %(message)s', '[%Y-%m-%d %H:%M:%S]' ) From 57287e635b56db9d9683b391311cf8e392d2ecb1 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Thu, 2 May 2019 15:17:40 -0400 Subject: [PATCH 11/15] if recover=True, also need to be less strict with xpaths in OAI wrapper; sometimes mismatched tag problems can propagate outwards to container and resumptionToken not found in exact path. --- oaiharvest/client.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/oaiharvest/client.py b/oaiharvest/client.py index 138e12c..591dba1 100644 --- a/oaiharvest/client.py +++ b/oaiharvest/client.py @@ -49,6 +49,7 @@ def __init__(self, metadata_registry=None, custom_retry_policy=None, recover=Fal if custom_retry_policy is not None: self.retry_policy.update(custom_retry_policy) self.XMLParser = etree.XMLParser(recover=recover) + self.recover = recover def updateGranularity(self): """Update the granularity setting dependent on that the server says. @@ -230,10 +231,15 @@ def buildRecords(self, # first find resumption token if available evaluator = etree.XPathEvaluator(tree, namespaces=namespaces) - token = evaluator.evaluate( - 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') + if self.recover : + token = evaluator.evaluate( + 'string(//oai:resumptionToken/text())') + else: + token = evaluator.evaluate( + 'string(/oai:OAI-PMH/*/oai:resumptionToken/text())') if token.strip() == '': token = None + record_nodes = evaluator.evaluate( '/oai:OAI-PMH/*/oai:record') result = [] From 7e73b33b94f37888de5dcc63c8cb5ebd47fa07fe Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 24 Apr 2020 17:11:08 -0400 Subject: [PATCH 12/15] otherwise files are being written as us-ascii w numerical character entities use unicode, not utf-8, so that it works under python 2.x as well as 3 --- oaiharvest/metadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/oaiharvest/metadata.py b/oaiharvest/metadata.py index 5ccb6ad..cbfc20b 100644 --- a/oaiharvest/metadata.py +++ b/oaiharvest/metadata.py @@ -50,6 +50,7 @@ def __call__(self, metadata_element): [six.text_type( tostring(rec_element, method="xml", + encoding="unicode", pretty_print=True)) for rec_element in metadata_element]) From c162d1594e589b07fc05623cd52d58d86b5e0bae Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Mon, 4 Apr 2022 14:24:37 -0400 Subject: [PATCH 13/15] Had been getting SSL certificate problems on VH harvest: SSL:CERTIFICATE_VERIFY_FAILED I don't know what needs to be fixed on the server ends (and I have no control over that end) but I found this fix googling the problem (several fixes suggested) https://www.howtouselinux.com/post/ssl-certificate_verify_failed-in-python --- oaiharvest/client.py | 3 +++ oaiharvest/registry.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/oaiharvest/client.py b/oaiharvest/client.py index 591dba1..c6d779e 100644 --- a/oaiharvest/client.py +++ b/oaiharvest/client.py @@ -18,6 +18,9 @@ import time import codecs +import ssl +ssl._create_default_https_context = ssl._create_unverified_context + from oaipmh import common, metadata, validation, error from oaipmh.datestamp import datestamp_to_datetime, datetime_to_datestamp diff --git a/oaiharvest/registry.py b/oaiharvest/registry.py index 690e8f1..d076100 100644 --- a/oaiharvest/registry.py +++ b/oaiharvest/registry.py @@ -34,7 +34,7 @@ from datetime import datetime # Import oaipmh for validation purposes -from oaipmh.client import Client +from .client import Client from oaipmh.metadata import MetadataRegistry, oai_dc_reader from oaipmh.error import XMLSyntaxError from six.moves.urllib.error import HTTPError From 11ee8b3713797127eb5174c51b341fa08de76c9b Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Fri, 22 Apr 2022 13:57:12 -0400 Subject: [PATCH 14/15] add 502:Bad Gateway to expected error codes for proxy timeouts --- oaiharvest/client.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/oaiharvest/client.py b/oaiharvest/client.py index c6d779e..8d0a25c 100644 --- a/oaiharvest/client.py +++ b/oaiharvest/client.py @@ -39,8 +39,9 @@ class BaseClient(common.OAIPMH): 'wait-default': WAIT_DEFAULT, # how many times should we retry 'retry': WAIT_MAX, - # which HTTP codes are expected - 'expected-errcodes': {503}, + # which HTTP codes are expected: 503 is Service Unavailable + # added 502:Bad Gateway, which is what we get from Proxy Errors + 'expected-errcodes': {503,502}, } def __init__(self, metadata_registry=None, custom_retry_policy=None, recover=False): @@ -410,6 +411,8 @@ def retrieveFromUrlWaiting(request, break except urllib2.HTTPError as e: if e.code in expected_errcodes: + print(e) + print('Retry:', request.get_selector()) try: retryAfter = int(e.hdrs.get('Retry-After')) except TypeError: From 8920dad678d5f7197a73373633a21e83e9960c66 Mon Sep 17 00:00:00 2001 From: Steve Majewski Date: Thu, 4 Aug 2022 10:45:38 -0400 Subject: [PATCH 15/15] timeout increased due to proxy problems --- README.rst | 7 +++++++ oaiharvest/client.py | 8 ++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 4a1e8ab..de2af95 100644 --- a/README.rst +++ b/README.rst @@ -13,6 +13,13 @@ OAI-PMH Harvest :alt: license:BSD +Modified Fork +============= + +--recover option uses recovery option on XML parser to try to parse invalid payloads + +pip3 install git+https://github.com/sdm7g/oai-harvest.git@fix-pyoai + Contents -------- diff --git a/oaiharvest/client.py b/oaiharvest/client.py index 8d0a25c..a41a68a 100644 --- a/oaiharvest/client.py +++ b/oaiharvest/client.py @@ -24,8 +24,8 @@ from oaipmh import common, metadata, validation, error from oaipmh.datestamp import datestamp_to_datetime, datetime_to_datestamp -WAIT_DEFAULT = 120 # two minutes -WAIT_MAX = 5 +WAIT_DEFAULT = 300 # five minutes +WAIT_MAX = 3 class Error(Exception): pass @@ -40,8 +40,8 @@ class BaseClient(common.OAIPMH): # how many times should we retry 'retry': WAIT_MAX, # which HTTP codes are expected: 503 is Service Unavailable - # added 502:Bad Gateway, which is what we get from Proxy Errors - 'expected-errcodes': {503,502}, + # added 504:Gateway Timeout, 502:Bad Gateway to catch proxy timeouts + 'expected-errcodes': {504,503,502}, } def __init__(self, metadata_registry=None, custom_retry_policy=None, recover=False):