From 5bb76ecf78ce6884d16796582d42b95bf9d0bf7b Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Thu, 25 Jan 2024 13:44:36 +0100 Subject: [PATCH 01/16] feat!: add xsdata models --- .pre-commit-config.yaml | 12 +- README.md | 8 +- docs/src/customizing.md | 23 +- docs/src/tutorial.md | 39 +- pyproject.toml | 17 +- src/oaipmh_scythe/__init__.py | 7 +- src/oaipmh_scythe/client.py | 90 +- src/oaipmh_scythe/exceptions.py | 4 +- src/oaipmh_scythe/iterator.py | 119 +- src/oaipmh_scythe/models.py | 280 --- src/oaipmh_scythe/models/__init__.py | 35 + src/oaipmh_scythe/models/datacite/.xsdata.xml | 32 + src/oaipmh_scythe/models/datacite/README.md | 7 + src/oaipmh_scythe/models/datacite/__init__.py | 43 + src/oaipmh_scythe/models/datacite/models.py | 1675 +++++++++++++++++ src/oaipmh_scythe/models/marcxml/.xsdata.xml | 32 + src/oaipmh_scythe/models/marcxml/README.md | 6 + src/oaipmh_scythe/models/marcxml/__init__.py | 29 + src/oaipmh_scythe/models/marcxml/models.py | 235 +++ src/oaipmh_scythe/models/mixins.py | 59 + src/oaipmh_scythe/models/oai_dc/.xsdata.xml | 32 + src/oaipmh_scythe/models/oai_dc/README.md | 6 + src/oaipmh_scythe/models/oai_dc/__init__.py | 49 + src/oaipmh_scythe/models/oai_dc/models.py | 261 +++ src/oaipmh_scythe/models/oai_pmh/.xsdata.xml | 37 + src/oaipmh_scythe/models/oai_pmh/README.md | 6 + src/oaipmh_scythe/models/oai_pmh/__init__.py | 59 + src/oaipmh_scythe/models/oai_pmh/models.py | 649 +++++++ src/oaipmh_scythe/response.py | 138 +- src/oaipmh_scythe/utils.py | 70 +- tests/cassettes/get_record.yaml | 111 ++ tests/conftest.py | 22 + tests/integration/test_get_record.py | 30 +- tests/integration/test_identify.py | 17 +- tests/integration/test_list_identifiers.py | 30 +- .../integration/test_list_metadata_formats.py | 9 +- tests/integration/test_list_records.py | 53 +- tests/integration/test_list_sets.py | 7 +- tests/unit/test_client.py | 154 +- tests/unit/test_iterator.py | 16 +- tests/unit/test_models.py | 321 +--- tests/unit/test_response.py | 57 +- tests/unit/test_utils.py | 58 +- 43 files changed, 3983 insertions(+), 961 deletions(-) delete mode 100644 src/oaipmh_scythe/models.py create mode 100644 src/oaipmh_scythe/models/__init__.py create mode 100644 src/oaipmh_scythe/models/datacite/.xsdata.xml create mode 100644 src/oaipmh_scythe/models/datacite/README.md create mode 100644 src/oaipmh_scythe/models/datacite/__init__.py create mode 100644 src/oaipmh_scythe/models/datacite/models.py create mode 100644 src/oaipmh_scythe/models/marcxml/.xsdata.xml create mode 100644 src/oaipmh_scythe/models/marcxml/README.md create mode 100644 src/oaipmh_scythe/models/marcxml/__init__.py create mode 100644 src/oaipmh_scythe/models/marcxml/models.py create mode 100644 src/oaipmh_scythe/models/mixins.py create mode 100644 src/oaipmh_scythe/models/oai_dc/.xsdata.xml create mode 100644 src/oaipmh_scythe/models/oai_dc/README.md create mode 100644 src/oaipmh_scythe/models/oai_dc/__init__.py create mode 100644 src/oaipmh_scythe/models/oai_dc/models.py create mode 100644 src/oaipmh_scythe/models/oai_pmh/.xsdata.xml create mode 100644 src/oaipmh_scythe/models/oai_pmh/README.md create mode 100644 src/oaipmh_scythe/models/oai_pmh/__init__.py create mode 100644 src/oaipmh_scythe/models/oai_pmh/models.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 273d93b..2d95a50 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -72,9 +72,11 @@ repos: - id: mypy args: [--config-file=pyproject.toml] additional_dependencies: - - httpx==0.27.0 - - lxml-stubs==0.5.1 - exclude: tests + - httpx>=0.27 + - lxml-stubs>=0.5 + - pytest>=8.1 + - xsdata>=24.4 + exclude: tests # TODO: remove this exclusion - repo: https://github.com/scientific-python/cookie rev: 35368e874265d105e1ca3355df7ef51bbca8eba6 # frozen: 2024.08.19 @@ -86,7 +88,9 @@ repos: hooks: - id: typos args: [--force-exclude] - exclude: CHANGELOG.md # the commit hashes in changelog trigger the spell checker + # CHANGELOG.md: the commit hashes in changelog trigger the spell checker + # src/oaipmh_scythe/models: autogenerated python modules by xsdata + exclude: ^CHANGELOG.md|^src/oaipmh_scythe/models/.* - repo: https://github.com/FHPythonUtils/LicenseCheck/ rev: b2b50f4d40c95b15478279a7a00553a1dc2925ef # frozen: 2024.2 diff --git a/README.md b/README.md index f4bc559..6266bd5 100644 --- a/README.md +++ b/README.md @@ -18,15 +18,15 @@ from oaipmh_scythe import Scythe with Scythe("https://zenodo.org/oai2d") as scythe: records = scythe.list_records() next(records) -# +# Record(header=Header(identifier='oai:zenodo.org:10654826', datestamp='2024-02-13T15:38:50Z', set_spec=['software'], status=None), metadata=Metadata(other_element=Dc(title=[Title(value='Research Data Management Organiser (RDMO)', lang=None)], creator=[Creator(value='Klar, Jochen', lang=None), Creator(value='Michaelis, Olaf', lang=None), Creator(value='Wallace, David', lang=None), Creator(value='Schröder, Max', lang=None), Creator(value='Fütterer, Heinz-Alexander', lang=None), Creator(value='Lanza, Giacomo', lang=None), Creator(value='Martínez Muñoz, David', lang=None), Creator(value='Pilori, Dario', lang=None), Creator(value='Harry, Enke', lang=None)], subject=[], description=[Description(value='<h2><a href="https://github.com/rdmorganiser/rdmo/compare/2.1.2...2.1.3">RDMO 2.1.3</a> (Feb 13, 2024)</h2>\n<ul>\n<li>Fix the migration of options with additional_input (#912)</li>\n<li>Fix export urls in management when using BASE_PATH (#915)</li>\n</ul>\n<h2>How to upgrade</h2>\n<p>In case you are upgrading from an RDMO version below 2.0.0 please read these <a href="https://rdmo.readthedocs.io/en/latest/upgrade/index.html#upgrade-to-version-2-0-0">upgrade instructions</a> before you proceed.</p>\n<pre><code>pip install --upgrade rdmo\npython manage.py upgrade\n</code></pre>', lang=None), Description(value='If you refer to this software in a publication, please cite it as below.', lang=None)], publisher=[Publisher(value='Zenodo', lang=None)], contributor=[], date=[Date(value='2024-02-13', lang=None)], type_value=[TypeType(value='info:eu-repo/semantics/other', lang=None)], format=[], identifier=[Identifier(value='https://doi.org/10.5281/zenodo.10654826', lang=None), Identifier(value='oai:zenodo.org:10654826', lang=None)], source=[], language=[], relation=[Relation(value='https://github.com/rdmorganiser/rdmo/tree/2.1.3', lang=None), Relation(value='https://doi.org/10.5281/zenodo.596581', lang=None)], coverage=[], rights=[Rights(value='info:eu-repo/semantics/openAccess', lang=None), Rights(value='Apache License 2.0', lang=None), Rights(value='http://www.apache.org/licenses/LICENSE-2.0', lang=None)])), about=[]) ``` ## Features - Easy harvesting of OAI-compliant interfaces - Support for all six OAI verbs -- Convenient object representations of OAI items (records, headers, sets, ...) -- Automatic de-serialization of Dublin Core-encoded metadata payloads to Python dictionaries +- Convenient object representations of OAI items (records, headers, sets, ...) as dataclasses +- Automatic de-serialization of metadata payloads to dataclasses for Dublin Core, DataCite, Marcxml - Option for ignoring deleted items ## Requirements @@ -36,7 +36,7 @@ with Scythe("https://zenodo.org/oai2d") as scythe: `oaipmh-scythe` is built with: - [httpx](https://github.com/encode/httpx) for issuing HTTP requests -- [lxml](https://github.com/lxml/lxml) for parsing XML responses +- [xsdata](https://github.com/tefra/xsdata) for parsing XML responses ## Installation diff --git a/docs/src/customizing.md b/docs/src/customizing.md index ecc707b..eeb8bab 100644 --- a/docs/src/customizing.md +++ b/docs/src/customizing.md @@ -7,7 +7,9 @@ SPDX-License-Identifier: BSD-3-Clause # Harvesting other Metadata Formats than OAI-DC -By default, oaipmh-scythe's mapping of the record XML into Python dictionaries is tailored to work only with +# TODO + +By default, `oaipmh-scythe`'s mapping of the record XML into Python dataclasses is tailored to work only with Dublin-Core-encoded metadata payloads. Other formats most probably won't be mapped correctly, especially if they are more hierarchically structured than Dublin Core. @@ -29,22 +31,3 @@ class MyRecord(Record): Next, associate your implementation with OAI verbs in the [oaipmh_scythe.client.Scythe][] object. In this case, we want the [oaipmh_scythe.client.Scythe][] object to use our implementation to represent items returned by ListRecords and GetRecord responses: - -```python -scythe = Scythe("http://...") -scythe.class_mapping["ListRecords"] = MyRecord -scythe.class_mapping["GetRecord"] = MyRecord -``` - -If you need to rewrite *all* item implementations, you can also provide a complete mapping to the -[oaipmh_scythe.client.Scythe][] object at instantiation: - -```python -my_mapping = { - "ListRecords": MyRecord, - "GetRecord": MyRecord, - # ... -} - -scythe = Scythe("https://...", class_mapping=my_mapping) -``` diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index aefc15e..f3b4271 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -77,7 +77,7 @@ Example: Fetching Records Published On or After a Specific Date ```python records = scythe.list_records(from_="2024-01-16") next(records) -# +# Record(header=Header(identifier='oai:zenodo.org:10654826', datestamp='2024-02-13T15:38:50Z', set_spec=['software'], status=None), metadata=Metadata(other_element=Dc(title=[Title(value='Research Data Management Organiser (RDMO)', lang=None)], creator=[Creator(value='Klar, Jochen', lang=None), Creator(value='Michaelis, Olaf', lang=None), Creator(value='Wallace, David', lang=None), Creator(value='Schröder, Max', lang=None), Creator(value='Fütterer, Heinz-Alexander', lang=None), Creator(value='Lanza, Giacomo', lang=None), Creator(value='Martínez Muñoz, David', lang=None), Creator(value='Pilori, Dario', lang=None), Creator(value='Harry, Enke', lang=None)], subject=[], description=[Description(value='<h2><a href="https://github.com/rdmorganiser/rdmo/compare/2.1.2...2.1.3">RDMO 2.1.3</a> (Feb 13, 2024)</h2>\n<ul>\n<li>Fix the migration of options with additional_input (#912)</li>\n<li>Fix export urls in management when using BASE_PATH (#915)</li>\n</ul>\n<h2>How to upgrade</h2>\n<p>In case you are upgrading from an RDMO version below 2.0.0 please read these <a href="https://rdmo.readthedocs.io/en/latest/upgrade/index.html#upgrade-to-version-2-0-0">upgrade instructions</a> before you proceed.</p>\n<pre><code>pip install --upgrade rdmo\npython manage.py upgrade\n</code></pre>', lang=None), Description(value='If you refer to this software in a publication, please cite it as below.', lang=None)], publisher=[Publisher(value='Zenodo', lang=None)], contributor=[], date=[Date(value='2024-02-13', lang=None)], type_value=[TypeType(value='info:eu-repo/semantics/other', lang=None)], format=[], identifier=[Identifier(value='https://doi.org/10.5281/zenodo.10654826', lang=None), Identifier(value='oai:zenodo.org:10654826', lang=None)], source=[], language=[], relation=[Relation(value='https://github.com/rdmorganiser/rdmo/tree/2.1.3', lang=None), Relation(value='https://doi.org/10.5281/zenodo.596581', lang=None)], coverage=[], rights=[Rights(value='info:eu-repo/semantics/openAccess', lang=None), Rights(value='Apache License 2.0', lang=None), Rights(value='http://www.apache.org/licenses/LICENSE-2.0', lang=None)])), about=[]) ``` In this example, `scythe.list_records(from_="2024-01-16")` retrieves records published on or after January 16, 2024. @@ -92,7 +92,7 @@ Example: Fetching records published until a specific date ```python records = scythe.list_records(until="2024-01-17") next(records) -# +# Record(header=Header(identifier='oai:zenodo.org:10654826', datestamp='2024-02-13T15:38:50Z', set_spec=['software'], status=None), metadata=Metadata(other_element=Dc(title=[Title(value='Research Data Management Organiser (RDMO)', lang=None)], creator=[Creator(value='Klar, Jochen', lang=None), Creator(value='Michaelis, Olaf', lang=None), Creator(value='Wallace, David', lang=None), Creator(value='Schröder, Max', lang=None), Creator(value='Fütterer, Heinz-Alexander', lang=None), Creator(value='Lanza, Giacomo', lang=None), Creator(value='Martínez Muñoz, David', lang=None), Creator(value='Pilori, Dario', lang=None), Creator(value='Harry, Enke', lang=None)], subject=[], description=[Description(value='<h2><a href="https://github.com/rdmorganiser/rdmo/compare/2.1.2...2.1.3">RDMO 2.1.3</a> (Feb 13, 2024)</h2>\n<ul>\n<li>Fix the migration of options with additional_input (#912)</li>\n<li>Fix export urls in management when using BASE_PATH (#915)</li>\n</ul>\n<h2>How to upgrade</h2>\n<p>In case you are upgrading from an RDMO version below 2.0.0 please read these <a href="https://rdmo.readthedocs.io/en/latest/upgrade/index.html#upgrade-to-version-2-0-0">upgrade instructions</a> before you proceed.</p>\n<pre><code>pip install --upgrade rdmo\npython manage.py upgrade\n</code></pre>', lang=None), Description(value='If you refer to this software in a publication, please cite it as below.', lang=None)], publisher=[Publisher(value='Zenodo', lang=None)], contributor=[], date=[Date(value='2024-02-13', lang=None)], type_value=[TypeType(value='info:eu-repo/semantics/other', lang=None)], format=[], identifier=[Identifier(value='https://doi.org/10.5281/zenodo.10654826', lang=None), Identifier(value='oai:zenodo.org:10654826', lang=None)], source=[], language=[], relation=[Relation(value='https://github.com/rdmorganiser/rdmo/tree/2.1.3', lang=None), Relation(value='https://doi.org/10.5281/zenodo.596581', lang=None)], coverage=[], rights=[Rights(value='info:eu-repo/semantics/openAccess', lang=None), Rights(value='Apache License 2.0', lang=None), Rights(value='http://www.apache.org/licenses/LICENSE-2.0', lang=None)])), about=[]) ``` This line will harvest records published up to and including January 17, 2024. @@ -130,7 +130,7 @@ Example: Fetching records from a specific set ```python records = scythe.list_records(set_="software") next(records) -# +# Record(header=Header(identifier='oai:zenodo.org:10654826', datestamp='2024-02-13T15:38:50Z', set_spec=['software'], status=None), metadata=Metadata(other_element=Dc(title=[Title(value='Research Data Management Organiser (RDMO)', lang=None)], creator=[Creator(value='Klar, Jochen', lang=None), Creator(value='Michaelis, Olaf', lang=None), Creator(value='Wallace, David', lang=None), Creator(value='Schröder, Max', lang=None), Creator(value='Fütterer, Heinz-Alexander', lang=None), Creator(value='Lanza, Giacomo', lang=None), Creator(value='Martínez Muñoz, David', lang=None), Creator(value='Pilori, Dario', lang=None), Creator(value='Harry, Enke', lang=None)], subject=[], description=[Description(value='<h2><a href="https://github.com/rdmorganiser/rdmo/compare/2.1.2...2.1.3">RDMO 2.1.3</a> (Feb 13, 2024)</h2>\n<ul>\n<li>Fix the migration of options with additional_input (#912)</li>\n<li>Fix export urls in management when using BASE_PATH (#915)</li>\n</ul>\n<h2>How to upgrade</h2>\n<p>In case you are upgrading from an RDMO version below 2.0.0 please read these <a href="https://rdmo.readthedocs.io/en/latest/upgrade/index.html#upgrade-to-version-2-0-0">upgrade instructions</a> before you proceed.</p>\n<pre><code>pip install --upgrade rdmo\npython manage.py upgrade\n</code></pre>', lang=None), Description(value='If you refer to this software in a publication, please cite it as below.', lang=None)], publisher=[Publisher(value='Zenodo', lang=None)], contributor=[], date=[Date(value='2024-02-13', lang=None)], type_value=[TypeType(value='info:eu-repo/semantics/other', lang=None)], format=[], identifier=[Identifier(value='https://doi.org/10.5281/zenodo.10654826', lang=None), Identifier(value='oai:zenodo.org:10654826', lang=None)], source=[], language=[], relation=[Relation(value='https://github.com/rdmorganiser/rdmo/tree/2.1.3', lang=None), Relation(value='https://doi.org/10.5281/zenodo.596581', lang=None)], coverage=[], rights=[Rights(value='info:eu-repo/semantics/openAccess', lang=None), Rights(value='Apache License 2.0', lang=None), Rights(value='http://www.apache.org/licenses/LICENSE-2.0', lang=None)])), about=[]) ``` In this example, `scythe.list_records(set_="software")` retrieves records that are part of the 'software' set. The call @@ -149,7 +149,7 @@ Example: Combining `set_` with Date Filters ```python records = scythe.list_records(set_="software", from_="2024-01-01", until="2024-01-31") next(records) -# +# Record(header=Header(identifier='oai:zenodo.org:10654826', datestamp='2024-02-13T15:38:50Z', set_spec=['software'], status=None), metadata=Metadata(other_element=Dc(title=[Title(value='Research Data Management Organiser (RDMO)', lang=None)], creator=[Creator(value='Klar, Jochen', lang=None), Creator(value='Michaelis, Olaf', lang=None), Creator(value='Wallace, David', lang=None), Creator(value='Schröder, Max', lang=None), Creator(value='Fütterer, Heinz-Alexander', lang=None), Creator(value='Lanza, Giacomo', lang=None), Creator(value='Martínez Muñoz, David', lang=None), Creator(value='Pilori, Dario', lang=None), Creator(value='Harry, Enke', lang=None)], subject=[], description=[Description(value='<h2><a href="https://github.com/rdmorganiser/rdmo/compare/2.1.2...2.1.3">RDMO 2.1.3</a> (Feb 13, 2024)</h2>\n<ul>\n<li>Fix the migration of options with additional_input (#912)</li>\n<li>Fix export urls in management when using BASE_PATH (#915)</li>\n</ul>\n<h2>How to upgrade</h2>\n<p>In case you are upgrading from an RDMO version below 2.0.0 please read these <a href="https://rdmo.readthedocs.io/en/latest/upgrade/index.html#upgrade-to-version-2-0-0">upgrade instructions</a> before you proceed.</p>\n<pre><code>pip install --upgrade rdmo\npython manage.py upgrade\n</code></pre>', lang=None), Description(value='If you refer to this software in a publication, please cite it as below.', lang=None)], publisher=[Publisher(value='Zenodo', lang=None)], contributor=[], date=[Date(value='2024-02-13', lang=None)], type_value=[TypeType(value='info:eu-repo/semantics/other', lang=None)], format=[], identifier=[Identifier(value='https://doi.org/10.5281/zenodo.10654826', lang=None), Identifier(value='oai:zenodo.org:10654826', lang=None)], source=[], language=[], relation=[Relation(value='https://github.com/rdmorganiser/rdmo/tree/2.1.3', lang=None), Relation(value='https://doi.org/10.5281/zenodo.596581', lang=None)], coverage=[], rights=[Rights(value='info:eu-repo/semantics/openAccess', lang=None), Rights(value='Apache License 2.0', lang=None), Rights(value='http://www.apache.org/licenses/LICENSE-2.0', lang=None)])), about=[]) ``` This code will harvest records from the 'software' set that were published in January 2024. @@ -210,7 +210,7 @@ used to iterate over the records of a repository: ```python records = scythe.list_records() next(records) -# +# Record(header=Header(identifier='oai:zenodo.org:10654826', datestamp='2024-02-13T15:38:50Z', set_spec=['software'], status=None), metadata=Metadata(other_element=Dc(title=[Title(value='Research Data Management Organiser (RDMO)', lang=None)], creator=[Creator(value='Klar, Jochen', lang=None), Creator(value='Michaelis, Olaf', lang=None), Creator(value='Wallace, David', lang=None), Creator(value='Schröder, Max', lang=None), Creator(value='Fütterer, Heinz-Alexander', lang=None), Creator(value='Lanza, Giacomo', lang=None), Creator(value='Martínez Muñoz, David', lang=None), Creator(value='Pilori, Dario', lang=None), Creator(value='Harry, Enke', lang=None)], subject=[], description=[Description(value='<h2><a href="https://github.com/rdmorganiser/rdmo/compare/2.1.2...2.1.3">RDMO 2.1.3</a> (Feb 13, 2024)</h2>\n<ul>\n<li>Fix the migration of options with additional_input (#912)</li>\n<li>Fix export urls in management when using BASE_PATH (#915)</li>\n</ul>\n<h2>How to upgrade</h2>\n<p>In case you are upgrading from an RDMO version below 2.0.0 please read these <a href="https://rdmo.readthedocs.io/en/latest/upgrade/index.html#upgrade-to-version-2-0-0">upgrade instructions</a> before you proceed.</p>\n<pre><code>pip install --upgrade rdmo\npython manage.py upgrade\n</code></pre>', lang=None), Description(value='If you refer to this software in a publication, please cite it as below.', lang=None)], publisher=[Publisher(value='Zenodo', lang=None)], contributor=[], date=[Date(value='2024-02-13', lang=None)], type_value=[TypeType(value='info:eu-repo/semantics/other', lang=None)], format=[], identifier=[Identifier(value='https://doi.org/10.5281/zenodo.10654826', lang=None), Identifier(value='oai:zenodo.org:10654826', lang=None)], source=[], language=[], relation=[Relation(value='https://github.com/rdmorganiser/rdmo/tree/2.1.3', lang=None), Relation(value='https://doi.org/10.5281/zenodo.596581', lang=None)], coverage=[], rights=[Rights(value='info:eu-repo/semantics/openAccess', lang=None), Rights(value='Apache License 2.0', lang=None), Rights(value='http://www.apache.org/licenses/LICENSE-2.0', lang=None)])), about=[]) ``` Note that this works with all verbs that return more than one element. These are: @@ -224,7 +224,7 @@ The following example shows how to iterate over the headers returned by `list_id ```python headers = scythe.list_identifiers() next(headers) -#
+# Header(identifier='oai:zenodo.org:8434385', datestamp='2023-10-12T09:03:02Z', set_spec=[], status=None) ``` Iterating over the sets returned by `list_sets()` works similarly: @@ -232,7 +232,7 @@ Iterating over the sets returned by `list_sets()` works similarly: ```python sets = scythe.list_sets() next(sets) -# +# Set(set_spec='user-emi', set_name='European Middleware Initiative', set_description=[Description(other_element=Dc(title=[], creator=[], subject=[], description=[Description(value='

\n\t 

\n

\n\tThe European Middleware Initiative (EMI) is a close collaboration of the three major middleware providers, ARC, gLite and UNICORE, and other specialized software providers like dCache.

\n

\n\tThe project's mission is to

\n
    \n\t
  1. \n\t\tdeliver a consolidated set of middleware components for deployment in EGI (as part of the Unified Middleware Distribution - UMD), PRACE and other DCIs,
  2. \n\t
  3. \n\t\textend the interoperability and integration with emerging computing models,
  4. \n\t
  5. \n\t\tstrengthen the reliability and manageability of the services and establish a sustainable model to support,
  6. \n\t
  7. \n\t\tharmonise and evolve the middleware, ensuring it responds effectively to the requirements of the scientific communities relying on it.
  8. \n
\n', lang=None)], publisher=[], contributor=[], date=[], type_value=[], format=[], identifier=[], source=[], language=[], relation=[], coverage=[], rights=[]))]) ``` To explore all the metadata formats supported by the repository, you can iterate through the formats returned by the @@ -241,7 +241,7 @@ To explore all the metadata formats supported by the repository, you can iterate ```python metadata_formats = scythe.list_metadata_formats() next(metadata_formats) -# +# MetadataFormat(metadata_prefix='marcxml', schema='https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd', metadata_namespace='https://www.loc.gov/standards/marcxml/') ``` ## Getting a Single Record @@ -249,25 +249,24 @@ next(metadata_formats) OAI-PMH allows you to get a single record by using the `GetRecord` verb: ```python -scythe.get_record(identifier="oai:zenodo.org:4574771") -# +scythe.get_record(identifier="oai:zenodo.org:10654826") +# Record(header=Header(identifier='oai:zenodo.org:10654826', datestamp='2024-02-13T15:38:50Z', set_spec=['software'], status=None), metadata=Metadata(other_element=Dc(title=[Title(value='Research Data Management Organiser (RDMO)', lang=None)], creator=[Creator(value='Klar, Jochen', lang=None), Creator(value='Michaelis, Olaf', lang=None), Creator(value='Wallace, David', lang=None), Creator(value='Schröder, Max', lang=None), Creator(value='Fütterer, Heinz-Alexander', lang=None), Creator(value='Lanza, Giacomo', lang=None), Creator(value='Martínez Muñoz, David', lang=None), Creator(value='Pilori, Dario', lang=None), Creator(value='Harry, Enke', lang=None)], subject=[], description=[Description(value='<h2><a href="https://github.com/rdmorganiser/rdmo/compare/2.1.2...2.1.3">RDMO 2.1.3</a> (Feb 13, 2024)</h2>\n<ul>\n<li>Fix the migration of options with additional_input (#912)</li>\n<li>Fix export urls in management when using BASE_PATH (#915)</li>\n</ul>\n<h2>How to upgrade</h2>\n<p>In case you are upgrading from an RDMO version below 2.0.0 please read these <a href="https://rdmo.readthedocs.io/en/latest/upgrade/index.html#upgrade-to-version-2-0-0">upgrade instructions</a> before you proceed.</p>\n<pre><code>pip install --upgrade rdmo\npython manage.py upgrade\n</code></pre>', lang=None), Description(value='If you refer to this software in a publication, please cite it as below.', lang=None)], publisher=[Publisher(value='Zenodo', lang=None)], contributor=[], date=[Date(value='2024-02-13', lang=None)], type_value=[TypeType(value='info:eu-repo/semantics/other', lang=None)], format=[], identifier=[Identifier(value='https://doi.org/10.5281/zenodo.10654826', lang=None), Identifier(value='oai:zenodo.org:10654826', lang=None)], source=[], language=[], relation=[Relation(value='https://github.com/rdmorganiser/rdmo/tree/2.1.3', lang=None), Relation(value='https://doi.org/10.5281/zenodo.596581', lang=None)], coverage=[], rights=[Rights(value='info:eu-repo/semantics/openAccess', lang=None), Rights(value='Apache License 2.0', lang=None), Rights(value='http://www.apache.org/licenses/LICENSE-2.0', lang=None)])), about=[]) ``` ## Harvesting OAI Items vs. OAI Responses Scythe supports two harvesting modes that differ in the type of the returned objects. The default mode returns OAI-specific *items* (records, headers etc.) encoded as Python objects as seen earlier. If you want to save the whole -XML response returned by the server, you have to pass the -[OAIResponseIterator][oaipmh_scythe.iterator.OAIResponseIterator] during the instantiation of the -[Scythe][oaipmh_scythe.client.Scythe] object: +XML response returned by the server, you have to pass the [ResponseIterator][oaipmh_scythe.iterator.ResponseIterator] +during the instantiation of the [Scythe][oaipmh_scythe.client.Scythe] object: ```python -from oaipmh_scythe.iterator import OAIResponseIterator +from oaipmh_scythe.iterator import ResponseIterator -scythe = Scythe("https://zenodo.org/oai2d", iterator=OAIResponseIterator) +scythe = Scythe("https://zenodo.org/oai2d", iterator=ResponseIterator) responses = scythe.list_records() next(responses) -# +# Response(url=URL('https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_dc'), status_code=) ``` You could then save the returned responses to disk: @@ -281,16 +280,16 @@ with open("response.xml", "w") as f: The [list_records()][oaipmh_scythe.client.Scythe.list_records] and [list_identifiers()][oaipmh_scythe.client.Scythe.list_identifiers] methods accept an optional parameter -`ignore_deleted`. If set to `True`, the returned [OAIItemIterator][oaipmh_scythe.iterator.OAIItemIterator] will skip -deleted records/headers: +`ignore_deleted`. If set to `True`, the returned [ItemIterator][oaipmh_scythe.iterator.ItemIterator] will skip deleted +records/headers: ```python records = scythe.list_records(ignore_deleted=True) ``` !!! note - This works only using the [oaipmh_scythe.iterator.OAIItemIterator][]. If you use the - [oaipmh_scythe.iterator.OAIResponseIterator][], the resulting OAI responses will still contain the deleted records. + This works only using the [oaipmh_scythe.iterator.ItemIterator][]. If you use the + [oaipmh_scythe.iterator.ResponseIterator][], the resulting OAI responses will still contain the deleted records. ## Authentication diff --git a/pyproject.toml b/pyproject.toml index 527b1d3..495c2be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dynamic = [ ] dependencies = [ "httpx>=0.27", - "lxml>=5.3", + "xsdata[cli,lxml]", # TODO: remove cli extra ] optional-dependencies.dev = [ "pre-commit~=3.8", @@ -72,6 +72,10 @@ include = [ "src", "CHANGELOG.md", ] +exclude = [ + "src/oaipmh_scythe/models/**/*.md", + "src/oaipmh_scythe/models/**/*.xml", +] [tool.hatch.build.targets.wheel] only-packages = true @@ -166,9 +170,17 @@ lint.per-file-ignores."src/oaipmh_scythe/__about__.py" = [ lint.per-file-ignores."src/oaipmh_scythe/client.py" = [ "PLR0913", # too-many-arguments ] +lint.per-file-ignores."src/oaipmh_scythe/models/**.py" = [ + "D101", # undocumented-public-class + "D106", # undocumented-public-nested-class + "D205", # blank-line-after-summary + "D415", # ends-in-punctuation + "RUF002", # ambiguous-unicode-character-docstring + "TCH002", # typing-only-third-party-import +] lint.per-file-ignores."tests/**.py" = [ "D", # pydocstyle - "PLR2004", # magic-value-comparison + "PLR2004", # magic-value-comparison ] lint.unfixable = [ "F401", # unused-import @@ -201,6 +213,7 @@ parallel = true source = [ "oaipmh_scythe" ] omit = [ "__about__.py", + "src/oaipmh_scythe/models/datacite.py", ] [tool.coverage.report] diff --git a/src/oaipmh_scythe/__init__.py b/src/oaipmh_scythe/__init__.py index cc2d213..655cfbb 100644 --- a/src/oaipmh_scythe/__init__.py +++ b/src/oaipmh_scythe/__init__.py @@ -11,14 +11,14 @@ BadResumptionToken, BadVerb, CannotDisseminateFormat, - GeneralOAIPMHError, IdDoesNotExist, NoMetadataFormats, NoRecordsMatch, NoSetHierarchy, OAIPMHException, + UndefinedError, ) -from oaipmh_scythe.response import OAIResponse +from oaipmh_scythe.response import Response __all__ = [ "BadArgument", @@ -31,6 +31,7 @@ "NoRecordsMatch", "NoSetHierarchy", "OAIPMHException", - "OAIResponse", + "Response", "Scythe", + "UndefinedError", ] diff --git a/src/oaipmh_scythe/client.py b/src/oaipmh_scythe/client.py index 5d2ad67..4f1805e 100644 --- a/src/oaipmh_scythe/client.py +++ b/src/oaipmh_scythe/client.py @@ -20,9 +20,9 @@ import httpx from oaipmh_scythe.__about__ import __version__ -from oaipmh_scythe.iterator import BaseOAIIterator, OAIItemIterator -from oaipmh_scythe.models import Header, Identify, MetadataFormat, OAIItem, Record, Set -from oaipmh_scythe.response import OAIResponse +from oaipmh_scythe.iterator import BaseOAIIterator, ItemIterator +from oaipmh_scythe.models import Header, Identify, MetadataFormat, Record, Set, Verb +from oaipmh_scythe.response import Response, _build_response from oaipmh_scythe.utils import filter_dict_except_resumption_token, log_response, remove_none_values if TYPE_CHECKING: @@ -34,18 +34,6 @@ logger = logging.getLogger(__name__) USER_AGENT: str = f"oaipmh-scythe/{__version__}" -OAI_NAMESPACE: str = "{http://www.openarchives.org/OAI/2.0/}" - - -# Map OAI verbs to class representations -DEFAULT_CLASS_MAP = { - "GetRecord": Record, - "ListRecords": Record, - "ListIdentifiers": Header, - "ListSets": Set, - "ListMetadataFormats": MetadataFormat, - "Identify": Identify, -} class Scythe: @@ -78,11 +66,10 @@ def __init__( self, endpoint: str, http_method: str = "GET", - iterator: type[BaseOAIIterator] = OAIItemIterator, + iterator: type[BaseOAIIterator] = ItemIterator, max_retries: int = 0, retry_status_codes: Iterable[int] | None = None, default_retry_after: int | float = 60, - class_mapping: dict[str, type[OAIItem]] | None = None, encoding: str = "utf-8", auth: AuthTypes | None = None, timeout: int | float = 60, @@ -102,8 +89,6 @@ def __init__( f"Invalid value for 'default_retry_after': {default_retry_after}. default_retry_after must be positive int or float." ) self.default_retry_after = default_retry_after - self.oai_namespace = OAI_NAMESPACE - self.class_mapping = class_mapping or DEFAULT_CLASS_MAP self.encoding = encoding self.auth = auth if timeout <= 0: @@ -155,7 +140,7 @@ def __exit__( ) -> None: self.close() - def harvest(self, query: dict[str, str]) -> OAIResponse: + def harvest(self, query: dict[str, str]) -> Response: """Perform an HTTP request to the OAI server with the given parameters. Send an OAI-PMH request to the server using the specified parameters. Handle retry logic @@ -165,7 +150,7 @@ def harvest(self, query: dict[str, str]) -> OAIResponse: query: A dictionary containing the request parameters. Returns: - An OAIResponse object encapsulating the server's response. + A Response object encapsulating the server's response. Raises: httpx.HTTPError: If the HTTP request fails after the maximum number of retries. @@ -177,8 +162,8 @@ def harvest(self, query: dict[str, str]) -> OAIResponse: logger.warning("HTTP %d! Retrying after %d seconds...", http_response.status_code, retry_after) time.sleep(retry_after) http_response = self._request(query) - http_response.raise_for_status() - return OAIResponse(http_response, params=query) + metadata_prefix = query.get("metadataPrefix") # TODO + return _build_response(http_response, metadata_prefix) def _request(self, query: dict[str, str]) -> httpx.Response: """Send an HTTP request to the OAI server using the configured HTTP method and given query parameters. @@ -201,7 +186,7 @@ def list_records( set_: str | None = None, resumption_token: str | None = None, ignore_deleted: bool = False, - ) -> Iterator[OAIResponse | Record]: + ) -> Iterator[Response | Record]: """Issue a ListRecords request to the OAI server. Send a request to list records from the OAI server, allowing for selective harvesting based on date range, @@ -219,8 +204,7 @@ def list_records( ignore_deleted: If True, skip records flagged as deleted in the response. Yields: - An iterator over OAIResponse or Record objects, each representing an individual record or response - from the server. + An iterator over Response or Record objects, each representing an individual record or response from the server. Raises: BadArgument: If the arguments provided do not conform to the expectations of the OAI server. @@ -231,7 +215,7 @@ def list_records( """ _query = { - "verb": "ListRecords", + "verb": Verb.LIST_RECORDS.value, "from": from_, "until": until, "metadataPrefix": metadata_prefix, @@ -249,7 +233,7 @@ def list_identifiers( set_: str | None = None, resumption_token: str | None = None, ignore_deleted: bool = False, - ) -> Iterator[OAIResponse | Header]: + ) -> Iterator[Response | Header]: """Issue a ListIdentifiers request to the OAI server. Send a request to list record identifiers from the OAI server. This method allows filtering records based on @@ -268,7 +252,7 @@ def list_identifiers( ignore_deleted: If True, skip records flagged as deleted in the response. Yields: - An iterator over OAIResponse or Header objects, each representing an individual record identifier + An iterator over Response or Header objects, each representing an individual record identifier or response from the server. Raises: @@ -279,7 +263,7 @@ def list_identifiers( """ _query = { - "verb": "ListIdentifiers", + "verb": Verb.LIST_IDENTIFIERS.value, "from": from_, "until": until, "metadataPrefix": metadata_prefix, @@ -290,7 +274,7 @@ def list_identifiers( query = remove_none_values(filter_dict_except_resumption_token(_query)) yield from self.iterator(self, query, ignore_deleted=ignore_deleted) - def list_sets(self, resumption_token: str | None = None) -> Iterator[OAIResponse | Set]: + def list_sets(self, resumption_token: str | None = None) -> Iterator[Response | Set]: """Issue a ListSets request to the OAI server. Send a request to list all sets defined in the OAI server. Sets are used to categorize records in the OAI @@ -303,7 +287,7 @@ def list_sets(self, resumption_token: str | None = None) -> Iterator[OAIResponse resumption_token: An optional token for pagination, used to continue a request for the next batch of sets. Yields: - An iterator over OAIResponse or Set objects, representing an individual set or response from the server. + An iterator over Response or Set objects, representing an individual set or response from the server. Raises: BadResumptionToken: If the provided resumption token is invalid or expired. @@ -311,18 +295,18 @@ def list_sets(self, resumption_token: str | None = None) -> Iterator[OAIResponse """ _query = { - "verb": "ListSets", + "verb": Verb.LIST_SETS.value, "resumptionToken": resumption_token, } query = remove_none_values(filter_dict_except_resumption_token(_query)) yield from self.iterator(self, query) - def identify(self) -> Identify: + def identify(self) -> Response | Identify: """Issue an Identify request to the OAI server. - Send a request to identify the OAI server and retrieve its information. This includes details such as the repository name, - the base URL, the protocol version, and other relevant data about the OAI server. It's useful for understanding the - capabilities and configuration of the server. + Send a request to identify the OAI server and retrieve its information. This includes details such as the + repository name, the base URL, the protocol version, and other relevant data about the OAI server. It's useful + for understanding the capabilities and configuration of the server. Ref: @@ -331,16 +315,17 @@ def identify(self) -> Identify: the OAI server. """ - query = {"verb": "Identify"} - return Identify(self.harvest(query)) + query = {"verb": Verb.IDENTIFY.value} + response = self.harvest(query) + if issubclass(self.iterator, ItemIterator) and response.parsed.identify: + return response.parsed.identify + return response - def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> OAIResponse | Record: + def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> Response | Record: """Issue a GetRecord request to the OAI server. Send a request to the OAI server to retrieve a specific record. The request is constructed with the provided - identifier and metadata prefix. The method then processes and returns the relevant OAIResponse or Record object - using an iterator. - + identifier and metadata prefix. The method then processes and returns the relevant Response or Record object. Ref: @@ -349,7 +334,7 @@ def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> OAIRes metadata_prefix: The metadata format to be returned for the record. Defaults to "oai_dc". Returns: - An OAIResponse or Record object representing the requested record. + A Response or Record object representing the requested record. Raises: CannotDisseminateFormat: If the specified metadata_prefix is not supported by the OAI server for @@ -358,18 +343,21 @@ def get_record(self, identifier: str, metadata_prefix: str = "oai_dc") -> OAIRes """ query = { - "verb": "GetRecord", + "verb": Verb.GET_RECORD.value, "identifier": identifier, "metadataPrefix": metadata_prefix, } - return next(iter(self.iterator(self, query))) + response = self.harvest(query) + if issubclass(self.iterator, ItemIterator) and response.parsed.get_record and response.parsed.get_record.record: + return response.parsed.get_record.record + return response - def list_metadata_formats(self, identifier: str | None = None) -> Iterator[OAIResponse | MetadataFormat]: + def list_metadata_formats(self, identifier: str | None = None) -> Iterator[Response | MetadataFormat]: """Issue a ListMetadataFormats request to the OAI server. Send a request to list the metadata formats available from the OAI server. This can be done for the entire repository or for a specific record if an identifier is provided. The method constructs a query and yields an - iterator over OAIResponse or MetadataFormat objects, each representing a different metadata format or response + iterator over Response or MetadataFormat objects, each representing a different metadata format or response from the server. Ref: @@ -379,8 +367,8 @@ def list_metadata_formats(self, identifier: str | None = None) -> Iterator[OAIRe If None, all metadata formats available in the repository are listed. Yields: - An iterator over OAIResponse or MetadataFormat objects, each representing an individual metadata format - or response from the server. + An iterator over Response or MetadataFormat objects, each representing an individual metadata format + or response from the server. Raises: IdDoesNotExist: If the specified identifier does not correspond to any record in the OAI server. @@ -388,7 +376,7 @@ def list_metadata_formats(self, identifier: str | None = None) -> Iterator[OAIRe """ _query = { - "verb": "ListMetadataFormats", + "verb": Verb.LIST_METADATA_FORMATS.value, "identifier": identifier, } query = remove_none_values(_query) diff --git a/src/oaipmh_scythe/exceptions.py b/src/oaipmh_scythe/exceptions.py index e49f5df..8ef3e31 100644 --- a/src/oaipmh_scythe/exceptions.py +++ b/src/oaipmh_scythe/exceptions.py @@ -11,7 +11,6 @@ Classes: OAIPMHException: The base exception class for all OAI-PMH related errors. - GeneralOAIPMHError: A general exception class for OAI-PMH errors not specifically covered by other classes. BadArgument: Raised when a request contains illegal, missing, or improperly formatted arguments. BadVerb: Raised when the verb argument in a request is invalid or improperly used. BadResumptionToken: Raised when a resumption token is invalid or expired. @@ -20,6 +19,7 @@ NoSetHierarchy: Raised when a repository does not support set hierarchies. NoMetadataFormats: Raised when no metadata formats are available for an item. NoRecordsMatch: Raised when a query yields no results due to specific argument combinations. + UndefinedError: A general exception class for OAI-PMH errors not specifically covered by other classes. These custom exceptions enhance the robustness and clarity of error handling in OAI-PMH client implementations, aligning closely with the protocol's standard error conditions. @@ -93,7 +93,7 @@ class NoRecordsMatch(OAIPMHException): """ -class GeneralOAIPMHError(OAIPMHException): +class UndefinedError(OAIPMHException): """General exception for context-specific OAI-PMH errors not covered by the other specific classes. This class is used for OAI-PMH errors that do not fall into the predefined categories diff --git a/src/oaipmh_scythe/iterator.py b/src/oaipmh_scythe/iterator.py index 18cc6a5..3342d84 100644 --- a/src/oaipmh_scythe/iterator.py +++ b/src/oaipmh_scythe/iterator.py @@ -12,32 +12,32 @@ Classes: BaseOAIIterator: An abstract base class for creating iterators over OAI-PMH data. - OAIResponseIterator: Iterates over OAI responses, handling pagination and resumption tokens. - OAIItemIterator: Provides iteration over specific OAI items like records, identifiers, and sets. + ResponseIterator: Iterates over OAI responses, handling pagination and resumption tokens. + ItemIterator: Provides iteration over specific OAI items like records, identifiers, and sets. """ from __future__ import annotations from abc import ABC, abstractmethod +from operator import attrgetter from typing import TYPE_CHECKING -from oaipmh_scythe import exceptions -from oaipmh_scythe.models import ResumptionToken +from oaipmh_scythe.models import Verb if TYPE_CHECKING: from collections.abc import Iterator from oaipmh_scythe import Scythe - from oaipmh_scythe.models import OAIItem - from oaipmh_scythe.response import OAIResponse - -VERBS_ELEMENTS: dict[str, str] = { - "GetRecord": "record", - "ListRecords": "record", - "ListIdentifiers": "header", - "ListSets": "set", - "ListMetadataFormats": "metadataFormat", - "Identify": "Identify", + from oaipmh_scythe.models import Item, ResumptionToken + from oaipmh_scythe.response import Response + + +MAPPING: dict[str, tuple[str, str]] = { + Verb.LIST_IDENTIFIERS.value: ("list_identifiers", "header"), + Verb.GET_RECORD.value: ("get_record", "record"), + Verb.LIST_RECORDS.value: ("list_records", "record"), + Verb.LIST_SETS.value: ("list_sets", "set"), + Verb.LIST_METADATA_FORMATS.value: ("list_metadata_formats", "metadata_format"), } @@ -58,7 +58,7 @@ class BaseOAIIterator(ABC): query: The parameters for OAI-PMH requests. ignore_deleted: Indicates whether deleted records should be ignored. verb: The OAI-PMH verb (e.g., 'ListRecords', 'ListIdentifiers') used in the request. - oai_response: The most recent OAIResponse received from the OAI server. + response: The most recent Response received from the OAI server. resumption_token: The current resumption token, if any, for paginated results. """ @@ -66,8 +66,8 @@ def __init__(self, scythe: Scythe, query: dict[str, str], ignore_deleted: bool = self.scythe = scythe self.query = query self.ignore_deleted = ignore_deleted - self.verb: str = self.query["verb"] - self.oai_response: OAIResponse | None = None + self.verb = self.query["verb"] + self.response: Response | None = None self.resumption_token: ResumptionToken | None = None self._next_response() @@ -87,70 +87,57 @@ def _get_resumption_token(self) -> ResumptionToken | None: Returns: A ResumptionToken instance if a token is found in the response, otherwise None. """ - ns = self.scythe.oai_namespace - if ( - self.oai_response is not None - and (token_element := self.oai_response.xml.find(f".//{ns}resumptionToken")) is not None - ): - return ResumptionToken( - token=token_element.text, - cursor=token_element.attrib.get("cursor"), # type: ignore [arg-type] - complete_list_size=token_element.attrib.get("completeListSize"), # type: ignore [arg-type] - expiration_date=token_element.attrib.get("expirationDate"), # type: ignore [arg-type] - ) - return None + if self.response is None: + return None + try: + lookup_attribute = MAPPING[self.verb][0] + parsed_data = getattr(self.response.parsed, lookup_attribute) + return parsed_data.resumption_token + except AttributeError: + return None def _next_response(self) -> None: """Request the next batch of data from the OAI server using the current resumption token. - This method is used internally to handle the pagination of OAI-PMH responses. It updates the `oai_response` + This method is used internally to handle the pagination of OAI-PMH responses. It updates the `response` attribute with the next batch of data from the server. If an error is encountered in the OAI response, an appropriate exception is raised. """ - if self.resumption_token and self.resumption_token.token: - self.query = {"verb": self.verb, "resumptionToken": self.resumption_token.token} - self.oai_response = self.scythe.harvest(self.query) - - if (error := self.oai_response.xml.find(f".//{self.scythe.oai_namespace}error")) is not None: - code = str(error.attrib.get("code", "UNKNOWN")) - description = error.text or "" - try: - exception_name = code[0].upper() + code[1:] - raise getattr(exceptions, exception_name)(description) - except AttributeError as exc: - raise exceptions.GeneralOAIPMHError(description) from exc + if self.resumption_token is not None: + self.query = {"verb": self.verb, "resumptionToken": self.resumption_token.value} + self.response = self.scythe.harvest(self.query) self.resumption_token = self._get_resumption_token() -class OAIResponseIterator(BaseOAIIterator): +class ResponseIterator(BaseOAIIterator): """An iterator class for iterating over OAI responses obtained via the OAI-PMH protocol. - This iterator specifically handles the iteration of OAIResponse objects, allowing for seamless - navigation through a sequence of responses returned by an OAI-PMH request. It utilizes the - underlying mechanisms of the BaseOAIIterator, including handling of resumption tokens for paginated data. + This iterator specifically handles the iteration of Response objects, allowing for seamless navigation through + a sequence of responses returned by an OAI-PMH request. It utilizes the underlying mechanisms of the + BaseOAIIterator, including handling of resumption tokens for paginated data. """ - def __iter__(self) -> Iterator[OAIResponse]: - """Yield the next OAIResponse object from the server response sequence. + def __iter__(self) -> Iterator[Response]: + """Yield the next Response object from the server response sequence. - Enable the OAIResponseIterator to iterate over a series of OAIResponse objects, managing pagination - with resumption tokens. Continue yielding responses until no more data is available from the server. + Enable the ResponseIterator to iterate over a series of Response objects, managing pagination with + resumption tokens. Continue yielding responses until no more data is available from the server. Yields: - OAIResponse: The next available OAIResponse object in the sequence. + The next available Response object in the sequence. """ while True: - if self.oai_response: - yield self.oai_response - self.oai_response = None - elif self.resumption_token and self.resumption_token.token: + if self.response: + yield self.response + self.response = None + elif self.resumption_token: self._next_response() else: return -class OAIItemIterator(BaseOAIIterator): +class ItemIterator(BaseOAIIterator): """An iterator class for iterating over various types of OAI items aggregated via OAI-PMH. This iterator is designed to handle the iteration of specific OAI items, such as records or sets, from a repository. @@ -164,8 +151,9 @@ class OAIItemIterator(BaseOAIIterator): def __init__(self, scythe: Scythe, query: dict[str, str], ignore_deleted: bool = False) -> None: self.verb = query["verb"] - self.mapper = scythe.class_mapping[self.verb] - self.element = VERBS_ELEMENTS[self.verb] + lookup_attribute = MAPPING[query["verb"]][0] + lookup_element = MAPPING[query["verb"]][1] + self.items_getter = attrgetter(f"{lookup_attribute}.{lookup_element}") super().__init__(scythe, query, ignore_deleted) def _next_response(self) -> None: @@ -175,27 +163,26 @@ def _next_response(self) -> None: for the specific elements (e.g. records, headers) based on the current resumption token. """ super()._next_response() - if self.oai_response is not None: - self._items = self.oai_response.xml.iterfind(f".//{self.scythe.oai_namespace}{self.element}") + if self.response is not None: + self._items = self.items_getter(self.response.parsed) else: self._items = iter(()) - def __iter__(self) -> Iterator[OAIItem]: + def __iter__(self) -> Iterator[Item]: """Iterate over individual OAI items from the response. Go through the items in the OAI-PMH response, applying any necessary mapping and handling the exclusion of deleted records if specified. Automatically handle pagination through resumption tokens. Yields: - OAIItem: The next OAI item (e.g., record, identifier, set) from the response. + The next OAI item (e.g., record, identifier, set) from the response. """ while True: for item in self._items: - mapped = self.mapper(item) - if self.ignore_deleted and mapped.deleted: + if self.ignore_deleted and item.deleted: continue - yield mapped - if self.resumption_token and self.resumption_token.token: + yield item + if self.resumption_token: self._next_response() else: return diff --git a/src/oaipmh_scythe/models.py b/src/oaipmh_scythe/models.py deleted file mode 100644 index f2464d9..0000000 --- a/src/oaipmh_scythe/models.py +++ /dev/null @@ -1,280 +0,0 @@ -# SPDX-FileCopyrightText: 2015 Mathias Loesch -# SPDX-FileCopyrightText: 2023 Heinz-Alexander Fütterer -# -# SPDX-License-Identifier: BSD-3-Clause - -"""The models module defines data structures for representing various components of the OAI-PMH protocol. - -This module includes classes that encapsulate different entities in OAI-PMH, such as resumption tokens and -various types of OAI items. These classes provide structured representations of OAI-PMH elements, -facilitating their manipulation and processing in client applications. - -Classes: - ResumptionToken: Represents a resumption token used in OAI-PMH for paginated data retrieval. - OAIItem: A base class for generic OAI items. - Identify: Represents an Identify response in OAI-PMH. - Header: Represents an OAI Header element. - Record: Represents an OAI Record element. - Set: Represents an OAI Set element. - MetadataFormat: Represents an OAI MetadataFormat element. -""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING - -from lxml import etree - -from oaipmh_scythe.utils import get_namespace, xml_to_dict - -if TYPE_CHECKING: - from collections.abc import Iterator - - from oaipmh_scythe.response import OAIResponse - - -@dataclass -class ResumptionToken: - """A data class representing a resumption token in the OAI-PMH protocol. - - Resumption tokens are used for iterating over multiple sets of results in OAI-PMH - harvest requests. This class encapsulates the typical components of a resumption token, - including the token itself, cursor, complete list size, and an expiration date. - - Attributes: - token: The actual resumption token used for continuing the iteration in subsequent OAI-PMH requests. - Default is None. - cursor: A marker indicating the current position in the list of results. Default is None. - complete_list_size: The total number of records in the complete list of results. Default is None. - expiration_date: The date and time when the resumption token expires. Default is None. - """ - - token: str | None = None - cursor: str | None = None - complete_list_size: str | None = None - expiration_date: str | None = None - - def __repr__(self) -> str: - return f"" - - -class OAIItem: - """A base class representing a generic item in the OAI-PMH protocol. - - This class provides a common structure for handling and manipulating XML data - associated with different types of OAI-PMH items, such as records, headers, or sets. - - Attributes: - xml: The parsed XML element representing the OAI item. - _strip_ns: A flag indicating whether to remove the namespaces from the element names - in the dictionary representation. - _oai_namespace: The namespace URI extracted from the XML element. - """ - - def __init__(self, xml: etree._Element, strip_ns: bool = True) -> None: - super().__init__() - self.xml = xml - self._strip_ns = strip_ns - self._oai_namespace = get_namespace(self.xml) - - def __bytes__(self) -> bytes: - return etree.tostring(self.xml, encoding="utf-8") - - def __str__(self) -> str: - return etree.tostring(self.xml, encoding="unicode") - - @property - def raw(self) -> str: - """Return the original XML as a unicode string.""" - return etree.tostring(self.xml, encoding="unicode") - - -class Identify(OAIItem): - """A class representing an Identify container in the OAI-PMH protocol. - - This class is specifically used for handling the response of an Identify request in OAI-PMH. - It differs from other OAI entities in that it is initialized with an OAIResponse object - rather than a direct XML element. The class parses the Identify information from the - response and provides access to its individual components. - - Args: - identify_response: The response object from an Identify request. - It should contain the XML representation of the Identify response. - - Attributes: - xml: The XML element representing the Identify response. - _identify_dict: A dictionary containing the parsed Identify information. - Dynamic Attributes: Based on the content of the Identify response, additional attributes - are dynamically set on this object. These can include attributes like - repository name, base URL, protocol version, etc. - - Raises: - ValueError: If the Identify element is not found in the provided XML. - """ - - def __init__(self, identify_response: OAIResponse) -> None: - super().__init__(identify_response.xml, strip_ns=True) - identify_element = self.xml.find(f".//{self._oai_namespace}Identify") - if identify_element is None: - raise ValueError("Identify element not found in the XML.") - self.xml = identify_element - self._identify_dict = xml_to_dict(self.xml, strip_ns=True) - for k, v in self._identify_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return "" - - def __iter__(self) -> Iterator: - """Iterate over the Identify information, yielding key-value pairs.""" - return iter(self._identify_dict.items()) - - -class Header(OAIItem): - """A class representing an OAI Header in the OAI-PMH protocol. - - The header contains essential information about a record, such as its identifier, datestamp, - and set specifications. This class parses these details from the provided XML header element - and makes them easily accessible as attributes. - - Args: - header_element: The XML element representing the OAI header. - - Attributes: - deleted: Indicates whether the record is marked as deleted in the OAI-PMH repository. - identifier: The unique identifier of the record in the OAI-PMH repository. - datestamp: The datestamp of the record, indicating when it was last updated. - setSpecs: A list of set specifications that the record belongs to. - """ - - def __init__(self, header_element: etree._Element) -> None: - super().__init__(header_element, strip_ns=True) - self.deleted = self.xml.attrib.get("status") == "deleted" - _identifier_element = self.xml.find(f"{self._oai_namespace}identifier") - _datestamp_element = self.xml.find(f"{self._oai_namespace}datestamp") - - self.identifier = getattr(_identifier_element, "text", None) - self.datestamp = getattr(_datestamp_element, "text", None) - self.setSpecs = [setSpec.text for setSpec in self.xml.findall(f"{self._oai_namespace}setSpec")] - - def __repr__(self) -> str: - return f"
" - - def __iter__(self) -> Iterator: - """Iterate over the header information, yielding key-value pairs.""" - return iter( - [ - ("identifier", self.identifier), - ("datestamp", self.datestamp), - ("setSpecs", self.setSpecs), - ] - ) - - -class Record(OAIItem): - """A class representing an OAI record in the OAI-PMH protocol. - - This class encapsulates a record element from an OAI-PMH response, handling its parsing, and providing - structured access to its details, such as header information and metadata. It checks for the presence of - the header and metadata elements and raises an error if the header is not found. - - Args: - record_element: The XML element representing the OAI record. - strip_ns: If True, namespaces are removed from the element names in the parsed metadata. Defaults to True. - - Attributes: - header: An instance of the Header class representing the header information of the record. - deleted: Indicates whether the record is marked as deleted. - metadata: A dictionary representation of the record's metadata, if available and not deleted. - - Raises: - ValueError: If the header element is not found in the provided XML. - """ - - def __init__(self, record_element: etree._Element, strip_ns: bool = True) -> None: - super().__init__(record_element, strip_ns=strip_ns) - header_element = self.xml.find(f".//{self._oai_namespace}header") - if header_element is None: - raise ValueError("Header element not found in the XML.") - self.header = Header(header_element) - self.deleted = self.header.deleted - if not self.deleted: - self.metadata = self.get_metadata() - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the record's metadata, yielding key-value pairs.""" - return iter(self.metadata.items()) - - def get_metadata(self): - """Extract and return the record's metadata as a dictionary.""" - # We want to get record/metadata//* - # would be the element ``dc`` - # in the ``oai_dc`` case. - return xml_to_dict( - self.xml.find(".//" + self._oai_namespace + "metadata").getchildren()[0], - strip_ns=self._strip_ns, - ) - - -class Set(OAIItem): - """A class representing a set in the OAI-PMH protocol. - - This class encapsulates a set element from an OAI-PMH response and provides structured access to its details. - It parses the set information from the provided XML element and dynamically sets attributes - based on the parsed content. - - Args: - set_element: The XML element representing the OAI set. The element is parsed to extract set details. - - Attributes: - setName: The name of the set, extracted from the set's XML element. - _set_dict: A dictionary containing the parsed set information. - """ - - def __init__(self, set_element: etree._Element) -> None: - super().__init__(set_element, strip_ns=True) - self._set_dict = xml_to_dict(self.xml, strip_ns=True) - self.setName: str | None = None - for k, v in self._set_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the set information, yielding key-value pairs.""" - return iter(self._set_dict.items()) - - -class MetadataFormat(OAIItem): - """A class representing a metadata format in the OAI-PMH protocol. - - This class handles the representation of a metadata format, which is an essential part of the OAI-PMH protocol. - It parses the provided XML element to extract and store metadata format details such as the metadata prefix. - - Args: - mdf_element: The XML element representing the metadata format. This element is parsed - to extract metadata format details. - - Attributes: - metadataPrefix: The prefix of the metadata format, extracted from the XML element. - _mdf_dict: A dictionary containing the parsed metadata format details. - """ - - def __init__(self, mdf_element: etree._Element) -> None: - super().__init__(mdf_element, strip_ns=True) - self._mdf_dict = xml_to_dict(self.xml, strip_ns=True) - self.metadataPrefix: str | None = None - for k, v in self._mdf_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the metadata format information, yielding key-value pairs.""" - return iter(self._mdf_dict.items()) diff --git a/src/oaipmh_scythe/models/__init__.py b/src/oaipmh_scythe/models/__init__.py new file mode 100644 index 0000000..0e38302 --- /dev/null +++ b/src/oaipmh_scythe/models/__init__.py @@ -0,0 +1,35 @@ +# SPDX-FileCopyrightText: 2024 Heinz-Alexander Fütterer +# +# SPDX-License-Identifier: BSD-3-Clause + +"""TODO.""" + +from typing import TypeAlias + +from oaipmh_scythe.models.oai_dc import Dc # TODO +from oaipmh_scythe.models.oai_pmh import ( + Header, + Identify, + MetadataFormat, + OaiPmh, + Record, + ResumptionToken, + Set, + Verb, +) + +# `Item` can be used for type annotations +Item: TypeAlias = Header | Record | Set | MetadataFormat + +__all__ = [ + "Header", + "Identify", + "MetadataFormat", + "OaiPmh", + "Record", + "ResumptionToken", + "Set", + "Verb", + "Item", + "Dc", # TODO +] diff --git a/src/oaipmh_scythe/models/datacite/.xsdata.xml b/src/oaipmh_scythe/models/datacite/.xsdata.xml new file mode 100644 index 0000000..43d3b91 --- /dev/null +++ b/src/oaipmh_scythe/models/datacite/.xsdata.xml @@ -0,0 +1,32 @@ + + + + src.oaipmh_scythe.models.datacite.models + dataclasses + single-package + Google + false + false + false + true + false + false + true + + + + + + + + + + + + + + + + + + diff --git a/src/oaipmh_scythe/models/datacite/README.md b/src/oaipmh_scythe/models/datacite/README.md new file mode 100644 index 0000000..e299ed6 --- /dev/null +++ b/src/oaipmh_scythe/models/datacite/README.md @@ -0,0 +1,7 @@ +Generate the models with: + +```console +python -m pip install xsdata[cli] +xsdata generate --config src/oaipmh_scythe/models/datacite/.xsdata.xml +http://schema.datacite.org/meta/kernel-4.5/metadata.xsd +``` diff --git a/src/oaipmh_scythe/models/datacite/__init__.py b/src/oaipmh_scythe/models/datacite/__init__.py new file mode 100644 index 0000000..94c58d0 --- /dev/null +++ b/src/oaipmh_scythe/models/datacite/__init__.py @@ -0,0 +1,43 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-21 17:40:55 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from oaipmh_scythe.models.datacite.models import ( + Affiliation, + Box, + ContributorType, + DateType, + DescriptionType, + FunderIdentifierType, + LangValue, + NameIdentifier, + NameType, + NumberType, + Point, + RelatedIdentifierType, + RelationType, + Resource, + ResourceType, + TitleType, +) + +__all__ = [ + "Affiliation", + "Box", + "ContributorType", + "DateType", + "DescriptionType", + "FunderIdentifierType", + "LangValue", + "NameIdentifier", + "NameType", + "NumberType", + "Point", + "RelatedIdentifierType", + "RelationType", + "Resource", + "ResourceType", + "TitleType", +] diff --git a/src/oaipmh_scythe/models/datacite/models.py b/src/oaipmh_scythe/models/datacite/models.py new file mode 100644 index 0000000..ea3a9e5 --- /dev/null +++ b/src/oaipmh_scythe/models/datacite/models.py @@ -0,0 +1,1675 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-21 17:40:55 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + + +@dataclass(slots=True) +class Affiliation: + """Uniquely identifies an affiliation, according to various identifier schemes.""" + + class Meta: + name = "affiliation" + target_namespace = "http://datacite.org/schema/kernel-4" + + value: str = field( + default="", + metadata={ + "required": True, + "min_length": 1, + }, + ) + affiliation_identifier: None | str = field( + default=None, + metadata={ + "name": "affiliationIdentifier", + "type": "Attribute", + }, + ) + affiliation_identifier_scheme: None | str = field( + default=None, + metadata={ + "name": "affiliationIdentifierScheme", + "type": "Attribute", + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + + +@dataclass(slots=True) +class Box: + class Meta: + name = "box" + target_namespace = "http://datacite.org/schema/kernel-4" + + west_bound_longitude: None | float = field( + default=None, + metadata={ + "name": "westBoundLongitude", + "type": "Element", + "namespace": "http://datacite.org/schema/kernel-4", + "required": True, + "min_inclusive": -180.0, + "max_inclusive": 180.0, + }, + ) + east_bound_longitude: None | float = field( + default=None, + metadata={ + "name": "eastBoundLongitude", + "type": "Element", + "namespace": "http://datacite.org/schema/kernel-4", + "required": True, + "min_inclusive": -180.0, + "max_inclusive": 180.0, + }, + ) + south_bound_latitude: None | float = field( + default=None, + metadata={ + "name": "southBoundLatitude", + "type": "Element", + "namespace": "http://datacite.org/schema/kernel-4", + "required": True, + "min_inclusive": -90.0, + "max_inclusive": 90.0, + }, + ) + north_bound_latitude: None | float = field( + default=None, + metadata={ + "name": "northBoundLatitude", + "type": "Element", + "namespace": "http://datacite.org/schema/kernel-4", + "required": True, + "min_inclusive": -90.0, + "max_inclusive": 90.0, + }, + ) + + +class ContributorType(Enum): + """The type of contributor of the resource.""" + + CONTACT_PERSON = "ContactPerson" + DATA_COLLECTOR = "DataCollector" + DATA_CURATOR = "DataCurator" + DATA_MANAGER = "DataManager" + DISTRIBUTOR = "Distributor" + EDITOR = "Editor" + HOSTING_INSTITUTION = "HostingInstitution" + OTHER = "Other" + PRODUCER = "Producer" + PROJECT_LEADER = "ProjectLeader" + PROJECT_MANAGER = "ProjectManager" + PROJECT_MEMBER = "ProjectMember" + REGISTRATION_AGENCY = "RegistrationAgency" + REGISTRATION_AUTHORITY = "RegistrationAuthority" + RELATED_PERSON = "RelatedPerson" + RESEARCH_GROUP = "ResearchGroup" + RIGHTS_HOLDER = "RightsHolder" + RESEARCHER = "Researcher" + SPONSOR = "Sponsor" + SUPERVISOR = "Supervisor" + WORK_PACKAGE_LEADER = "WorkPackageLeader" + + +class DateType(Enum): + """The type of date. + + Use RKMS‐ISO8601 standard for depicting date ranges.To indicate the end of an embargo period, use Available. + To indicate the start of an embargo period, use Submitted or Accepted, as appropriate. + """ + + ACCEPTED = "Accepted" + AVAILABLE = "Available" + COLLECTED = "Collected" + COPYRIGHTED = "Copyrighted" + CREATED = "Created" + ISSUED = "Issued" + OTHER = "Other" + SUBMITTED = "Submitted" + UPDATED = "Updated" + VALID = "Valid" + WITHDRAWN = "Withdrawn" + + +class DescriptionType(Enum): + """The type of the description.""" + + ABSTRACT = "Abstract" + METHODS = "Methods" + SERIES_INFORMATION = "SeriesInformation" + TABLE_OF_CONTENTS = "TableOfContents" + TECHNICAL_INFO = "TechnicalInfo" + OTHER = "Other" + + +class FunderIdentifierType(Enum): + """The type of the funderIdentifier.""" + + ISNI = "ISNI" + GRID = "GRID" + ROR = "ROR" + CROSSREF_FUNDER_ID = "Crossref Funder ID" + OTHER = "Other" + + +@dataclass(slots=True) +class NameIdentifier: + """Uniquely identifies a creator or contributor, according to various identifier + schemes. + """ + + class Meta: + name = "nameIdentifier" + target_namespace = "http://datacite.org/schema/kernel-4" + + value: str = field( + default="", + metadata={ + "required": True, + "min_length": 1, + }, + ) + name_identifier_scheme: None | str = field( + default=None, + metadata={ + "name": "nameIdentifierScheme", + "type": "Attribute", + "required": True, + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + + +class NameType(Enum): + ORGANIZATIONAL = "Organizational" + PERSONAL = "Personal" + + +class NumberType(Enum): + ARTICLE = "Article" + CHAPTER = "Chapter" + REPORT = "Report" + OTHER = "Other" + + +@dataclass(slots=True) +class Point: + class Meta: + name = "point" + target_namespace = "http://datacite.org/schema/kernel-4" + + point_longitude: None | float = field( + default=None, + metadata={ + "name": "pointLongitude", + "type": "Element", + "namespace": "http://datacite.org/schema/kernel-4", + "required": True, + "min_inclusive": -180.0, + "max_inclusive": 180.0, + }, + ) + point_latitude: None | float = field( + default=None, + metadata={ + "name": "pointLatitude", + "type": "Element", + "namespace": "http://datacite.org/schema/kernel-4", + "required": True, + "min_inclusive": -90.0, + "max_inclusive": 90.0, + }, + ) + + +class RelatedIdentifierType(Enum): + """The type of the RelatedIdentifier.""" + + ARK = "ARK" + AR_XIV = "arXiv" + BIBCODE = "bibcode" + DOI = "DOI" + EAN13 = "EAN13" + EISSN = "EISSN" + HANDLE = "Handle" + IGSN = "IGSN" + ISBN = "ISBN" + ISSN = "ISSN" + ISTC = "ISTC" + LISSN = "LISSN" + LSID = "LSID" + PMID = "PMID" + PURL = "PURL" + UPC = "UPC" + URL = "URL" + URN = "URN" + W3ID = "w3id" + + +class RelationType(Enum): + """Description of the relationship of the resource being registered (A) and the + related resource (B). + """ + + IS_CITED_BY = "IsCitedBy" + CITES = "Cites" + IS_SUPPLEMENT_TO = "IsSupplementTo" + IS_SUPPLEMENTED_BY = "IsSupplementedBy" + IS_CONTINUED_BY = "IsContinuedBy" + CONTINUES = "Continues" + IS_NEW_VERSION_OF = "IsNewVersionOf" + IS_PREVIOUS_VERSION_OF = "IsPreviousVersionOf" + IS_PART_OF = "IsPartOf" + HAS_PART = "HasPart" + IS_PUBLISHED_IN = "IsPublishedIn" + IS_REFERENCED_BY = "IsReferencedBy" + REFERENCES = "References" + IS_DOCUMENTED_BY = "IsDocumentedBy" + DOCUMENTS = "Documents" + IS_COMPILED_BY = "IsCompiledBy" + COMPILES = "Compiles" + IS_VARIANT_FORM_OF = "IsVariantFormOf" + IS_ORIGINAL_FORM_OF = "IsOriginalFormOf" + IS_IDENTICAL_TO = "IsIdenticalTo" + HAS_METADATA = "HasMetadata" + IS_METADATA_FOR = "IsMetadataFor" + REVIEWS = "Reviews" + IS_REVIEWED_BY = "IsReviewedBy" + IS_DERIVED_FROM = "IsDerivedFrom" + IS_SOURCE_OF = "IsSourceOf" + DESCRIBES = "Describes" + IS_DESCRIBED_BY = "IsDescribedBy" + HAS_VERSION = "HasVersion" + IS_VERSION_OF = "IsVersionOf" + REQUIRES = "Requires" + IS_REQUIRED_BY = "IsRequiredBy" + OBSOLETES = "Obsoletes" + IS_OBSOLETED_BY = "IsObsoletedBy" + COLLECTS = "Collects" + IS_COLLECTED_BY = "IsCollectedBy" + + +class ResourceType(Enum): + """The general type of a resource.""" + + AUDIOVISUAL = "Audiovisual" + BOOK = "Book" + BOOK_CHAPTER = "BookChapter" + COLLECTION = "Collection" + COMPUTATIONAL_NOTEBOOK = "ComputationalNotebook" + CONFERENCE_PAPER = "ConferencePaper" + CONFERENCE_PROCEEDING = "ConferenceProceeding" + DATA_PAPER = "DataPaper" + DATASET = "Dataset" + DISSERTATION = "Dissertation" + EVENT = "Event" + IMAGE = "Image" + INSTRUMENT = "Instrument" + INTERACTIVE_RESOURCE = "InteractiveResource" + JOURNAL = "Journal" + JOURNAL_ARTICLE = "JournalArticle" + MODEL = "Model" + OUTPUT_MANAGEMENT_PLAN = "OutputManagementPlan" + PEER_REVIEW = "PeerReview" + PHYSICAL_OBJECT = "PhysicalObject" + PREPRINT = "Preprint" + REPORT = "Report" + SERVICE = "Service" + SOFTWARE = "Software" + SOUND = "Sound" + STANDARD = "Standard" + STUDY_REGISTRATION = "StudyRegistration" + TEXT = "Text" + WORKFLOW = "Workflow" + OTHER = "Other" + + +class TitleType(Enum): + ALTERNATIVE_TITLE = "AlternativeTitle" + SUBTITLE = "Subtitle" + TRANSLATED_TITLE = "TranslatedTitle" + OTHER = "Other" + + +class LangValue(Enum): + VALUE = "" + + +@dataclass(slots=True) +class Resource: + """Root element of a single record. + + This wrapper element is for XML implementation only and is not defined in the DataCite DOI standard. + Note: This is the case for all wrapper elements within this schema. + No content in this wrapper element. + + Attributes: + identifier: A persistent identifier that identifies a resource. + creators: + titles: + publisher: The name of the entity that holds, archives, publishes prints, distributes, releases, issues, + or produces the resource. This property will be used to formulate the citation, so consider the + prominence of the role. In the case of datasets, "publish" is understood to mean making the data + available to the community of researchers. + publication_year: Year when the data is made publicly available. If an embargo period has been in effect, + use the date when the embargo period ends. In the case of datasets, "publish" is understood to mean + making the data available on a specific date to the community of researchers. If there is no standard + publication year value, use the date that would be preferred from a citation perspective. YYYY + resource_type: The type of a resource. You may enter an additional free text description. The format is + open, but the preferred format is a single term of some detail so that a pair can be formed with the + sub-property. + subjects: + contributors: + dates: + language: Primary language of the resource. Allowed values are taken from IETF BCP 47, ISO 639-1 + language codes. + alternate_identifiers: + related_identifiers: + sizes: + formats: + version: Version number of the resource. If the primary resource has changed the version number + increases. Register a new identifier for a major version change. Individual stewards need to + determine which are major vs. minor versions. May be used in conjunction with properties 11 and 12 + (AlternateIdentifier and RelatedIdentifier) to indicate various information updates. May be used in + conjunction with property 17 (Description) to indicate the nature and file/record range of version. + rights_list: + descriptions: + geo_locations: + funding_references: + related_items: + """ + + class Meta: + name = "resource" + namespace = "http://datacite.org/schema/kernel-4" + + identifier: None | Resource.Identifier = field( + default=None, + metadata={ + "type": "Element", + "required": True, + }, + ) + creators: None | Resource.Creators = field( + default=None, + metadata={ + "type": "Element", + "required": True, + }, + ) + titles: None | Resource.Titles = field( + default=None, + metadata={ + "type": "Element", + "required": True, + }, + ) + publisher: None | Resource.Publisher = field( + default=None, + metadata={ + "type": "Element", + "required": True, + }, + ) + publication_year: None | str = field( + default=None, + metadata={ + "name": "publicationYear", + "type": "Element", + "required": True, + "pattern": r"[\d]{4}", + }, + ) + resource_type: None | Resource.ResourceType = field( + default=None, + metadata={ + "name": "resourceType", + "type": "Element", + "required": True, + }, + ) + subjects: None | Resource.Subjects = field( + default=None, + metadata={ + "type": "Element", + }, + ) + contributors: None | Resource.Contributors = field( + default=None, + metadata={ + "type": "Element", + }, + ) + dates: None | Resource.Dates = field( + default=None, + metadata={ + "type": "Element", + }, + ) + language: None | str = field( + default=None, + metadata={ + "type": "Element", + }, + ) + alternate_identifiers: None | Resource.AlternateIdentifiers = field( + default=None, + metadata={ + "name": "alternateIdentifiers", + "type": "Element", + }, + ) + related_identifiers: None | Resource.RelatedIdentifiers = field( + default=None, + metadata={ + "name": "relatedIdentifiers", + "type": "Element", + }, + ) + sizes: None | Resource.Sizes = field( + default=None, + metadata={ + "type": "Element", + }, + ) + formats: None | Resource.Formats = field( + default=None, + metadata={ + "type": "Element", + }, + ) + version: None | str = field( + default=None, + metadata={ + "type": "Element", + }, + ) + rights_list: None | Resource.RightsList = field( + default=None, + metadata={ + "name": "rightsList", + "type": "Element", + }, + ) + descriptions: None | Resource.Descriptions = field( + default=None, + metadata={ + "type": "Element", + }, + ) + geo_locations: None | Resource.GeoLocations = field( + default=None, + metadata={ + "name": "geoLocations", + "type": "Element", + }, + ) + funding_references: None | Resource.FundingReferences = field( + default=None, + metadata={ + "name": "fundingReferences", + "type": "Element", + }, + ) + related_items: None | Resource.RelatedItems = field( + default=None, + metadata={ + "name": "relatedItems", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Identifier: + value: str = field( + default="", + metadata={ + "required": True, + "min_length": 1, + }, + ) + identifier_type: None | object = field( + default=None, + metadata={ + "name": "identifierType", + "type": "Attribute", + "required": True, + }, + ) + + @dataclass(slots=True) + class Creators: + """Attributes: + creator: The main researchers involved working on the data, or the authors of the publication in + priority order. May be a corporate/institutional or personal name. Format: Family, Given. + Personal names can be further specified using givenName and familyName. + """ + + creator: list[Resource.Creators.Creator] = field( + default_factory=list, + metadata={ + "type": "Element", + "min_occurs": 1, + }, + ) + + @dataclass(slots=True) + class Creator: + creator_name: None | Resource.Creators.Creator.CreatorName = field( + default=None, + metadata={ + "name": "creatorName", + "type": "Element", + "required": True, + }, + ) + given_name: None | object = field( + default=None, + metadata={ + "name": "givenName", + "type": "Element", + }, + ) + family_name: None | object = field( + default=None, + metadata={ + "name": "familyName", + "type": "Element", + }, + ) + name_identifier: list[object] = field( + default_factory=list, + metadata={ + "name": "nameIdentifier", + "type": "Element", + }, + ) + affiliation: list[object] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class CreatorName: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + name_type: None | NameType = field( + default=None, + metadata={ + "name": "nameType", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class Titles: + """Attributes: + title: A name or title by which a resource is known. + """ + + title: list[Resource.Titles.Title] = field( + default_factory=list, + metadata={ + "type": "Element", + "min_occurs": 1, + }, + ) + + @dataclass(slots=True) + class Title: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + title_type: None | TitleType = field( + default=None, + metadata={ + "name": "titleType", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class Publisher: + value: str = field( + default="", + metadata={ + "required": True, + "min_length": 1, + }, + ) + publisher_identifier: None | str = field( + default=None, + metadata={ + "name": "publisherIdentifier", + "type": "Attribute", + }, + ) + publisher_identifier_scheme: None | str = field( + default=None, + metadata={ + "name": "publisherIdentifierScheme", + "type": "Attribute", + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class ResourceType: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + resource_type_general: None | ResourceType = field( + default=None, + metadata={ + "name": "resourceTypeGeneral", + "type": "Attribute", + "required": True, + }, + ) + + @dataclass(slots=True) + class Subjects: + """Attributes: + subject: Subject, keywords, classification codes, or key phrases describing the resource. + """ + + subject: list[Resource.Subjects.Subject] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Subject: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + subject_scheme: None | object = field( + default=None, + metadata={ + "name": "subjectScheme", + "type": "Attribute", + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + value_uri: None | str = field( + default=None, + metadata={ + "name": "valueURI", + "type": "Attribute", + }, + ) + classification_code: None | str = field( + default=None, + metadata={ + "name": "classificationCode", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class Contributors: + """Attributes: + contributor: The institution or person responsible for collecting, creating, or otherwise + contributing to the development of the dataset. The personal name format should be: Family, + Given. + """ + + contributor: list[Resource.Contributors.Contributor] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Contributor: + contributor_name: None | Resource.Contributors.Contributor.ContributorName = field( + default=None, + metadata={ + "name": "contributorName", + "type": "Element", + "required": True, + }, + ) + given_name: None | object = field( + default=None, + metadata={ + "name": "givenName", + "type": "Element", + }, + ) + family_name: None | object = field( + default=None, + metadata={ + "name": "familyName", + "type": "Element", + }, + ) + name_identifier: list[object] = field( + default_factory=list, + metadata={ + "name": "nameIdentifier", + "type": "Element", + }, + ) + affiliation: list[object] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + contributor_type: None | ContributorType = field( + default=None, + metadata={ + "name": "contributorType", + "type": "Attribute", + "required": True, + }, + ) + + @dataclass(slots=True) + class ContributorName: + value: str = field( + default="", + metadata={ + "required": True, + "min_length": 1, + }, + ) + name_type: None | NameType = field( + default=None, + metadata={ + "name": "nameType", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class Dates: + """Attributes: + date: Different dates relevant to the work. YYYY,YYYY-MM-DD, YYYY-MM-DDThh:mm:ssTZD or any other + format or level of granularity described in W3CDTF. Use RKMS-ISO8601 standard for depicting date + ranges. + """ + + date: list[Resource.Dates.Date] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Date: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + date_type: None | DateType = field( + default=None, + metadata={ + "name": "dateType", + "type": "Attribute", + "required": True, + }, + ) + date_information: None | object = field( + default=None, + metadata={ + "name": "dateInformation", + "type": "Attribute", + }, + ) + + @dataclass(slots=True) + class AlternateIdentifiers: + """Attributes: + alternate_identifier: An identifier or identifiers other than the primary Identifier applied to the + resource being registered. This may be any alphanumeric string which is unique within its domain + of issue. May be used for local identifiers. AlternateIdentifier should be used for another + identifier of the same instance (same location, same file). + """ + + alternate_identifier: list[Resource.AlternateIdentifiers.AlternateIdentifier] = field( + default_factory=list, + metadata={ + "name": "alternateIdentifier", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class AlternateIdentifier: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + alternate_identifier_type: None | object = field( + default=None, + metadata={ + "name": "alternateIdentifierType", + "type": "Attribute", + "required": True, + }, + ) + + @dataclass(slots=True) + class RelatedIdentifiers: + """Attributes: + related_identifier: Identifiers of related resources. Use this property to indicate subsets of + properties, as appropriate. + """ + + related_identifier: list[Resource.RelatedIdentifiers.RelatedIdentifier] = field( + default_factory=list, + metadata={ + "name": "relatedIdentifier", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class RelatedIdentifier: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + resource_type_general: None | ResourceType = field( + default=None, + metadata={ + "name": "resourceTypeGeneral", + "type": "Attribute", + }, + ) + related_identifier_type: None | RelatedIdentifierType = field( + default=None, + metadata={ + "name": "relatedIdentifierType", + "type": "Attribute", + "required": True, + }, + ) + relation_type: None | RelationType = field( + default=None, + metadata={ + "name": "relationType", + "type": "Attribute", + "required": True, + }, + ) + related_metadata_scheme: None | object = field( + default=None, + metadata={ + "name": "relatedMetadataScheme", + "type": "Attribute", + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + scheme_type: None | object = field( + default=None, + metadata={ + "name": "schemeType", + "type": "Attribute", + }, + ) + + @dataclass(slots=True) + class Sizes: + """Attributes: + size: Unstructures size information about the resource. + """ + + size: list[str] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Formats: + """Attributes: + format: Technical format of the resource. Use file extension or MIME type where possible. + """ + + format: list[str] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class RightsList: + """Attributes: + rights: Any rights information for this resource. Provide a rights management statement for the + resource or reference a service providing such information. Include embargo information if + applicable. Use the complete title of a license and include version information if applicable. + """ + + rights: list[Resource.RightsList.Rights] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Rights: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + rights_uri: None | str = field( + default=None, + metadata={ + "name": "rightsURI", + "type": "Attribute", + }, + ) + rights_identifier: None | object = field( + default=None, + metadata={ + "name": "rightsIdentifier", + "type": "Attribute", + }, + ) + rights_identifier_scheme: None | object = field( + default=None, + metadata={ + "name": "rightsIdentifierScheme", + "type": "Attribute", + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class Descriptions: + """Attributes: + description: All additional information that does not fit in any of the other categories. May be used + for technical information. It is a best practice to supply a description. + """ + + description: list[Resource.Descriptions.Description] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Description: + description_type: None | DescriptionType = field( + default=None, + metadata={ + "name": "descriptionType", + "type": "Attribute", + "required": True, + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + content: list[object] = field( + default_factory=list, + metadata={ + "type": "Wildcard", + "namespace": "##any", + "mixed": True, + "choices": ( + { + "name": "br", + "type": object, + }, + ), + }, + ) + + @dataclass(slots=True) + class GeoLocations: + geo_location: list[Resource.GeoLocations.GeoLocation] = field( + default_factory=list, + metadata={ + "name": "geoLocation", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class GeoLocation: + """Attributes: + geo_location_place: Spatial region or named place where the data was gathered or about which the + data is focused. + geo_location_point: A point contains a single latitude-longitude pair. + geo_location_box: A box contains two white space separated latitude-longitude pairs, with each + pair separated by whitespace. The first pair is the lower corner, the second is the upper + corner. + geo_location_polygon: A drawn polygon area, defined by a set of points and lines connecting the + points in a closed chain. + """ + + geo_location_place: list[object] = field( + default_factory=list, + metadata={ + "name": "geoLocationPlace", + "type": "Element", + }, + ) + geo_location_point: list[Point] = field( + default_factory=list, + metadata={ + "name": "geoLocationPoint", + "type": "Element", + }, + ) + geo_location_box: list[Box] = field( + default_factory=list, + metadata={ + "name": "geoLocationBox", + "type": "Element", + }, + ) + geo_location_polygon: list[Resource.GeoLocations.GeoLocation.GeoLocationPolygon] = field( + default_factory=list, + metadata={ + "name": "geoLocationPolygon", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class GeoLocationPolygon: + polygon_point: list[Point] = field( + default_factory=list, + metadata={ + "name": "polygonPoint", + "type": "Element", + "min_occurs": 4, + }, + ) + in_polygon_point: None | Point = field( + default=None, + metadata={ + "name": "inPolygonPoint", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class FundingReferences: + """Attributes: + funding_reference: Information about financial support (funding) for the resource being registered. + """ + + funding_reference: list[Resource.FundingReferences.FundingReference] = field( + default_factory=list, + metadata={ + "name": "fundingReference", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class FundingReference: + """Attributes: + funder_name: Name of the funding provider. + funder_identifier: Uniquely identifies a funding entity, according to various types. + award_number: The code assigned by the funder to a sponsored award (grant). + award_title: The human readable title of the award (grant). + """ + + funder_name: None | str = field( + default=None, + metadata={ + "name": "funderName", + "type": "Element", + "required": True, + "min_length": 1, + }, + ) + funder_identifier: None | Resource.FundingReferences.FundingReference.FunderIdentifier = field( + default=None, + metadata={ + "name": "funderIdentifier", + "type": "Element", + }, + ) + award_number: None | Resource.FundingReferences.FundingReference.AwardNumber = field( + default=None, + metadata={ + "name": "awardNumber", + "type": "Element", + }, + ) + award_title: None | object = field( + default=None, + metadata={ + "name": "awardTitle", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class FunderIdentifier: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + funder_identifier_type: None | FunderIdentifierType = field( + default=None, + metadata={ + "name": "funderIdentifierType", + "type": "Attribute", + "required": True, + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + + @dataclass(slots=True) + class AwardNumber: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + award_uri: None | str = field( + default=None, + metadata={ + "name": "awardURI", + "type": "Attribute", + }, + ) + + @dataclass(slots=True) + class RelatedItems: + """Attributes: + related_item: Information about a resource related to the one being registered e.g. a journal or book + of which the article or chapter is part. + """ + + related_item: list[Resource.RelatedItems.RelatedItem] = field( + default_factory=list, + metadata={ + "name": "relatedItem", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class RelatedItem: + """Attributes: + related_item_identifier: The identifier for the related item. + creators: + titles: + publication_year: The year when the item was or will be made publicly available. + volume: Volume of the related item. + issue: Issue number or name of the related item. + number: Number of the related item e.g. report number of article number. + first_page: First page of the related item e.g. of the chapter, article, or conference paper. + last_page: Last page of the related item e.g. of the chapter, article, or conference paper. + publisher: The name of the entity that holds, archives, publishes prints, distributes, releases, + issues, or produces the resource. This property will be used to formulate the citation, so + consider the prominence of the role. + edition: Edition or version of the related item. + contributors: + related_item_type: The type of the related item, e.g. journal article, book or chapter. + relation_type: Description of the relationship of the resource being registered (A) and the + related resource (B). + """ + + related_item_identifier: None | Resource.RelatedItems.RelatedItem.RelatedItemIdentifier = field( + default=None, + metadata={ + "name": "relatedItemIdentifier", + "type": "Element", + }, + ) + creators: None | Resource.RelatedItems.RelatedItem.Creators = field( + default=None, + metadata={ + "type": "Element", + }, + ) + titles: None | Resource.RelatedItems.RelatedItem.Titles = field( + default=None, + metadata={ + "type": "Element", + }, + ) + publication_year: None | str = field( + default=None, + metadata={ + "name": "publicationYear", + "type": "Element", + "pattern": r"[\d]{4}", + }, + ) + volume: None | object = field( + default=None, + metadata={ + "type": "Element", + }, + ) + issue: None | object = field( + default=None, + metadata={ + "type": "Element", + }, + ) + number: None | Resource.RelatedItems.RelatedItem.Number = field( + default=None, + metadata={ + "type": "Element", + }, + ) + first_page: None | object = field( + default=None, + metadata={ + "name": "firstPage", + "type": "Element", + }, + ) + last_page: None | object = field( + default=None, + metadata={ + "name": "lastPage", + "type": "Element", + }, + ) + publisher: None | object = field( + default=None, + metadata={ + "type": "Element", + }, + ) + edition: None | object = field( + default=None, + metadata={ + "type": "Element", + }, + ) + contributors: None | Resource.RelatedItems.RelatedItem.Contributors = field( + default=None, + metadata={ + "type": "Element", + }, + ) + related_item_type: None | ResourceType = field( + default=None, + metadata={ + "name": "relatedItemType", + "type": "Attribute", + "required": True, + }, + ) + relation_type: None | RelationType = field( + default=None, + metadata={ + "name": "relationType", + "type": "Attribute", + "required": True, + }, + ) + + @dataclass(slots=True) + class RelatedItemIdentifier: + """Attributes: + value: + related_item_identifier_type: The type of the Identifier for the related item e.g. DOI. + related_metadata_scheme: The name of the scheme. + scheme_uri: The URI of the relatedMetadataScheme. + scheme_type: The type of the relatedMetadataScheme, linked with the schemeURI. + """ + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + related_item_identifier_type: None | RelatedIdentifierType = field( + default=None, + metadata={ + "name": "relatedItemIdentifierType", + "type": "Attribute", + }, + ) + related_metadata_scheme: None | object = field( + default=None, + metadata={ + "name": "relatedMetadataScheme", + "type": "Attribute", + }, + ) + scheme_uri: None | str = field( + default=None, + metadata={ + "name": "schemeURI", + "type": "Attribute", + }, + ) + scheme_type: None | object = field( + default=None, + metadata={ + "name": "schemeType", + "type": "Attribute", + }, + ) + + @dataclass(slots=True) + class Creators: + """Attributes: + creator: The institution or person responsible for creating the related resource. To supply + multiple creators, repeat this property. + """ + + creator: list[Resource.RelatedItems.RelatedItem.Creators.Creator] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Creator: + creator_name: None | Resource.RelatedItems.RelatedItem.Creators.Creator.CreatorName = field( + default=None, + metadata={ + "name": "creatorName", + "type": "Element", + "required": True, + }, + ) + given_name: None | object = field( + default=None, + metadata={ + "name": "givenName", + "type": "Element", + }, + ) + family_name: None | object = field( + default=None, + metadata={ + "name": "familyName", + "type": "Element", + }, + ) + + @dataclass(slots=True) + class CreatorName: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + name_type: None | NameType = field( + default=None, + metadata={ + "name": "nameType", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class Titles: + """Attributes: + title: Title of the related item. + """ + + title: list[Resource.RelatedItems.RelatedItem.Titles.Title] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Title: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + title_type: None | TitleType = field( + default=None, + metadata={ + "name": "titleType", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + @dataclass(slots=True) + class Number: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + number_type: None | NumberType = field( + default=None, + metadata={ + "name": "numberType", + "type": "Attribute", + }, + ) + + @dataclass(slots=True) + class Contributors: + """Attributes: + contributor: The institution or person responsible for collecting, managing, distributing, or + otherwise contributing to the development of the resource. + """ + + contributor: list[Resource.RelatedItems.RelatedItem.Contributors.Contributor] = field( + default_factory=list, + metadata={ + "type": "Element", + }, + ) + + @dataclass(slots=True) + class Contributor: + """Attributes: + contributor_name: + given_name: + family_name: + contributor_type: The type of contributor of the resource. + """ + + contributor_name: ( + None | Resource.RelatedItems.RelatedItem.Contributors.Contributor.ContributorName + ) = field( + default=None, + metadata={ + "name": "contributorName", + "type": "Element", + "required": True, + }, + ) + given_name: None | object = field( + default=None, + metadata={ + "name": "givenName", + "type": "Element", + }, + ) + family_name: None | object = field( + default=None, + metadata={ + "name": "familyName", + "type": "Element", + }, + ) + contributor_type: None | ContributorType = field( + default=None, + metadata={ + "name": "contributorType", + "type": "Attribute", + "required": True, + }, + ) + + @dataclass(slots=True) + class ContributorName: + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + name_type: None | NameType = field( + default=None, + metadata={ + "name": "nameType", + "type": "Attribute", + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) diff --git a/src/oaipmh_scythe/models/marcxml/.xsdata.xml b/src/oaipmh_scythe/models/marcxml/.xsdata.xml new file mode 100644 index 0000000..d6bd5e7 --- /dev/null +++ b/src/oaipmh_scythe/models/marcxml/.xsdata.xml @@ -0,0 +1,32 @@ + + + + src.oaipmh_scythe.models.marcxml.models + dataclasses + single-package + Google + false + false + false + true + false + false + true + + + + + + + + + + + + + + + + + + diff --git a/src/oaipmh_scythe/models/marcxml/README.md b/src/oaipmh_scythe/models/marcxml/README.md new file mode 100644 index 0000000..a1822fc --- /dev/null +++ b/src/oaipmh_scythe/models/marcxml/README.md @@ -0,0 +1,6 @@ +Generate the models with: + +```console +python -m pip install xsdata[cli] +xsdata generate --config src/oaipmh_scythe/models/marcxml/.xsdata.xml https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd +``` diff --git a/src/oaipmh_scythe/models/marcxml/__init__.py b/src/oaipmh_scythe/models/marcxml/__init__.py new file mode 100644 index 0000000..fe61c81 --- /dev/null +++ b/src/oaipmh_scythe/models/marcxml/__init__.py @@ -0,0 +1,29 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-23 15:41:50 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from oaipmh_scythe.models.marcxml.models import ( + Collection, + CollectionType, + ControlFieldType, + DataFieldType, + LeaderFieldType, + Record, + RecordType, + RecordTypeType, + SubfieldatafieldType, +) + +__all__ = [ + "Collection", + "CollectionType", + "ControlFieldType", + "DataFieldType", + "LeaderFieldType", + "Record", + "RecordType", + "RecordTypeType", + "SubfieldatafieldType", +] diff --git a/src/oaipmh_scythe/models/marcxml/models.py b/src/oaipmh_scythe/models/marcxml/models.py new file mode 100644 index 0000000..e9c93ac --- /dev/null +++ b/src/oaipmh_scythe/models/marcxml/models.py @@ -0,0 +1,235 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-23 15:41:50 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + +__NAMESPACE__ = "http://www.loc.gov/MARC21/slim" + + +@dataclass(slots=True) +class ControlFieldType: + """MARC21 Fields 001-009.""" + + class Meta: + name = "controlFieldType" + + value: str = field( + default="", + metadata={ + "required": True, + "white_space": "preserve", + }, + ) + id: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + tag: None | str = field( + default=None, + metadata={ + "type": "Attribute", + "required": True, + "white_space": "preserve", + "pattern": r"00[1-9A-Za-z]{1}", + }, + ) + + +@dataclass(slots=True) +class LeaderFieldType: + """MARC21 Leader, 24 bytes.""" + + class Meta: + name = "leaderFieldType" + + value: str = field( + default="", + metadata={ + "required": True, + "white_space": "preserve", + "pattern": r"[\d ]{5}[\dA-Za-z ]{1}[\dA-Za-z]{1}[\dA-Za-z ]{3}(2| )(2| )[\d ]{5}[\dA-Za-z ]{3}(4500| )", + }, + ) + id: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + + +class RecordTypeType(Enum): + BIBLIOGRAPHIC = "Bibliographic" + AUTHORITY = "Authority" + HOLDINGS = "Holdings" + CLASSIFICATION = "Classification" + COMMUNITY = "Community" + + +@dataclass(slots=True) +class SubfieldatafieldType: + class Meta: + name = "subfieldatafieldType" + + value: str = field( + default="", + metadata={ + "required": True, + "white_space": "preserve", + }, + ) + id: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + code: None | str = field( + default=None, + metadata={ + "type": "Attribute", + "required": True, + "white_space": "preserve", + "pattern": r"[\dA-Za-z!\"#$%&'()*+,-./:;<=>?{}_^`~\[\]\\]{1}", + }, + ) + + +@dataclass(slots=True) +class DataFieldType: + """MARC21 Variable Data Fields 010-999.""" + + class Meta: + name = "dataFieldType" + + subfield: list[SubfieldatafieldType] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.loc.gov/MARC21/slim", + "min_occurs": 1, + }, + ) + id: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + tag: None | str = field( + default=None, + metadata={ + "type": "Attribute", + "required": True, + "white_space": "preserve", + "pattern": r"(0([1-9A-Z][0-9A-Z])|0([1-9a-z][0-9a-z]))|(([1-9A-Z][0-9A-Z]{2})|([1-9a-z][0-9a-z]{2}))", + }, + ) + ind1: None | str = field( + default=None, + metadata={ + "type": "Attribute", + "required": True, + "white_space": "preserve", + "pattern": r"[\da-z ]{1}", + }, + ) + ind2: None | str = field( + default=None, + metadata={ + "type": "Attribute", + "required": True, + "white_space": "preserve", + "pattern": r"[\da-z ]{1}", + }, + ) + + +@dataclass(slots=True) +class RecordType: + class Meta: + name = "recordType" + + leader: None | LeaderFieldType = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.loc.gov/MARC21/slim", + }, + ) + controlfield: list[ControlFieldType] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.loc.gov/MARC21/slim", + }, + ) + datafield: list[DataFieldType] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.loc.gov/MARC21/slim", + }, + ) + type_value: None | RecordTypeType = field( + default=None, + metadata={ + "name": "type", + "type": "Attribute", + }, + ) + id: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + + +@dataclass(slots=True) +class Record(RecordType): + """Record is a top level container element for all of the field elements which compose the record.""" + + class Meta: + name = "record" + nillable = True + namespace = "http://www.loc.gov/MARC21/slim" + + +@dataclass(slots=True) +class CollectionType: + class Meta: + name = "collectionType" + + record: list[Record] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.loc.gov/MARC21/slim", + "nillable": True, + }, + ) + id: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + + +@dataclass(slots=True) +class Collection(CollectionType): + """Collection is a top level container element for 0 or many records.""" + + class Meta: + name = "collection" + nillable = True + namespace = "http://www.loc.gov/MARC21/slim" diff --git a/src/oaipmh_scythe/models/mixins.py b/src/oaipmh_scythe/models/mixins.py new file mode 100644 index 0000000..6bbd720 --- /dev/null +++ b/src/oaipmh_scythe/models/mixins.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: 2024 Heinz-Alexander Fütterer +# +# SPDX-License-Identifier: BSD-3-Clause + +"""TODO.""" + +from typing import Any + + +class HeaderMixin: + """A mixin class that provides functionality for managing headers in records. + + Attributes: + status: The status attribute of the header. + """ + + status: Any + + @property + def deleted(self) -> bool: + """Indicate if this header has been deleted. + + Returns: + True if the status attribute contains DELETED, False otherwise. + """ + if self.status and self.status.DELETED: + return True + return False + + +class RecordMixin: + """A mixin class that provides functionality for managing records. + + Attributes: + header: The header of the record. + metadata: The metadata associated with the record. + """ + + header: Any + metadata: Any + + @property + def deleted(self) -> bool: + """Indicate if this record has been deleted. + + Returns: + True if the header's status attribute contains DELETED, False otherwise. + """ + if self.header.status and self.header.status.DELETED: + return True + return False + + def get_metadata(self): + """Return the metadata associated with this record. + + Returns: + The metadata associated with this record. + """ + return self.metadata.other_element diff --git a/src/oaipmh_scythe/models/oai_dc/.xsdata.xml b/src/oaipmh_scythe/models/oai_dc/.xsdata.xml new file mode 100644 index 0000000..85aa116 --- /dev/null +++ b/src/oaipmh_scythe/models/oai_dc/.xsdata.xml @@ -0,0 +1,32 @@ + + + + src.oaipmh_scythe.models.oai_dc.models + dataclasses + single-package + Google + false + false + false + true + false + false + true + + + + + + + + + + + + + + + + + + diff --git a/src/oaipmh_scythe/models/oai_dc/README.md b/src/oaipmh_scythe/models/oai_dc/README.md new file mode 100644 index 0000000..574420c --- /dev/null +++ b/src/oaipmh_scythe/models/oai_dc/README.md @@ -0,0 +1,6 @@ +Generate the models with: + +```console +python -m pip install xsdata[cli] +xsdata generate --config src/oaipmh_scythe/models/oai_dc/.xsdata.xml http://www.openarchives.org/OAI/2.0/oai_dc.xsd +``` diff --git a/src/oaipmh_scythe/models/oai_dc/__init__.py b/src/oaipmh_scythe/models/oai_dc/__init__.py new file mode 100644 index 0000000..25c9c93 --- /dev/null +++ b/src/oaipmh_scythe/models/oai_dc/__init__.py @@ -0,0 +1,49 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-21 17:47:18 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from oaipmh_scythe.models.oai_dc.models import ( + Contributor, + Coverage, + Creator, + Date, + Dc, + Description, + ElementType, + Format, + Identifier, + Language, + LangValue, + OaiDcType, + Publisher, + Relation, + Rights, + Source, + Subject, + Title, + TypeType, +) + +__all__ = [ + "Contributor", + "Coverage", + "Creator", + "Date", + "Dc", + "Description", + "ElementType", + "Format", + "Identifier", + "LangValue", + "Language", + "OaiDcType", + "Publisher", + "Relation", + "Rights", + "Source", + "Subject", + "Title", + "TypeType", +] diff --git a/src/oaipmh_scythe/models/oai_dc/models.py b/src/oaipmh_scythe/models/oai_dc/models.py new file mode 100644 index 0000000..2f5fb50 --- /dev/null +++ b/src/oaipmh_scythe/models/oai_dc/models.py @@ -0,0 +1,261 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-21 17:47:18 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + + +class LangValue(Enum): + VALUE = "" + + +@dataclass(slots=True) +class ElementType: + class Meta: + name = "elementType" + target_namespace = "http://purl.org/dc/elements/1.1/" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + +@dataclass(slots=True) +class Contributor(ElementType): + class Meta: + name = "contributor" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Coverage(ElementType): + class Meta: + name = "coverage" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Creator(ElementType): + class Meta: + name = "creator" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Date(ElementType): + class Meta: + name = "date" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Description(ElementType): + class Meta: + name = "description" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Format(ElementType): + class Meta: + name = "format" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Identifier(ElementType): + class Meta: + name = "identifier" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Language(ElementType): + class Meta: + name = "language" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Publisher(ElementType): + class Meta: + name = "publisher" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Relation(ElementType): + class Meta: + name = "relation" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Rights(ElementType): + class Meta: + name = "rights" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Source(ElementType): + class Meta: + name = "source" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Subject(ElementType): + class Meta: + name = "subject" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class Title(ElementType): + class Meta: + name = "title" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class TypeType(ElementType): + class Meta: + name = "type" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass(slots=True) +class OaiDcType: + class Meta: + name = "oai_dcType" + target_namespace = "http://www.openarchives.org/OAI/2.0/oai_dc/" + + title: list[Title] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + creator: list[Creator] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + subject: list[Subject] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + description: list[Description] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + publisher: list[Publisher] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + contributor: list[Contributor] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + date: list[Date] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + type_value: list[TypeType] = field( + default_factory=list, + metadata={ + "name": "type", + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + format: list[Format] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + identifier: list[Identifier] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + source: list[Source] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + language: list[Language] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + relation: list[Relation] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + coverage: list[Coverage] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + rights: list[Rights] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + + +@dataclass(slots=True) +class Dc(OaiDcType): + class Meta: + name = "dc" + namespace = "http://www.openarchives.org/OAI/2.0/oai_dc/" diff --git a/src/oaipmh_scythe/models/oai_pmh/.xsdata.xml b/src/oaipmh_scythe/models/oai_pmh/.xsdata.xml new file mode 100644 index 0000000..445b3eb --- /dev/null +++ b/src/oaipmh_scythe/models/oai_pmh/.xsdata.xml @@ -0,0 +1,37 @@ + + + + src.oaipmh_scythe.models.oai_pmh.models + dataclasses + single-package + Google + false + false + false + true + false + false + true + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/oaipmh_scythe/models/oai_pmh/README.md b/src/oaipmh_scythe/models/oai_pmh/README.md new file mode 100644 index 0000000..54a3bfe --- /dev/null +++ b/src/oaipmh_scythe/models/oai_pmh/README.md @@ -0,0 +1,6 @@ +Generate the models with: + +```console +python -m pip install xsdata[cli] +xsdata generate --config src/oaipmh_scythe/models/oai_pmh/.xsdata.xml https://www.openarchives.org/OAI/2.0/OAI-PMH.xsd +``` diff --git a/src/oaipmh_scythe/models/oai_pmh/__init__.py b/src/oaipmh_scythe/models/oai_pmh/__init__.py new file mode 100644 index 0000000..4398398 --- /dev/null +++ b/src/oaipmh_scythe/models/oai_pmh/__init__.py @@ -0,0 +1,59 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-22 17:48:31 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from oaipmh_scythe.models.oai_pmh.models import ( + About, + DeletedRecord, + Description, + GetRecord, + Granularity, + Header, + Identify, + ListIdentifiers, + ListMetadataFormats, + ListRecords, + ListSets, + Metadata, + MetadataFormat, + OaiPmh, + OaiPmherror, + OaiPmherrorcode, + OaiPmhtype, + ProtocolVersion, + Record, + Request, + ResumptionToken, + Set, + Status, + Verb, +) + +__all__ = [ + "GetRecord", + "Identify", + "ListIdentifiers", + "ListMetadataFormats", + "ListRecords", + "ListSets", + "OaiPmh", + "OaiPmherror", + "OaiPmherrorcode", + "OaiPmhtype", + "About", + "DeletedRecord", + "Description", + "Granularity", + "Header", + "MetadataFormat", + "Metadata", + "ProtocolVersion", + "Record", + "Request", + "ResumptionToken", + "Set", + "Status", + "Verb", +] diff --git a/src/oaipmh_scythe/models/oai_pmh/models.py b/src/oaipmh_scythe/models/oai_pmh/models.py new file mode 100644 index 0000000..b00f751 --- /dev/null +++ b/src/oaipmh_scythe/models/oai_pmh/models.py @@ -0,0 +1,649 @@ +"""This file was generated by xsdata, v24.4, on 2024-04-22 17:48:31 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + +from xsdata.models.datatype import XmlDate, XmlDateTime + +from oaipmh_scythe.models.mixins import HeaderMixin, RecordMixin + +__NAMESPACE__ = "http://www.openarchives.org/OAI/2.0/" + + +class OaiPmherrorcode(Enum): + CANNOT_DISSEMINATE_FORMAT = "cannotDisseminateFormat" + ID_DOES_NOT_EXIST = "idDoesNotExist" + BAD_ARGUMENT = "badArgument" + BAD_VERB = "badVerb" + NO_METADATA_FORMATS = "noMetadataFormats" + NO_RECORDS_MATCH = "noRecordsMatch" + BAD_RESUMPTION_TOKEN = "badResumptionToken" + NO_SET_HIERARCHY = "noSetHierarchy" + + +@dataclass(slots=True) +class About: + """Data "about" the record must be expressed in XML that is compliant with an XML Schema defined by a community.""" + + class Meta: + name = "aboutType" + + other_element: None | object = field( + default=None, + metadata={ + "type": "Wildcard", + "namespace": "##other", + }, + ) + + +class DeletedRecord(Enum): + NO = "no" + PERSISTENT = "persistent" + TRANSIENT = "transient" + + +@dataclass(slots=True) +class Description: + """The descriptionType is used for the description element in Identify and for setDescription element in ListSets. + + Content must be compliant with an XML Schema defined by a community. + """ + + class Meta: + name = "descriptionType" + + other_element: None | object = field( + default=None, + metadata={ + "type": "Wildcard", + "namespace": "##other", + }, + ) + + +class Granularity(Enum): + YYYY_MM_DD = "YYYY-MM-DD" + YYYY_MM_DDTHH_MM_SS_Z = "YYYY-MM-DDThh:mm:ssZ" + + +@dataclass(slots=True) +class MetadataFormat: + class Meta: + name = "metadataFormatType" + + metadata_prefix: None | str = field( + default=None, + metadata={ + "name": "metadataPrefix", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r"[A-Za-z0-9\-_\.!~\*'\(\)]+", + }, + ) + schema: None | str = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + metadata_namespace: None | str = field( + default=None, + metadata={ + "name": "metadataNamespace", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + + +@dataclass(slots=True) +class Metadata: + """Metadata must be expressed in XML that complies with another XML Schema (namespace=#other). + + Metadata must be explicitly qualified in the response. + """ + + class Meta: + name = "metadataType" + + other_element: None | object = field( + default=None, + metadata={ + "type": "Wildcard", + "namespace": "##other", + }, + ) + + +class ProtocolVersion(Enum): + VALUE_2_0 = "2.0" + + +@dataclass(slots=True) +class ResumptionToken: + """A resumptionToken may have 3 optional attributes and can be used in ListSets, ListIdentifiers, ListRecords + responses. + """ + + class Meta: + name = "resumptionTokenType" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + expiration_date: None | XmlDateTime = field( + default=None, + metadata={ + "name": "expirationDate", + "type": "Attribute", + }, + ) + complete_list_size: None | int = field( + default=None, + metadata={ + "name": "completeListSize", + "type": "Attribute", + }, + ) + cursor: None | int = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + + +class Status(Enum): + DELETED = "deleted" + + +class Verb(Enum): + IDENTIFY = "Identify" + LIST_METADATA_FORMATS = "ListMetadataFormats" + LIST_SETS = "ListSets" + GET_RECORD = "GetRecord" + LIST_IDENTIFIERS = "ListIdentifiers" + LIST_RECORDS = "ListRecords" + + +@dataclass(slots=True) +class Identify: + class Meta: + name = "IdentifyType" + + repository_name: None | str = field( + default=None, + metadata={ + "name": "repositoryName", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + base_url: None | str = field( + default=None, + metadata={ + "name": "baseURL", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + protocol_version: None | ProtocolVersion = field( + default=None, + metadata={ + "name": "protocolVersion", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + admin_email: list[str] = field( + default_factory=list, + metadata={ + "name": "adminEmail", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + "pattern": r"\S+@(\S+\.)+\S+", + }, + ) + earliest_datestamp: None | XmlDate | str = field( + default=None, + metadata={ + "name": "earliestDatestamp", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r".*Z", + }, + ) + deleted_record: None | DeletedRecord = field( + default=None, + metadata={ + "name": "deletedRecord", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + granularity: None | Granularity = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + compression: list[str] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + description: list[Description] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class ListMetadataFormats: + class Meta: + name = "ListMetadataFormatsType" + + metadata_format: list[MetadataFormat] = field( + default_factory=list, + metadata={ + "name": "metadataFormat", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + + +@dataclass(slots=True) +class OaiPmherror: + class Meta: + name = "OAI-PMHerrorType" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + code: None | OaiPmherrorcode = field( + default=None, + metadata={ + "type": "Attribute", + "required": True, + }, + ) + + +@dataclass(slots=True) +class Header(HeaderMixin): + """A header has a unique identifier, a datestamp, and setSpec(s) in case the item from which the record is + disseminated belongs to set(s). + + the header can carry a deleted status indicating that the record is deleted. + """ + + class Meta: + name = "headerType" + + identifier: None | str = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + datestamp: None | XmlDate | str = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r".*Z", + }, + ) + set_spec: list[str] = field( + default_factory=list, + metadata={ + "name": "setSpec", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "pattern": r"([A-Za-z0-9\-_\.!~\*'\(\)])+(:[A-Za-z0-9\-_\.!~\*'\(\)]+)*", + }, + ) + status: None | Status = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + + +@dataclass(slots=True) +class Request: + """Define requestType, indicating the protocol request that led to the response. + + Element content is BASE-URL, attributes are arguments of protocol request, attribute-values are values of + arguments of protocol request + """ + + class Meta: + name = "requestType" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + verb: None | Verb = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + identifier: None | str = field( + default=None, + metadata={ + "type": "Attribute", + }, + ) + metadata_prefix: None | str = field( + default=None, + metadata={ + "name": "metadataPrefix", + "type": "Attribute", + "pattern": r"[A-Za-z0-9\-_\.!~\*'\(\)]+", + }, + ) + from_value: None | XmlDate | str = field( + default=None, + metadata={ + "name": "from", + "type": "Attribute", + "pattern": r".*Z", + }, + ) + until: None | XmlDate | str = field( + default=None, + metadata={ + "type": "Attribute", + "pattern": r".*Z", + }, + ) + set: None | str = field( + default=None, + metadata={ + "type": "Attribute", + "pattern": r"([A-Za-z0-9\-_\.!~\*'\(\)])+(:[A-Za-z0-9\-_\.!~\*'\(\)]+)*", + }, + ) + resumption_token: None | str = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Attribute", + }, + ) + + +@dataclass(slots=True) +class Set: + class Meta: + name = "setType" + + set_spec: None | str = field( + default=None, + metadata={ + "name": "setSpec", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + "pattern": r"([A-Za-z0-9\-_\.!~\*'\(\)])+(:[A-Za-z0-9\-_\.!~\*'\(\)]+)*", + }, + ) + set_name: None | str = field( + default=None, + metadata={ + "name": "setName", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + set_description: list[Description] = field( + default_factory=list, + metadata={ + "name": "setDescription", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class ListIdentifiers: + class Meta: + name = "ListIdentifiersType" + + header: list[Header] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + resumption_token: None | ResumptionToken = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class ListSets: + class Meta: + name = "ListSetsType" + + set: list[Set] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + resumption_token: None | ResumptionToken = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class Record(RecordMixin): + """A record has a header, a metadata part, and an optional about container.""" + + class Meta: + name = "recordType" + + header: None | Header = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + metadata: None | Metadata = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + about: list[About] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class GetRecord: + class Meta: + name = "GetRecordType" + + record: None | Record = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + + +@dataclass(slots=True) +class ListRecords: + class Meta: + name = "ListRecordsType" + + record: list[Record] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "min_occurs": 1, + }, + ) + resumption_token: None | ResumptionToken = field( + default=None, + metadata={ + "name": "resumptionToken", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class OaiPmhtype: + class Meta: + name = "OAI-PMHtype" + + response_date: None | XmlDateTime = field( + default=None, + metadata={ + "name": "responseDate", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + request: None | Request = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + "required": True, + }, + ) + error: list[OaiPmherror] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + identify: None | Identify = field( + default=None, + metadata={ + "name": "Identify", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_metadata_formats: None | ListMetadataFormats = field( + default=None, + metadata={ + "name": "ListMetadataFormats", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_sets: None | ListSets = field( + default=None, + metadata={ + "name": "ListSets", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + get_record: None | GetRecord = field( + default=None, + metadata={ + "name": "GetRecord", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_identifiers: None | ListIdentifiers = field( + default=None, + metadata={ + "name": "ListIdentifiers", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + list_records: None | ListRecords = field( + default=None, + metadata={ + "name": "ListRecords", + "type": "Element", + "namespace": "http://www.openarchives.org/OAI/2.0/", + }, + ) + + +@dataclass(slots=True) +class OaiPmh(OaiPmhtype): + class Meta: + name = "OAI-PMH" + namespace = "http://www.openarchives.org/OAI/2.0/" diff --git a/src/oaipmh_scythe/response.py b/src/oaipmh_scythe/response.py index 0d49f41..6119e28 100644 --- a/src/oaipmh_scythe/response.py +++ b/src/oaipmh_scythe/response.py @@ -5,50 +5,134 @@ """The response module offers a structured representation of responses from OAI-PMH services. -This module defines the OAIResponse class, which encapsulates the HTTP response from an OAI-PMH server, +This module defines the Response class, which encapsulates the HTTP response from an OAI-PMH server, providing easy access to its content both as raw text and as parsed XML. It is designed to work seamlessly with various components of an OAI-PMH client, handling the nuances of OAI-PMH responses. """ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import TYPE_CHECKING -from lxml import etree +import httpx +from xsdata.formats.dataclass.context import XmlContext +from xsdata.formats.dataclass.parsers import XmlParser + +from oaipmh_scythe import exceptions +from oaipmh_scythe.models.oai_pmh import OaiPmh +from oaipmh_scythe.utils import load_models if TYPE_CHECKING: - from httpx import Response + from oaipmh_scythe.models.oai_pmh import OaiPmherror -XMLParser = etree.XMLParser(remove_blank_text=True, recover=True, resolve_entities=False) +CONTEXT = XmlContext() +PARSER = XmlParser(context=CONTEXT) -@dataclass -class OAIResponse: - """Represents a response received from an OAI server, encapsulating the raw HTTP response and parsed XML content. +def _build_response(http_response: httpx.Response, metadata_prefix: str | None) -> Response: + """Build a response object from an HTTP response. - This class provides a structured way to access various aspects of an OAI server's response. - It offers methods to retrieve the raw text of the response, parse it as XML, - and obtain a string representation of the response that includes the OAI verb. + This function is used to construct a response object from an HTTP response. It checks if the server returned + an error status code and raises an exception if so. Otherwise, it parses the response content using + `_parse_response` and returns a Response object with the parsed data. - Attributes: - http_response: The original HTTP response object from the OAI server. - params: A dictionary of the OAI parameters used in the request that led to this response. + Args: + http_response: The HTTP response to build a response from. + metadata_prefix: The metadata format used in the request. + + Returns: + A built response object. + + Raises: + httpx.HTTPError: If the server returned an error status code >= 500. """ + if http_response.is_server_error: + http_response.raise_for_status() + parsed = _parse_response(http_response.content, metadata_prefix) + return Response( + url=http_response.url, + status_code=httpx.codes(http_response.status_code), + content=http_response.content, + headers=http_response.headers, + parsed=parsed, + ) + - http_response: Response - params: dict[str, str] +def _parse_response(content: bytes, metadata_prefix: str | None) -> OaiPmh: + """Parse an HTTP response content into an OAI-PMH object. + + This function uses the xsdata XmlParser to convert the HTTP response content into an OAI-PMH object. It first loads + any necessary models, then parse the content using the parser. If there are errors in the XML response, + it raises the appropriate exception. + + Args: + content: The HTTP response content to parse. + metadata_prefix: The metadata format used in the request. + + Returns: + The parsed OAI-PMH object. + + Raises: + exceptions.OAIPMHException: If there is an error sent from the server in the response content. + """ + load_models(metadata_prefix) + parsed = PARSER.from_bytes(content, OaiPmh) + raise_for_error(parsed.error) + return parsed - @property - def raw(self) -> str: - """Return the raw text of the server's response as a unicode string.""" - return self.http_response.text - @property - def xml(self) -> etree._Element: - """Parse the server's response content and return it as an `etree._Element` object.""" - return etree.XML(self.http_response.content, parser=XMLParser) +def raise_for_error(errors: list[OaiPmherror] | None) -> None: + """Raise an exception for each error in the given list. + + Args: + errors: A list of OAI-PMH errors to raise exceptions for. If None, no exceptions are raised. + + Returns: + None. + + Raises: + exceptions.OAIPMHException: If the error list is empty or contains unknown error codes, the appropriate + exception is raised. Specific exceptions are raised for each known error code. + """ + if errors is None: + return + for error in errors: + if error.code: + match error.code: + case error.code.BAD_ARGUMENT: + raise exceptions.BadArgument(error.value) + case error.code.BAD_RESUMPTION_TOKEN: + raise exceptions.BadResumptionToken(error.value) + case error.code.BAD_VERB: + raise exceptions.BadVerb(error.value) + case error.code.CANNOT_DISSEMINATE_FORMAT: + raise exceptions.CannotDisseminateFormat(error.value) + case error.code.ID_DOES_NOT_EXIST: + raise exceptions.IdDoesNotExist(error.value) + case error.code.NO_METADATA_FORMATS: + raise exceptions.NoMetadataFormat(error.value) + case error.code.NO_RECORDS_MATCH: + raise exceptions.NoRecordsMatch(error.value) + case error.code.NO_SET_HIERARCHY: + raise exceptions.NoSetHierarchy(error.value) + raise exceptions.UndefinedError(error) + + +@dataclass(slots=True) +class Response: + """A response received from an OAI server, encapsulating the raw HTTP response and parsed content. + + Attributes: + url: TODO + status_code: The HTTP status code of the response. + headers: A dictionary-like object containing metadata about the response, such as content type and length. + content: The raw bytes of the response content. + parsed: The parsed OAI-PMH object representing the OAI-PMH metadata in the response. + """ - def __str__(self) -> str: - verb = self.params.get("verb") - return f"" + url: httpx.URL + status_code: httpx.codes + headers: httpx.Headers = field(repr=False) + content: bytes = field(repr=False) + parsed: OaiPmh = field(repr=False) diff --git a/src/oaipmh_scythe/utils.py b/src/oaipmh_scythe/utils.py index e07a157..bc4f345 100644 --- a/src/oaipmh_scythe/utils.py +++ b/src/oaipmh_scythe/utils.py @@ -13,22 +13,18 @@ log_response: Log the details of an HTTP response. remove_none_values: Remove keys from the dictionary where the value is `None`. filter_dict_except_resumption_token: Filter keys from the dictionary, if resumption token is not `None`. - get_namespace: Extracts the namespace from an XML element. - xml_to_dict: Converts an XML tree or element into a dictionary representation. """ from __future__ import annotations import logging -import re -from collections import defaultdict from typing import TYPE_CHECKING if TYPE_CHECKING: from typing import Any import httpx - from lxml import etree + logger = logging.getLogger(__name__) @@ -70,10 +66,10 @@ def filter_dict_except_resumption_token(d: dict[str, Any | None]) -> dict[str, A with None values. Args: - d (dict[str, Any | None]): The dictionary to filter. + d: The dictionary to filter. Returns: - dict[str, Any]: A filtered dictionary based on the defined criteria. + A filtered dictionary based on the defined criteria. """ allowed_keys = ("verb", "resumptionToken") resumption_token_present = d["resumptionToken"] is not None @@ -86,54 +82,24 @@ def filter_dict_except_resumption_token(d: dict[str, Any | None]) -> dict[str, A return d -def get_namespace(element: etree._Element) -> str | None: - """Return the namespace URI of an XML element. - - Extracts and returns the namespace URI from the tag of the given XML element. - The namespace URI is enclosed in curly braces at the start of the tag. - If the element does not have a namespace, `None` is returned. - - Args: - element: The XML element from which to extract the namespace. - - Returns: - The namespace URI as a string if the element has a namespace, otherwise `None`. - """ - match = re.search(r"(\{.*\})", element.tag) - return match.group(1) if match else None - +def load_models(metadata_prefix: str | None = None) -> None: + """Load models based on the provided metadata prefix. -def xml_to_dict( - tree: etree._Element, paths: list[str] | None = None, nsmap: dict[str, str] | None = None, strip_ns: bool = False -) -> dict[str, list[str | None]]: - """Convert an XML tree to a dictionary, with options for custom XPath and namespace handling. - - This function takes an XML element tree and converts it into a dictionary. The keys of the - dictionary are the tags of the XML elements, and the values are lists of the text contents - of these elements. It offers options to apply specific XPath expressions, handle namespaces, - and optionally strip namespaces from the tags in the resulting dictionary. + After loading these models, they are available to the xsdata XmlParser for parsing XML responses into the + appropriate dataclasses. Args: - tree: The root element of the XML tree to be converted. - paths: An optional list of XPath expressions to apply on the XML tree. If None or not - provided, the function will consider all elements in the tree. - nsmap: An optional dictionary for namespace mapping, used to provide shorter, more - readable paths in XPath expressions. If None or not provided, no namespace - mapping is applied. - strip_ns: A boolean flag indicating whether to remove namespaces from the element tags - in the resulting dictionary. Defaults to False. + metadata_prefix: The metadata format of the response to be parsed. Possible values are 'oai_dc' and 'datacite'. Returns: - A dictionary where each key is an element tag (with or without namespace, based on - `strip_ns`) and each value is a list of strings representing the text content of - each element with that tag. + None """ - paths = paths or [".//"] - nsmap = nsmap or {} - fields = defaultdict(list) - for path in paths: - elements = tree.findall(path, nsmap) - for element in elements: - tag = re.sub(r"\{.*\}", "", element.tag) if strip_ns else element.tag - fields[tag].append(element.text) - return dict(fields) + match metadata_prefix: + case "oai_dc": + from oaipmh_scythe.models.oai_dc import Dc # noqa: F401 + case "datacite": + from oaipmh_scythe.models.datacite import Resource # noqa: F401 + case "marcxml": + from oaipmh_scythe.models.marcxml import Record # noqa: F401 + case _: + pass diff --git a/tests/cassettes/get_record.yaml b/tests/cassettes/get_record.yaml index 25f1d29..15cfa51 100644 --- a/tests/cassettes/get_record.yaml +++ b/tests/cassettes/get_record.yaml @@ -340,4 +340,115 @@ interactions: - 1; mode=block http_version: HTTP/1.1 status_code: 422 +- request: + body: '' + headers: + accept: + - text/xml; charset=utf-8 + accept-encoding: + - gzip, deflate + connection: + - keep-alive + host: + - zenodo.org + user-agent: + - oaipmh-scythe/0.12.0 + method: GET + uri: https://zenodo.org/oai2d?verb=GetRecord&identifier=oai%3Azenodo.org%3A10357859&metadataPrefix=marcxml + response: + body: + string: !!binary | + H4sIAAAAAAAAA7SWUU/jOBDH3/dTWOGBXQk3iZO0SdR2VQG77B29IuB0q3szybS1LrG7tguF77QP + +3JP98YXu0lKaTmolqBrpUqxO+P5zX886XQ/LsqCXIM2Qsnevt/y9gnITOVCTnr7v19+ovH+x/67 + bmVGjb0twEwBLLG3M+g5FhbWXZjCIVMN457jGsutyFzFBWtV+5XraPCFng1PCJ4gTc+ZWjtLXffm + 5qalZiC5zqbiGkxL6YmLpi5rea6zNE4XRjxxuAlqM+Z5vvt1eHqRTaHkVEgMKzNALyNSU2+eqgxJ + MKVXxSOvMXrIA/PKnf47QroazExJA0fcQp95LKReSFlw6QdpmKQs+rPrPjFZ+nybg7GV4lc95zPY + c8iUzh0icpBWjAXonoPypXcgVa4qgNT3gqgTR4lDSrA855afodpi0XNKBEWlnH7FbzCBtVddg7wi + qAPWwR/DVasaZr3A5RR4Dnq1xI01U38LUtfdsFk7IiPG5OWskiWgPqO+f+l3UtZOwzbKsv597WPA + Xswg6xs1tjdcQ9dd7az43KeA3ZUaG4csM3rhqhUqa03UtTscnB8y3zWFKJdFLJZnetVHluXeHmP1 + 83y+F0ae13WLx6DdTEmrVYHJFjmxfNJz8CI6/bUYmwa1R4W3ac5CrLTM/Z5D6gdWPTxUw8yvlqbY + fdhbzOnnSqAKD9svW/EqfCtisb+q/Zpm07PWnG8FS7xkBXa4AhttAVPO9tvQIGTH816tBWY5FNmU + QyHMARkVfPwzXTyn/17pTOQfqmJS/DKKKjHa8YKvu+T8gxcFz+CAHPFrkTfH9GmcRDEN256/S0x8 + der7v/FiH5AhXzTHxKaOGKNhEu5UzU/3/1gLusI8ASHv6KCABZfI/RbkMEg61GOdnSKfcnnHD8hn + wTNVquaYjDIWJDRIomiXmEOu7f13CXdkOL//oe5evLD/b8gzUSgtqkBavEEYrF+YJDSOOsEuKU+4 + 1rcH5Fj+Bc0ZsXhBu03jwG83YmRh1ITxHAxUMwpKaTk2sOQTKPF/mIz0hEthQJP350fD0YdGEH4z + oX4tOLblLwrnLfkWpaI4xpKyhtWMo/YzSLYdcq6LdDUcTYSdzq9a2Jauzku1kqpeuFYD4JiHA3Aj + nChspJmQY5XCnGqYKZxvSo6zU2bcauYcZBkYs8vggxnHWpFTkQGOpIT9N9XnTvPlZPkwQ/Havx4u + i+UZxj39cnj828UxfXbYT9Db0SO6v0LvPKD/CwAA//+sWV1v0zAUfedXWDwgJlAnJvG0p6orYwi2 + qet4rdzGbT0cO8T2SkH8d851mrRd3cwRe1rmJPfz3HvPTSOmV6oPtRw+DCzYIvvVyZiPZ53i+AbE + 9Vy58+VZuFq48/qEb/agJMThtADPpd2id9br9QL4XtciqXxZOKqFn/L6Hnt7IWbsw4f3jOj1SfPA + 1qBX9ZlXB0dKNjrEnM+cKVlRmgVWFcumvGQYr8xgP3mUYpUk5pP8xRyAtSfFGQRMW6wHuOZTG8Ti + JJO0lllcliDrTq0b67cCIzrulmZVK3nAe1tlUu+e28b0oHD3jSV2FGqaXTSulqCdrEB7hQOlYHOv + 1JpRxDA+Jad/uLYrMJSsNuRp6GJaTiPxbLemSZUpQvySEtNXCj7U54Tf+g4t7vCHN8p3bzIOZ7Mq + UVzBr8I7Nj/U3W5xP8uiqik2E9If1w3cbNTQ5VQwb7exldptkks7PxYzZuZH/YvK/z/bl0IViWZb + wEczDTvoIAAj3D9wpc2i7jihWlyUPM9RgtJarP6kkHtncvoyExC7EBqV4BBW8odRrI5a1Yqve0yR + IEzOCB+5V7wKAZUeryL40xt3/m57yaxc6DRfRiJHMcVBJMqpsWKieS4mhfIlV/G8hNb9siFuSpGi + Sk1cCSfYSmYL0WCuvSyPoGsOCrIrM+5R1ZJmHFlV1jALSwR601Q6dLd1Va2JJUpgeSQy7m0NlhUm + Fosa8SKxu8rRjZHTBniJjSwjPM08nM3ZHwxS8ZcB4T8yqjGM8DBt0M6QBx4qj9Ezp6qaD4A0EgZg + ptnYzx6gKJQDffCUekFdJhOOS2VBK6i21mzpckWqW0PdPUDkKcaaQ0lRgfLZzHjdIUr16S5kbkc3 + X4aD8WQwGvbHw8loeDceXQ3Gw4vGpP3ur5PEXI5u7m/v4iKQjtqJZvyG/FADpFyJEtHU6FTGF4lg + PeZdfzC4ub8et9pzzKW7m8FV/2uKBBhNU1Bj9Hhar/Y7KjBmEpzpDoe6Xkhvvl3zAqbRhZrqbMXF + lZbEVeRv8Bep8OamzGceFAzSrGwv8+1R4ESmJE4HAoTeC2EVM6Az4BU2rilW4W/ovR2yu2sPCSGn + hQoeJ0oJTLQEI6sGEZv6RUMQnwvhyyTsthSPJB5Md44lCXx0hfU4YMYyAU5V9RWEDp1XUce1EnPT + cS3QhVPZMEXraRqIZXD7w55uOjlhniZ5dVU9Te2kpJ92Amb32HEKbX2+GCuO00LvjlXi5sUt7ZyE + QRYXAlc3wXtCEQVmFv20kOZEzI7vJLcXpMd1a1QAVyu+tmhxzpfEdhi1OoyJt+SdNi7q4bXRcZEn + adZ+oz5wKd1nPw1RxOVXPqUIWEEoQM3QVwUUjyllagioYDa81AoHAeAgdtmJo+4Li3lOX8S9W8bj + WaF1T8E/AAAA//+0WktTgzAQvvsrOng1pLzB6XDWkzcP3gQSmxkLGR6j9Ne7G6DQFhtwxmsbss9s + dr8vvwrArONlkdeE4WwHN0wFsUfPQ9jbr1J87OtlOw11FQkbkTeqAQJjoXPW9QlzJWGcu59wSi02 + jYSdsjHcM4O5PKmSQzMHjXRbNGrE7L5Fo8DUA4R6o6CAnqKFdMeZDjGDLVjN8MsSByGIVzX2ypdA + BAIOJq6DZVmRVqYoKMupOiw17dWlIs/Yt4mdzX3/E6kL0ksmNtmSEZ/oF6hBrGzS88FwRCoSxrE/ + RePA4yljmXlaJa+dUp4SYy6RpJBKHiTUhpBBA7TtTrb1HrzTFXhTtkMIxrjNToETgesQo3VQ6TNX + HoCQ9IV6L6Dw90SnGtCgu0jgvlDp9zAENsWLEK4MmMdV2M1VOtr+ch0TI35TfJ4OXkuNCaW7Sp0o + XK7ODaQUEri8LXeOpfU6tS3bsqzA9l0fgUQdWzuBml2dxhUIsJzIcrQA5dGID5n3yF3QwuVRGCQ2 + 9z3fcTmztl4a8izIHK7lHppZ0r/jvys6MLIUGj5WXYOPRCGL5lHIVTH0XHtxDD+NGGHtVfsHgbN4 + //fRAX8C9q93FHBQ4Vg2UnZNJ5xU3Se5Yhj+08YLdt+LfC+0dGqBJaJ67er2i5Yrz2deGpwZsaMX + L0Xo+dOL6f87OnllshsezcR3PwAAAP//AwAtfif8aSQAAA== + headers: + content-encoding: + - gzip + content-security-policy: + - 'default-src ''self'' fonts.googleapis.com *.gstatic.com data: ''unsafe-inline'' + ''unsafe-eval'' blob: zenodo-broker.web.cern.ch zenodo-broker-qa.web.cern.ch + maxcdn.bootstrapcdn.com cdnjs.cloudflare.com ajax.googleapis.com webanalytics.web.cern.ch' + content-type: + - text/xml; charset=utf-8 + date: + - Tue, 23 Apr 2024 13:49:25 GMT + permissions-policy: + - interest-cohort=() + referrer-policy: + - strict-origin-when-cross-origin + retry-after: + - '60' + server: + - nginx + set-cookie: + - session=c1ce0fc293d24669_6627bc65.ce_O8JYP-DnZGU9efpHKZh5kklE; Expires=Fri, + 03 May 2024 13:49:25 GMT; Secure; HttpOnly; Path=/; SameSite=Lax + - 5569e5a730cade8ff2b54f1e815f3670=a6ea7e22258a47b28d52600c2b8a56b2; path=/; + HttpOnly; Secure; SameSite=None + strict-transport-security: + - max-age=31556926; includeSubDomains + - max-age=15768000 + transfer-encoding: + - chunked + vary: + - Accept-Encoding + x-content-type-options: + - nosniff + x-frame-options: + - sameorigin + x-ratelimit-limit: + - '120' + x-ratelimit-remaining: + - '119' + x-ratelimit-reset: + - '1713880226' + x-request-id: + - b83d3f192935bde6569eeb62def4a421 + x-xss-protection: + - 1; mode=block + status: + code: 200 + message: OK version: 1 diff --git a/tests/conftest.py b/tests/conftest.py index b443ef0..a076d4b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from __future__ import annotations +import httpx import pytest from oaipmh_scythe import Scythe @@ -17,3 +18,24 @@ def vcr_config() -> dict[str, str]: @pytest.fixture def scythe() -> Scythe: return Scythe("https://zenodo.org/oai2d") + + +@pytest.fixture() +def identify_response() -> httpx.Response: + identify_response_xml = """ + + 2023-11-09T09:53:46Z + https://zenodo.org/oai2d + + Zenodo + https://zenodo.org/oai2d + 2.0 + + + """ + return httpx.Response( + status_code=httpx.codes.OK, + content=identify_response_xml, + request=httpx.Request(method="GET", url="https://zenodo.org/oai2d?verb=Identify"), + ) diff --git a/tests/integration/test_get_record.py b/tests/integration/test_get_record.py index 1cf5b58..e06d75a 100644 --- a/tests/integration/test_get_record.py +++ b/tests/integration/test_get_record.py @@ -7,8 +7,8 @@ from typing import TYPE_CHECKING import pytest -from httpx import HTTPStatusError +from oaipmh_scythe import BadArgument, IdDoesNotExist from oaipmh_scythe.models import Record if TYPE_CHECKING: @@ -19,40 +19,48 @@ @pytest.mark.default_cassette("get_record.yaml") -@pytest.mark.vcr +@pytest.mark.vcr() def test_get_record_with_default_metadata_prefix(scythe: Scythe) -> None: record = scythe.get_record(identifier=IDENTIFIER, metadata_prefix="oai_dc") assert isinstance(record, Record) - assert record.metadata["title"][0] == TITLE + assert record.metadata.other_element.title[0].value == TITLE @pytest.mark.default_cassette("get_record.yaml") -@pytest.mark.vcr +@pytest.mark.vcr() def test_get_record_without_metadata_prefix(scythe: Scythe) -> None: record = scythe.get_record(identifier=IDENTIFIER) assert isinstance(record, Record) - assert record.metadata["title"][0] == TITLE + assert record.metadata.other_element.title[0].value == TITLE @pytest.mark.default_cassette("get_record.yaml") @pytest.mark.vcr -def test_get_record_with_valid_metadata_prefix(scythe: Scythe) -> None: +def test_get_record_with_datacite_metadata_prefix(scythe: Scythe) -> None: record = scythe.get_record(identifier=IDENTIFIER, metadata_prefix="datacite") assert isinstance(record, Record) - assert record.metadata["title"][0] == TITLE + assert record.metadata.other_element.titles.title[0].value == TITLE + + +@pytest.mark.default_cassette("get_record.yaml") +@pytest.mark.vcr +def test_get_record_with_marcxml_metadata_prefix(scythe: Scythe) -> None: + record = scythe.get_record(identifier=IDENTIFIER, metadata_prefix="marcxml") + assert isinstance(record, Record) + controlfield = record.metadata.other_element.controlfield[0] + assert controlfield.value == "10357859" + assert controlfield.tag == "001" @pytest.mark.default_cassette("get_record.yaml") @pytest.mark.vcr def test_get_record_with_invalid_metadata_prefix(scythe: Scythe) -> None: - with pytest.raises(HTTPStatusError): - # cannotDisseminateFormat + with pytest.raises(BadArgument, match="metadataPrefix does not exist"): scythe.get_record(identifier=IDENTIFIER, metadata_prefix="XXX") @pytest.mark.default_cassette("id_does_not_exist.yaml") @pytest.mark.vcr def test_get_record_with_invalid_identifier(scythe: Scythe) -> None: - # idDoesNotExist - with pytest.raises(HTTPStatusError): + with pytest.raises(IdDoesNotExist, match="No matching identifier"): scythe.get_record(identifier="oai:zenodo.org:XXX", metadata_prefix="oai_dc") diff --git a/tests/integration/test_identify.py b/tests/integration/test_identify.py index 5472be0..a0edf14 100644 --- a/tests/integration/test_identify.py +++ b/tests/integration/test_identify.py @@ -7,7 +7,8 @@ import httpx import pytest -from oaipmh_scythe import Scythe +from oaipmh_scythe import Response, Scythe +from oaipmh_scythe.iterator import ResponseIterator from oaipmh_scythe.models import Identify @@ -30,14 +31,24 @@ def test_context_manager() -> None: def test_identify(scythe: Scythe) -> None: identify = scythe.identify() assert isinstance(identify, Identify) - assert identify.repositoryName == "Zenodo" + assert identify.repository_name == "Zenodo" + + +@pytest.mark.default_cassette("identify.yaml") +@pytest.mark.vcr() +def test_identify_response(scythe: Scythe) -> None: + scythe.iterator = ResponseIterator + response = scythe.identify() + assert isinstance(response, Response) + assert response.status_code == httpx.codes.OK + assert response.url == httpx.URL("https://zenodo.org/oai2d?verb=Identify") @pytest.mark.default_cassette("identify.yaml") @pytest.mark.vcr def test_non_oai_pmh_url() -> None: scythe = Scythe("https://duckduckgo.com/") - with pytest.raises(ValueError, match="Identify element not found in the XML"): + with pytest.raises(ValueError, match="Unknown property {http://www.openarchives.org/OAI/2.0/}OAI-PMH:head"): scythe.identify() scythe.close() diff --git a/tests/integration/test_list_identifiers.py b/tests/integration/test_list_identifiers.py index 0f07c2f..8fb8bad 100644 --- a/tests/integration/test_list_identifiers.py +++ b/tests/integration/test_list_identifiers.py @@ -8,10 +8,9 @@ import httpx import pytest -from lxml import etree -from oaipmh_scythe import OAIResponse, Scythe -from oaipmh_scythe.iterator import OAIResponseIterator +from oaipmh_scythe import BadArgument, BadResumptionToken, NoRecordsMatch, Response, Scythe +from oaipmh_scythe.iterator import ResponseIterator from oaipmh_scythe.models import Header @@ -48,9 +47,8 @@ def test_list_identifiers_with_valid_metadata_prefix(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_list_identifiers_with_invalid_metadata_prefix(scythe: Scythe) -> None: - # cannotDisseminateFormat headers = scythe.list_identifiers(metadata_prefix="XXX") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(BadArgument, match="metadataPrefix does not exist"): next(headers) @@ -91,9 +89,8 @@ def test_list_identifiers_with_valid_set(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_list_identifiers_with_invalid_set(scythe: Scythe) -> None: - # noRecordsMatch headers = scythe.list_identifiers(set_="XXX") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(NoRecordsMatch): next(headers) @@ -110,18 +107,16 @@ def test_list_identifiers_with_valid_resumption_token(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_list_identifiers_with_invalid_resumption_token(scythe: Scythe) -> None: - # badResumptionToken headers = scythe.list_identifiers(resumption_token="XXX") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(BadResumptionToken, match="The value of the resumptionToken argument is invalid or expired."): next(headers) @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_list_identifiers_raises_no_records_match(scythe: Scythe) -> None: - # noRecordsMatch headers = scythe.list_identifiers(from_="2025-01-15") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(NoRecordsMatch): next(headers) @@ -136,13 +131,12 @@ def test_list_identifiers_ignore_deleted(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") -@pytest.mark.vcr -def test_list_identifiers_oai_response(scythe: Scythe) -> None: - scythe.iterator = OAIResponseIterator +@pytest.mark.vcr() +def test_list_identifiers_response(scythe: Scythe) -> None: + scythe.iterator = ResponseIterator responses = scythe.list_identifiers(metadata_prefix="oai_dc") assert isinstance(responses, Iterator) response = next(responses) - assert isinstance(response, OAIResponse) - assert response.params == {"metadataPrefix": "oai_dc", "verb": "ListIdentifiers"} - assert isinstance(response.xml, etree._Element) - assert response.xml.tag == "{http://www.openarchives.org/OAI/2.0/}OAI-PMH" + assert isinstance(response, Response) + assert response.status_code == httpx.codes.OK + assert response.url == httpx.URL("https://zenodo.org/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc") diff --git a/tests/integration/test_list_metadata_formats.py b/tests/integration/test_list_metadata_formats.py index c6722d2..f2d3846 100644 --- a/tests/integration/test_list_metadata_formats.py +++ b/tests/integration/test_list_metadata_formats.py @@ -8,8 +8,8 @@ from typing import TYPE_CHECKING import pytest -from httpx import HTTPStatusError +from oaipmh_scythe import IdDoesNotExist from oaipmh_scythe.models import MetadataFormat if TYPE_CHECKING: @@ -23,7 +23,7 @@ def test_list_metadata_formats(scythe: Scythe) -> None: assert isinstance(metadata_formats, Iterator) metadata_format = next(metadata_formats) assert isinstance(metadata_format, MetadataFormat) - assert metadata_format.metadataPrefix == "marcxml" + assert metadata_format.metadata_prefix == "marcxml" @pytest.mark.default_cassette("list_metadata_formats.yaml") @@ -33,13 +33,12 @@ def test_list_metadata_formats_with_valid_identifier(scythe: Scythe) -> None: assert isinstance(metadata_formats, Iterator) metadata_format = next(metadata_formats) assert isinstance(metadata_format, MetadataFormat) - assert metadata_format.metadataPrefix == "marcxml" + assert metadata_format.metadata_prefix == "marcxml" @pytest.mark.default_cassette("list_metadata_formats.yaml") @pytest.mark.vcr def test_list_metadata_formats_with_invalid_identifier(scythe: Scythe) -> None: - # idDoesNotExist metadata_formats = scythe.list_metadata_formats(identifier="oai:zenodo.org:XXX") - with pytest.raises(HTTPStatusError): + with pytest.raises(IdDoesNotExist, match="No matching identifier"): next(metadata_formats) diff --git a/tests/integration/test_list_records.py b/tests/integration/test_list_records.py index b99a906..33a31eb 100644 --- a/tests/integration/test_list_records.py +++ b/tests/integration/test_list_records.py @@ -9,11 +9,11 @@ import httpx import pytest -from lxml import etree -from oaipmh_scythe.iterator import OAIResponseIterator +from oaipmh_scythe import BadArgument, BadResumptionToken, NoRecordsMatch +from oaipmh_scythe.iterator import ResponseIterator from oaipmh_scythe.models import Record -from oaipmh_scythe.response import OAIResponse +from oaipmh_scythe.response import Response if TYPE_CHECKING: from oaipmh_scythe import Scythe @@ -29,7 +29,7 @@ def test_list_records_with_default_metadata_prefix(scythe: Scythe) -> None: assert isinstance(records, Iterator) record = next(records) assert isinstance(record, Record) - assert record.metadata["title"][0] == TITLE_1 + assert record.metadata.other_element.title[0].value == TITLE_1 @pytest.mark.default_cassette("list_records.yaml") @@ -39,7 +39,7 @@ def test_list_records_without_metadata_prefix(scythe: Scythe) -> None: assert isinstance(records, Iterator) record = next(records) assert isinstance(record, Record) - assert record.metadata["title"][0] == TITLE_1 + assert record.metadata.other_element.title[0].value == TITLE_1 @pytest.mark.default_cassette("list_records.yaml") @@ -49,15 +49,14 @@ def test_list_records_with_valid_metadata_prefix(scythe: Scythe) -> None: assert isinstance(records, Iterator) record = next(records) assert isinstance(record, Record) - assert record.metadata["title"][0] == TITLE_1 + assert record.metadata.other_element.titles.title[0].value == TITLE_1 @pytest.mark.default_cassette("list_records.yaml") @pytest.mark.vcr def test_list_records_with_invalid_metadata_prefix(scythe: Scythe) -> None: - # cannotDisseminateFormat records = scythe.list_records(metadata_prefix="XXX") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(BadArgument, match="metadataPrefix does not exist"): next(records) @@ -67,7 +66,7 @@ def test_list_records_with_from(scythe: Scythe) -> None: records = scythe.list_records(from_="2024-01-16") assert isinstance(records, Iterator) record = next(records) - assert record.metadata["title"][0] == TITLE_2 + assert record.metadata.other_element.title[0].value == TITLE_2 @pytest.mark.default_cassette("list_records.yaml") @@ -76,7 +75,7 @@ def test_list_records_with_until(scythe: Scythe) -> None: records = scythe.list_records(until="2024-01-17") assert isinstance(records, Iterator) record = next(records) - assert record.metadata["title"][0] == TITLE_1 + assert record.metadata.other_element.title[0].value == TITLE_1 @pytest.mark.default_cassette("list_records.yaml") @@ -84,7 +83,7 @@ def test_list_records_with_until(scythe: Scythe) -> None: def test_list_records_with_from_and_until(scythe: Scythe) -> None: records = scythe.list_records(from_="2024-01-16", until="2024-01-17") record = next(records) - assert record.metadata["title"][0] == TITLE_2 + assert record.metadata.other_element.title[0].value == TITLE_2 @pytest.mark.default_cassette("list_records.yaml") @@ -92,15 +91,14 @@ def test_list_records_with_from_and_until(scythe: Scythe) -> None: def test_list_records_with_valid_set(scythe: Scythe) -> None: records = scythe.list_records(set_="software") record = next(records) - assert record.metadata["title"][0] == "plasmo-dev/PlasmoExamples: Initial Release" + assert record.metadata.other_element.title[0].value == "plasmo-dev/PlasmoExamples: Initial Release" @pytest.mark.default_cassette("list_records.yaml") @pytest.mark.vcr def test_list_records_with_invalid_set(scythe: Scythe) -> None: - # noRecordsMatch records = scythe.list_records(set_="XXX") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(NoRecordsMatch): next(records) @@ -111,24 +109,22 @@ def test_list_records_with_valid_resumption_token(scythe: Scythe) -> None: records = scythe.list_records(resumption_token=token) assert isinstance(records, Iterator) record = next(records) - assert record + assert isinstance(record, Record) @pytest.mark.default_cassette("list_records.yaml") @pytest.mark.vcr def test_list_records_with_invalid_resumption_token(scythe: Scythe) -> None: - # badResumptionToken records = scythe.list_records(resumption_token="XXX") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(BadResumptionToken, match="The value of the resumptionToken argument is invalid or expired."): next(records) @pytest.mark.default_cassette("list_records.yaml") @pytest.mark.vcr def test_list_records_raises_no_records_match(scythe: Scythe) -> None: - # noRecordsMatch records = scythe.list_records(from_="2025-01-15") - with pytest.raises(httpx.HTTPStatusError): + with pytest.raises(NoRecordsMatch): next(records) @@ -143,16 +139,15 @@ def test_list_records_ignore_deleted(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_records.yaml") -@pytest.mark.vcr -def test_list_records_oai_response(scythe: Scythe) -> None: - scythe.iterator = OAIResponseIterator - responses = scythe.list_records() - assert isinstance(responses, Iterator) - responses = list(responses) +@pytest.mark.vcr() +def test_list_records_response(scythe: Scythe) -> None: + scythe.iterator = ResponseIterator + _responses = scythe.list_records() + assert isinstance(_responses, Iterator) + responses = list(_responses) # there are 3 canned responses in list_records.yaml assert len(responses) == 3 response = responses[0] - assert isinstance(response, OAIResponse) - assert response.params == {"metadataPrefix": "oai_dc", "verb": "ListRecords"} - assert isinstance(response.xml, etree._Element) - assert response.xml.tag == "{http://www.openarchives.org/OAI/2.0/}OAI-PMH" + assert isinstance(response, Response) + assert response.status_code == httpx.codes.OK + assert response.url == httpx.URL("https://zenodo.org/oai2d?verb=ListRecords&metadataPrefix=oai_dc") diff --git a/tests/integration/test_list_sets.py b/tests/integration/test_list_sets.py index 7384611..5ab99dd 100644 --- a/tests/integration/test_list_sets.py +++ b/tests/integration/test_list_sets.py @@ -8,8 +8,8 @@ from typing import TYPE_CHECKING import pytest -from httpx import HTTPStatusError +from oaipmh_scythe import BadResumptionToken from oaipmh_scythe.models import Set if TYPE_CHECKING: @@ -26,7 +26,7 @@ def test_list_sets(scythe: Scythe) -> None: assert len(sets) == 10 s = sets[0] assert isinstance(s, Set) - assert s.setName == "European Middleware Initiative" + assert s.set_name == "European Middleware Initiative" @pytest.mark.default_cassette("list_sets.yaml") @@ -42,7 +42,6 @@ def test_list_sets_with_valid_resumption_token(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_sets.yaml") @pytest.mark.vcr def test_list_sets_with_invalid_resumption_token(scythe: Scythe) -> None: - # badResumptionToken sets = scythe.list_sets(resumption_token="XXX") - with pytest.raises(HTTPStatusError): + with pytest.raises(BadResumptionToken, match="The value of the resumptionToken argument is invalid or expired."): sets = list(sets) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index c500ce0..645d850 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -5,22 +5,36 @@ from __future__ import annotations +from collections.abc import Iterator from contextlib import suppress from typing import TYPE_CHECKING import httpx import pytest -from oaipmh_scythe import Scythe +from oaipmh_scythe import CannotDisseminateFormat, NoSetHierarchy, Scythe +from oaipmh_scythe.models import Identify, Record if TYPE_CHECKING: + from pytest_mock.plugin import MockerFixture, MockType + from respx.models import Route from respx.router import MockRouter -query = {"verb": "ListIdentifiers", "metadataPrefix": "oai_dc"} +query = {"verb": "Identify"} auth = ("username", "password") +@pytest.fixture() +def mock_sleep(mocker: MockerFixture) -> MockType: + return mocker.patch("time.sleep") + + +@pytest.fixture() +def mock_identify(respx_mock: MockRouter, identify_response: httpx.Response) -> Route: + return respx_mock.get("https://zenodo.org/oai2d?verb=Identify").mock(return_value=identify_response) + + def test_invalid_http_method() -> None: with pytest.raises(ValueError, match="Invalid HTTP method"): Scythe("https://localhost", http_method="DELETE") @@ -44,81 +58,97 @@ def test_context_manager() -> None: assert isinstance(scythe, Scythe) -def test_override_encoding(scythe: Scythe, respx_mock: MockRouter) -> None: - mock_route = respx_mock.get("https://zenodo.org/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc").mock( - return_value=httpx.Response(200) - ) +def test_override_encoding(scythe: Scythe, mock_identify: Route) -> None: custom_encoding = "latin_1" scythe.encoding = custom_encoding - oai_response = scythe.harvest(query) - assert mock_route.called - assert oai_response.http_response.encoding == custom_encoding + http_response = scythe._request(query) + assert mock_identify.called + assert http_response.encoding == custom_encoding -def test_post_method(scythe: Scythe, respx_mock: MockRouter) -> None: - mock_route = respx_mock.post("https://zenodo.org/oai2d").mock(return_value=httpx.Response(200)) +def test_post_method(scythe: Scythe, respx_mock: MockRouter, identify_response: httpx.Response) -> None: + mock_route = respx_mock.post("https://zenodo.org/oai2d").mock(return_value=identify_response) scythe.http_method = "POST" - oai_response = scythe.harvest(query) - assert mock_route.called - assert oai_response.http_response.status_code == 200 + response = scythe.harvest(query) + assert mock_route.call_count == 1 + assert response.status_code == httpx.codes.OK -def test_no_retry(scythe: Scythe, respx_mock: MockRouter) -> None: - mock_route = respx_mock.get("https://zenodo.org/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc").mock( - return_value=httpx.Response(503) - ) +def test_no_retry(scythe: Scythe, mock_identify: Route) -> None: + mock_identify.return_value = httpx.Response(httpx.codes.SERVICE_UNAVAILABLE) with suppress(httpx.HTTPStatusError): scythe.harvest(query) - assert mock_route.call_count == 1 + assert mock_identify.call_count == 1 -def test_retry_on_503(scythe: Scythe, respx_mock: MockRouter, mocker) -> None: +def test_retry_on_503(scythe: Scythe, mock_identify: Route, mock_sleep: MockType) -> None: scythe.max_retries = 3 scythe.default_retry_after = 0 - mock_sleep = mocker.patch("time.sleep") - mock_route = respx_mock.get("https://zenodo.org/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc").mock( - return_value=httpx.Response(503, headers={"retry-after": "10"}) - ) + mock_identify.return_value = httpx.Response(httpx.codes.SERVICE_UNAVAILABLE, headers={"retry-after": "10"}) with suppress(httpx.HTTPStatusError): scythe.harvest(query) - assert mock_route.call_count == 4 + assert mock_identify.call_count == 4 assert mock_sleep.call_count == 3 mock_sleep.assert_called_with(10) -def test_retry_on_503_without_retry_after_header(scythe: Scythe, respx_mock: MockRouter, mocker) -> None: +def test_retry_on_503_without_retry_after_header(scythe: Scythe, mock_identify: Route, mock_sleep: MockType) -> None: scythe.max_retries = 3 scythe.default_retry_after = 0 - mock_sleep = mocker.patch("time.sleep") - mock_route = respx_mock.get("https://zenodo.org/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc").mock( - return_value=httpx.Response(503, headers=None) - ) + mock_identify.return_value = httpx.Response(httpx.codes.SERVICE_UNAVAILABLE, headers=None) with suppress(httpx.HTTPStatusError): scythe.harvest(query) - assert mock_route.call_count == 4 + assert mock_identify.call_count == 4 assert mock_sleep.call_count == 3 -def test_retry_on_custom_code(scythe: Scythe, respx_mock: MockRouter, mocker) -> None: - mock_route = respx_mock.get("https://zenodo.org/oai2d?verb=ListIdentifiers&metadataPrefix=oai_dc").mock( - return_value=httpx.Response(500) - ) +def test_retry_on_custom_code(scythe: Scythe, mock_identify: Route, mock_sleep: MockType) -> None: + mock_identify.return_value = httpx.Response(httpx.codes.INTERNAL_SERVER_ERROR) scythe.max_retries = 3 scythe.default_retry_after = 0 - mock_sleep = mocker.patch("time.sleep") - scythe.retry_status_codes = (503, 500) + scythe.retry_status_codes = (httpx.codes.SERVICE_UNAVAILABLE, httpx.codes.INTERNAL_SERVER_ERROR) with suppress(httpx.HTTPStatusError): scythe.harvest(query) - assert mock_route.call_count == 4 + assert mock_identify.call_count == 4 assert mock_sleep.call_count == 3 -def test_no_auth_arguments(): +def test_no_set_hierarchy(scythe: Scythe, respx_mock: MockRouter) -> None: + no_set_hierarchy_xml = """ + + 2002-05-01T09:18:29Z + https://zenodo.org/oai2d + This repository does not support sets + + """ + response = httpx.Response(status_code=httpx.codes.UNPROCESSABLE_ENTITY, content=no_set_hierarchy_xml) + respx_mock.get("https://zenodo.org/oai2d?verb=ListSets").mock(return_value=response) + sets = scythe.list_sets() + with pytest.raises(NoSetHierarchy, match="This repository does not support sets"): + next(sets) + + +def test_cannot_disseminate_format(scythe: Scythe, respx_mock: MockRouter) -> None: + cannot_disseminate_format_xml = """ + + 2024-04-20T11:54:13Z + https://zenodo.org/oai2d + XXX + + """ + response = httpx.Response(status_code=httpx.codes.UNPROCESSABLE_ENTITY, content=cannot_disseminate_format_xml) + respx_mock.get("https://zenodo.org/oai2d?verb=ListIdentifiers&metadataPrefix=XXX").mock(return_value=response) + headers = scythe.list_identifiers(metadata_prefix="XXX") + with pytest.raises(CannotDisseminateFormat, match="XXX"): + next(headers) + + +def test_no_auth_arguments() -> None: with Scythe("https://zenodo.org/oai2d") as scythe: assert scythe.client.auth is None -def test_auth_arguments(): +def test_auth_arguments() -> None: with Scythe("https://zenodo.org/oai2d", auth=auth) as scythe: assert scythe.client.auth @@ -130,6 +160,50 @@ def test_auth_arguments_usage(respx_mock: MockRouter) -> None: assert oai_response.http_response.request.headers["authorization"] +def test_identify(scythe: Scythe, mock_identify: Route) -> None: + identify = scythe.identify() + assert isinstance(identify, Identify) + + +@pytest.mark.default_cassette("list_records.yaml") +@pytest.mark.vcr() +def test_list_records(scythe: Scythe) -> None: + records = scythe.list_records() + assert isinstance(records, Iterator) + assert next(records) + + +@pytest.mark.default_cassette("list_identifiers.yaml") +@pytest.mark.vcr() +def test_list_identifiers(scythe: Scythe) -> None: + headers = scythe.list_identifiers() + assert isinstance(headers, Iterator) + assert next(headers) + + +@pytest.mark.default_cassette("list_metadata_formats.yaml") +@pytest.mark.vcr() +def test_list_metadata_formats(scythe: Scythe, mocker) -> None: + metadata_formats = scythe.list_metadata_formats() + assert isinstance(metadata_formats, Iterator) + assert next(metadata_formats) + + +@pytest.mark.default_cassette("list_sets.yaml") +@pytest.mark.vcr() +def test_list_sets(scythe: Scythe, mocker) -> None: + sets = scythe.list_sets() + assert isinstance(sets, Iterator) + assert next(sets) + + +@pytest.mark.default_cassette("get_record.yaml") +@pytest.mark.vcr() +def test_get_record(scythe: Scythe) -> None: + record = scythe.get_record(identifier="oai:zenodo.org:10357859") + assert isinstance(record, Record) + + @pytest.mark.parametrize("timeout", [10, 10.0, 0.1]) def test_valid_custom_timeout(timeout): with Scythe("https://zenodo.org/oai2d", timeout=timeout) as scythe: diff --git a/tests/unit/test_iterator.py b/tests/unit/test_iterator.py index 80c5f88..9d3ba73 100644 --- a/tests/unit/test_iterator.py +++ b/tests/unit/test_iterator.py @@ -6,8 +6,8 @@ import pytest -from oaipmh_scythe import OAIResponse, Scythe -from oaipmh_scythe.iterator import OAIItemIterator, OAIResponseIterator +from oaipmh_scythe import Response, Scythe +from oaipmh_scythe.iterator import ItemIterator, ResponseIterator from oaipmh_scythe.models import Header query = {"verb": "ListIdentifiers", "metadataPrefix": "oai_dc"} @@ -16,16 +16,16 @@ @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_iterator_str(scythe: Scythe) -> None: - iterator = OAIResponseIterator(scythe, query) - assert str(iterator) == "" + iterator = ResponseIterator(scythe, query) + assert str(iterator) == "" @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_oai_response_iterator(scythe: Scythe) -> None: - iterator = OAIResponseIterator(scythe, query) + iterator = ResponseIterator(scythe, query) responses = list(iterator) - assert isinstance(responses[0], OAIResponse) + assert isinstance(responses[0], Response) # there are 3 canned responses in list_identifiers.yaml assert len(responses) == 3 @@ -33,7 +33,7 @@ def test_oai_response_iterator(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_oai_item_iterator(scythe: Scythe) -> None: - iterator = OAIItemIterator(scythe, query) + iterator = ItemIterator(scythe, query) headers = list(iterator) assert isinstance(headers[0], Header) # there are 9 canned identifiers in list_identifiers.yaml @@ -43,7 +43,7 @@ def test_oai_item_iterator(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") @pytest.mark.vcr def test_oai_item_iterator_ignore_deleted(scythe: Scythe) -> None: - iterator = OAIItemIterator(scythe, query, ignore_deleted=True) + iterator = ItemIterator(scythe, query, ignore_deleted=True) headers = list(iterator) assert isinstance(headers[0], Header) # there are 9 canned responses in list_identifiers.yaml diff --git a/tests/unit/test_models.py b/tests/unit/test_models.py index 0661145..a0dab54 100644 --- a/tests/unit/test_models.py +++ b/tests/unit/test_models.py @@ -2,266 +2,135 @@ # # SPDX-License-Identifier: BSD-3-Clause -import pytest -from lxml import etree +from xsdata.formats.dataclass.context import XmlContext +from xsdata.formats.dataclass.parsers import XmlParser +from xsdata.models.datatype import XmlDateTime -from oaipmh_scythe import OAIResponse from oaipmh_scythe.models import Header, Identify, MetadataFormat, Record, ResumptionToken, Set +from oaipmh_scythe.models.oai_dc import Dc, Title +from oaipmh_scythe.models.oai_pmh import Metadata, OaiPmherror, OaiPmherrorcode, ProtocolVersion, Status +PARSER = XmlParser(context=XmlContext()) -def test_resumption_token_repr() -> None: - token = ResumptionToken(token="some-token") - assert repr(token) == "" - -@pytest.fixture -def identify_response(mocker): - xml = """ - - 2023-11-09T09:53:46Z - https://zenodo.org/oai2d - - Zenodo - https://zenodo.org/oai2d - 2.0 - - +def test_identify_parsing() -> None: + identify_xml = """ + + Zenodo + https://zenodo.org/oai2d + 2.0 + """ - mock_response = mocker.MagicMock(spec=OAIResponse) - mock_response.xml = etree.fromstring(xml) - return mock_response - - -@pytest.fixture -def identify(identify_response) -> Identify: - return Identify(identify_response) - - -def test_identify_bytes(identify): - assert isinstance(identify.__bytes__(), bytes) - assert b"https://zenodo.org/oai2d" in identify.__bytes__() - - -def test_identify_str(identify): - assert isinstance(identify.__str__(), str) - assert "https://zenodo.org/oai2d" in str(identify) - - -def test_identify_raw(identify): - assert isinstance(identify.raw, str) - assert "https://zenodo.org/oai2d" in identify.raw - - -def test_identify_repr(identify): - assert repr(identify) == "" - - -def test_identify_attributes(identify): - assert identify.repositoryName == "Zenodo" - assert identify.baseURL == "https://zenodo.org/oai2d" - assert identify.protocolVersion == "2.0" - - -def test_identify_iter(identify): - identify_items = dict(identify) - assert identify_items["repositoryName"] == ["Zenodo"] - assert identify_items["baseURL"] == ["https://zenodo.org/oai2d"] - assert identify_items["protocolVersion"] == ["2.0"] + identify = PARSER.from_string(identify_xml, Identify) + assert isinstance(identify, Identify) + expected = Identify( + repository_name="Zenodo", base_url="https://zenodo.org/oai2d", protocol_version=ProtocolVersion.VALUE_2_0 + ) + assert identify == expected -@pytest.fixture(scope="session") -def header_element(): - xml = """ +def test_header_parsing(): + header_xml = """
- oai:zenodo.org:6538892 - 2022-05-11T13:49:36Z + oai:zenodo.org:10357859 + 2023-12-11T17:26:46Z
""" - return etree.fromstring(xml.encode()) - - -@pytest.fixture(scope="session") -def deleted_header_element(): - xml = """ -
- oai:zenodo.org:6538892 - 2022-05-11T13:49:36Z -
- """ - return etree.fromstring(xml.encode()) - - -@pytest.fixture -def header(header_element): - return Header(header_element) - - -@pytest.fixture -def deleted_header(deleted_header_element): - return Header(deleted_header_element) - - -def test_header_init(header): - assert header.identifier == "oai:zenodo.org:6538892" - assert header.datestamp == "2022-05-11T13:49:36Z" + header = PARSER.from_string(header_xml, Header) + assert isinstance(header, Header) + expected = Header(identifier="oai:zenodo.org:10357859", datestamp="2023-12-11T17:26:46Z") + assert header == expected assert not header.deleted -def test_header_init_with_deleted(deleted_header): - assert deleted_header.identifier == "oai:zenodo.org:6538892" - assert deleted_header.datestamp == "2022-05-11T13:49:36Z" - assert deleted_header.deleted - - -def test_header_repr(header, deleted_header): - assert repr(header) == "
" - assert repr(deleted_header) == "
" +def test_header_deleted(): + header_xml = '
' + header = PARSER.from_string(header_xml, Header) + assert header.deleted -def test_header_iter(header): - items = dict(header) - assert items == {"identifier": "oai:zenodo.org:6538892", "datestamp": "2022-05-11T13:49:36Z", "setSpecs": []} - - -@pytest.fixture -def record_element(): - xml = """ - -
- oai:example.org:record1 - 2021-01-01 - set1 -
- - - Example Title - Example Creator - - -
+def test_resumption_token_parsing() -> None: + token_xml = """ + eJyNzt1ugjAYgOF7 """ - return etree.fromstring(xml.encode()) + token = PARSER.from_string(token_xml, ResumptionToken) + assert isinstance(token, ResumptionToken) + expiration_date = XmlDateTime(2024, 1, 21, 16, 55, 57) + expected = ResumptionToken( + value="eJyNzt1ugjAYgOF7", cursor=0, expiration_date=expiration_date, complete_list_size=3677115 + ) + assert token == expected -@pytest.fixture -def deleted_record_lement(): - xml = """ +def test_record_parsing(): + record_xml = """ -
- oai:example.org:record1 - 2021-01-01 - set1 -
+
- - Example Title - Example Creator - + + Research Data Management Organiser (RDMO) +
""" - return etree.fromstring(xml.encode()) - - -@pytest.fixture -def record(record_element): - return Record(record_element) - - -@pytest.fixture -def deleted_record(deleted_record_lement): - return Record(deleted_record_lement) - - -def test_record_init(record): - assert isinstance(record.header, Header) - assert record.header.identifier == "oai:example.org:record1" + record = PARSER.from_string(record_xml, Record) + assert isinstance(record, Record) + expected = Record( + header=Header(), + metadata=Metadata(other_element=Dc(title=[Title(value="Research Data Management Organiser (RDMO)")])), + ) + assert record == expected assert not record.deleted - assert "title" in record.metadata - assert record.metadata["title"] == ["Example Title"] - -def test_record_repr(record): - assert repr(record) == "" +def test_record_deleted(): + record = Record(header=Header(status=Status.DELETED)) + assert record.deleted -def test_deleted_record_repr(deleted_record): - assert repr(deleted_record) == "" +def test_record_get_metadata(): + expected = Dc(title=[Title(value="Research Data Management Organiser (RDMO)")]) + record = Record(header=Header(), metadata=Metadata(other_element=expected)) + metadata = record.get_metadata() + assert isinstance(metadata, Dc) + assert metadata == expected -def test_record_iter(record): - record_metadata = dict(record) - assert record_metadata["title"] == ["Example Title"] - assert record_metadata["creator"] == ["Example Creator"] +def test_error_parsing(): + error_xml = 'No matching identifier' + error = PARSER.from_string(error_xml, OaiPmherror) + assert isinstance(error, OaiPmherror) + expected = OaiPmherror(code=OaiPmherrorcode.ID_DOES_NOT_EXIST, value="No matching identifier") + assert error == expected -def test_deleted_record_no_metadata(deleted_record): - assert deleted_record.deleted - with pytest.raises(AttributeError): - _ = record.metadata - -@pytest.fixture -def set_element(): - xml = """ - - user-emi - European Middleware Initiative - +def test_set_parsing(): + set_xml = """ + + software + Software """ - return etree.fromstring(xml.encode()) - - -@pytest.fixture -def oai_set(set_element): - return Set(set_element) - - -def test_set_init(oai_set): - assert oai_set.setName == "European Middleware Initiative" - assert "ser-emi" in oai_set.setSpec # spellchecker:disable-line - - -def test_set_repr(oai_set): - assert repr(oai_set) == "" - - -def test_set_iter(oai_set): - set_items = dict(oai_set) - assert set_items["setName"] == ["European Middleware Initiative"] - assert set_items["setSpec"] == ["user-emi"] - - -@pytest.fixture -def mdf_element(): - xml = """ - - marcxml - https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd - https://www.loc.gov/standards/marcxml/ + set_ = PARSER.from_string(set_xml, Set) + expected = Set(set_spec="software", set_name="Software") + assert isinstance(set_, Set) + assert set_ == expected + + +def test_metadata_format_parsing(): + metadata_format_xml = """ + + oai_dc + http://www.openarchives.org/OAI/2.0/oai_dc.xsd + http://www.openarchives.org/OAI/2.0/oai_dc/ """ - return etree.fromstring(xml.encode()) - - -@pytest.fixture -def mdf(mdf_element): - return MetadataFormat(mdf_element) - - -def test_metadata_format_init(mdf): - assert mdf.metadataPrefix == "marcxml" - assert mdf.schema == "https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" - assert mdf.metadataNamespace == "https://www.loc.gov/standards/marcxml/" - - -def test_metadata_format_repr(mdf): - assert repr(mdf) == "" - - -def test_metadata_format_iter(mdf): - mdf_items = dict(mdf) - assert mdf_items["metadataPrefix"] == ["marcxml"] - assert mdf_items["schema"] == ["https://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"] - assert mdf_items["metadataNamespace"] == ["https://www.loc.gov/standards/marcxml/"] + metadata_format = PARSER.from_string(metadata_format_xml, MetadataFormat) + assert isinstance(metadata_format, MetadataFormat) + expected = MetadataFormat( + metadata_prefix="oai_dc", + schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd", + metadata_namespace="http://www.openarchives.org/OAI/2.0/oai_dc/", + ) + assert metadata_format == expected diff --git a/tests/unit/test_response.py b/tests/unit/test_response.py index beea9a9..53c1b46 100644 --- a/tests/unit/test_response.py +++ b/tests/unit/test_response.py @@ -7,50 +7,31 @@ from typing import TYPE_CHECKING import pytest -from lxml import etree -from oaipmh_scythe.response import OAIResponse +from oaipmh_scythe import IdDoesNotExist +from oaipmh_scythe.models import Identify, OaiPmh +from oaipmh_scythe.models.oai_pmh import OaiPmherror, OaiPmherrorcode +from oaipmh_scythe.response import Response, _build_response, raise_for_error if TYPE_CHECKING: - from pytest_mock import MockerFixture + import httpx -IDENTIFY_XML: str = """ - - 2023-11-09T09:53:46Z - https://zenodo.org/oai2d - - Zenodo - https://zenodo.org/oai2d - 2.0 - - -""" +def test_build_response(identify_response: httpx.Response) -> None: + response = _build_response(identify_response, metadata_prefix="oai_dc") + assert isinstance(response, Response) + assert isinstance(response.parsed, OaiPmh) + assert response.status_code == identify_response.status_code + assert response.content == identify_response.content + assert isinstance(response.parsed.identify, Identify) + assert response.parsed.identify.repository_name == "Zenodo" -@pytest.fixture -def mock_response(mocker: MockerFixture): - response = mocker.Mock() - response.text = IDENTIFY_XML - response.content = response.text.encode() - return response +def test_raise_for_error_no_errors() -> None: + assert raise_for_error(None) is None -def test_oai_response_raw(mock_response) -> None: - params = {"verb": "Identify"} - oai_response = OAIResponse(http_response=mock_response, params=params) - assert oai_response.raw == mock_response.text - - -def test_oai_response_xml(mock_response): - params = {"verb": "Identify"} - oai_response = OAIResponse(http_response=mock_response, params=params) - assert isinstance(oai_response.xml, etree._Element) - assert oai_response.xml.tag == "{http://www.openarchives.org/OAI/2.0/}OAI-PMH" - - -def test_oai_response_str(mock_response): - params = {"verb": "Identify"} - oai_response = OAIResponse(http_response=mock_response, params=params) - assert str(oai_response) == "" +def test_raise_for_error() -> None: + error = OaiPmherror(code=OaiPmherrorcode.ID_DOES_NOT_EXIST, value="No matching identifier") + with pytest.raises(IdDoesNotExist): + raise_for_error([error]) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index d6078a8..24b1c8d 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -3,22 +3,14 @@ # # SPDX-License-Identifier: BSD-3-Clause -import pytest -from lxml import etree - -from oaipmh_scythe.utils import filter_dict_except_resumption_token, get_namespace, remove_none_values, xml_to_dict - +from typing import TYPE_CHECKING -@pytest.fixture -def xml_element_with_namespace() -> etree._Element: - xml = 'https://zenodo.org/oai2d' - return etree.fromstring(xml) +import pytest +if TYPE_CHECKING: + from pytest_mock.plugin import MockerFixture -@pytest.fixture -def xml_element_without_namespace() -> etree._Element: - xml = 'https://zenodo.org/oai2d' - return etree.fromstring(xml) +from oaipmh_scythe.utils import filter_dict_except_resumption_token, load_models, remove_none_values def test_remove_none_values() -> None: @@ -46,38 +38,8 @@ def test_filter_dict_except_resumption_token_noop() -> None: assert result == d -def test_get_namespace(xml_element_with_namespace: etree._Element) -> None: - namespace = get_namespace(xml_element_with_namespace) - assert namespace == "{http://www.openarchives.org/OAI/2.0/}" - - -def test_get_namespace_without_namespace(xml_element_without_namespace: etree._Element) -> None: - namespace = get_namespace(xml_element_without_namespace) - assert namespace is None - - -def test_xml_to_dict_default(xml_element_with_namespace: etree._Element) -> None: - result = xml_to_dict(xml_element_with_namespace) - expected = {"{http://www.openarchives.org/OAI/2.0/}request": ["https://zenodo.org/oai2d"]} - assert result == expected - - -def test_xml_to_dict_with_paths(xml_element_with_namespace: etree._Element) -> None: - result = xml_to_dict(xml_element_with_namespace, paths=["./{http://www.openarchives.org/OAI/2.0/}request"]) - expected = { - "{http://www.openarchives.org/OAI/2.0/}request": ["https://zenodo.org/oai2d"], - } - assert result == expected - - -def test_xml_to_dict_with_nsmap(xml_element_with_namespace: etree._Element) -> None: - nsmap = {"oai": "http://www.openarchives.org/OAI/2.0/"} - result = xml_to_dict(xml_element_with_namespace, paths=["oai:request"], nsmap=nsmap) - expected = {"{http://www.openarchives.org/OAI/2.0/}request": ["https://zenodo.org/oai2d"]} - assert result == expected - - -def test_xml_to_dict_strip_namespace(xml_element_with_namespace: etree._Element) -> None: - result = xml_to_dict(xml_element_with_namespace, strip_ns=True) - expected = {"request": ["https://zenodo.org/oai2d"]} - assert result == expected +@pytest.mark.parametrize("metadata_prefix", ["oai_dc", "datacite"]) +def test_load_models(mocker: "MockerFixture", metadata_prefix: str) -> None: + mock_import = mocker.patch("builtins.__import__") + load_models(metadata_prefix) + mock_import.assert_called_once() From 0d9ed10faea9c894849135ed4e049c11c86d4e7f Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Fri, 17 May 2024 14:12:08 +0200 Subject: [PATCH 02/16] s --- tests/unit/test_client.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 645d850..2adab67 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -153,11 +153,10 @@ def test_auth_arguments() -> None: assert scythe.client.auth -def test_auth_arguments_usage(respx_mock: MockRouter) -> None: +def test_auth_arguments_usage(respx_mock: MockRouter, mock_identify: Route) -> None: scythe = Scythe("https://zenodo.org/oai2d", auth=auth) - respx_mock.get("https://zenodo.org/oai2d").mock(return_value=httpx.Response(200)) - oai_response = scythe.harvest(query) - assert oai_response.http_response.request.headers["authorization"] + http_response = scythe._request(query) + assert http_response.request.headers["authorization"] def test_identify(scythe: Scythe, mock_identify: Route) -> None: From 199fecc0b56acea83d1aa21da7438d72c8e8d099 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Fri, 17 May 2024 14:47:54 +0200 Subject: [PATCH 03/16] s --- docs/src/tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md index f3b4271..2d9ce0b 100644 --- a/docs/src/tutorial.md +++ b/docs/src/tutorial.md @@ -273,7 +273,7 @@ You could then save the returned responses to disk: ```python with open("response.xml", "w") as f: - f.write(next(responses).raw.encode("utf-8")) + f.write(next(responses).content.decode("utf-8")) ``` ## Ignoring Deleted Records From 3a37c895bb3eb15432b822f40523327d48de24aa Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Fri, 17 May 2024 16:07:26 +0200 Subject: [PATCH 04/16] s --- src/oaipmh_scythe/models/marcxml/__init__.py | 2 +- src/oaipmh_scythe/models/marcxml/models.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/oaipmh_scythe/models/marcxml/__init__.py b/src/oaipmh_scythe/models/marcxml/__init__.py index fe61c81..5e8cd51 100644 --- a/src/oaipmh_scythe/models/marcxml/__init__.py +++ b/src/oaipmh_scythe/models/marcxml/__init__.py @@ -1,4 +1,4 @@ -"""This file was generated by xsdata, v24.4, on 2024-04-23 15:41:50 +"""This file was generated by xsdata, v24.5, on 2024-05-17 16:05:33 Generator: DataclassGenerator See: https://xsdata.readthedocs.io/ diff --git a/src/oaipmh_scythe/models/marcxml/models.py b/src/oaipmh_scythe/models/marcxml/models.py index e9c93ac..293f054 100644 --- a/src/oaipmh_scythe/models/marcxml/models.py +++ b/src/oaipmh_scythe/models/marcxml/models.py @@ -1,4 +1,4 @@ -"""This file was generated by xsdata, v24.4, on 2024-04-23 15:41:50 +"""This file was generated by xsdata, v24.5, on 2024-05-17 16:05:33 Generator: DataclassGenerator See: https://xsdata.readthedocs.io/ @@ -196,7 +196,9 @@ class Meta: @dataclass(slots=True) class Record(RecordType): - """Record is a top level container element for all of the field elements which compose the record.""" + """Record is a top level container element for all of the field elements which + compose the record. + """ class Meta: name = "record" From c3d6615ded424f273ccf2a1fa5d4109fd8e46b80 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Fri, 17 May 2024 16:08:14 +0200 Subject: [PATCH 05/16] s --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 495c2be..c3c596f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,7 @@ dynamic = [ ] dependencies = [ "httpx>=0.27", - "xsdata[cli,lxml]", # TODO: remove cli extra + "xsdata[cli,lxml]>=24.5", # TODO: remove cli extra ] optional-dependencies.dev = [ "pre-commit~=3.8", From 6a91fff1f6ad1abfcba0720ecc95b61dcbe26e90 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Fri, 16 Aug 2024 09:58:40 +0200 Subject: [PATCH 06/16] stash --- README.md | 4 +- docs/mkdocs.yml | 5 ++ docs/src/css/code_select.css | 8 +++ docs/src/customizing.md | 50 +++++++++++------ .../models/oai_datacite/.xsdata.xml | 32 +++++++++++ .../models/oai_datacite/README.md | 6 ++ .../models/oai_datacite/__init__.py | 14 +++++ .../models/oai_datacite/models.py | 55 +++++++++++++++++++ src/oaipmh_scythe/utils.py | 3 + 9 files changed, 159 insertions(+), 18 deletions(-) create mode 100644 docs/src/css/code_select.css create mode 100644 src/oaipmh_scythe/models/oai_datacite/.xsdata.xml create mode 100644 src/oaipmh_scythe/models/oai_datacite/README.md create mode 100644 src/oaipmh_scythe/models/oai_datacite/__init__.py create mode 100644 src/oaipmh_scythe/models/oai_datacite/models.py diff --git a/README.md b/README.md index 6266bd5..51f69dd 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,8 @@ with Scythe("https://zenodo.org/oai2d") as scythe: You can install `oaipmh-scythe` via pip from [PyPI][pypi-url]: -```console -python -m pip install oaipmh-scythe +```shell-session +$ python -m pip install oaipmh-scythe ``` ## Documentation diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index c56df05..9db9420 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -38,6 +38,7 @@ theme: - content.action.view - content.code.annotate - content.code.copy + - content.code.select - navigation.footer palette: - media: '(prefers-color-scheme: light)' @@ -69,6 +70,7 @@ plugins: markdown_extensions: - admonition - pymdownx.highlight: + use_pygments: true anchor_linenums: true line_spans: __span pygments_lang_class: true @@ -80,3 +82,6 @@ extra: version: provider: mike alias: true + +extra_css: +- css/code_select.css diff --git a/docs/src/css/code_select.css b/docs/src/css/code_select.css new file mode 100644 index 0000000..4d83760 --- /dev/null +++ b/docs/src/css/code_select.css @@ -0,0 +1,8 @@ +.language-pycon .gp, .language-pycon .go { + user-select: none; +} + + +/* .highlight .gp { + user-select: none; +} */ diff --git a/docs/src/customizing.md b/docs/src/customizing.md index eeb8bab..aca8354 100644 --- a/docs/src/customizing.md +++ b/docs/src/customizing.md @@ -5,29 +5,47 @@ SPDX-FileCopyrightText: 2023 Heinz-Alexander Fütterer SPDX-License-Identifier: BSD-3-Clause --> -# Harvesting other Metadata Formats than OAI-DC +# Harvesting other Metadata Formats -# TODO +By default, `oaipmh-scythe`'s mapping of the record XML into Python dataclasses is tailored to work best with +Dublin-Core-encoded metadata payloads (i.e. `metadata_prefix="oai_dc"`). -By default, `oaipmh-scythe`'s mapping of the record XML into Python dataclasses is tailored to work only with -Dublin-Core-encoded metadata payloads. Other formats most probably won't be mapped correctly, especially if they are -more hierarchically structured than Dublin Core. +```pycon +>>> from oaipmh_scythe import Scythe +>>> scythe = Scythe("https://export.arxiv.org/oai2") +>>> record = scythe.get_record("oai:arXiv.org:2203.05794", metadata_prefix="oai_dc") +>>> record.get_metadata() +Dc(title=[Title(value='BERTopic: Neural topic modeling with a class-based TF-IDF procedure', lang=None)], creator=[Creator(value='Grootendorst, Maarten', lang=None)], subject=[Subject(value='Computer Science - Computation and Language', lang=None)], description=[Description(value=' Topic models can be useful tools to discover latent topics in collections of\ndocuments. Recent studies have shown the feasibility of approach topic modeling\nas a clustering task. We present BERTopic, a topic model that extends this\nprocess by extracting coherent topic representation through the development of\na class-based variation of TF-IDF. More specifically, BERTopic generates\ndocument embedding with pre-trained transformer-based language models, clusters\nthese embeddings, and finally, generates topic representations with the\nclass-based TF-IDF procedure. BERTopic generates coherent topics and remains\ncompetitive across a variety of benchmarks involving classical models and those\nthat follow the more recent clustering approach of topic modeling.\n', lang=None), Description(value='Comment: BERTopic has a python implementation, see\n https://github.com/MaartenGr/BERTopic', lang=None)], publisher=[], contributor=[], date=[Date(value='2022-03-11', lang=None)], type_value=[TypeType(value='text', lang=None)], format=[], identifier=[Identifier(value='http://arxiv.org/abs/2203.05794', lang=None)], source=[], language=[], relation=[], coverage=[], rights=[]) +``` -In case you want to harvest these more complex formats, you have to write your own record model class by subclassing the -default implementation that unpacks the metadata XML: +```pycon +>>> record = scythe.get_record("oai:arXiv.org:2203.05794", metadata_prefix="arXiv") +>>> record +# Record(header=Header(identifier='oai:arXiv.org:2203.05794', datestamp=XmlDate(2022, 3, 14), set_spec=['cs'], status=None), metadata=Metadata(other_element=AnyElement(qname='{http://arxiv.org/OAI/arXiv/}arXiv', text='', tail=None, children=[AnyElement(qname='{http://arxiv.org/OAI/arXiv/}id', text='2203.05794', tail=None, children=[], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}created', text='2022-03-11', tail=None, children=[], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}authors', text='', tail=None, children=[AnyElement(qname='{http://arxiv.org/OAI/arXiv/}author', text='', tail=None, children=[AnyElement(qname='{http://arxiv.org/OAI/arXiv/}keyname', text='Grootendorst', tail=None, children=[], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}forenames', text='Maarten', tail=None, children=[], attributes={})], attributes={})], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}title', text='BERTopic: Neural topic modeling with a class-based TF-IDF procedure', tail=None, children=[], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}categories', text='cs.CL', tail=None, children=[], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}comments', text='BERTopic has a python implementation, see\n https://github.com/MaartenGr/BERTopic', tail=None, children=[], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}license', text='http://arxiv.org/licenses/nonexclusive-distrib/1.0/', tail=None, children=[], attributes={}), AnyElement(qname='{http://arxiv.org/OAI/arXiv/}abstract', text=' Topic models can be useful tools to discover latent topics in collections of\ndocuments. Recent studies have shown the feasibility of approach topic modeling\nas a clustering task. We present BERTopic, a topic model that extends this\nprocess by extracting coherent topic representation through the development of\na class-based variation of TF-IDF. More specifically, BERTopic generates\ndocument embedding with pre-trained transformer-based language models, clusters\nthese embeddings, and finally, generates topic representations with the\nclass-based TF-IDF procedure. BERTopic generates coherent topics and remains\ncompetitive across a variety of benchmarks involving classical models and those\nthat follow the more recent clustering approach of topic modeling.\n', tail=None, children=[], attributes={})], attributes={'{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://arxiv.org/OAI/arXiv/ http://arxiv.org/OAI/arXiv.xsd'})), about=[]) +``` + +!!! note + The response still gets parsed into a dataclass, but the metadata has attributes of type `AnyElement`, e.g. `AnyElement(qname='{http://arxiv.org/OAI/arXiv/}arXiv'`. -```python -from oaipmh_scythe.models import Record +https://xsdata.readthedocs.io/en/latest/codegen/intro/#command-line-tool -class MyRecord(Record): - # Your XML unpacking implementation goes here. - pass +```bash +$ python -m pip install "xsdata[cli]>=24.5" +$ xsdata generate --package=arxiv http://arxiv.org/OAI/arXiv.xsd +``` + + +```python +from arxiv import ArXiv +record = scythe.get_record("oai:arXiv.org:2203.05794", metadata_prefix="arXiv") +record +# Record(header=Header(identifier='oai:arXiv.org:2203.05794', datestamp=XmlDate(2022, 3, 14), set_spec=['cs'], status=None), metadata=Metadata(other_element=ArXiv(id=['2203.05794'], created=['2022-03-11'], updated=[], authors=[AuthorsType(author=[AuthorType(keyname='Grootendorst', forenames='Maarten', suffix=None, affiliation=[])])], title=['BERTopic: Neural topic modeling with a class-based TF-IDF procedure'], msc_class=[], acm_class=[], report_no=[], journal_ref=[], comments=['BERTopic has a python implementation, see\n https://github.com/MaartenGr/BERTopic'], abstract=[' Topic models can be useful tools to discover latent topics in collections of\ndocuments. Recent studies have shown the feasibility of approach topic modeling\nas a clustering task. We present BERTopic, a topic model that extends this\nprocess by extracting coherent topic representation through the development of\na class-based variation of TF-IDF. More specifically, BERTopic generates\ndocument embedding with pre-trained transformer-based language models, clusters\nthese embeddings, and finally, generates topic representations with the\nclass-based TF-IDF procedure. BERTopic generates coherent topics and remains\ncompetitive across a variety of benchmarks involving classical models and those\nthat follow the more recent clustering approach of topic modeling.\n'], categories=['cs.CL'], doi=[], proxy=[], license=['http://arxiv.org/licenses/nonexclusive-distrib/1.0/'])), about=[]) ``` !!! note - Take a look at the implementation of [oaipmh_scythe.models.Record][] to get an idea of how to do this. + The response gets parsed into a Record dataclass, and the metadata is of type `ArXiv`. -Next, associate your implementation with OAI verbs in the [oaipmh_scythe.client.Scythe][] object. In this case, we want -the [oaipmh_scythe.client.Scythe][] object to use our implementation to represent items returned by ListRecords and -GetRecord responses: + +!!! note + Take a look at the models diff --git a/src/oaipmh_scythe/models/oai_datacite/.xsdata.xml b/src/oaipmh_scythe/models/oai_datacite/.xsdata.xml new file mode 100644 index 0000000..2b7128f --- /dev/null +++ b/src/oaipmh_scythe/models/oai_datacite/.xsdata.xml @@ -0,0 +1,32 @@ + + + + src.oaipmh_scythe.models.oai_datacite.models + dataclasses + single-package + Google + false + false + false + true + false + false + true + + + + + + + + + + + + + + + + + + diff --git a/src/oaipmh_scythe/models/oai_datacite/README.md b/src/oaipmh_scythe/models/oai_datacite/README.md new file mode 100644 index 0000000..5174087 --- /dev/null +++ b/src/oaipmh_scythe/models/oai_datacite/README.md @@ -0,0 +1,6 @@ +Generate the models with: + +```console +python -m pip install xsdata[cli] +xsdata generate --config src/oaipmh_scythe/models/oai_datacite/.xsdata.xml https://schema.datacite.org/oai/oai-1.1/oai.xsd +``` diff --git a/src/oaipmh_scythe/models/oai_datacite/__init__.py b/src/oaipmh_scythe/models/oai_datacite/__init__.py new file mode 100644 index 0000000..62380dc --- /dev/null +++ b/src/oaipmh_scythe/models/oai_datacite/__init__.py @@ -0,0 +1,14 @@ +"""This file was generated by xsdata, v24.5, on 2024-05-17 15:55:14 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" +from src.oaipmh_scythe.models.oai_datacite.models import ( + OaiDatacite, + RootType, +) + +__all__ = [ + "OaiDatacite", + "RootType", +] diff --git a/src/oaipmh_scythe/models/oai_datacite/models.py b/src/oaipmh_scythe/models/oai_datacite/models.py new file mode 100644 index 0000000..7e56afb --- /dev/null +++ b/src/oaipmh_scythe/models/oai_datacite/models.py @@ -0,0 +1,55 @@ +"""This file was generated by xsdata, v24.5, on 2024-05-17 15:55:14 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" +from __future__ import annotations + +from dataclasses import dataclass, field + +__NAMESPACE__ = "http://schema.datacite.org/oai/oai-1.1/" + + +@dataclass(slots=True) +class RootType: + """ + Attributes: + datacentre_symbol: The symbol of the datacentre that registered this record. + schema_version: The estimated DataCite Metadata Scheme version that this record adheres to. + payload: The complete, unaltered metadata of this record. + """ + + class Meta: + name = "rootType" + + datacentre_symbol: None | str = field( + default=None, + metadata={ + "name": "datacentreSymbol", + "type": "Element", + "namespace": "http://schema.datacite.org/oai/oai-1.1/", + "required": True, + }, + ) + schema_version: None | object = field( + default=None, + metadata={ + "name": "schemaVersion", + "type": "Element", + "namespace": "http://schema.datacite.org/oai/oai-1.1/", + }, + ) + payload: None | object = field( + default=None, + metadata={ + "type": "Element", + "namespace": "http://schema.datacite.org/oai/oai-1.1/", + }, + ) + + +@dataclass(slots=True) +class OaiDatacite(RootType): + class Meta: + name = "oai_datacite" + namespace = "http://schema.datacite.org/oai/oai-1.1/" diff --git a/src/oaipmh_scythe/utils.py b/src/oaipmh_scythe/utils.py index bc4f345..4a18ca2 100644 --- a/src/oaipmh_scythe/utils.py +++ b/src/oaipmh_scythe/utils.py @@ -97,6 +97,9 @@ def load_models(metadata_prefix: str | None = None) -> None: match metadata_prefix: case "oai_dc": from oaipmh_scythe.models.oai_dc import Dc # noqa: F401 + case "oai_datacite": + # from oaipmh_scythe.models.datacite import Resource # noqa: F401 + from oaipmh_scythe.models.oai_datacite import OaiDatacite # noqa: F401 case "datacite": from oaipmh_scythe.models.datacite import Resource # noqa: F401 case "marcxml": From 12a8377bb1f42aa6c16ddea7f261be26c842d4ff Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:36:29 +0200 Subject: [PATCH 07/16] s --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d95a50..1fd8a7d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -72,10 +72,10 @@ repos: - id: mypy args: [--config-file=pyproject.toml] additional_dependencies: - - httpx>=0.27 - - lxml-stubs>=0.5 - - pytest>=8.1 - - xsdata>=24.4 + - httpx==0.27 + - lxml-stubs==0.5.1 + - pytest==8.3.2 + - xsdata==24.7 exclude: tests # TODO: remove this exclusion - repo: https://github.com/scientific-python/cookie From f3cbaa930e71688f53b46e47ce11aae79028dd3d Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:36:50 +0200 Subject: [PATCH 08/16] s --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c3c596f..54ef376 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -180,7 +180,7 @@ lint.per-file-ignores."src/oaipmh_scythe/models/**.py" = [ ] lint.per-file-ignores."tests/**.py" = [ "D", # pydocstyle - "PLR2004", # magic-value-comparison + "PLR2004", # magic-value-comparison ] lint.unfixable = [ "F401", # unused-import From 1b2f158367962b4f955a9567a4abdbe088f85d0b Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:37:23 +0200 Subject: [PATCH 09/16] s --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index a076d4b..569d45f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,7 @@ def scythe() -> Scythe: return Scythe("https://zenodo.org/oai2d") -@pytest.fixture() +@pytest.fixture def identify_response() -> httpx.Response: identify_response_xml = """ Date: Wed, 21 Aug 2024 18:37:34 +0200 Subject: [PATCH 10/16] s --- tests/integration/test_get_record.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_get_record.py b/tests/integration/test_get_record.py index e06d75a..ec2e58b 100644 --- a/tests/integration/test_get_record.py +++ b/tests/integration/test_get_record.py @@ -19,7 +19,7 @@ @pytest.mark.default_cassette("get_record.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_get_record_with_default_metadata_prefix(scythe: Scythe) -> None: record = scythe.get_record(identifier=IDENTIFIER, metadata_prefix="oai_dc") assert isinstance(record, Record) @@ -27,7 +27,7 @@ def test_get_record_with_default_metadata_prefix(scythe: Scythe) -> None: @pytest.mark.default_cassette("get_record.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_get_record_without_metadata_prefix(scythe: Scythe) -> None: record = scythe.get_record(identifier=IDENTIFIER) assert isinstance(record, Record) From ef6efd767d2810f7d7111e9d69d91a8bba2bbabb Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:37:48 +0200 Subject: [PATCH 11/16] s --- tests/unit/test_client.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 2adab67..22d9426 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -25,12 +25,12 @@ auth = ("username", "password") -@pytest.fixture() +@pytest.fixture def mock_sleep(mocker: MockerFixture) -> MockType: return mocker.patch("time.sleep") -@pytest.fixture() +@pytest.fixture def mock_identify(respx_mock: MockRouter, identify_response: httpx.Response) -> Route: return respx_mock.get("https://zenodo.org/oai2d?verb=Identify").mock(return_value=identify_response) @@ -165,7 +165,7 @@ def test_identify(scythe: Scythe, mock_identify: Route) -> None: @pytest.mark.default_cassette("list_records.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_list_records(scythe: Scythe) -> None: records = scythe.list_records() assert isinstance(records, Iterator) @@ -173,7 +173,7 @@ def test_list_records(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_list_identifiers(scythe: Scythe) -> None: headers = scythe.list_identifiers() assert isinstance(headers, Iterator) @@ -181,7 +181,7 @@ def test_list_identifiers(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_metadata_formats.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_list_metadata_formats(scythe: Scythe, mocker) -> None: metadata_formats = scythe.list_metadata_formats() assert isinstance(metadata_formats, Iterator) @@ -189,7 +189,7 @@ def test_list_metadata_formats(scythe: Scythe, mocker) -> None: @pytest.mark.default_cassette("list_sets.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_list_sets(scythe: Scythe, mocker) -> None: sets = scythe.list_sets() assert isinstance(sets, Iterator) @@ -197,7 +197,7 @@ def test_list_sets(scythe: Scythe, mocker) -> None: @pytest.mark.default_cassette("get_record.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_get_record(scythe: Scythe) -> None: record = scythe.get_record(identifier="oai:zenodo.org:10357859") assert isinstance(record, Record) From 5eee42e5f7fa82cbf668645cbb99bc66a1ecde90 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:37:58 +0200 Subject: [PATCH 12/16] s --- tests/integration/test_list_records.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_list_records.py b/tests/integration/test_list_records.py index 33a31eb..b490481 100644 --- a/tests/integration/test_list_records.py +++ b/tests/integration/test_list_records.py @@ -139,7 +139,7 @@ def test_list_records_ignore_deleted(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_records.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_list_records_response(scythe: Scythe) -> None: scythe.iterator = ResponseIterator _responses = scythe.list_records() From 253812ca859c50d6d23240fb84149f3bb2c7e3e6 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:38:07 +0200 Subject: [PATCH 13/16] s --- tests/integration/test_list_identifiers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_list_identifiers.py b/tests/integration/test_list_identifiers.py index 8fb8bad..ed466f8 100644 --- a/tests/integration/test_list_identifiers.py +++ b/tests/integration/test_list_identifiers.py @@ -131,7 +131,7 @@ def test_list_identifiers_ignore_deleted(scythe: Scythe) -> None: @pytest.mark.default_cassette("list_identifiers.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_list_identifiers_response(scythe: Scythe) -> None: scythe.iterator = ResponseIterator responses = scythe.list_identifiers(metadata_prefix="oai_dc") From 70b706577fc5599294ce5db3552eacc06d0b2e1e Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:38:16 +0200 Subject: [PATCH 14/16] s --- tests/integration/test_identify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_identify.py b/tests/integration/test_identify.py index a0edf14..1dc470e 100644 --- a/tests/integration/test_identify.py +++ b/tests/integration/test_identify.py @@ -35,7 +35,7 @@ def test_identify(scythe: Scythe) -> None: @pytest.mark.default_cassette("identify.yaml") -@pytest.mark.vcr() +@pytest.mark.vcr def test_identify_response(scythe: Scythe) -> None: scythe.iterator = ResponseIterator response = scythe.identify() From eeed52d85c6ed661c00a45fbf534848ac6a97010 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:38:45 +0200 Subject: [PATCH 15/16] s --- src/oaipmh_scythe/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/oaipmh_scythe/utils.py b/src/oaipmh_scythe/utils.py index 4a18ca2..ec089f6 100644 --- a/src/oaipmh_scythe/utils.py +++ b/src/oaipmh_scythe/utils.py @@ -98,7 +98,7 @@ def load_models(metadata_prefix: str | None = None) -> None: case "oai_dc": from oaipmh_scythe.models.oai_dc import Dc # noqa: F401 case "oai_datacite": - # from oaipmh_scythe.models.datacite import Resource # noqa: F401 + # from oaipmh_scythe.models.datacite import Resource from oaipmh_scythe.models.oai_datacite import OaiDatacite # noqa: F401 case "datacite": from oaipmh_scythe.models.datacite import Resource # noqa: F401 From c20f9eccf32a3399fd9356a0f7f239c6fd79d967 Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Wed, 21 Aug 2024 18:39:18 +0200 Subject: [PATCH 16/16] s --- docs/src/customizing.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/src/customizing.md b/docs/src/customizing.md index aca8354..b2fd4da 100644 --- a/docs/src/customizing.md +++ b/docs/src/customizing.md @@ -25,8 +25,8 @@ Dc(title=[Title(value='BERTopic: Neural topic modeling with a class-based TF-IDF ``` !!! note - The response still gets parsed into a dataclass, but the metadata has attributes of type `AnyElement`, e.g. `AnyElement(qname='{http://arxiv.org/OAI/arXiv/}arXiv'`. - + The response still gets parsed into a dataclass, but the metadata has attributes of type `AnyElement`, e.g. + `AnyElement(qname='{http://arxiv.org/OAI/arXiv/}arXiv'`. https://xsdata.readthedocs.io/en/latest/codegen/intro/#command-line-tool @@ -35,9 +35,9 @@ $ python -m pip install "xsdata[cli]>=24.5" $ xsdata generate --package=arxiv http://arxiv.org/OAI/arXiv.xsd ``` - ```python from arxiv import ArXiv + record = scythe.get_record("oai:arXiv.org:2203.05794", metadata_prefix="arXiv") record # Record(header=Header(identifier='oai:arXiv.org:2203.05794', datestamp=XmlDate(2022, 3, 14), set_spec=['cs'], status=None), metadata=Metadata(other_element=ArXiv(id=['2203.05794'], created=['2022-03-11'], updated=[], authors=[AuthorsType(author=[AuthorType(keyname='Grootendorst', forenames='Maarten', suffix=None, affiliation=[])])], title=['BERTopic: Neural topic modeling with a class-based TF-IDF procedure'], msc_class=[], acm_class=[], report_no=[], journal_ref=[], comments=['BERTopic has a python implementation, see\n https://github.com/MaartenGr/BERTopic'], abstract=[' Topic models can be useful tools to discover latent topics in collections of\ndocuments. Recent studies have shown the feasibility of approach topic modeling\nas a clustering task. We present BERTopic, a topic model that extends this\nprocess by extracting coherent topic representation through the development of\na class-based variation of TF-IDF. More specifically, BERTopic generates\ndocument embedding with pre-trained transformer-based language models, clusters\nthese embeddings, and finally, generates topic representations with the\nclass-based TF-IDF procedure. BERTopic generates coherent topics and remains\ncompetitive across a variety of benchmarks involving classical models and those\nthat follow the more recent clustering approach of topic modeling.\n'], categories=['cs.CL'], doi=[], proxy=[], license=['http://arxiv.org/licenses/nonexclusive-distrib/1.0/'])), about=[]) @@ -46,6 +46,5 @@ record !!! note The response gets parsed into a Record dataclass, and the metadata is of type `ArXiv`. - !!! note Take a look at the models