From a8a6e6ed580b5d3a419eab63084acc29c98e447b Mon Sep 17 00:00:00 2001 From: Heinz-Alexander Fuetterer Date: Thu, 25 Jan 2024 13:44:36 +0100 Subject: [PATCH] feat!: add xsdata models --- pyproject.toml | 4 +- src/oaipmh_scythe/models.py | 280 --------------------------- src/oaipmh_scythe/models/__init__.py | 48 +++++ src/oaipmh_scythe/models/oai_dc.py | 260 +++++++++++++++++++++++++ src/oaipmh_scythe/utils.py | 59 +----- 5 files changed, 311 insertions(+), 340 deletions(-) delete mode 100644 src/oaipmh_scythe/models.py create mode 100644 src/oaipmh_scythe/models/__init__.py create mode 100644 src/oaipmh_scythe/models/oai_dc.py diff --git a/pyproject.toml b/pyproject.toml index 0dfc581..b760d4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,8 +33,8 @@ dynamic = [ "version", ] dependencies = [ - "httpx>=0.25", - "lxml>=5.1", + "httpx>=0.26", + "xsdata[cli,lxml]", # TODO remove cli, when done here ] [project.optional-dependencies] dev = [ diff --git a/src/oaipmh_scythe/models.py b/src/oaipmh_scythe/models.py deleted file mode 100644 index f2464d9..0000000 --- a/src/oaipmh_scythe/models.py +++ /dev/null @@ -1,280 +0,0 @@ -# SPDX-FileCopyrightText: 2015 Mathias Loesch -# SPDX-FileCopyrightText: 2023 Heinz-Alexander Fütterer -# -# SPDX-License-Identifier: BSD-3-Clause - -"""The models module defines data structures for representing various components of the OAI-PMH protocol. - -This module includes classes that encapsulate different entities in OAI-PMH, such as resumption tokens and -various types of OAI items. These classes provide structured representations of OAI-PMH elements, -facilitating their manipulation and processing in client applications. - -Classes: - ResumptionToken: Represents a resumption token used in OAI-PMH for paginated data retrieval. - OAIItem: A base class for generic OAI items. - Identify: Represents an Identify response in OAI-PMH. - Header: Represents an OAI Header element. - Record: Represents an OAI Record element. - Set: Represents an OAI Set element. - MetadataFormat: Represents an OAI MetadataFormat element. -""" - -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING - -from lxml import etree - -from oaipmh_scythe.utils import get_namespace, xml_to_dict - -if TYPE_CHECKING: - from collections.abc import Iterator - - from oaipmh_scythe.response import OAIResponse - - -@dataclass -class ResumptionToken: - """A data class representing a resumption token in the OAI-PMH protocol. - - Resumption tokens are used for iterating over multiple sets of results in OAI-PMH - harvest requests. This class encapsulates the typical components of a resumption token, - including the token itself, cursor, complete list size, and an expiration date. - - Attributes: - token: The actual resumption token used for continuing the iteration in subsequent OAI-PMH requests. - Default is None. - cursor: A marker indicating the current position in the list of results. Default is None. - complete_list_size: The total number of records in the complete list of results. Default is None. - expiration_date: The date and time when the resumption token expires. Default is None. - """ - - token: str | None = None - cursor: str | None = None - complete_list_size: str | None = None - expiration_date: str | None = None - - def __repr__(self) -> str: - return f"" - - -class OAIItem: - """A base class representing a generic item in the OAI-PMH protocol. - - This class provides a common structure for handling and manipulating XML data - associated with different types of OAI-PMH items, such as records, headers, or sets. - - Attributes: - xml: The parsed XML element representing the OAI item. - _strip_ns: A flag indicating whether to remove the namespaces from the element names - in the dictionary representation. - _oai_namespace: The namespace URI extracted from the XML element. - """ - - def __init__(self, xml: etree._Element, strip_ns: bool = True) -> None: - super().__init__() - self.xml = xml - self._strip_ns = strip_ns - self._oai_namespace = get_namespace(self.xml) - - def __bytes__(self) -> bytes: - return etree.tostring(self.xml, encoding="utf-8") - - def __str__(self) -> str: - return etree.tostring(self.xml, encoding="unicode") - - @property - def raw(self) -> str: - """Return the original XML as a unicode string.""" - return etree.tostring(self.xml, encoding="unicode") - - -class Identify(OAIItem): - """A class representing an Identify container in the OAI-PMH protocol. - - This class is specifically used for handling the response of an Identify request in OAI-PMH. - It differs from other OAI entities in that it is initialized with an OAIResponse object - rather than a direct XML element. The class parses the Identify information from the - response and provides access to its individual components. - - Args: - identify_response: The response object from an Identify request. - It should contain the XML representation of the Identify response. - - Attributes: - xml: The XML element representing the Identify response. - _identify_dict: A dictionary containing the parsed Identify information. - Dynamic Attributes: Based on the content of the Identify response, additional attributes - are dynamically set on this object. These can include attributes like - repository name, base URL, protocol version, etc. - - Raises: - ValueError: If the Identify element is not found in the provided XML. - """ - - def __init__(self, identify_response: OAIResponse) -> None: - super().__init__(identify_response.xml, strip_ns=True) - identify_element = self.xml.find(f".//{self._oai_namespace}Identify") - if identify_element is None: - raise ValueError("Identify element not found in the XML.") - self.xml = identify_element - self._identify_dict = xml_to_dict(self.xml, strip_ns=True) - for k, v in self._identify_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return "" - - def __iter__(self) -> Iterator: - """Iterate over the Identify information, yielding key-value pairs.""" - return iter(self._identify_dict.items()) - - -class Header(OAIItem): - """A class representing an OAI Header in the OAI-PMH protocol. - - The header contains essential information about a record, such as its identifier, datestamp, - and set specifications. This class parses these details from the provided XML header element - and makes them easily accessible as attributes. - - Args: - header_element: The XML element representing the OAI header. - - Attributes: - deleted: Indicates whether the record is marked as deleted in the OAI-PMH repository. - identifier: The unique identifier of the record in the OAI-PMH repository. - datestamp: The datestamp of the record, indicating when it was last updated. - setSpecs: A list of set specifications that the record belongs to. - """ - - def __init__(self, header_element: etree._Element) -> None: - super().__init__(header_element, strip_ns=True) - self.deleted = self.xml.attrib.get("status") == "deleted" - _identifier_element = self.xml.find(f"{self._oai_namespace}identifier") - _datestamp_element = self.xml.find(f"{self._oai_namespace}datestamp") - - self.identifier = getattr(_identifier_element, "text", None) - self.datestamp = getattr(_datestamp_element, "text", None) - self.setSpecs = [setSpec.text for setSpec in self.xml.findall(f"{self._oai_namespace}setSpec")] - - def __repr__(self) -> str: - return f"
" - - def __iter__(self) -> Iterator: - """Iterate over the header information, yielding key-value pairs.""" - return iter( - [ - ("identifier", self.identifier), - ("datestamp", self.datestamp), - ("setSpecs", self.setSpecs), - ] - ) - - -class Record(OAIItem): - """A class representing an OAI record in the OAI-PMH protocol. - - This class encapsulates a record element from an OAI-PMH response, handling its parsing, and providing - structured access to its details, such as header information and metadata. It checks for the presence of - the header and metadata elements and raises an error if the header is not found. - - Args: - record_element: The XML element representing the OAI record. - strip_ns: If True, namespaces are removed from the element names in the parsed metadata. Defaults to True. - - Attributes: - header: An instance of the Header class representing the header information of the record. - deleted: Indicates whether the record is marked as deleted. - metadata: A dictionary representation of the record's metadata, if available and not deleted. - - Raises: - ValueError: If the header element is not found in the provided XML. - """ - - def __init__(self, record_element: etree._Element, strip_ns: bool = True) -> None: - super().__init__(record_element, strip_ns=strip_ns) - header_element = self.xml.find(f".//{self._oai_namespace}header") - if header_element is None: - raise ValueError("Header element not found in the XML.") - self.header = Header(header_element) - self.deleted = self.header.deleted - if not self.deleted: - self.metadata = self.get_metadata() - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the record's metadata, yielding key-value pairs.""" - return iter(self.metadata.items()) - - def get_metadata(self): - """Extract and return the record's metadata as a dictionary.""" - # We want to get record/metadata//* - # would be the element ``dc`` - # in the ``oai_dc`` case. - return xml_to_dict( - self.xml.find(".//" + self._oai_namespace + "metadata").getchildren()[0], - strip_ns=self._strip_ns, - ) - - -class Set(OAIItem): - """A class representing a set in the OAI-PMH protocol. - - This class encapsulates a set element from an OAI-PMH response and provides structured access to its details. - It parses the set information from the provided XML element and dynamically sets attributes - based on the parsed content. - - Args: - set_element: The XML element representing the OAI set. The element is parsed to extract set details. - - Attributes: - setName: The name of the set, extracted from the set's XML element. - _set_dict: A dictionary containing the parsed set information. - """ - - def __init__(self, set_element: etree._Element) -> None: - super().__init__(set_element, strip_ns=True) - self._set_dict = xml_to_dict(self.xml, strip_ns=True) - self.setName: str | None = None - for k, v in self._set_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the set information, yielding key-value pairs.""" - return iter(self._set_dict.items()) - - -class MetadataFormat(OAIItem): - """A class representing a metadata format in the OAI-PMH protocol. - - This class handles the representation of a metadata format, which is an essential part of the OAI-PMH protocol. - It parses the provided XML element to extract and store metadata format details such as the metadata prefix. - - Args: - mdf_element: The XML element representing the metadata format. This element is parsed - to extract metadata format details. - - Attributes: - metadataPrefix: The prefix of the metadata format, extracted from the XML element. - _mdf_dict: A dictionary containing the parsed metadata format details. - """ - - def __init__(self, mdf_element: etree._Element) -> None: - super().__init__(mdf_element, strip_ns=True) - self._mdf_dict = xml_to_dict(self.xml, strip_ns=True) - self.metadataPrefix: str | None = None - for k, v in self._mdf_dict.items(): - setattr(self, k.replace("-", "_"), v[0]) - - def __repr__(self) -> str: - return f"" - - def __iter__(self) -> Iterator: - """Iterate over the metadata format information, yielding key-value pairs.""" - return iter(self._mdf_dict.items()) diff --git a/src/oaipmh_scythe/models/__init__.py b/src/oaipmh_scythe/models/__init__.py new file mode 100644 index 0000000..15851fa --- /dev/null +++ b/src/oaipmh_scythe/models/__init__.py @@ -0,0 +1,48 @@ +"""This file was generated by xsdata, v24.1, on 2024-01-25 13:49:22 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" +from src.oaipmh_scythe.models.oai_dc import ( + Contributor, + Coverage, + Creator, + Date, + Dc, + Description, + ElementType, + Format, + Identifier, + Language, + LangValue, + OaiDcType, + Publisher, + Relation, + Rights, + Source, + Subject, + Title, + TypeType, +) + +__all__ = [ + "Contributor", + "Coverage", + "Creator", + "Date", + "Dc", + "Description", + "ElementType", + "Format", + "Identifier", + "LangValue", + "Language", + "OaiDcType", + "Publisher", + "Relation", + "Rights", + "Source", + "Subject", + "Title", + "TypeType", +] diff --git a/src/oaipmh_scythe/models/oai_dc.py b/src/oaipmh_scythe/models/oai_dc.py new file mode 100644 index 0000000..b54ed5b --- /dev/null +++ b/src/oaipmh_scythe/models/oai_dc.py @@ -0,0 +1,260 @@ +"""This file was generated by xsdata, v24.1, on 2024-01-25 13:49:22 + +Generator: DataclassGenerator +See: https://xsdata.readthedocs.io/ +""" +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + + +class LangValue(Enum): + VALUE = "" + + +@dataclass +class ElementType: + class Meta: + name = "elementType" + target_namespace = "http://purl.org/dc/elements/1.1/" + + value: str = field( + default="", + metadata={ + "required": True, + }, + ) + lang: None | str | LangValue = field( + default=None, + metadata={ + "type": "Attribute", + "namespace": "http://www.w3.org/XML/1998/namespace", + }, + ) + + +@dataclass +class Contributor(ElementType): + class Meta: + name = "contributor" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Coverage(ElementType): + class Meta: + name = "coverage" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Creator(ElementType): + class Meta: + name = "creator" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Date(ElementType): + class Meta: + name = "date" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Description(ElementType): + class Meta: + name = "description" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Format(ElementType): + class Meta: + name = "format" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Identifier(ElementType): + class Meta: + name = "identifier" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Language(ElementType): + class Meta: + name = "language" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Publisher(ElementType): + class Meta: + name = "publisher" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Relation(ElementType): + class Meta: + name = "relation" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Rights(ElementType): + class Meta: + name = "rights" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Source(ElementType): + class Meta: + name = "source" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Subject(ElementType): + class Meta: + name = "subject" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class Title(ElementType): + class Meta: + name = "title" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class TypeType(ElementType): + class Meta: + name = "type" + namespace = "http://purl.org/dc/elements/1.1/" + + +@dataclass +class OaiDcType: + class Meta: + name = "oai_dcType" + target_namespace = "http://www.openarchives.org/OAI/2.0/oai_dc/" + + title: list[Title] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + creator: list[Creator] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + subject: list[Subject] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + description: list[Description] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + publisher: list[Publisher] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + contributor: list[Contributor] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + date: list[Date] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + type_value: list[TypeType] = field( + default_factory=list, + metadata={ + "name": "type", + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + format: list[Format] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + identifier: list[Identifier] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + source: list[Source] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + language: list[Language] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + relation: list[Relation] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + coverage: list[Coverage] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + rights: list[Rights] = field( + default_factory=list, + metadata={ + "type": "Element", + "namespace": "http://purl.org/dc/elements/1.1/", + }, + ) + + +@dataclass +class Dc(OaiDcType): + class Meta: + name = "dc" + namespace = "http://www.openarchives.org/OAI/2.0/oai_dc/" diff --git a/src/oaipmh_scythe/utils.py b/src/oaipmh_scythe/utils.py index e07a157..af1ff9a 100644 --- a/src/oaipmh_scythe/utils.py +++ b/src/oaipmh_scythe/utils.py @@ -13,22 +13,18 @@ log_response: Log the details of an HTTP response. remove_none_values: Remove keys from the dictionary where the value is `None`. filter_dict_except_resumption_token: Filter keys from the dictionary, if resumption token is not `None`. - get_namespace: Extracts the namespace from an XML element. - xml_to_dict: Converts an XML tree or element into a dictionary representation. """ from __future__ import annotations import logging -import re -from collections import defaultdict from typing import TYPE_CHECKING if TYPE_CHECKING: from typing import Any import httpx - from lxml import etree + logger = logging.getLogger(__name__) @@ -84,56 +80,3 @@ def filter_dict_except_resumption_token(d: dict[str, Any | None]) -> dict[str, A ) return {k: v for k, v in d.items() if k in allowed_keys} return d - - -def get_namespace(element: etree._Element) -> str | None: - """Return the namespace URI of an XML element. - - Extracts and returns the namespace URI from the tag of the given XML element. - The namespace URI is enclosed in curly braces at the start of the tag. - If the element does not have a namespace, `None` is returned. - - Args: - element: The XML element from which to extract the namespace. - - Returns: - The namespace URI as a string if the element has a namespace, otherwise `None`. - """ - match = re.search(r"(\{.*\})", element.tag) - return match.group(1) if match else None - - -def xml_to_dict( - tree: etree._Element, paths: list[str] | None = None, nsmap: dict[str, str] | None = None, strip_ns: bool = False -) -> dict[str, list[str | None]]: - """Convert an XML tree to a dictionary, with options for custom XPath and namespace handling. - - This function takes an XML element tree and converts it into a dictionary. The keys of the - dictionary are the tags of the XML elements, and the values are lists of the text contents - of these elements. It offers options to apply specific XPath expressions, handle namespaces, - and optionally strip namespaces from the tags in the resulting dictionary. - - Args: - tree: The root element of the XML tree to be converted. - paths: An optional list of XPath expressions to apply on the XML tree. If None or not - provided, the function will consider all elements in the tree. - nsmap: An optional dictionary for namespace mapping, used to provide shorter, more - readable paths in XPath expressions. If None or not provided, no namespace - mapping is applied. - strip_ns: A boolean flag indicating whether to remove namespaces from the element tags - in the resulting dictionary. Defaults to False. - - Returns: - A dictionary where each key is an element tag (with or without namespace, based on - `strip_ns`) and each value is a list of strings representing the text content of - each element with that tag. - """ - paths = paths or [".//"] - nsmap = nsmap or {} - fields = defaultdict(list) - for path in paths: - elements = tree.findall(path, nsmap) - for element in elements: - tag = re.sub(r"\{.*\}", "", element.tag) if strip_ns else element.tag - fields[tag].append(element.text) - return dict(fields)