From 67f9a59ae5c9a225d524d8302254f6430075a413 Mon Sep 17 00:00:00 2001 From: mferrera Date: Thu, 9 Jan 2025 12:05:57 +0100 Subject: [PATCH 1/2] MAINT: Refactor 'schema'/'internal' classes This restructures how these classes are formed and where they live. It was confusing to locate them with the model when they are really just tweaks to the model for us when exporting. This refactor tries to bring some clarity to that relationship. --- schemas/0.8.0/fmu_results.json | 48 +---- src/fmu/dataio/_metadata.py | 58 ++++- src/fmu/dataio/_model/root.py | 198 +++++++++--------- src/fmu/dataio/aggregation.py | 6 +- src/fmu/dataio/case.py | 5 +- src/fmu/dataio/preprocessed.py | 7 +- src/fmu/dataio/providers/_fmu.py | 9 +- src/fmu/dataio/providers/objectdata/_base.py | 8 +- .../objectdata/_export_models.py} | 70 +------ 9 files changed, 187 insertions(+), 222 deletions(-) rename src/fmu/dataio/{_model/schema.py => providers/objectdata/_export_models.py} (59%) diff --git a/schemas/0.8.0/fmu_results.json b/schemas/0.8.0/fmu_results.json index fd8d5f99a..e942a23b9 100644 --- a/schemas/0.8.0/fmu_results.json +++ b/schemas/0.8.0/fmu_results.json @@ -476,10 +476,7 @@ "$ref": "#/$defs/Masterdata" }, "source": { - "const": "fmu", - "enum": [ - "fmu" - ], + "default": "fmu", "title": "Source", "type": "string" }, @@ -487,10 +484,7 @@ "$ref": "#/$defs/Tracklog" }, "version": { - "const": "0.8.0", - "enum": [ - "0.8.0" - ], + "default": "0.8.0", "title": "Version", "type": "string" } @@ -499,8 +493,6 @@ "class", "masterdata", "tracklog", - "source", - "version", "fmu", "access" ], @@ -3555,10 +3547,7 @@ "$ref": "#/$defs/Masterdata" }, "source": { - "const": "fmu", - "enum": [ - "fmu" - ], + "default": "fmu", "title": "Source", "type": "string" }, @@ -3566,10 +3555,7 @@ "$ref": "#/$defs/Tracklog" }, "version": { - "const": "0.8.0", - "enum": [ - "0.8.0" - ], + "default": "0.8.0", "title": "Version", "type": "string" } @@ -3578,8 +3564,6 @@ "class", "masterdata", "tracklog", - "source", - "version", "fmu", "access" ], @@ -4664,10 +4648,7 @@ "$ref": "#/$defs/Masterdata" }, "source": { - "const": "fmu", - "enum": [ - "fmu" - ], + "default": "fmu", "title": "Source", "type": "string" }, @@ -4675,10 +4656,7 @@ "$ref": "#/$defs/Tracklog" }, "version": { - "const": "0.8.0", - "enum": [ - "0.8.0" - ], + "default": "0.8.0", "title": "Version", "type": "string" } @@ -4687,8 +4665,6 @@ "class", "masterdata", "tracklog", - "source", - "version", "fmu", "access", "data", @@ -6496,10 +6472,7 @@ "$ref": "#/$defs/Masterdata" }, "source": { - "const": "fmu", - "enum": [ - "fmu" - ], + "default": "fmu", "title": "Source", "type": "string" }, @@ -6507,10 +6480,7 @@ "$ref": "#/$defs/Tracklog" }, "version": { - "const": "0.8.0", - "enum": [ - "0.8.0" - ], + "default": "0.8.0", "title": "Version", "type": "string" } @@ -6519,8 +6489,6 @@ "class", "masterdata", "tracklog", - "source", - "version", "fmu", "access" ], diff --git a/src/fmu/dataio/_metadata.py b/src/fmu/dataio/_metadata.py index 30def6d88..75212463b 100644 --- a/src/fmu/dataio/_metadata.py +++ b/src/fmu/dataio/_metadata.py @@ -1,18 +1,32 @@ -"""Module for DataIO metadata. +""" +This module contains models used to output the metadata that sit beside the exported +data. -This contains the _MetaData class which collects and holds all relevant metadata +It contains internal data structures that are designed to depend on external modules, +but not the other way around. This design ensures modularity and flexibility, allowing +external modules to be potentially separated into their own repositories without +dependencies on the internals. """ from __future__ import annotations -from typing import TYPE_CHECKING, Final +from typing import TYPE_CHECKING, Final, List, Literal, Optional, Union + +from pydantic import ( + AnyHttpUrl, + BaseModel, + Field, +) from ._logging import null_logger -from ._model import fields, schema +from ._model import data, fields +from ._model.enums import FMUClass from ._model.global_configuration import GlobalConfiguration from ._model.product import Product +from ._model.root import CaseMetadata, FmuResultsSchema, ObjectMetadata from .exceptions import InvalidMetadataError from .providers._filedata import FileDataProvider +from .providers.objectdata._base import UnsetData from .providers.objectdata._provider import objectdata_provider_factory if TYPE_CHECKING: @@ -24,6 +38,38 @@ logger: Final = null_logger(__name__) +class JsonSchemaMetadata(BaseModel): + """Mixin to inject the $schema field into exported metadata.""" + + schema_: AnyHttpUrl = Field( + default_factory=lambda: AnyHttpUrl(FmuResultsSchema.url()), + alias="$schema", + frozen=True, + ) + + +class ObjectMetadataExport(JsonSchemaMetadata, ObjectMetadata, populate_by_name=True): + """Wraps the schema ObjectMetadata, adjusting some values to optional for pragmatic + purposes when exporting metadata.""" + + # These type ignores are for making the field optional + fmu: Optional[fields.FMU] # type: ignore + access: Optional[fields.SsdlAccess] # type: ignore + masterdata: Optional[fields.Masterdata] # type: ignore + # !! Keep UnsetData first in this union + data: Union[UnsetData, data.AnyData] # type: ignore + preprocessed: Optional[bool] = Field(alias="_preprocessed", default=None) + + +class CaseMetadataExport(JsonSchemaMetadata, CaseMetadata, populate_by_name=True): + """Adds the optional description field for backward compatibility.""" + + class_: Literal[FMUClass.case] = Field( + default=FMUClass.case, alias="class", title="metadata_class" + ) + description: Optional[List[str]] = Field(default=None) + + def _get_meta_filedata( dataio: ExportData, obj: types.Inferrable, @@ -72,7 +118,7 @@ def generate_export_metadata( dataio: ExportData, fmudata: FmuProvider | None = None, product: Product | None = None, -) -> schema.InternalObjectMetadata: +) -> ObjectMetadataExport: """ Main function to generate the full metadata @@ -102,7 +148,7 @@ def generate_export_metadata( objdata = objectdata_provider_factory(obj, dataio, product) - return schema.InternalObjectMetadata( # type: ignore[call-arg] + return ObjectMetadataExport( # type: ignore[call-arg] class_=objdata.classname, fmu=_get_meta_fmu(fmudata) if fmudata else None, masterdata=( diff --git a/src/fmu/dataio/_model/root.py b/src/fmu/dataio/_model/root.py index c446f6f31..198a7fa90 100644 --- a/src/fmu/dataio/_model/root.py +++ b/src/fmu/dataio/_model/root.py @@ -13,7 +13,7 @@ from pydantic.json_schema import GenerateJsonSchema from typing_extensions import Annotated -from fmu.dataio._definitions import FmuSchemas, SchemaBase +from fmu.dataio._definitions import SOURCE, FmuSchemas, SchemaBase from .data import AnyData from .enums import FMUClass @@ -38,6 +38,102 @@ T = TypeVar("T", Dict, List, object) +class FmuResultsSchema(SchemaBase): + """The main metadata export describing the results.""" + + VERSION: str = "0.8.0" + FILENAME: str = "fmu_results.json" + PATH: Path = FmuSchemas.PATH / VERSION / FILENAME + + class FmuResultsGenerateJsonSchema(GenerateJsonSchema): + contractual: Final[list[str]] = [ + "access", + "class", + "data.alias", + "data.bbox", + "data.content", + "data.format", + "data.geometry", + "data.grid_model", + "data.is_observation", + "data.is_prediction", + "data.name", + "data.offset", + "data.product.name", + "data.seismic.attribute", + "data.spec.columns", + "data.stratigraphic", + "data.stratigraphic_alias", + "data.tagname", + "data.time", + "data.vertical_domain", + "file.checksum_md5", + "file.relative_path", + "file.size_bytes", + "fmu.aggregation.operation", + "fmu.aggregation.realization_ids", + "fmu.case", + "fmu.context.stage", + "fmu.iteration.name", + "fmu.iteration.uuid", + "fmu.model", + "fmu.realization.id", + "fmu.realization.is_reference", + "fmu.realization.name", + "fmu.realization.uuid", + "fmu.workflow", + "masterdata", + "source", + "tracklog.datetime", + "tracklog.event", + "tracklog.user.id", + "version", + ] + + def _remove_format_path(self, obj: T) -> T: + """ + Removes entries with key "format" and value "path" from dictionaries. This + adjustment is necessary because JSON Schema does not recognize the "format": + "path", while OpenAPI does. This function is used in contexts where OpenAPI + specifications are not applicable. + """ + + if isinstance(obj, dict): + return { + k: self._remove_format_path(v) + for k, v in obj.items() + if not (k == "format" and v == "path") + } + + if isinstance(obj, list): + return [self._remove_format_path(element) for element in obj] + + return obj + + def generate( + self, + schema: Mapping[str, Any], + mode: Literal["validation", "serialization"] = "validation", + ) -> dict[str, Any]: + json_schema = super().generate(schema, mode=mode) + json_schema["$schema"] = self.schema_dialect + json_schema["$id"] = FmuResultsSchema.url() + json_schema["$contractual"] = self.contractual + + # sumo-core's validator does not recognize these. + del json_schema["discriminator"]["mapping"] + del json_schema["$defs"]["AnyData"]["discriminator"]["mapping"] + del json_schema["$defs"]["AnyProduct"]["discriminator"]["mapping"] + + return self._remove_format_path(json_schema) + + @staticmethod + def dump() -> dict[str, Any]: + return Root.model_json_schema( + schema_generator=FmuResultsSchema.FmuResultsGenerateJsonSchema + ) + + class MetadataBase(BaseModel): """Base model for all root metadata models generated.""" @@ -52,10 +148,10 @@ class MetadataBase(BaseModel): """The ``tracklog`` block contains a record of events recorded on these data. See :class:`Tracklog`.""" - source: Literal["fmu"] + source: str = SOURCE """The source of this data. Defaults to 'fmu'.""" - version: Literal["0.8.0"] + version: str = Field(default=FmuResultsSchema.VERSION) """The version of the schema that generated this data.""" @@ -197,99 +293,3 @@ def __get_pydantic_json_schema__( } ) return json_schema - - -class FmuResultsSchema(SchemaBase): - """The main metadata export describing the results.""" - - VERSION: str = "0.8.0" - FILENAME: str = "fmu_results.json" - PATH: Path = FmuSchemas.PATH / VERSION / FILENAME - - class FmuResultsGenerateJsonSchema(GenerateJsonSchema): - contractual: Final[list[str]] = [ - "access", - "class", - "data.alias", - "data.bbox", - "data.content", - "data.format", - "data.geometry", - "data.grid_model", - "data.is_observation", - "data.is_prediction", - "data.name", - "data.offset", - "data.product.name", - "data.seismic.attribute", - "data.spec.columns", - "data.stratigraphic", - "data.stratigraphic_alias", - "data.tagname", - "data.time", - "data.vertical_domain", - "file.checksum_md5", - "file.relative_path", - "file.size_bytes", - "fmu.aggregation.operation", - "fmu.aggregation.realization_ids", - "fmu.case", - "fmu.context.stage", - "fmu.iteration.name", - "fmu.iteration.uuid", - "fmu.model", - "fmu.realization.id", - "fmu.realization.is_reference", - "fmu.realization.name", - "fmu.realization.uuid", - "fmu.workflow", - "masterdata", - "source", - "tracklog.datetime", - "tracklog.event", - "tracklog.user.id", - "version", - ] - - def _remove_format_path(self, obj: T) -> T: - """ - Removes entries with key "format" and value "path" from dictionaries. This - adjustment is necessary because JSON Schema does not recognize the "format": - "path", while OpenAPI does. This function is used in contexts where OpenAPI - specifications are not applicable. - """ - - if isinstance(obj, dict): - return { - k: self._remove_format_path(v) - for k, v in obj.items() - if not (k == "format" and v == "path") - } - - if isinstance(obj, list): - return [self._remove_format_path(element) for element in obj] - - return obj - - def generate( - self, - schema: Mapping[str, Any], - mode: Literal["validation", "serialization"] = "validation", - ) -> dict[str, Any]: - json_schema = super().generate(schema, mode=mode) - json_schema["$schema"] = self.schema_dialect - json_schema["$id"] = FmuResultsSchema.url() - json_schema["$contractual"] = self.contractual - - # sumo-core's validator does not recognize these. - del json_schema["discriminator"]["mapping"] - del json_schema["$defs"]["AnyData"]["discriminator"]["mapping"] - del json_schema["$defs"]["AnyProduct"]["discriminator"]["mapping"] - - return self._remove_format_path(json_schema) - - @staticmethod - def dump() -> dict[str, Any]: - return Root.model_json_schema( - schema_generator=FmuResultsSchema.FmuResultsGenerateJsonSchema - ) diff --git a/src/fmu/dataio/aggregation.py b/src/fmu/dataio/aggregation.py index 41d0841e3..361936c44 100644 --- a/src/fmu/dataio/aggregation.py +++ b/src/fmu/dataio/aggregation.py @@ -12,7 +12,7 @@ from . import _utils, dataio, types from ._logging import null_logger -from ._model import schema +from ._metadata import ObjectMetadataExport from ._model.enums import FMUContext from .exceptions import InvalidMetadataError from .providers.objectdata._provider import objectdata_provider_factory @@ -65,7 +65,7 @@ class AggregatedData: tagname: str = "" verbosity: str = "DEPRECATED" # keep for while - _metadata: schema.InternalObjectMetadata = field(init=False) + _metadata: ObjectMetadataExport = field(init=False) _metafile: Path = field(default_factory=Path, init=False) def __post_init__(self) -> None: @@ -292,7 +292,7 @@ def _set_metadata( template["data"]["bbox"] = bbox try: - self._metadata = schema.InternalObjectMetadata.model_validate(template) + self._metadata = ObjectMetadataExport.model_validate(template) except ValidationError as err: raise InvalidMetadataError( f"The existing metadata for the aggregated data is invalid. " diff --git a/src/fmu/dataio/case.py b/src/fmu/dataio/case.py index 039343945..52a9155fb 100644 --- a/src/fmu/dataio/case.py +++ b/src/fmu/dataio/case.py @@ -13,7 +13,8 @@ from . import _utils from ._logging import null_logger -from ._model import global_configuration, schema +from ._metadata import CaseMetadataExport +from ._model import global_configuration from ._model.fields import Access, Case, Masterdata, Model, User logger: Final = null_logger(__name__) @@ -115,7 +116,7 @@ def generate_metadata(self) -> dict: warnings.warn(exists_warning, UserWarning) return {} - self._metadata = schema.InternalCaseMetadata( + self._metadata = CaseMetadataExport( masterdata=Masterdata.model_validate(self.config["masterdata"]), access=Access.model_validate(self.config["access"]), fmu=fields.FMUBase( diff --git a/src/fmu/dataio/preprocessed.py b/src/fmu/dataio/preprocessed.py index d54e509d8..c19bdc81b 100644 --- a/src/fmu/dataio/preprocessed.py +++ b/src/fmu/dataio/preprocessed.py @@ -10,7 +10,8 @@ from pydantic import ValidationError from ._logging import null_logger -from ._model import enums, schema +from ._metadata import ObjectMetadataExport +from ._model import enums from ._model.enums import FMUContext from ._model.fields import File from ._utils import export_metadata_file, md5sum @@ -186,9 +187,7 @@ def _get_updated_metadata(self, meta_existing: dict, objfile: Path) -> dict: try: # TODO: Would like to use meta.Root.model_validate() here # but then the '$schema' field is dropped from the meta_existing - validated_metadata = schema.InternalObjectMetadata.model_validate( - meta_existing - ) + validated_metadata = ObjectMetadataExport.model_validate(meta_existing) validated_metadata.tracklog.extend(enums.TrackLogEventType.merged) return validated_metadata.model_dump( mode="json", exclude_none=True, by_alias=True diff --git a/src/fmu/dataio/providers/_fmu.py b/src/fmu/dataio/providers/_fmu.py index fe67a6b0d..313907366 100644 --- a/src/fmu/dataio/providers/_fmu.py +++ b/src/fmu/dataio/providers/_fmu.py @@ -41,7 +41,8 @@ from fmu.config import utilities as ut from fmu.dataio import _utils from fmu.dataio._logging import null_logger -from fmu.dataio._model import fields, schema +from fmu.dataio._metadata import CaseMetadataExport +from fmu.dataio._model import fields from fmu.dataio._model.enums import ErtSimulationMode, FMUContext from fmu.dataio.exceptions import InvalidMetadataError @@ -280,7 +281,7 @@ def _get_restart_data_uuid(self) -> UUID | None: return None try: - restart_metadata = schema.InternalCaseMetadata.model_validate( + restart_metadata = CaseMetadataExport.model_validate( ut.yaml_load(restart_case_metafile) ) return _utils.uuid_from_string( @@ -300,12 +301,12 @@ def _get_iteration_and_real_uuid(self, case_uuid: UUID) -> tuple[UUID, UUID]: real_uuid = _utils.uuid_from_string(f"{case_uuid}{iter_uuid}{self._real_id}") return iter_uuid, real_uuid - def _get_case_meta(self) -> schema.InternalCaseMetadata: + def _get_case_meta(self) -> CaseMetadataExport: """Parse and validate the CASE metadata.""" logger.debug("Loading case metadata file and return pydantic case model") assert self._casepath is not None case_metafile = self._casepath / ERT_RELATIVE_CASE_METADATA_FILE - return schema.InternalCaseMetadata.model_validate( + return CaseMetadataExport.model_validate( ut.yaml_load(case_metafile, loader="standard") ) diff --git a/src/fmu/dataio/providers/objectdata/_base.py b/src/fmu/dataio/providers/objectdata/_base.py index 794165480..f8ae390a8 100644 --- a/src/fmu/dataio/providers/objectdata/_base.py +++ b/src/fmu/dataio/providers/objectdata/_base.py @@ -15,9 +15,9 @@ StratigraphyElement, ) from fmu.dataio._model.product import Product -from fmu.dataio._model.schema import AllowedContent, InternalUnsetData from fmu.dataio._utils import generate_description from fmu.dataio.providers._base import Provider +from fmu.dataio.providers.objectdata._export_models import AllowedContent, UnsetData if TYPE_CHECKING: from fmu.dataio._model.data import ( @@ -53,7 +53,7 @@ class ObjectDataProvider(Provider): # result properties; the most important is metadata which IS the 'data' part in # the resulting metadata. But other variables needed later are also given # as instance properties in addition (for simplicity in other classes/functions) - _metadata: AnyData | InternalUnsetData | None = field(default=None) + _metadata: AnyData | UnsetData | None = field(default=None) name: str = field(default="") time0: datetime | None = field(default=None) time1: datetime | None = field(default=None) @@ -101,7 +101,7 @@ def __post_init__(self) -> None: metadata["description"] = generate_description(self.dataio.description) self._metadata = ( - InternalUnsetData.model_validate(metadata) + UnsetData.model_validate(metadata) if metadata["content"] == "unset" else AnyData.model_validate(metadata) ) @@ -149,7 +149,7 @@ def get_bbox(self) -> BoundingBox2D | BoundingBox3D | None: def get_spec(self) -> AnySpecification | None: raise NotImplementedError - def get_metadata(self) -> AnyData | InternalUnsetData: + def get_metadata(self) -> AnyData | UnsetData: assert self._metadata is not None return self._metadata diff --git a/src/fmu/dataio/_model/schema.py b/src/fmu/dataio/providers/objectdata/_export_models.py similarity index 59% rename from src/fmu/dataio/_model/schema.py rename to src/fmu/dataio/providers/objectdata/_export_models.py index ccbf71172..48bc8f6f5 100644 --- a/src/fmu/dataio/_model/schema.py +++ b/src/fmu/dataio/providers/objectdata/_export_models.py @@ -1,30 +1,26 @@ -""" -This module contains models used to output the metadata that sit beside the exported -data. +"""This module contains classes used when data is being exported from the object data +provider. -It contains internal data structures that are designed to depend on external modules, -but not the other way around. This design ensures modularity and flexibility, allowing -external modules to be potentially separated into their own repositories without -dependencies on the internals. +Mostly these classes are here to maintain backward compatibility while a deprecation +period is ongoing. """ from __future__ import annotations import warnings from textwrap import dedent -from typing import List, Literal, Optional, Union +from typing import Final, Literal, Optional, Union from pydantic import ( - AnyHttpUrl, BaseModel, Field, model_validator, ) -from fmu.dataio._definitions import SOURCE +from fmu.dataio._logging import null_logger +from fmu.dataio._model import data, enums -from . import data, enums, fields -from .root import FmuResultsSchema +logger: Final = null_logger(__name__) def property_warn() -> None: @@ -101,24 +97,11 @@ def _validate_input(cls, values: dict) -> dict: return values -class JsonSchemaMetadata(BaseModel): - """This model contains information about which schema validates its data.""" - - schema_: AnyHttpUrl = Field( - default_factory=lambda: AnyHttpUrl(FmuResultsSchema.url()), - alias="$schema", - frozen=True, - ) - version: str = Field(default=FmuResultsSchema.VERSION, frozen=True) - source: str = Field(default=SOURCE, frozen=True) - - -# Remove the two models below when content is required as input. -class InternalUnsetData(data.Data): +class UnsetData(data.Data): content: Literal["unset"] # type: ignore @model_validator(mode="after") - def _deprecation_warning(self) -> InternalUnsetData: + def _deprecation_warning(self) -> UnsetData: valid_contents = [m.value for m in enums.Content] warnings.warn( "The is not provided which will produce invalid metadata. " @@ -128,36 +111,3 @@ def _deprecation_warning(self) -> InternalUnsetData: FutureWarning, ) return self - - -class InternalObjectMetadata(JsonSchemaMetadata, populate_by_name=True): - # TODO: aim to use root.ObjectMetadata as base - # class and disallow creating invalid metadata. - class_: Literal[ - enums.FMUClass.surface, - enums.FMUClass.table, - enums.FMUClass.cpgrid, - enums.FMUClass.cpgrid_property, - enums.FMUClass.polygons, - enums.FMUClass.cube, - enums.FMUClass.well, - enums.FMUClass.points, - enums.FMUClass.dictionary, - ] = Field(alias="class") - fmu: Optional[fields.FMU] - masterdata: Optional[fields.Masterdata] - access: Optional[fields.SsdlAccess] - data: Union[InternalUnsetData, data.AnyData] # keep InternalUnsetData first here - file: fields.File - display: fields.Display - tracklog: fields.Tracklog - preprocessed: Optional[bool] = Field(alias="_preprocessed", default=None) - - -class InternalCaseMetadata(JsonSchemaMetadata, populate_by_name=True): - class_: Literal["case"] = Field(alias="class", default="case") - masterdata: fields.Masterdata - access: fields.Access - fmu: fields.FMUBase - description: Optional[List[str]] = Field(default=None) - tracklog: fields.Tracklog From 209b8ff8346fb22cb7602963b63d65bdbd816a27 Mon Sep 17 00:00:00 2001 From: mferrera Date: Thu, 9 Jan 2025 13:01:57 +0100 Subject: [PATCH 2/2] DEV: Compare schemas when forcing without release --- tools/update-schema | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/update-schema b/tools/update-schema index 0c4bcba84..948012360 100755 --- a/tools/update-schema +++ b/tools/update-schema @@ -195,7 +195,7 @@ def write_schema( f"{BOLD}{schema.FILENAME}{NC} version {BOLD}{schema.VERSION}{NC}: " f"modifying '$id' url to 'prod':\n {schema.url()}", ) - else: + elif new_schema == existing_schema: print( PASS, f"{BOLD}{schema.FILENAME}{NC} version "