From 538d1a6d1d3faa857630acf17f219a35d2f9722b Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Fri, 1 Nov 2024 13:02:17 -0400 Subject: [PATCH] feat(schema): add support for extensions on primitive types i.e. add support for "sunder" fields like _status (sibling of status). See http://hl7.org/fhir/R4/json.html#primitive for more details. Example: { "birthDate": "1970-03-30", "_birthDate": { "id": "314159", "extension": [ { "url": "http://example.org/fhir/StructureDefinition/text", "valueString": "Easter 1970" }] } } These fields will be in the resulting schema if they are present in the input rows, else they will be left off. --- .github/workflows/ci.yaml | 2 +- cumulus_fhir_support/__init__.py | 2 +- cumulus_fhir_support/schemas.py | 69 +++++++++++++++++++++++-------- tests/test_schemas.py | 71 ++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4bce559..c122535 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,7 +16,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 diff --git a/cumulus_fhir_support/__init__.py b/cumulus_fhir_support/__init__.py index 30589ac..9c68a6f 100644 --- a/cumulus_fhir_support/__init__.py +++ b/cumulus_fhir_support/__init__.py @@ -1,6 +1,6 @@ """FHIR support code for the Cumulus project""" -__version__ = "1.2.1" +__version__ = "1.3.0" from .json import list_multiline_json_in_dir, read_multiline_json, read_multiline_json_from_dir from .schemas import pyarrow_schema_from_rows diff --git a/cumulus_fhir_support/schemas.py b/cumulus_fhir_support/schemas.py index dc182db..1b25a52 100644 --- a/cumulus_fhir_support/schemas.py +++ b/cumulus_fhir_support/schemas.py @@ -1,13 +1,13 @@ """Detect FHIR resource schemas""" from collections import namedtuple -from functools import partial from typing import Any, Iterable, Optional import pyarrow from fhirclient.models import ( codeableconcept, coding, + element, extension, fhirabstractbase, fhirdate, @@ -140,7 +140,8 @@ def _create_pyarrow_schema_for_resource( """ instance = fhirelementfactory.FHIRElementFactory.instantiate(resource_type, None) - # fhirclient doesn't include `resourceType` in the list of properties. So do that manually. + # fhirclient doesn't include `resourceType` in the list of properties, because it's only + # used in ndjson representations. But it's useful to have, so add it manually. type_field = pyarrow.field("resourceType", pyarrow.string()) level = 0 if wide else 2 @@ -153,27 +154,27 @@ def _fhir_obj_to_pyarrow_fields( base_obj: fhirabstractbase.FHIRAbstractBase, batch_shape: dict, *, level: int ) -> list[pyarrow.Field]: """Convert a FHIR instance to a PyArrow Field schema list""" - properties = map(FhirProperty._make, base_obj.elementProperties()) - return list( - filter( - None, - map( - partial( - _fhir_to_pyarrow_property, - base_obj=base_obj, - batch_shape=batch_shape, - level=level, - ), - properties, - ), - ) - ) + fhir_properties = map(FhirProperty._make, base_obj.elementProperties()) + pa_properties = [] + + for fhir_property in fhir_properties: + if pa_property := _fhir_to_pyarrow_property( + fhir_property, + base_obj=base_obj, + batch_shape=batch_shape, + level=level, + ): + pa_properties.append(pa_property) + if pa_sunder := _sunder_to_pyarrow_property(fhir_property, batch_shape=batch_shape): + pa_properties.append(pa_sunder) + + return pa_properties def _fhir_to_pyarrow_property( prop: FhirProperty, *, - base_obj: fhirabstractbase.FHIRAbstractBase, + base_obj: Optional[fhirabstractbase.FHIRAbstractBase] = None, batch_shape: dict = None, level: int, ) -> Optional[pyarrow.Field]: @@ -222,6 +223,38 @@ def _fhir_to_pyarrow_property( return pyarrow.field(prop.json_name, pyarrow_type, nullable=True) +def _sunder_to_pyarrow_property( + prop: FhirProperty, + *, + batch_shape: Optional[dict] = None, +) -> Optional[pyarrow.Field]: + """ + Checks for a FhirProperty's "sunder" sibling and returns a PyArrow field for it. + + A sunder (single underscore) field is an adjacent JSON field for primitive types that don't + otherwise have a place to put extension information. So "status" might have a sibling + "_status" field. + + See http://hl7.org/fhir/R4/json.html#primitive for more information. + + Returns None if the sunder field isn't present. + """ + # First, check if the sunder version is even present. + if not batch_shape or f"_{prop.json_name}" not in batch_shape: + return None + + # Make a fake property definition and see if it's good. + sunder_prop = FhirProperty( + name=f"_{prop.name}", + json_name=f"_{prop.json_name}", + pytype=element.Element, + is_list=prop.is_list, + of_many=prop.of_many, + required=prop.required, + ) + return _fhir_to_pyarrow_property(sunder_prop, level=LEVEL_INCLUSION, batch_shape=batch_shape) + + def _basic_fhir_to_pyarrow_type(pytype: type) -> pyarrow.DataType: """Converts a basic python type to a Pyspark type""" if pytype is int: diff --git a/tests/test_schemas.py b/tests/test_schemas.py index 6908d26..22e9cb7 100644 --- a/tests/test_schemas.py +++ b/tests/test_schemas.py @@ -232,3 +232,74 @@ def test_unexpected_fhir_type(self, mock_instantiate): mock_instantiate.return_value = mock_resource with self.assertRaisesRegex(ValueError, "Unexpected type: "): support.pyarrow_schema_from_rows("AllergyIntolerance") + + def test_primitive_field_extension(self): + """Verify that we support extensions to primitive fields""" + # See http://hl7.org/fhir/R4/json.html#primitive for details + rows = [ + { + # Non-existant sunder field + "_doesNotExist": {"id": "test-fake"}, + # Extension only, no ID + "_status": {"extension": [{"valueCode": "test-status"}]}, + # ID only, no extension (but with bogus modifierExtension that will be ignored) + "_priority": {"id": "test-priority", "modifierExtension": "not-supported"}, + # Array + "_instantiatesUri": [ + None, + {"id": "test-array"}, + {"extension": [{"url": "test"}]}, + ], + # Deep field + "dispenseRequest": { + "validityPeriod": {"_start": {"id": "test-start"}}, + }, + } + ] + schema = support.pyarrow_schema_from_rows("MedicationRequest", rows) + + self.assertEqual(-1, schema.get_field_index("_doesNotExist")) + self.assertEqual(-1, schema.get_field_index("_intent")) # never specified + self.assertEqual( + pyarrow.struct( + { + "extension": pyarrow.list_( + pyarrow.struct( + { + "valueCode": pyarrow.string(), + } + ) + ), + } + ), + schema.field("_status").type, + ) + self.assertEqual( + pyarrow.struct({"id": pyarrow.string()}), + schema.field("_priority").type, + ) + self.assertEqual( + pyarrow.list_( + pyarrow.struct( + { + "extension": pyarrow.list_( + pyarrow.struct( + { + "url": pyarrow.string(), + } + ) + ), + "id": pyarrow.string(), + } + ) + ), + schema.field("_instantiatesUri").type, + ) + self.assertEqual( + pyarrow.struct( + { + "id": pyarrow.string(), + } + ), + schema.field("dispenseRequest").type.field("validityPeriod").type.field("_start").type, + )