Merge pull request #1108 from NASA-IMPACT/1107-ej-integrate-original-…

…spreadsheet-data-with-cmr-records-in-api remove destination_server and add datasource
NASA-IMPACT · Dec 3, 2024 · 5306d32 · 5306d32
2 parents 8c19323 + 9a20863
commit 5306d32
Show file tree

Hide file tree

Showing 8 changed files with 395 additions and 15 deletions.
diff --git a/environmental_justice/README.md b/environmental_justice/README.md
@@ -0,0 +1,86 @@
+# Environmental Justice API
+
+## Overview
+This API provides access to Environmental Justice data from multiple sources. It supports retrieving data from individual sources or as a combined dataset with defined precedence rules.
+
+## Endpoints
+
+### GET /api/environmental-justice/
+
+Retrieves environmental justice data based on specified data source.
+
+#### Query Parameters
+
+| Parameter    | Description | Default    | Options                                      |
+|-------------|-------------|------------|----------------------------------------------|
+| data_source | Data source filter | "combined" | "spreadsheet", "ml_production", "ml_testing", "combined" |
+
+#### Data Source Behavior
+
+1. **Single Source**
+   - `?data_source=spreadsheet`: Returns only spreadsheet data
+   - `?data_source=ml_production`: Returns only ML production data
+   - `?data_source=ml_testing`: Returns only ML testing data
+
+2. **Combined Data** (Default)
+   - Access via `?data_source=combined` or no parameter
+   - Merges data from 'spreadsheet' and 'ml_production' sources
+   - Precedence rules:
+     - If the same dataset exists in both sources, the spreadsheet version is used
+     - Unique datasets from ml_production are included
+     - ML testing data is not included in combined view
+
+#### Example Requests
+
+```bash
+# Get combined data (default)
+GET /api/environmental-justice/
+
+# Get combined data (explicit)
+GET /api/environmental-justice/?data_source=combined
+
+# Get only spreadsheet data
+GET /api/environmental-justice/?data_source=spreadsheet
+
+# Get only ML production data
+GET /api/environmental-justice/?data_source=ml_production
+
+# Get only ML testing data
+GET /api/environmental-justice/?data_source=ml_testing
+```
+
+#### Response Fields
+
+Each record includes the following fields:
+- dataset
+- description
+- description_simplified
+- indicators
+- intended_use
+- latency
+- limitations
+- project
+- source_link
+- strengths
+- format
+- geographic_coverage
+- data_visualization
+- spatial_resolution
+- temporal_extent
+- temporal_resolution
+- sde_link
+- data_source
+
+## Data Source Definitions
+
+- **spreadsheet**: Primary source data from environmental justice spreadsheets
+- **ml_production**: Production machine learning processed data
+- **ml_testing**: Testing/staging machine learning processed data
+
+## Precedence Rules
+When retrieving combined data:
+1. If a dataset exists in both spreadsheet and ml_production:
+   - The spreadsheet version takes precedence
+   - The ml_production version is excluded
+2. Datasets unique to ml_production are included in the response
+3. ML testing data is never included in combined results
diff --git a/...tal_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py b/...tal_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py
@@ -0,0 +1,52 @@
+# Generated by Django 4.2.9 on 2024-11-23 03:18
+
+from django.db import migrations, models
+
+
+def migrate_destination_server_to_data_source(apps, schema_editor):
+    EnvironmentalJusticeRow = apps.get_model("environmental_justice", "EnvironmentalJusticeRow")
+
+    # Migrate prod to spreadsheet
+    EnvironmentalJusticeRow.objects.filter(destination_server="prod").update(
+        data_source="spreadsheet", destination_server=""
+    )
+
+    # Migrate dev to ml_production
+    EnvironmentalJusticeRow.objects.filter(destination_server="dev").update(
+        data_source="ml_production", destination_server=""
+    )
+
+    # Migrate test to ml_testing
+    EnvironmentalJusticeRow.objects.filter(destination_server="test").update(
+        data_source="ml_testing", destination_server=""
+    )
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("environmental_justice", "0005_environmentaljusticerow_destination_server"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="environmentaljusticerow",
+            name="data_source",
+            field=models.CharField(
+                blank=True,
+                choices=[
+                    ("spreadsheet", "Spreadsheet"),
+                    ("ml_production", "ML Production"),
+                    ("ml_testing", "ML Testing"),
+                ],
+                default="",
+                max_length=20,
+                verbose_name="Data Source",
+            ),
+        ),
+        migrations.RunPython(migrate_destination_server_to_data_source, reverse_code=migrations.RunPython.noop),
+        migrations.RemoveField(
+            model_name="environmentaljusticerow",
+            name="destination_server",
+        ),
+    ]
diff --git a/environmental_justice/models.py b/environmental_justice/models.py
@@ -6,13 +6,13 @@ class EnvironmentalJusticeRow(models.Model):
     Environmental Justice data from the spreadsheet
     """
 
-    class DestinationServerChoices(models.TextChoices):
-        DEV = "dev", "Development"
-        TEST = "test", "Testing"
-        PROD = "prod", "Production"
+    class DataSourceChoices(models.TextChoices):
+        SPREADSHEET = "spreadsheet", "Spreadsheet"
+        ML_PRODUCTION = "ml_production", "ML Production"
+        ML_TESTING = "ml_testing", "ML Testing"
 
-    destination_server = models.CharField(
-        "Destination Server", max_length=10, choices=DestinationServerChoices.choices, default="", blank=True
+    data_source = models.CharField(
+        "Data Source", max_length=20, choices=DataSourceChoices.choices, default="", blank=True
     )
 
     dataset = models.CharField("Dataset", blank=True, default="")

diff --git a/environmental_justice/tests.py b/environmental_justice/tests.py
diff --git a/environmental_justice/tests/conftest.py b/environmental_justice/tests/conftest.py
@@ -0,0 +1,30 @@
+import pytest
+from django.urls import include, path
+from rest_framework.routers import DefaultRouter
+from rest_framework.test import APIClient
+
+from environmental_justice.views import EnvironmentalJusticeRowViewSet
+
+# Create router and register our viewset
+router = DefaultRouter()
+router.register(r"environmental-justice", EnvironmentalJusticeRowViewSet)
+
+# Create temporary urlpatterns for testing
+urlpatterns = [
+    path("api/", include(router.urls)),
+]
+
+
+# Override default URL conf for testing
+@pytest.fixture
+def client():
+    """Return a Django REST framework API client"""
+    return APIClient()
+
+
+@pytest.fixture(autouse=True)
+def setup_urls():
+    """Setup URLs for testing"""
+    from django.conf import settings
+
+    settings.ROOT_URLCONF = __name__
diff --git a/environmental_justice/tests/factories.py b/environmental_justice/tests/factories.py
@@ -0,0 +1,28 @@
+import factory
+from factory.django import DjangoModelFactory
+
+from environmental_justice.models import EnvironmentalJusticeRow
+
+
+class EnvironmentalJusticeRowFactory(DjangoModelFactory):
+    class Meta:
+        model = EnvironmentalJusticeRow
+
+    dataset = factory.Sequence(lambda n: f"dataset_{n}")
+    description = factory.Faker("sentence")
+    description_simplified = factory.Faker("sentence")
+    indicators = factory.Faker("sentence")
+    intended_use = factory.Faker("sentence")
+    latency = factory.Faker("word")
+    limitations = factory.Faker("sentence")
+    project = factory.Faker("word")
+    source_link = factory.Faker("url")
+    strengths = factory.Faker("sentence")
+    format = factory.Faker("file_extension")
+    geographic_coverage = factory.Faker("country")
+    data_visualization = factory.Faker("sentence")
+    spatial_resolution = factory.Faker("word")
+    temporal_extent = factory.Faker("date")
+    temporal_resolution = factory.Faker("word")
+    sde_link = factory.Faker("url")
+    data_source = EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET
diff --git a/environmental_justice/tests/test_views.py b/environmental_justice/tests/test_views.py
@@ -0,0 +1,153 @@
+# docker-compose -f local.yml run --rm django pytest environmental_justice/tests/test_views.py
+import pytest
+from rest_framework import status
+
+from environmental_justice.models import EnvironmentalJusticeRow
+from environmental_justice.tests.factories import EnvironmentalJusticeRowFactory
+
+
+@pytest.mark.django_db
+class TestEnvironmentalJusticeRowViewSet:
+    """Test suite for the EnvironmentalJusticeRow API endpoints"""
+
+    def setup_method(self):
+        """Setup URL for API endpoint"""
+        self.url = "/api/environmental-justice/"
+
+    def test_empty_database_returns_empty_list(self, client):
+        """Should return empty list when no records exist"""
+        response = client.get(self.url)
+        assert response.status_code == status.HTTP_200_OK
+        assert response.json()["results"] == []
+        assert response.json()["count"] == 0
+
+    def test_single_source_filtering(self, client):
+        """Should return records only from requested data source"""
+        # Create records for each data source
+        spreadsheet_record = EnvironmentalJusticeRowFactory(
+            dataset="test_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET
+        )
+        ml_prod_record = EnvironmentalJusticeRowFactory(
+            dataset="another_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
+        )
+        ml_test_record = EnvironmentalJusticeRowFactory(
+            dataset="test_dataset_3", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_TESTING
+        )
+
+        # Test spreadsheet filter
+        response = client.get(f"{self.url}?data_source=spreadsheet")
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["dataset"] == spreadsheet_record.dataset
+
+        # Test ml_production filter
+        response = client.get(f"{self.url}?data_source=ml_production")
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["dataset"] == ml_prod_record.dataset
+
+        # Test ml_testing filter
+        response = client.get(f"{self.url}?data_source=ml_testing")
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 1
+        assert data[0]["dataset"] == ml_test_record.dataset
+
+    def test_combined_data_precedence(self, client):
+        """
+        Should return combined data with spreadsheet taking precedence over ml_production
+        for matching datasets
+        """
+        # Create spreadsheet record
+        EnvironmentalJusticeRowFactory(
+            dataset="common_dataset",
+            description="spreadsheet version",
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET,
+        )
+
+        # Create ML production record with same dataset
+        EnvironmentalJusticeRowFactory(
+            dataset="common_dataset",
+            description="ml version",
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION,
+        )
+
+        # Create unique ML production record
+        EnvironmentalJusticeRowFactory(
+            dataset="unique_ml_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
+        )
+
+        # Test combined view (default)
+        response = client.get(self.url)
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+        assert len(data) == 2  # Should only return 2 records (not 3)
+
+        # Verify correct records are returned
+        datasets = [record["dataset"] for record in data]
+        assert "common_dataset" in datasets
+        assert "unique_ml_dataset" in datasets
+
+        # Verify precedence - should get spreadsheet version of common dataset
+        common_record = next(r for r in data if r["dataset"] == "common_dataset")
+        assert common_record["description"] == "spreadsheet version"
+
+    def test_combined_explicit_parameter(self, client):
+        """Should handle explicit 'combined' parameter same as default"""
+        EnvironmentalJusticeRowFactory(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET)
+        EnvironmentalJusticeRowFactory(
+            dataset="unique_ml_dataset",  # Ensure different dataset
+            data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION,
+        )
+
+        # Compare default and explicit combined responses
+        default_response = client.get(self.url)
+        combined_response = client.get(f"{self.url}?data_source=combined")
+
+        assert default_response.status_code == status.HTTP_200_OK
+        assert combined_response.status_code == status.HTTP_200_OK
+        assert default_response.json()["results"] == combined_response.json()["results"]
+
+    def test_invalid_data_source(self, client):
+        """Should return 400 error for invalid data_source parameter"""
+        response = client.get(f"{self.url}?data_source=invalid")
+        assert response.status_code == status.HTTP_400_BAD_REQUEST
+        assert "Invalid data_source" in str(response.json())
+
+    def test_sorting_in_combined_view(self, client):
+        """Should return combined results sorted by dataset name"""
+        # Create records in non-alphabetical order
+        EnvironmentalJusticeRowFactory(
+            dataset="zebra_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET
+        )
+        EnvironmentalJusticeRowFactory(
+            dataset="alpha_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION
+        )
+
+        response = client.get(self.url)
+        assert response.status_code == status.HTTP_200_OK
+        data = response.json()["results"]
+
+        # Verify sorting
+        datasets = [record["dataset"] for record in data]
+        assert datasets == sorted(datasets)
+
+    def test_http_methods_allowed(self, client):
+        """Should only allow GET requests"""
+        # Test GET (should work)
+        get_response = client.get(self.url)
+        assert get_response.status_code == status.HTTP_200_OK
+
+        # Test POST (should fail)
+        post_response = client.post(self.url, {})
+        assert post_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED
+
+        # Test PUT (should fail)
+        put_response = client.put(self.url, {})
+        assert put_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED
+
+        # Test DELETE (should fail)
+        delete_response = client.delete(self.url)
+        assert delete_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED