From 6518c4745d09ba9883ab0043724fc4f5f372e739 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 22 Nov 2024 21:35:44 -0600 Subject: [PATCH 1/4] remove destination_server and add datasource --- ...ljusticerow_destination_server_and_more.py | 52 +++++++++++++++++++ environmental_justice/models.py | 12 ++--- environmental_justice/views.py | 46 +++++++++++++--- 3 files changed, 98 insertions(+), 12 deletions(-) create mode 100644 environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py diff --git a/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py b/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py new file mode 100644 index 00000000..c51219b4 --- /dev/null +++ b/environmental_justice/migrations/0006_remove_environmentaljusticerow_destination_server_and_more.py @@ -0,0 +1,52 @@ +# Generated by Django 4.2.9 on 2024-11-23 03:18 + +from django.db import migrations, models + + +def migrate_destination_server_to_data_source(apps, schema_editor): + EnvironmentalJusticeRow = apps.get_model("environmental_justice", "EnvironmentalJusticeRow") + + # Migrate prod to spreadsheet + EnvironmentalJusticeRow.objects.filter(destination_server="prod").update( + data_source="spreadsheet", destination_server="" + ) + + # Migrate dev to ml_production + EnvironmentalJusticeRow.objects.filter(destination_server="dev").update( + data_source="ml_production", destination_server="" + ) + + # Migrate test to ml_testing + EnvironmentalJusticeRow.objects.filter(destination_server="test").update( + data_source="ml_testing", destination_server="" + ) + + +class Migration(migrations.Migration): + + dependencies = [ + ("environmental_justice", "0005_environmentaljusticerow_destination_server"), + ] + + operations = [ + migrations.AddField( + model_name="environmentaljusticerow", + name="data_source", + field=models.CharField( + blank=True, + choices=[ + ("spreadsheet", "Spreadsheet"), + ("ml_production", "ML Production"), + ("ml_testing", "ML Testing"), + ], + default="", + max_length=20, + verbose_name="Data Source", + ), + ), + migrations.RunPython(migrate_destination_server_to_data_source, reverse_code=migrations.RunPython.noop), + migrations.RemoveField( + model_name="environmentaljusticerow", + name="destination_server", + ), + ] diff --git a/environmental_justice/models.py b/environmental_justice/models.py index 97cb1d61..d7cb705b 100644 --- a/environmental_justice/models.py +++ b/environmental_justice/models.py @@ -6,13 +6,13 @@ class EnvironmentalJusticeRow(models.Model): Environmental Justice data from the spreadsheet """ - class DestinationServerChoices(models.TextChoices): - DEV = "dev", "Development" - TEST = "test", "Testing" - PROD = "prod", "Production" + class DataSourceChoices(models.TextChoices): + SPREADSHEET = "spreadsheet", "Spreadsheet" + ML_PRODUCTION = "ml_production", "ML Production" + ML_TESTING = "ml_testing", "ML Testing" - destination_server = models.CharField( - "Destination Server", max_length=10, choices=DestinationServerChoices.choices, default="", blank=True + data_source = models.CharField( + "Data Source", max_length=20, choices=DataSourceChoices.choices, default="", blank=True ) dataset = models.CharField("Dataset", blank=True, default="") diff --git a/environmental_justice/views.py b/environmental_justice/views.py index 4e999a4c..f4d2afbe 100644 --- a/environmental_justice/views.py +++ b/environmental_justice/views.py @@ -1,3 +1,4 @@ +from django.db.models import Q from django_filters.rest_framework import DjangoFilterBackend from rest_framework import viewsets @@ -8,19 +9,52 @@ class EnvironmentalJusticeRowViewSet(viewsets.ModelViewSet): """ API endpoint that allows environmental justice rows to be read. + When combining spreadsheet and ml_production data, spreadsheet takes precedence + for any matching dataset values. """ queryset = EnvironmentalJusticeRow.objects.all() serializer_class = EnvironmentalJusticeRowSerializer http_method_names = ["get"] filter_backends = [DjangoFilterBackend] - filterset_fields = ["destination_server"] + filterset_fields = ["data_source"] + + def get_combined_queryset(self): + """ + Returns combined data where: + 1. All spreadsheet data is included + 2. ML production data is included only if there's no spreadsheet data with matching dataset + """ + # First, get all unique datasets that exist in spreadsheet + spreadsheet_datasets = ( + EnvironmentalJusticeRow.objects.filter(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) + .values_list("dataset", flat=True) + .distinct() + ) + + # Build query to get: + # 1. ALL spreadsheet records + # 2. ML production records where dataset isn't in spreadsheet + combined_query = Q(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) | Q( + data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, dataset__not_in=spreadsheet_datasets + ) + + return EnvironmentalJusticeRow.objects.filter(combined_query).order_by( + "dataset" + ) # Optional: orders results by dataset name def get_queryset(self): """ - if no destination_server is provided, default to PROD + Handle different data_source filter scenarios: + - No filter: Return combined data (spreadsheet takes precedence) + - 'combined': Same as no filter + - specific source: Return data for that source only """ - queryset = super().get_queryset() - if not self.request.query_params.get("destination_server"): - queryset = queryset.filter(destination_server=EnvironmentalJusticeRow.DestinationServerChoices.PROD) - return queryset + data_source = self.request.query_params.get("data_source", "combined") + + # straightfoward case: return data for specific source + if data_source in EnvironmentalJusticeRow.DataSourceChoices.values: + return super().get_queryset().filter(data_source=data_source) + + # Handle 'combined' or no filter case + return self.get_combined_queryset() From 888c53b516aff217b45cdf59cd3f6e7adbb60583 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 22 Nov 2024 21:40:09 -0600 Subject: [PATCH 2/4] add readme explaining EJ api behavior --- environmental_justice/README.md | 86 +++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 environmental_justice/README.md diff --git a/environmental_justice/README.md b/environmental_justice/README.md new file mode 100644 index 00000000..0dffaf84 --- /dev/null +++ b/environmental_justice/README.md @@ -0,0 +1,86 @@ +# Environmental Justice API + +## Overview +This API provides access to Environmental Justice data from multiple sources. It supports retrieving data from individual sources or as a combined dataset with defined precedence rules. + +## Endpoints + +### GET /api/environmental-justice/ + +Retrieves environmental justice data based on specified data source. + +#### Query Parameters + +| Parameter | Description | Default | Options | +|-------------|-------------|------------|----------------------------------------------| +| data_source | Data source filter | "combined" | "spreadsheet", "ml_production", "ml_testing", "combined" | + +#### Data Source Behavior + +1. **Single Source** + - `?data_source=spreadsheet`: Returns only spreadsheet data + - `?data_source=ml_production`: Returns only ML production data + - `?data_source=ml_testing`: Returns only ML testing data + +2. **Combined Data** (Default) + - Access via `?data_source=combined` or no parameter + - Merges data from 'spreadsheet' and 'ml_production' sources + - Precedence rules: + - If the same dataset exists in both sources, the spreadsheet version is used + - Unique datasets from ml_production are included + - ML testing data is not included in combined view + +#### Example Requests + +```bash +# Get combined data (default) +GET /api/environmental-justice/ + +# Get combined data (explicit) +GET /api/environmental-justice/?data_source=combined + +# Get only spreadsheet data +GET /api/environmental-justice/?data_source=spreadsheet + +# Get only ML production data +GET /api/environmental-justice/?data_source=ml_production + +# Get only ML testing data +GET /api/environmental-justice/?data_source=ml_testing +``` + +#### Response Fields + +Each record includes the following fields: +- dataset +- description +- description_simplified +- indicators +- intended_use +- latency +- limitations +- project +- source_link +- strengths +- format +- geographic_coverage +- data_visualization +- spatial_resolution +- temporal_extent +- temporal_resolution +- sde_link +- data_source + +## Data Source Definitions + +- **spreadsheet**: Primary source data from environmental justice spreadsheets +- **ml_production**: Production machine learning processed data +- **ml_testing**: Testing/staging machine learning processed data + +## Precedence Rules +When retrieving combined data: +1. If a dataset exists in both spreadsheet and ml_production: + - The spreadsheet version takes precedence + - The ml_production version is excluded +2. Datasets unique to ml_production are included in the response +3. ML testing data is never included in combined results From 1a5ae32a5244c923c38b17c63f08e97bc132aa38 Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 22 Nov 2024 21:57:51 -0600 Subject: [PATCH 3/4] update query to explicitly handle 'combined' parameter --- environmental_justice/views.py | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/environmental_justice/views.py b/environmental_justice/views.py index f4d2afbe..4959c168 100644 --- a/environmental_justice/views.py +++ b/environmental_justice/views.py @@ -1,6 +1,6 @@ -from django.db.models import Q from django_filters.rest_framework import DjangoFilterBackend from rest_framework import viewsets +from rest_framework.exceptions import ValidationError from .models import EnvironmentalJusticeRow from .serializers import EnvironmentalJusticeRowSerializer @@ -17,31 +17,27 @@ class EnvironmentalJusticeRowViewSet(viewsets.ModelViewSet): serializer_class = EnvironmentalJusticeRowSerializer http_method_names = ["get"] filter_backends = [DjangoFilterBackend] - filterset_fields = ["data_source"] + filterset_fields = [] def get_combined_queryset(self): """ Returns combined data where: 1. All spreadsheet data is included 2. ML production data is included only if there's no spreadsheet data with matching dataset + Records are sorted by dataset name and then data_source (ensuring spreadsheet comes before ml_production) """ - # First, get all unique datasets that exist in spreadsheet - spreadsheet_datasets = ( - EnvironmentalJusticeRow.objects.filter(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) - .values_list("dataset", flat=True) - .distinct() + # Get spreadsheet data + spreadsheet_data = EnvironmentalJusticeRow.objects.filter( + data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET ) - # Build query to get: - # 1. ALL spreadsheet records - # 2. ML production records where dataset isn't in spreadsheet - combined_query = Q(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) | Q( - data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, dataset__not_in=spreadsheet_datasets - ) + # Get ML production data excluding datasets that exist in spreadsheet + ml_production_data = EnvironmentalJusticeRow.objects.filter( + data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ).exclude(dataset__in=spreadsheet_data.values_list("dataset", flat=True)) - return EnvironmentalJusticeRow.objects.filter(combined_query).order_by( - "dataset" - ) # Optional: orders results by dataset name + # Combine the querysets and sort + return spreadsheet_data.union(ml_production_data).order_by("dataset", "data_source") def get_queryset(self): """ @@ -52,9 +48,13 @@ def get_queryset(self): """ data_source = self.request.query_params.get("data_source", "combined") - # straightfoward case: return data for specific source - if data_source in EnvironmentalJusticeRow.DataSourceChoices.values: - return super().get_queryset().filter(data_source=data_source) + # Handle the 'combined' case or no parameter case + if not data_source or data_source == "combined": + return self.get_combined_queryset() + + # Validate specific data source + if data_source not in EnvironmentalJusticeRow.DataSourceChoices.values: + valid_choices = list(EnvironmentalJusticeRow.DataSourceChoices.values) + ["combined"] + raise ValidationError(f"Invalid data_source. Valid choices are: {', '.join(valid_choices)}") - # Handle 'combined' or no filter case - return self.get_combined_queryset() + return super().get_queryset().filter(data_source=data_source).order_by("dataset") From 9a20863e2cf3b19ae3fecd372bbea489b7d8b31b Mon Sep 17 00:00:00 2001 From: Carson Davis Date: Fri, 22 Nov 2024 22:13:53 -0600 Subject: [PATCH 4/4] add api tests for EJ --- environmental_justice/tests.py | 3 - environmental_justice/tests/conftest.py | 30 +++++ environmental_justice/tests/factories.py | 28 ++++ environmental_justice/tests/test_views.py | 153 ++++++++++++++++++++++ 4 files changed, 211 insertions(+), 3 deletions(-) delete mode 100644 environmental_justice/tests.py create mode 100644 environmental_justice/tests/conftest.py create mode 100644 environmental_justice/tests/factories.py create mode 100644 environmental_justice/tests/test_views.py diff --git a/environmental_justice/tests.py b/environmental_justice/tests.py deleted file mode 100644 index 9a30df3b..00000000 --- a/environmental_justice/tests.py +++ /dev/null @@ -1,3 +0,0 @@ -from django.test import TestCase # noqa - -# Create your tests here. diff --git a/environmental_justice/tests/conftest.py b/environmental_justice/tests/conftest.py new file mode 100644 index 00000000..d8b53c9a --- /dev/null +++ b/environmental_justice/tests/conftest.py @@ -0,0 +1,30 @@ +import pytest +from django.urls import include, path +from rest_framework.routers import DefaultRouter +from rest_framework.test import APIClient + +from environmental_justice.views import EnvironmentalJusticeRowViewSet + +# Create router and register our viewset +router = DefaultRouter() +router.register(r"environmental-justice", EnvironmentalJusticeRowViewSet) + +# Create temporary urlpatterns for testing +urlpatterns = [ + path("api/", include(router.urls)), +] + + +# Override default URL conf for testing +@pytest.fixture +def client(): + """Return a Django REST framework API client""" + return APIClient() + + +@pytest.fixture(autouse=True) +def setup_urls(): + """Setup URLs for testing""" + from django.conf import settings + + settings.ROOT_URLCONF = __name__ diff --git a/environmental_justice/tests/factories.py b/environmental_justice/tests/factories.py new file mode 100644 index 00000000..42d05735 --- /dev/null +++ b/environmental_justice/tests/factories.py @@ -0,0 +1,28 @@ +import factory +from factory.django import DjangoModelFactory + +from environmental_justice.models import EnvironmentalJusticeRow + + +class EnvironmentalJusticeRowFactory(DjangoModelFactory): + class Meta: + model = EnvironmentalJusticeRow + + dataset = factory.Sequence(lambda n: f"dataset_{n}") + description = factory.Faker("sentence") + description_simplified = factory.Faker("sentence") + indicators = factory.Faker("sentence") + intended_use = factory.Faker("sentence") + latency = factory.Faker("word") + limitations = factory.Faker("sentence") + project = factory.Faker("word") + source_link = factory.Faker("url") + strengths = factory.Faker("sentence") + format = factory.Faker("file_extension") + geographic_coverage = factory.Faker("country") + data_visualization = factory.Faker("sentence") + spatial_resolution = factory.Faker("word") + temporal_extent = factory.Faker("date") + temporal_resolution = factory.Faker("word") + sde_link = factory.Faker("url") + data_source = EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET diff --git a/environmental_justice/tests/test_views.py b/environmental_justice/tests/test_views.py new file mode 100644 index 00000000..1632d45b --- /dev/null +++ b/environmental_justice/tests/test_views.py @@ -0,0 +1,153 @@ +# docker-compose -f local.yml run --rm django pytest environmental_justice/tests/test_views.py +import pytest +from rest_framework import status + +from environmental_justice.models import EnvironmentalJusticeRow +from environmental_justice.tests.factories import EnvironmentalJusticeRowFactory + + +@pytest.mark.django_db +class TestEnvironmentalJusticeRowViewSet: + """Test suite for the EnvironmentalJusticeRow API endpoints""" + + def setup_method(self): + """Setup URL for API endpoint""" + self.url = "/api/environmental-justice/" + + def test_empty_database_returns_empty_list(self, client): + """Should return empty list when no records exist""" + response = client.get(self.url) + assert response.status_code == status.HTTP_200_OK + assert response.json()["results"] == [] + assert response.json()["count"] == 0 + + def test_single_source_filtering(self, client): + """Should return records only from requested data source""" + # Create records for each data source + spreadsheet_record = EnvironmentalJusticeRowFactory( + dataset="test_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET + ) + ml_prod_record = EnvironmentalJusticeRowFactory( + dataset="another_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ) + ml_test_record = EnvironmentalJusticeRowFactory( + dataset="test_dataset_3", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_TESTING + ) + + # Test spreadsheet filter + response = client.get(f"{self.url}?data_source=spreadsheet") + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["dataset"] == spreadsheet_record.dataset + + # Test ml_production filter + response = client.get(f"{self.url}?data_source=ml_production") + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["dataset"] == ml_prod_record.dataset + + # Test ml_testing filter + response = client.get(f"{self.url}?data_source=ml_testing") + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 1 + assert data[0]["dataset"] == ml_test_record.dataset + + def test_combined_data_precedence(self, client): + """ + Should return combined data with spreadsheet taking precedence over ml_production + for matching datasets + """ + # Create spreadsheet record + EnvironmentalJusticeRowFactory( + dataset="common_dataset", + description="spreadsheet version", + data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET, + ) + + # Create ML production record with same dataset + EnvironmentalJusticeRowFactory( + dataset="common_dataset", + description="ml version", + data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, + ) + + # Create unique ML production record + EnvironmentalJusticeRowFactory( + dataset="unique_ml_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ) + + # Test combined view (default) + response = client.get(self.url) + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + assert len(data) == 2 # Should only return 2 records (not 3) + + # Verify correct records are returned + datasets = [record["dataset"] for record in data] + assert "common_dataset" in datasets + assert "unique_ml_dataset" in datasets + + # Verify precedence - should get spreadsheet version of common dataset + common_record = next(r for r in data if r["dataset"] == "common_dataset") + assert common_record["description"] == "spreadsheet version" + + def test_combined_explicit_parameter(self, client): + """Should handle explicit 'combined' parameter same as default""" + EnvironmentalJusticeRowFactory(data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET) + EnvironmentalJusticeRowFactory( + dataset="unique_ml_dataset", # Ensure different dataset + data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION, + ) + + # Compare default and explicit combined responses + default_response = client.get(self.url) + combined_response = client.get(f"{self.url}?data_source=combined") + + assert default_response.status_code == status.HTTP_200_OK + assert combined_response.status_code == status.HTTP_200_OK + assert default_response.json()["results"] == combined_response.json()["results"] + + def test_invalid_data_source(self, client): + """Should return 400 error for invalid data_source parameter""" + response = client.get(f"{self.url}?data_source=invalid") + assert response.status_code == status.HTTP_400_BAD_REQUEST + assert "Invalid data_source" in str(response.json()) + + def test_sorting_in_combined_view(self, client): + """Should return combined results sorted by dataset name""" + # Create records in non-alphabetical order + EnvironmentalJusticeRowFactory( + dataset="zebra_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.SPREADSHEET + ) + EnvironmentalJusticeRowFactory( + dataset="alpha_dataset", data_source=EnvironmentalJusticeRow.DataSourceChoices.ML_PRODUCTION + ) + + response = client.get(self.url) + assert response.status_code == status.HTTP_200_OK + data = response.json()["results"] + + # Verify sorting + datasets = [record["dataset"] for record in data] + assert datasets == sorted(datasets) + + def test_http_methods_allowed(self, client): + """Should only allow GET requests""" + # Test GET (should work) + get_response = client.get(self.url) + assert get_response.status_code == status.HTTP_200_OK + + # Test POST (should fail) + post_response = client.post(self.url, {}) + assert post_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED + + # Test PUT (should fail) + put_response = client.put(self.url, {}) + assert put_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED + + # Test DELETE (should fail) + delete_response = client.delete(self.url) + assert delete_response.status_code == status.HTTP_405_METHOD_NOT_ALLOWED