NASA-IMPACT · amarouane-ABDELHAK · Nov 24, 2024 · Nov 25, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/README.md b/README.md
@@ -17,7 +17,6 @@ data products and STAC metadata for interfaces such as https://github.com/NASA-I
 
 First time setting up the repo:
 `git submodule update --init --recursive`
-
 Afterwards:
 `git submodule update --recursive --remote`
 

diff --git a/dags/generate_dags.py b/dags/generate_dags.py
@@ -15,9 +15,8 @@ def generate_dags():
 
     from pathlib import Path
 
-    airflow_vars = Variable.get("aws_dags_variables")
-    airflow_vars_json = json.loads(airflow_vars)
-    bucket = airflow_vars_json.get("EVENT_BUCKET")
+    airflow_vars = Variable.get("aws_dags_variables", default_var={}, deserialize_json=True)
+    bucket = airflow_vars.get("EVENT_BUCKET")
 
     try:
         client = boto3.client("s3")

diff --git a/dags/requirements.txt b/dags/requirements.txt
@@ -11,7 +11,6 @@ apache-airflow-providers-postgres==5.2.2
 apache-airflow-providers-common-sql==1.2.0
 typing-extensions==4.4.0
 psycopg2-binary==2.9.5
-pypgstac==0.7.4
 pyOpenSSL==22.0.0
 stac-pydantic
 fsspec

diff --git a/dags/veda_data_pipeline/groups/collection_group.py b/dags/veda_data_pipeline/groups/collection_group.py
@@ -1,12 +1,10 @@
 import requests
 from airflow.models.variable import Variable
-from airflow.operators.python import PythonOperator
 from airflow.utils.task_group import TaskGroup
+from airflow.decorators import task
 from veda_data_pipeline.utils.collection_generation import GenerateCollection
 from veda_data_pipeline.utils.submit_stac import submission_handler
 
-generator = GenerateCollection()
-
 
 def check_collection_exists(endpoint: str, collection_id: str):
     """
@@ -24,27 +22,7 @@ def check_collection_exists(endpoint: str, collection_id: str):
     )
 
 
-def ingest_collection_task(ti):
-    """
-    Ingest a collection into the STAC catalog
 
-    Args:
-        dataset (Dict[str, Any]): dataset dictionary (JSON)
-        role_arn (str): role arn for Zarr collection generation
-    """
-    import json
-    collection = ti.xcom_pull(task_ids='Collection.generate_collection')
-    airflow_vars = Variable.get("aws_dags_variables")
-    airflow_vars_json = json.loads(airflow_vars)
-    cognito_app_secret = airflow_vars_json.get("COGNITO_APP_SECRET")
-    stac_ingestor_api_url = airflow_vars_json.get("STAC_INGESTOR_API_URL")
-
-    return submission_handler(
-        event=collection,
-        endpoint="/collections",
-        cognito_app_secret=cognito_app_secret,
-        stac_ingestor_api_url=stac_ingestor_api_url
-    )
 
 
 # NOTE unused, but useful for item ingests, since collections are a dependency for items
@@ -60,32 +38,48 @@ def check_collection_exists_task(ti):
     )
 
 
-def generate_collection_task(ti):
-    import json
-    config = ti.dag_run.conf
-    airflow_vars = Variable.get("aws_dags_variables")
-    airflow_vars_json = json.loads(airflow_vars)
-    role_arn = airflow_vars_json.get("ASSUME_ROLE_READ_ARN")
-
-    # TODO it would be ideal if this also works with complete collections where provided - this would make the collection ingest more re-usable
-    collection = generator.generate_stac(
-        dataset_config=config, role_arn=role_arn
-    )
-    return collection
-
 
 
 group_kwgs = {"group_id": "Collection", "tooltip": "Collection"}
 
 
 def collection_task_group():
     with TaskGroup(**group_kwgs) as collection_task_grp:
-        generate_collection = PythonOperator(
-            task_id="generate_collection", python_callable=generate_collection_task
-        )
-        ingest_collection = PythonOperator(
-            task_id="ingest_collection", python_callable=ingest_collection_task
-        )
-        generate_collection >> ingest_collection
+        @task()
+        def generate_collection_task(ti):
+
+            config = ti.dag_run.conf
+            airflow_vars_json = Variable.get("aws_dags_variables", deserialize_json=True)
+            role_arn = airflow_vars_json.get("ASSUME_ROLE_READ_ARN")
+
+            # TODO it would be ideal if this also works with complete collections where provided - this would make the collection ingest more re-usable
+            generator = GenerateCollection()
+            collection = generator.generate_stac(
+                dataset_config=config, role_arn=role_arn
+            )
+            return collection
+
+        @task()
+        def ingest_collection_task(collection):
+            """
+            Ingest a collection into the STAC catalog
+
+            Args:
+                collection:
+
+            """
+            airflow_vars_json = Variable.get("aws_dags_variables", deserialize_json=True)
+            cognito_app_secret = airflow_vars_json.get("COGNITO_APP_SECRET")
+            stac_ingestor_api_url = airflow_vars_json.get("STAC_INGESTOR_API_URL")
+
+            return submission_handler(
+                event=collection,
+                endpoint="/collections",
+                cognito_app_secret=cognito_app_secret,
+                stac_ingestor_api_url=stac_ingestor_api_url
+            )
+
+        collection = generate_collection_task()
+        ingest_collection_task(collection)
 
         return collection_task_grp
diff --git a/dags/veda_data_pipeline/groups/discover_group.py b/dags/veda_data_pipeline/groups/discover_group.py
@@ -8,6 +8,7 @@
 from veda_data_pipeline.utils.s3_discovery import (
     s3_discovery_handler, EmptyFileListError
 )
+from deprecated import deprecated
 
 group_kwgs = {"group_id": "Discover", "tooltip": "Discover"}
 
@@ -48,6 +49,36 @@ def discover_from_s3_task(ti=None, event={}, **kwargs):
 
 
 @task
+def get_files_task(payload, ti=None):
+    """
+    Get files from S3 produced by discovery or dataset tasks.
+    Handles both single payload and multiple payload scenarios.
+    """
+    dag_run_id = ti.dag_run.run_id
+    results = []
+
+    # Handle multiple payloads (dataset and items case)
+    payloads = payload if isinstance(payload, list) else [payload]
+
+    for item in payloads:
+        if isinstance(item, LazyXComAccess):  # Dynamic task mapping case
+            payloads_xcom = item[0].pop("payload", [])
+            base_payload = item[0]
+        else:
+            payloads_xcom = item.pop("payload", [])
+            base_payload = item
+
+        for indx, payload_xcom in enumerate(payloads_xcom):
+            results.append({
+                "run_id": f"{dag_run_id}_{uuid.uuid4()}_{indx}",
+                **base_payload,
+                "payload": payload_xcom,
+            })
+
+    return results
+
+@task
+@deprecated(reason="Please use get_files_task function that hundles both files and dataset files use cases")
 def get_files_to_process(payload, ti=None):
     """Get files from S3 produced by the discovery task.
     Used as part of both the parallel_run_process_rasters and parallel_run_process_vectors tasks.
@@ -66,6 +97,7 @@ def get_files_to_process(payload, ti=None):
 
 
 @task
+@deprecated(reason="Please use get_files_task airflow task instead. This will be removed in the new release")
 def get_dataset_files_to_process(payload, ti=None):
     """Get files from S3 produced by the dataset task.
     This is different from the get_files_to_process task as it produces a combined structure from repeated mappings.

diff --git a/dags/veda_data_pipeline/utils/collection_generation.py b/dags/veda_data_pipeline/utils/collection_generation.py
@@ -97,14 +97,13 @@ def create_cog_collection(self, dataset: Dict[str, Any]) -> dict:
 
         # Override the extents if they exists
         if spatial_extent := dataset.get("spatial_extent"):
-            collection_stac["extent"]["spatial"] = {"bbox": [list(spatial_extent.values())]},
+            collection_stac["extent"]["spatial"] = {"bbox": [list(spatial_extent.values())]}
 
         if temporal_extent := dataset.get("temporal_extent"):
             collection_stac["extent"]["temporal"] = {
                 "interval": [
-                    # most of our data uses the Z suffix for UTC - isoformat() doesn't
                     [
-                        datetime.fromisoformat(x).astimezone(timezone.utc).isoformat().replace("+00:00", "Z")
+                        x
                         if x else None
                         for x in list(temporal_extent.values())
                     ]

diff --git a/dags/veda_data_pipeline/utils/submit_stac.py b/dags/veda_data_pipeline/utils/submit_stac.py
@@ -103,7 +103,7 @@ def submission_handler(
     cognito_app_secret=None,
     stac_ingestor_api_url=None,
     context=None,
-) -> None:
+) -> [Dict[str, Any], None]:
     if context is None:
         context = {}
 
@@ -121,7 +121,7 @@ def submission_handler(
         secret_id=cognito_app_secret,
         base_url=stac_ingestor_api_url,
     )
-    ingestor.submit(event=stac_item, endpoint=endpoint)
+    return ingestor.submit(event=stac_item, endpoint=endpoint)
 
 
 if __name__ == "__main__":

diff --git a/dags/veda_data_pipeline/veda_collection_pipeline.py b/dags/veda_data_pipeline/veda_collection_pipeline.py
@@ -31,19 +31,19 @@
 }
 
 template_dag_run_conf = {
-    "collection": "<collection-id>", 
-    "data_type": "cog", 
-    "description": "<collection-description>", 
-    "is_periodic": "<true|false>", 
-    "license": "<collection-LICENSE>", 
-    "time_density": "<time-density>", 
-    "title": "<collection-title>"
+    "collection": "<collection-id>",
+    "data_type": "cog",
+    "description": "<collection-description>",
+    "is_periodic": "<true|false>",
+    "license": "<collection-LICENSE>",
+    "time_density": "<time-density>",
+    "title": "<collection-title>",
 }
 
 with DAG("veda_collection_pipeline", params=template_dag_run_conf, **dag_args) as dag:
     start = EmptyOperator(task_id="start", dag=dag)
-    end = EmptyOperator(task_id="end", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS, dag=dag)
-
-    collection_grp = collection_task_group()
+    end = EmptyOperator(
+        task_id="end", trigger_rule=TriggerRule.NONE_FAILED_MIN_ONE_SUCCESS, dag=dag
+    )
 
-    start >> collection_grp >> end
+    start >> collection_task_group() >> end
diff --git a/dags/veda_data_pipeline/veda_dataset_pipeline.py b/dags/veda_data_pipeline/veda_dataset_pipeline.py
@@ -1,38 +1,37 @@
 import pendulum
 from airflow import DAG
+from airflow.models.param import Param
 from airflow.decorators import task
+from airflow.operators.trigger_dagrun import TriggerDagRunOperator
 from airflow.operators.dummy_operator import DummyOperator as EmptyOperator
-from airflow.models.variable import Variable
-import json
 from veda_data_pipeline.groups.collection_group import collection_task_group
-from veda_data_pipeline.groups.discover_group import discover_from_s3_task, get_dataset_files_to_process
-from veda_data_pipeline.groups.processing_tasks import submit_to_stac_ingestor_task
 
-dag_doc_md = """
+template_dag_run_conf = {
+    "collection": "<collection-id>",
+    "data_type": "cog",
+    "description": "<collection-description>",
+    "discovery_items": [
+        {
+            "bucket": "<bucket-name>",
+            "datetime_range": "<range>",
+            "discovery": "s3",
+            "filename_regex": "<regex>",
+            "prefix": "<example-prefix/>",
+        }
+    ],
+    "is_periodic": "<true|false>",
+    "license": "<collection-LICENSE>",
+    "time_density": "<time-density>",
+    "title": "<collection-title>",
+}
+
+dag_doc_md = f"""
 ### Dataset Pipeline
 Generates a collection and triggers the file discovery process
 #### Notes
 - This DAG can run with the following configuration <br>
 ```json
-{
-    "collection": "collection-id", 
-    "data_type": "cog", 
-    "description": "collection description", 
-    "discovery_items": 
-        [
-            {
-                "bucket": "veda-data-store-staging", 
-                "datetime_range": "year", 
-                "discovery": "s3", 
-                "filename_regex": "^(.*).tif$", 
-                "prefix": "example-prefix/"
-            }
-        ], 
-    "is_periodic": true, 
-    "license": "collection-LICENSE", 
-    "time_density": "year", 
-    "title": "collection-title"
-}
+{template_dag_run_conf}
 ```
 """
 
@@ -44,24 +43,6 @@
     "tags": ["collection", "discovery"],
 }
 
-
-@task
-def extract_discovery_items(**kwargs):
-    ti = kwargs.get("ti")
-    discovery_items = ti.dag_run.conf.get("discovery_items")
-    print(discovery_items)
-    return discovery_items
-
-
-@task(max_active_tis_per_dag=3)
-def build_stac_task(payload):
-    from veda_data_pipeline.utils.build_stac.handler import stac_handler
-    airflow_vars = Variable.get("aws_dags_variables")
-    airflow_vars_json = json.loads(airflow_vars)
-    event_bucket = airflow_vars_json.get("EVENT_BUCKET")
-    return stac_handler(payload_src=payload, bucket_output=event_bucket)
-
-
 template_dag_run_conf = {
     "collection": "<collection-id>",
     "data_type": "cog",
@@ -83,19 +64,38 @@ def build_stac_task(payload):
 }
 
 with DAG("veda_dataset_pipeline", params=template_dag_run_conf, **dag_args) as dag:
-    # ECS dependency variable
+    start = EmptyOperator(task_id="start")
+    end = EmptyOperator(task_id="end")
 
-    start = EmptyOperator(task_id="start", dag=dag)
-    end = EmptyOperator(task_id="end", dag=dag)
 
-    collection_grp = collection_task_group()
-    discover = discover_from_s3_task.expand(event=extract_discovery_items())
-    discover.set_upstream(collection_grp)  # do not discover until collection exists
-    get_files = get_dataset_files_to_process(payload=discover)
+    @task()
+    def mutate_payload(**kwargs):
+        ti = kwargs.get("ti")
+        payload = ti.dag_run.conf.copy()
+        payloads = list()
+        if assets := payload.get("assets"):
+            # remove thumbnail asset if provided in collection config
+            if "thumbnail" in assets.keys():
+                assets.pop("thumbnail")
+            # if thumbnail was only asset, delete assets
+            if not assets:
+                payload.pop("assets")
+            # finally put the mutated assets back in the payload
+            else:
+                payload["assets"] = assets
+        for item in payload.get("discovery_items"):
+            payloads.append({
+                **payload,
+                **item
+            }
+            )
+
+        return payloads
 
-    build_stac = build_stac_task.expand(payload=get_files)
-    # .output is needed coming from a non-taskflow operator
-    submit_stac = submit_to_stac_ingestor_task.expand(built_stac=build_stac)
 
-    collection_grp.set_upstream(start)
-    submit_stac.set_downstream(end)
+    mutated_payloads = start >> collection_task_group() >> mutate_payload()
+    run_discover_build_and_push = TriggerDagRunOperator.partial(
+        task_id="trigger_discover_items_dag",
+        trigger_dag_id="veda_discover",
+        wait_for_completion=True
+    ).expand(conf=mutated_payloads) >> end