diff --git a/app/models/orm/migrations/versions/604bf4e66c2b_.py b/app/models/orm/migrations/versions/604bf4e66c2b_.py new file mode 100644 index 000000000..8eebb37ba --- /dev/null +++ b/app/models/orm/migrations/versions/604bf4e66c2b_.py @@ -0,0 +1,29 @@ +"""Add content_date_description to version_metadata + +Revision ID: 604bf4e66c2b +Revises: ef3392e8e054 +Create Date: 2024-10-31 16:52:56.571782 + +""" +from alembic import op +import sqlalchemy as sa +import sqlalchemy_utils +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '604bf4e66c2b' +down_revision = 'ef3392e8e054' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('version_metadata', sa.Column('content_date_description', sa.String(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('version_metadata', 'content_date_description') + # ### end Alembic commands ### diff --git a/app/models/orm/version_metadata.py b/app/models/orm/version_metadata.py index f1a09042f..df39cc94e 100644 --- a/app/models/orm/version_metadata.py +++ b/app/models/orm/version_metadata.py @@ -10,6 +10,7 @@ class VersionMetadata(Base, MetadataMixin): version = db.Column(db.String, nullable=False) content_date = db.Column(db.Date) content_start_date = db.Column(db.Date) + content_date_description = db.Column(db.String) content_end_date = db.Column(db.Date) last_update = db.Column(db.Date) description = db.Column(db.String) diff --git a/app/models/pydantic/creation_options.py b/app/models/pydantic/creation_options.py index 02a4a1f9d..961ff9010 100644 --- a/app/models/pydantic/creation_options.py +++ b/app/models/pydantic/creation_options.py @@ -115,6 +115,13 @@ class FieldType(StrictBaseModel): class RasterTileSetAssetCreationOptions(StrictBaseModel): + unify_projection: bool = Field( + False, + description=( + "First re-project to a common projection (EPSG:4326). Necessary " + "when input files are in different projections from each other." + ) + ) pixel_meaning: str data_type: DataType nbits: Optional[int] diff --git a/app/models/pydantic/metadata.py b/app/models/pydantic/metadata.py index 01f5a7308..63e836d81 100644 --- a/app/models/pydantic/metadata.py +++ b/app/models/pydantic/metadata.py @@ -112,7 +112,10 @@ class VersionMetadata(CommonMetadata): None, description="Date range covered by the content", ) - + content_date_description: Optional[str] = Field( + None, + description="Date of content to display", + ) last_update: Optional[date] = Field( None, description="Date the data were last updated", @@ -130,6 +133,7 @@ class Config: "start_date": "2000-01-01", # TODO fix date "end_date": "2021-04-06", }, + "content_date_description": "2000 - present", } ] } @@ -159,6 +163,10 @@ class VersionMetadataUpdate(VersionMetadataIn): None, description="Date range covered by the content", ) + content_date_description: Optional[str] = Field( + None, + description="Date of content to display", + ) last_update: Optional[date] = Field( None, diff --git a/app/routes/datasets/queries.py b/app/routes/datasets/queries.py index 51ebdfe81..70d4110cf 100755 --- a/app/routes/datasets/queries.py +++ b/app/routes/datasets/queries.py @@ -507,7 +507,7 @@ async def _query_table( sql: str, geometry: Optional[Geometry], ) -> List[Dict[str, Any]]: - # parse and validate SQL statement + # Parse and validate SQL statement try: parsed = parse_sql(unquote(sql)) except ParseError as e: diff --git a/app/routes/datasets/versions.py b/app/routes/datasets/versions.py index d25175b5e..50449b7f2 100644 --- a/app/routes/datasets/versions.py +++ b/app/routes/datasets/versions.py @@ -177,6 +177,7 @@ async def update_version( AssetType.dynamic_vector_tile_cache, AssetType.static_vector_tile_cache, AssetType.raster_tile_cache, + AssetType.cog, ], ) diff --git a/app/tasks/raster_tile_set_assets/raster_tile_set_assets.py b/app/tasks/raster_tile_set_assets/raster_tile_set_assets.py index 102475a03..795896757 100644 --- a/app/tasks/raster_tile_set_assets/raster_tile_set_assets.py +++ b/app/tasks/raster_tile_set_assets/raster_tile_set_assets.py @@ -18,9 +18,10 @@ from app.models.pydantic.geostore import FeatureCollection from app.models.pydantic.jobs import Job from app.models.pydantic.statistics import BandStats, Histogram, RasterStats +from app.settings.globals import DATA_LAKE_BUCKET from app.tasks import Callback, callback_constructor from app.tasks.batch import execute -from app.tasks.raster_tile_set_assets.utils import create_pixetl_job +from app.tasks.raster_tile_set_assets.utils import create_pixetl_job, create_unify_projection_job from app.utils.aws import get_s3_client from app.utils.path import ( get_asset_uri, @@ -67,13 +68,42 @@ async def raster_tile_set_asset( creation_options = PixETLCreationOptions(**co) + jobs: List[Job] = list() callback: Callback = callback_constructor(asset_id) - create_raster_tile_set_job: Job = await create_pixetl_job( - dataset, version, creation_options, "create_raster_tile_set", callback + unify_job: Optional[Job] = None + if creation_options.unify_projection: + target_crs = "epsg:4326" + new_src_uris = list() + for i,_ in enumerate(creation_options.source_uri): + new_src_uris.append( + f"s3://{DATA_LAKE_BUCKET}/{dataset}/{version}/raster/" + f"{target_crs.replace(':', '-')}/reprojected/SRC_{i}" + ) + target_prefix = new_src_uris[0].rsplit("/", 1)[0] + unify_job = await create_unify_projection_job( + dataset, + creation_options.source_uri, + target_prefix, + target_crs, + "unify_projection", + callback + ) + jobs.append(unify_job) + creation_options.source_uri = new_src_uris + + jobs.append( + await create_pixetl_job( + dataset, + version, + creation_options, + "create_raster_tile_set", + callback, + [unify_job] if unify_job is not None else None, + ) ) - log: ChangeLog = await execute([create_raster_tile_set_job]) + log: ChangeLog = await execute(jobs) return log diff --git a/app/tasks/raster_tile_set_assets/utils.py b/app/tasks/raster_tile_set_assets/utils.py index 94075b501..b6b3069f0 100644 --- a/app/tasks/raster_tile_set_assets/utils.py +++ b/app/tasks/raster_tile_set_assets/utils.py @@ -7,7 +7,7 @@ from app.models.enum.assets import AssetType from app.models.enum.pixetl import ResamplingMethod from app.models.pydantic.creation_options import PixETLCreationOptions -from app.models.pydantic.jobs import GDALDEMJob, Job, PixETLJob +from app.models.pydantic.jobs import GDALDEMJob, Job, PixETLJob, GDAL2TilesJob from app.settings.globals import ( AWS_GCS_KEY_SECRET_ARN, DEFAULT_JOB_DURATION, @@ -225,3 +225,38 @@ async def create_resample_job( parents=[parent.job_name for parent in parents] if parents else None, **kwargs, ) + +async def create_unify_projection_job( + dataset: str, + old_source_uris: List[str], + target_prefix: str, + target_crs: str, + job_name: str, + callback: Callback +) -> GDAL2TilesJob: + """Creates a Batch job that takes all files indicated in old_source_uris + and re-projects each to a common CRS, then places them in a mirror of the + original directory structure under the target_prefix, divided by source + number. More specifically, the files from the first source URI will be + put at /SRC_0, the files from the second under + /SRC_1, and so on. + """ + + command = [ + "unify_projection.sh", + "--target_crs", + target_crs, + ] + + for s in old_source_uris: + command.extend(["--source", s]) + + command.extend(["--target", target_prefix]) + + return GDAL2TilesJob( + dataset=dataset, + job_name=job_name, + command=command, + environment=JOB_ENV, + callback=callback, + ) diff --git a/batch/gdal-python.dockerfile b/batch/gdal-python.dockerfile index 773b430b8..c2d4a5986 100644 --- a/batch/gdal-python.dockerfile +++ b/batch/gdal-python.dockerfile @@ -1,4 +1,4 @@ -FROM globalforestwatch/data-api-gdal:v1.2.1 +FROM globalforestwatch/data-api-gdal:v1.2.2 # Copy scripts COPY ./batch/scripts/ /opt/scripts/ diff --git a/batch/pixetl.dockerfile b/batch/pixetl.dockerfile index 6c7f44fc3..a468ac9aa 100644 --- a/batch/pixetl.dockerfile +++ b/batch/pixetl.dockerfile @@ -1,4 +1,4 @@ -FROM globalforestwatch/pixetl:v1.7.7 +FROM globalforestwatch/pixetl:v1.7.7_test_parallel # Copy scripts COPY ./batch/scripts/ /opt/scripts/ @@ -18,4 +18,4 @@ WORKDIR /tmp ENV LC_ALL=C.UTF-8 ENV LANG=C.UTF-8 -ENTRYPOINT ["/opt/scripts/report_status.sh"] \ No newline at end of file +ENTRYPOINT ["/opt/scripts/report_status.sh"] diff --git a/batch/python/export_to_gee.py b/batch/python/export_to_gee.py index 235fa0d64..302839a6f 100644 --- a/batch/python/export_to_gee.py +++ b/batch/python/export_to_gee.py @@ -48,10 +48,7 @@ def upload_cog_to_gcs(dataset, implementation): return f"gs://{GCS_BUCKET}/{dataset}/{implementation}.tif" -def create_cog_backed_asset(dataset, implementation, gcs_path, service_account): - credentials = ee.ServiceAccountCredentials(service_account, GCS_CREDENTIALS_FILE) - ee.Initialize(credentials) - +def create_cog_backed_asset(dataset, implementation, gcs_path, credentials): # delete any existing asset with the same dataset/implementation try: ee.data.deleteAsset(f"projects/{EE_PROJECT}/assets/{dataset}/{implementation}") @@ -84,6 +81,38 @@ def create_cog_backed_asset(dataset, implementation, gcs_path, service_account): f"GEE returned unexpected status code {response.status_code} with payload {response.content}" ) + return asset_id + + +def ingest_in_gee(dataset, implementation, gcs_path): + """Ingest directly into GEE as a best effort task.""" + # delete any existing asset with the same dataset/implementation + try: + ee.data.deleteAsset(f"projects/{EE_PROJECT}/assets/{dataset}/{implementation}") + except ee.EEException: + # asset doesn't exist + pass + + # create dataset folder if it doesn't exist + try: + ee.data.createAsset( + {"type": "Folder"}, f"projects/{EE_PROJECT}/assets/{dataset}" + ) + except ee.EEException: + # folder already exists + pass + + asset_id = f"{dataset}/{implementation}" + request_id = ee.data.newTaskId()[0] + params = { + "name": f"projects/{EE_PROJECT}/assets/{asset_id}", + "tilesets": [{"sources": [{"uris": [gcs_path]}]}], + } + ee.data.startIngestion(request_id=request_id, params=params) + return asset_id + + +def set_acl_to_anyone_read(asset_id): # update ACL to be public full_asset_id = f"projects/{EE_PROJECT}/assets/{asset_id}" acl = ee.data.getAssetAcl(full_asset_id) @@ -96,8 +125,13 @@ def export_to_gee( implementation: str = Option(..., help="Implementation name."), ): service_account = set_google_application_credentials() + + # initialize GEE + credentials = ee.ServiceAccountCredentials(service_account, GCS_CREDENTIALS_FILE) + ee.Initialize(credentials) + gcs_path = upload_cog_to_gcs(dataset, implementation) - create_cog_backed_asset(dataset, implementation, gcs_path, service_account) + ingest_in_gee(dataset, implementation, gcs_path) if __name__ == "__main__": diff --git a/batch/scripts/_tiff_crosses_dateline.sh b/batch/scripts/_tiff_crosses_dateline.sh new file mode 100755 index 000000000..f13405199 --- /dev/null +++ b/batch/scripts/_tiff_crosses_dateline.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# USAGE: _tiff_crosses_dateline.sh raster_file +# +# Prints the string "true" if the input raster will cross the dateline +# when converting to EPSG:4326, "false" otherwise +# +# Needs GDAL 2.0+ and Python +# +# Credit: Slightly modified from https://gis.stackexchange.com/a/222341 + + +if [ -z "${1}" ]; then + echo -e "Error: No input raster file given.\n> USAGE: _tiff_crosses_dateline.sh raster_file" + exit 1 +fi + +# Get raster info, save it to a variable as we need it several times +gdalinfo=$(gdalinfo "${1}" -json) + +# Exit if -json switch is not available +if [ ! -z $(echo $gdalinfo | grep "^Usage:") ]; then + echo -e "Error: GDAL command failed, Version 2.0+ is needed" + exit 1 +fi + +function jsonq { + echo "${1}" | python -c "import json,sys; jdata = sys.stdin.read(); data = json.loads(jdata); print(data${2});" +} + +ulx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][0][0]") +llx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][1][0]") +lrx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][3][0]") +urx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][2][0]") + +crossing_dateline=false +test $(python -c "print(${ulx}>${lrx})") = True && crossing_dateline=true +test $(python -c "print(${llx}>${urx})") = True && crossing_dateline=true + +echo -n "${crossing_dateline}" \ No newline at end of file diff --git a/batch/scripts/_warp_and_upload.sh b/batch/scripts/_warp_and_upload.sh new file mode 100644 index 000000000..fd7295aec --- /dev/null +++ b/batch/scripts/_warp_and_upload.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e + +# arguments: +# $0 - The name of this script +# $1 - local_src_file +# $2 - local_warped_file +# $3 - target_crs +# $4 - remote target file + +if aws s3 ls "$4"; then + echo "Remote target file $4 already exists, skipping..." + exit 0 +fi + +warp_options=("-co" "COMPRESS=DEFLATE" "-co" "TILED=yes") + +echo "Seeing if TIFF crosses the dateline" +crosses="$(_tiff_crosses_dateline.sh $1)" +if [ "${crosses}" = "true" ]; then + echo "$1 crosses the dateline" + warp_options+=("--config" "CENTER_LONG" "180") +else + echo "$1 does not cross the dateline" +fi + +echo "Now warping $1 to $2" +gdalwarp "$1" "$2" -t_srs "$3" "${warp_options[@]}" +echo "Done warping $1 to $2" + +echo "Now uploading $2 to $4" +aws s3 cp --no-progress "$2" "$4" +echo "Done uploading $2 to $4" + +echo "Finally, deleting local files $1 and $2" +rm "$1" "$2" +echo "Done deleting local files $1 and $2" diff --git a/batch/scripts/get_arguments.sh b/batch/scripts/get_arguments.sh index 067bd24d8..216cc68a5 100755 --- a/batch/scripts/get_arguments.sh +++ b/batch/scripts/get_arguments.sh @@ -201,6 +201,11 @@ do shift # past argument shift # past value ;; + --target_crs) + TARGET_CRS="$2" + shift # past argument + shift # past value + ;; --target_bucket) TARGET_BUCKET="$2" shift # past argument diff --git a/batch/scripts/resample.sh b/batch/scripts/resample.sh index 7ecdc0557..e27596de6 100644 --- a/batch/scripts/resample.sh +++ b/batch/scripts/resample.sh @@ -6,7 +6,7 @@ set -e # -d | --dataset # -v | --version # -s | --source -# -r | --resampling_method) +# -r | --resampling_method # --zoom_level # -T | --target diff --git a/batch/scripts/unify_projection.sh b/batch/scripts/unify_projection.sh new file mode 100644 index 000000000..89c29dbbf --- /dev/null +++ b/batch/scripts/unify_projection.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +set -e + +# requires arguments +# -s | --source +# -T | --target +# --target_crs + +ME=$(basename "$0") +. get_arguments.sh "$@" + +echo "Reproject to a common CRS" + +src_count=0 +CMD_ARGS=() + +for s in "${SRC[@]}"; do + source_dir="SRC_${src_count}" + mkdir -p "$source_dir" + + echo "Now recursively downloading $s to $source_dir" + if [[ $s == gs://* ]]; then + gsutil -m cp -r "$s" "$source_dir" + elif [[ $s == s3://* ]]; then + aws s3 cp --recursive --no-progress "$s" "$source_dir" + fi + echo "Done downloading $s to $source_dir" + + reprojected_dir="REPROJECTED_${src_count}" + mkdir -p "$reprojected_dir" + + cd "${source_dir}" + for d in $(find . -type d | sed 's/.\///'); do + mkdir -p "../${reprojected_dir}/${d}" + done + + for f in $(find . -iname "*.tif"| sed 's/.\///'); do + local_src_file="${source_dir}/${f}" + local_warped_file="${reprojected_dir}/${f}" + remote_target_file="${TARGET}/SRC_${src_count}/${f}" + + CMD_ARGS+=("${local_src_file}" "${local_warped_file}" "${TARGET_CRS}" "${remote_target_file}") + done + cd .. + + src_count=$(($src_count+1)) +done + +echo "${CMD_ARGS[@]}" | xargs -n 4 -P 32 _warp_and_upload.sh diff --git a/tests/utils.py b/tests/utils.py index a9a4075a0..026fbf2ec 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -31,6 +31,7 @@ version_metadata = { "content_date_range": {"start_date": "2000-01-01", "end_date": "2021-01-01"}, + "content_date_description": "2000 - present", "last_update": "2020-01-03", "spatial_resolution": 10, "resolution_description": "10 meters", diff --git a/tests_v2/fixtures/metadata/version.py b/tests_v2/fixtures/metadata/version.py index 63a81369c..07f442d2c 100644 --- a/tests_v2/fixtures/metadata/version.py +++ b/tests_v2/fixtures/metadata/version.py @@ -1,5 +1,6 @@ VERSION_METADATA = { "content_date_range": {"start_date": "2000-01-01", "end_date": "2021-01-01"}, + "content_date_description": "2000 - present", "last_update": "2020-01-03", "spatial_resolution": 10, "resolution_description": "10 meters",