Skip to content

Commit

Permalink
Merge unify_projection and CF cache changes to master
Browse files Browse the repository at this point in the history
Merge Daniel's unify_projection change (with type fix) and
Solomon's clear cloudfront cache for COGs.
  • Loading branch information
danscales committed Nov 25, 2024
2 parents 8e8d2a4 + f4f0929 commit 9f57856
Show file tree
Hide file tree
Showing 10 changed files with 213 additions and 7 deletions.
7 changes: 7 additions & 0 deletions app/models/pydantic/creation_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,13 @@ class FieldType(StrictBaseModel):


class RasterTileSetAssetCreationOptions(StrictBaseModel):
unify_projection: bool = Field(
False,
description=(
"First re-project to a common projection (EPSG:4326). Necessary "
"when input files are in different projections from each other."
)
)
pixel_meaning: str
data_type: DataType
nbits: Optional[int]
Expand Down
1 change: 1 addition & 0 deletions app/routes/datasets/versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ async def update_version(
AssetType.dynamic_vector_tile_cache,
AssetType.static_vector_tile_cache,
AssetType.raster_tile_cache,
AssetType.cog,
],
)

Expand Down
38 changes: 34 additions & 4 deletions app/tasks/raster_tile_set_assets/raster_tile_set_assets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
from app.models.pydantic.geostore import FeatureCollection
from app.models.pydantic.jobs import Job
from app.models.pydantic.statistics import BandStats, Histogram, RasterStats
from app.settings.globals import DATA_LAKE_BUCKET
from app.tasks import Callback, callback_constructor
from app.tasks.batch import execute
from app.tasks.raster_tile_set_assets.utils import create_pixetl_job
from app.tasks.raster_tile_set_assets.utils import create_pixetl_job, create_unify_projection_job
from app.utils.aws import get_s3_client
from app.utils.path import (
get_asset_uri,
Expand Down Expand Up @@ -67,13 +68,42 @@ async def raster_tile_set_asset(

creation_options = PixETLCreationOptions(**co)

jobs: List[Job] = list()
callback: Callback = callback_constructor(asset_id)

create_raster_tile_set_job: Job = await create_pixetl_job(
dataset, version, creation_options, "create_raster_tile_set", callback
unify_job: Optional[Job] = None
if creation_options.unify_projection:
target_crs = "epsg:4326"
new_src_uris = list()
for i,_ in enumerate(creation_options.source_uri):
new_src_uris.append(
f"s3://{DATA_LAKE_BUCKET}/{dataset}/{version}/raster/"
f"{target_crs.replace(':', '-')}/reprojected/SRC_{i}"
)
target_prefix = new_src_uris[0].rsplit("/", 1)[0]
unify_job = await create_unify_projection_job(
dataset,
creation_options.source_uri,
target_prefix,
target_crs,
"unify_projection",
callback
)
jobs.append(unify_job)
creation_options.source_uri = new_src_uris

jobs.append(
await create_pixetl_job(
dataset,
version,
creation_options,
"create_raster_tile_set",
callback,
[unify_job] if unify_job is not None else None,
)
)

log: ChangeLog = await execute([create_raster_tile_set_job])
log: ChangeLog = await execute(jobs)

return log

Expand Down
37 changes: 36 additions & 1 deletion app/tasks/raster_tile_set_assets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from app.models.enum.assets import AssetType
from app.models.enum.pixetl import ResamplingMethod
from app.models.pydantic.creation_options import PixETLCreationOptions
from app.models.pydantic.jobs import GDALDEMJob, Job, PixETLJob
from app.models.pydantic.jobs import GDALDEMJob, Job, PixETLJob, GDAL2TilesJob
from app.settings.globals import (
AWS_GCS_KEY_SECRET_ARN,
DEFAULT_JOB_DURATION,
Expand Down Expand Up @@ -225,3 +225,38 @@ async def create_resample_job(
parents=[parent.job_name for parent in parents] if parents else None,
**kwargs,
)

async def create_unify_projection_job(
dataset: str,
old_source_uris: List[str],
target_prefix: str,
target_crs: str,
job_name: str,
callback: Callback
) -> GDAL2TilesJob:
"""Creates a Batch job that takes all files indicated in old_source_uris
and re-projects each to a common CRS, then places them in a mirror of the
original directory structure under the target_prefix, divided by source
number. More specifically, the files from the first source URI will be
put at <target_prefix>/SRC_0, the files from the second under
<target_prefix>/SRC_1, and so on.
"""

command = [
"unify_projection.sh",
"--target_crs",
target_crs,
]

for s in old_source_uris:
command.extend(["--source", s])

command.extend(["--target", target_prefix])

return GDAL2TilesJob(
dataset=dataset,
job_name=job_name,
command=command,
environment=JOB_ENV,
callback=callback,
)
2 changes: 1 addition & 1 deletion batch/gdal-python.dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM globalforestwatch/data-api-gdal:v1.2.1
FROM globalforestwatch/data-api-gdal:v1.2.2

# Copy scripts
COPY ./batch/scripts/ /opt/scripts/
Expand Down
40 changes: 40 additions & 0 deletions batch/scripts/_tiff_crosses_dateline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/bin/bash
#
# USAGE: _tiff_crosses_dateline.sh raster_file
#
# Prints the string "true" if the input raster will cross the dateline
# when converting to EPSG:4326, "false" otherwise
#
# Needs GDAL 2.0+ and Python
#
# Credit: Slightly modified from https://gis.stackexchange.com/a/222341


if [ -z "${1}" ]; then
echo -e "Error: No input raster file given.\n> USAGE: _tiff_crosses_dateline.sh raster_file"
exit 1
fi

# Get raster info, save it to a variable as we need it several times
gdalinfo=$(gdalinfo "${1}" -json)

# Exit if -json switch is not available
if [ ! -z $(echo $gdalinfo | grep "^Usage:") ]; then
echo -e "Error: GDAL command failed, Version 2.0+ is needed"
exit 1
fi

function jsonq {
echo "${1}" | python -c "import json,sys; jdata = sys.stdin.read(); data = json.loads(jdata); print(data${2});"
}

ulx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][0][0]")
llx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][1][0]")
lrx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][3][0]")
urx=$(jsonq "$gdalinfo" "['wgs84Extent']['coordinates'][0][2][0]")

crossing_dateline=false
test $(python -c "print(${ulx}>${lrx})") = True && crossing_dateline=true
test $(python -c "print(${llx}>${urx})") = True && crossing_dateline=true

echo -n "${crossing_dateline}"
38 changes: 38 additions & 0 deletions batch/scripts/_warp_and_upload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

set -e

# arguments:
# $0 - The name of this script
# $1 - local_src_file
# $2 - local_warped_file
# $3 - target_crs
# $4 - remote target file

if aws s3 ls "$4"; then
echo "Remote target file $4 already exists, skipping..."
exit 0
fi

warp_options=("-co" "COMPRESS=DEFLATE" "-co" "TILED=yes")

echo "Seeing if TIFF crosses the dateline"
crosses="$(_tiff_crosses_dateline.sh $1)"
if [ "${crosses}" = "true" ]; then
echo "$1 crosses the dateline"
warp_options+=("--config" "CENTER_LONG" "180")
else
echo "$1 does not cross the dateline"
fi

echo "Now warping $1 to $2"
gdalwarp "$1" "$2" -t_srs "$3" "${warp_options[@]}"
echo "Done warping $1 to $2"

echo "Now uploading $2 to $4"
aws s3 cp --no-progress "$2" "$4"
echo "Done uploading $2 to $4"

echo "Finally, deleting local files $1 and $2"
rm "$1" "$2"
echo "Done deleting local files $1 and $2"
5 changes: 5 additions & 0 deletions batch/scripts/get_arguments.sh
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,11 @@ do
shift # past argument
shift # past value
;;
--target_crs)
TARGET_CRS="$2"
shift # past argument
shift # past value
;;
--target_bucket)
TARGET_BUCKET="$2"
shift # past argument
Expand Down
2 changes: 1 addition & 1 deletion batch/scripts/resample.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set -e
# -d | --dataset
# -v | --version
# -s | --source
# -r | --resampling_method)
# -r | --resampling_method
# --zoom_level
# -T | --target

Expand Down
50 changes: 50 additions & 0 deletions batch/scripts/unify_projection.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash

set -e

# requires arguments
# -s | --source
# -T | --target
# --target_crs

ME=$(basename "$0")
. get_arguments.sh "$@"

echo "Reproject to a common CRS"

src_count=0
CMD_ARGS=()

for s in "${SRC[@]}"; do
source_dir="SRC_${src_count}"
mkdir -p "$source_dir"

echo "Now recursively downloading $s to $source_dir"
if [[ $s == gs://* ]]; then
gsutil -m cp -r "$s" "$source_dir"
elif [[ $s == s3://* ]]; then
aws s3 cp --recursive --no-progress "$s" "$source_dir"
fi
echo "Done downloading $s to $source_dir"

reprojected_dir="REPROJECTED_${src_count}"
mkdir -p "$reprojected_dir"

cd "${source_dir}"
for d in $(find . -type d | sed 's/.\///'); do
mkdir -p "../${reprojected_dir}/${d}"
done

for f in $(find . -iname "*.tif"| sed 's/.\///'); do
local_src_file="${source_dir}/${f}"
local_warped_file="${reprojected_dir}/${f}"
remote_target_file="${TARGET}/SRC_${src_count}/${f}"

CMD_ARGS+=("${local_src_file}" "${local_warped_file}" "${TARGET_CRS}" "${remote_target_file}")
done
cd ..

src_count=$(($src_count+1))
done

echo "${CMD_ARGS[@]}" | xargs -n 4 -P 32 _warp_and_upload.sh

0 comments on commit 9f57856

Please sign in to comment.