Skip to content

Commit

Permalink
Merge pull request #5 from LCOGT/feature/implement_median
Browse files Browse the repository at this point in the history
Feature/implement median
  • Loading branch information
LTDakin authored Mar 21, 2024
2 parents 5b0fd9d + 2cdf3f6 commit c7f8da5
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 1 deletion.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# vscode
.vscode
72 changes: 71 additions & 1 deletion datalab/datalab_session/data_operations/median.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,16 @@
from io import BytesIO
import logging
import os
import tempfile

import numpy as np
from astropy.io import fits

from datalab.datalab_session.data_operations.data_operation import BaseDataOperation
from datalab.datalab_session.util import store_fits_output, get_archive_from_basename

log = logging.getLogger()
log.setLevel(logging.INFO)


class Median(BaseDataOperation):
Expand Down Expand Up @@ -31,4 +43,62 @@ def wizard_description():
}

def operate(self):
pass
input_files = self.input_data.get('input_files', [])
file_count = len(input_files)

if file_count == 0:
return { 'output_files': [] }

log.info(f'Executing median operation on {file_count} files')

with tempfile.TemporaryDirectory() as temp_dir:
memmap_paths = []

for index, file_info in enumerate(input_files):
basename = file_info.get('basename', 'No basename found')
archive_record = get_archive_from_basename(basename)

try:
fits_url = archive_record[0].get('url', 'No URL found')
except IndexError:
continue

with fits.open(fits_url, use_fsspec=True) as hdu_list:
data = hdu_list['SCI'].data
memmap_path = os.path.join(temp_dir, f'memmap_{index}.dat')
memmap_array = np.memmap(memmap_path, dtype=data.dtype, mode='w+', shape=data.shape)
memmap_array[:] = data[:]
memmap_paths.append(memmap_path)

self.set_percent_completion(index / file_count)

image_data_list = [
np.memmap(path, dtype=np.float32, mode='r', shape=memmap_array.shape)
for path in memmap_paths
]

# Crop fits image data to be the same shape then stack
min_shape = min(arr.shape for arr in image_data_list)
cropped_data_list = [arr[:min_shape[0], :min_shape[1]] for arr in image_data_list]
stacked_data = np.stack(cropped_data_list, axis=2)

# Calculate a Median along the z axis
median = np.median(stacked_data, axis=2)

cache_key = self.generate_cache_key()
header = fits.Header([('KEY', cache_key)])
primary_hdu = fits.PrimaryHDU(header=header)
image_hdu = fits.ImageHDU(median)
hdu_list = fits.HDUList([primary_hdu, image_hdu])

fits_buffer = BytesIO()
hdu_list.writeto(fits_buffer)
fits_buffer.seek(0)

# Write the HDU List to the output FITS file in the bucket
response = store_fits_output(cache_key, fits_buffer)

# TODO: No output yet, need to build a thumbnail service
output = {'output_files': []}
self.set_percent_completion(file_count / file_count)
self.set_output(output)
43 changes: 43 additions & 0 deletions datalab/datalab_session/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import requests
import logging

import boto3

from django.conf import settings

log = logging.getLogger()
log.setLevel(logging.INFO)

def store_fits_output(item_key: str, fits_buffer: object) -> object:
"""
Stores a fits into the operation bucket in S3
Keyword Arguements:
item_key -- name under which to store the fits file
fits_buffer -- the fits file to add to the bucket
"""
log.info(f'Adding {item_key} to {settings.DATALAB_OPERATION_BUCKET}')

s3 = boto3.resource('s3')
response = s3.Bucket(settings.DATALAB_OPERATION_BUCKET).put_object(Key = item_key, Body = fits_buffer.getvalue())
return response

def get_archive_from_basename(basename: str) -> dict:
"""
Queries and returns an archive file from the Archive
Keyword Arguements:
basename -- name to query
"""
query_params = {'basename_exact': basename }

response = requests.get(settings.ARCHIVE_API + '/frames/', params=query_params)

try:
image_data = response.json()
results = image_data.get('results', None)
except IndexError:
log.error(f"No image found with specified basename: {basename}")
raise FileNotFoundError

return results
5 changes: 5 additions & 0 deletions datalab/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,11 @@ def get_list_from_env(variable, default=None):
# AdminMiddleware is enabled. The default value is 'default'.
DRAMATIQ_TASKS_DATABASE = 'default'

# AWS S3 Bitbucket
DATALAB_OPERATION_BUCKET = os.getenv('DATALAB_OPERATION_BUCKET', 'datalab-operation-output-bucket')

# Datalab Archive
ARCHIVE_API = os.getenv('ARCHIVE_API', 'https://datalab-archive.photonranch.org')

# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
Expand Down

0 comments on commit c7f8da5

Please sign in to comment.