Skip to content

Commit

Permalink
refactor codebase for new euclid workflow usage with tests (#180)
Browse files Browse the repository at this point in the history
* refactor codebase for new euclid workflow usage with tests

* add base_extractor to galaxyzoo module, use class label keys and prefexises for extractor tests, remove error on baseclass

* remove shared class implementation
  • Loading branch information
Tooyosi authored Nov 11, 2024
1 parent 6ae2f7e commit 0b691cc
Show file tree
Hide file tree
Showing 24 changed files with 366 additions and 284 deletions.
6 changes: 3 additions & 3 deletions app/modules/label_extractors/finder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ class UnknownTaskKey < StandardError; end

# hard code Galaxy Zoo for now as these will fail due to missing constant lookup
# long term we can add these back in and make the lookup dynamic
EXTRACTOR_SCHEMA_CLASS_REGEX = /\Agalaxy_zoo_(decals|cosmic_dawn)_(.+)\z/.freeze
EXTRACTOR_SCHEMA_CLASS_REGEX = /\A(galaxy_zoo)_(decals|cosmic_dawn|euclid)_(.+)\z/.freeze

def self.extractor_instance(task_schema_lookup_key)
# simulate a regex lookup failure with the || [nil, task_schema_lookup_key] as it'll raise a NameError when trying to constantize
schema_name_and_task = EXTRACTOR_SCHEMA_CLASS_REGEX.match(task_schema_lookup_key) || [nil, task_schema_lookup_key]
schema_klass = "LabelExtractors::GalaxyZoo::#{schema_name_and_task[1].camelize}".constantize
task_key = schema_name_and_task[2].upcase
schema_klass = "LabelExtractors::#{schema_name_and_task[1].camelize}::#{schema_name_and_task[2].camelize}".constantize
task_key = schema_name_and_task[3].upcase
schema_klass.new(task_key)
rescue NameError => _e
raise UnknownExtractor, "no extractor class found for '#{schema_name_and_task[1]}'"
Expand Down
69 changes: 69 additions & 0 deletions app/modules/label_extractors/galaxy_zoo/base_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# frozen_string_literal: true

module LabelExtractors
module GalaxyZoo
class BaseExtractor
attr_reader :task_lookup_key, :task_prefix_label

def initialize(task_lookup_key)
@task_lookup_key = task_lookup_key
@task_prefix_label = task_prefix
end

# extract the keys from the reduction data payload hash
# and convert the keys to the workflow question tasks
#
# e.g. workflow type (GZ) are question type 'decision tree' tasks
# looking at the 'T0' task it correlates to 3 exclusive answers:
# 0 (smooth)
# 1 (features or disk)
# 2 (star or artifact)
#
# then combined with the label prefix used to identify the correlated task name for Zoobot
def extract(data_hash)
data_hash.transform_keys do |key|
# create the lable key used for column headers in the derived training catalogues
# note the hyphen and underscore formatting, see Zoobot label schema for more details
"#{task_prefix_label}-#{data_release_suffix}_#{data_payload_label(key)}"
end
end

def self.label_prefixes
self::TASK_KEY_LABEL_PREFIXES
end

def self.data_labels
self::TASK_KEY_DATA_LABELS
end

# provide a flat task question and answers list for the decals mission catalogues
def self.question_answers_schema
label_prefixes.map do |task_key, question_prefix|
data_labels[task_key].values.map do |answer_suffix|
"#{question_prefix}-#{data_release_suffix}_#{answer_suffix}"
end
end.flatten
end

private

def task_prefix
prefix = self.class::TASK_KEY_LABEL_PREFIXES[task_lookup_key]
raise UnknownTaskKey, "key not found: #{task_lookup_key}" unless prefix

prefix
end

def data_payload_label(key)
label = self.class::TASK_KEY_DATA_LABELS.dig(task_lookup_key, key)
raise UnknownLabelKey, "key not found: #{key}" unless label

label
end

def data_release_suffix
self.class::data_release_suffix
end
end
end
end
59 changes: 3 additions & 56 deletions app/modules/label_extractors/galaxy_zoo/cosmic_dawn.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

module LabelExtractors
module GalaxyZoo
class CosmicDawn

attr_reader :task_lookup_key, :task_prefix_label
class CosmicDawn < BaseExtractor

# GZ decision tree task schema and lable tables
#
Expand Down Expand Up @@ -111,60 +109,9 @@ class CosmicDawn

DATA_RELEASE_SUFFIX = 'cd'

def self.label_prefixes
TASK_KEY_LABEL_PREFIXES
end

def self.data_labels
TASK_KEY_DATA_LABELS
end

# provide a flat task question and answers list for the decals mission catalogues
def self.question_answers_schema
label_prefixes.map do |task_key, question_prefix|
data_labels[task_key].values.map do |answer_suffix|
"#{question_prefix}-#{DATA_RELEASE_SUFFIX}_#{answer_suffix}"
end
end.flatten
end

def initialize(task_lookup_key)
@task_lookup_key = task_lookup_key
@task_prefix_label = task_prefix
end

# extract the keys from the reduction data payload hash
# and convert the keys to the workflow question tasks
#
# e.g. workflow type (GZ) are question type 'decision tree' tasks
# looking at the 'T0' task it correlates to 3 exclusive answers:
# 0 (smooth)
# 1 (features or disk)
# 2 (problem)
#
# then combined with the label prefix used to identify the correlated task name for Zoobot
def extract(data_hash)
data_hash.transform_keys do |key|
# create the lable key used for column headers in the derived training catalogues
# note the hyphen and underscore formatting, see Zoobot label schema for more details
"#{task_prefix_label}-#{DATA_RELEASE_SUFFIX}_#{data_payload_label(key)}"
end
end

private

def task_prefix
prefix = TASK_KEY_LABEL_PREFIXES[task_lookup_key]
raise UnknownTaskKey, "key not found: #{task_lookup_key}" unless prefix

prefix
end

def data_payload_label(key)
label = TASK_KEY_DATA_LABELS.dig(task_lookup_key, key)
raise UnknownLabelKey, "key not found: #{key}" unless label

label
def self.data_release_suffix
DATA_RELEASE_SUFFIX
end
end
end
Expand Down
42 changes: 3 additions & 39 deletions app/modules/label_extractors/galaxy_zoo/decals.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

module LabelExtractors
module GalaxyZoo
class Decals
class Decals < BaseExtractor

attr_reader :task_lookup_key, :task_prefix_label, :data_release_suffix

Expand Down Expand Up @@ -95,13 +95,6 @@ class Decals
# longer term we may include gz2 , e.g. -dr12 (gz 1 & 2)
CATALOG_DATA_RELEASE_SUFFIXES = %w[dr5 dr8]

def self.label_prefixes
TASK_KEY_LABEL_PREFIXES
end

def self.data_labels
TASK_KEY_DATA_LABELS
end

# provide a flat task question and answers list for the decals mission catalogues
def self.question_answers_schema
Expand All @@ -120,38 +113,9 @@ def initialize(task_lookup_key, data_release_suffix = DEFAULT_DATA_RELEASE_SUFFI
@data_release_suffix = data_release_suffix
end

# extract the keys from the reduction data payload hash
# and convert the keys to the workflow question tasks
#
# e.g. workflow type (GZ) are question type 'decision tree' tasks
# looking at the 'T0' task it correlates to 3 exclusive answers:
# 0 (smooth)
# 1 (features or disk)
# 2 (star or artifact)
#
# then combined with the label prefix used to identify the correlated task name for Zoobot
def extract(data_hash)
data_hash.transform_keys do |key|
# create the lable key used for column headers in the derived training catalogues
# note the hyphen and underscore formatting, see Zoobot lable schema for more details
"#{task_prefix_label}-#{data_release_suffix}_#{data_payload_label(key)}"
end
end

private

def task_prefix
prefix = TASK_KEY_LABEL_PREFIXES[task_lookup_key]
raise UnknownTaskKey, "key not found: #{task_lookup_key}" unless prefix

prefix
end

def data_payload_label(key)
label = TASK_KEY_DATA_LABELS.dig(task_lookup_key, key)
raise UnknownLabelKey, "key not found: #{key}" unless label

label
def self.data_release_suffix
data_release_suffix
end
end
end
Expand Down
114 changes: 114 additions & 0 deletions app/modules/label_extractors/galaxy_zoo/euclid.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# frozen_string_literal: true

module LabelExtractors
module GalaxyZoo
class Euclid < BaseExtractor

attr_reader :task_lookup_key, :task_prefix_label

# Derived to conform to the existing catalogue schema for Zoobot euclid
# https://github.com/mwalmsley/galaxy-datasets/blob/eed30d3e37b5559d0427c339e8dc1d2a9dc2d004/galaxy_datasets/shared/label_metadata.py#L462
TASK_KEY_LABEL_PREFIXES = {
'T0' => 'smooth-or-featured',
'T1' => 'how-rounded',
'T2' => 'disk-edge-on',
'T3' => 'edge-on-bulge',
'T4' => 'bar',
'T5' => 'has-spiral-arms',
'T6' => 'spiral-winding',
'T7' => 'spiral-arm-count',
'T8' => 'bulge-size',
'T11' => 'merging', # T10 is not used for training and no T9 in prod :shrug:
'T12' => 'lensing',
'T13' => 'clumps',
'T14' => 'problem',
'T15' => 'artifact'
}.freeze
TASK_KEY_DATA_LABELS = {
'T0' => {
'0' => 'smooth',
'1' => 'featured-or-disk',
'2' => 'problem'
},
'T1' => {
'0' => 'round',
'1' => 'in-between',
'2' => 'cigar-shaped'
},
'T2' => {
'0' => 'yes',
'1' => 'no'
},
'T3' => {
'0' => 'rounded',
'1' => 'boxy',
'2' => 'none'
},
'T4' => {
'0' => 'no',
'1' => 'weak',
'2' => 'strong'
},
'T5' => {
'0' => 'yes',
'1' => 'no'
},
'T6' => {
'0' => 'tight',
'1' => 'medium',
'2' => 'loose'
},
'T7' => {
'0' => '1',
'1' => '2',
'2' => '3',
'3' => '4',
'4' => 'more-than-4',
'5' => 'cant-tell'
},
'T8' => {
'0' => 'none',
'1' => 'small',
'2' => 'moderate',
'3' => 'large',
'4' => 'dominant'
},
'T11' => {
'0' => 'merger',
'1' => 'major-disturbance',
'2' => 'minor-disturbance',
'3' => 'none'
},
'T12' => {
'0' => 'yes',
'1' => 'no'
},
'T13' => {
'0' => 'yes',
'1' => 'no'
},
'T14' => {
'0' => 'star',
'1' => 'artifact',
'2' => 'zoom'
},
'T15' => {
'0' => 'saturation',
'1' => 'diffraction',
'2' => 'satellite',
'3' => 'ray',
'4' => 'scattered',
'5' => 'other',
'6' => 'ghost'
}
}.freeze

DATA_RELEASE_SUFFIX = 'euclid'

private
def self.data_release_suffix
DATA_RELEASE_SUFFIX
end
end
end
end
5 changes: 3 additions & 2 deletions app/modules/zoobot.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# frozen_string_literal: true

module Zoobot
def self.gz_label_column_headers
def self.label_column_headers(module_name='GalaxyZoo', extractor_name='CosmicDawn')
schema_klass = "LabelExtractors::#{module_name}::#{extractor_name}".constantize
# as data sets change, switch to different mission label extractors, e.g. Decals is older
%w[id_str file_loc] | LabelExtractors::GalaxyZoo::CosmicDawn.question_answers_schema
%w[id_str file_loc] | schema_klass.question_answers_schema
end

module Storage
Expand Down
3 changes: 2 additions & 1 deletion app/services/batch/training/create_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def initialize(training_job, bajor_client = Bajor::Client.new)

def run
begin
bajor_job_url = bajor_client.create_training_job(training_job.manifest_path)
context = Context.find_by(workflow_id: training_job.workflow_id)
bajor_job_url = bajor_client.create_training_job(training_job.manifest_path, context.extractor_name)
training_job.update(state: :submitted, service_job_url: bajor_job_url, message: '')
rescue Bajor::Client::Error => e
# mark the jobs as failed and record the client error message
Expand Down
5 changes: 4 additions & 1 deletion app/services/export/training_data.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@ def initialize(training_data_export)
end

def run
context = Context.find_by(workflow_id: training_data_export.workflow_id)
module_name = context.module_name.camelize
extractor_name = context.extractor_name.camelize
# create the formatted csv file export IO object
csv_export_file = Format::TrainingDataCsv.new(
training_data_export.workflow_id,
Zoobot.gz_label_column_headers
Zoobot.label_column_headers(module_name, extractor_name)
).run

# upload the export file via active storage
Expand Down
1 change: 1 addition & 0 deletions app/sidekiq/retrain_zoobot_job.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# frozen_string_literal: true
require 'bajor/client'

class RetrainZoobotJob
class Failure < StandardError; end
Expand Down
9 changes: 9 additions & 0 deletions config/schedule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,12 @@ zoobot_training_job:
args: '1' # the context_id to train on (holds the workflow, active and pool subject set ids)
description: 'Scheduled Worker to re-train GZ Zoobot on latest data'
status: <%= Rails.env.production? ? 'enabled' : 'disabled' %> # only enable this in production, manually schedule via the sidekiq UI in other envs

# this file can be used to paramterize RetrainZoobotJob for different contexts (workflow and subject sets)
zoobot_euclid_training_job:
cron: "0 10 * * 4" # 10:00 every Thursday https://crontab.guru/#0_10_*_*_4
class: 'RetrainZoobotJob'
queue: 'high'
args: '2' # the context_id to train on (holds the workflow, active and pool subject set ids)
description: 'Scheduled Worker to re-train GZ Zoobot on latest data for euclid workflow'
status: <%= Rails.env.production? ? 'enabled' : 'disabled' %> # only enable this in production, manually schedule via the sidekiq UI in other envs
Loading

0 comments on commit 0b691cc

Please sign in to comment.