From 06cc71cd8ecdd7cd38d7d6318ab8180187b93758 Mon Sep 17 00:00:00 2001 From: tooyosi Date: Mon, 28 Oct 2024 18:22:49 +0000 Subject: [PATCH] refactor codebase for new euclid workflow usage with tests --- .../label_extractors/base_extractor.rb | 66 ++++++++ app/modules/label_extractors/finder.rb | 6 +- .../galaxy_zoo/cosmic_dawn.rb | 59 +------ .../label_extractors/galaxy_zoo/decals.rb | 43 +---- .../label_extractors/galaxy_zoo/euclid.rb | 114 +++++++++++++ .../shared/cosmic_dawn_and_euclid.rb | 23 +++ app/modules/zoobot.rb | 5 +- app/services/batch/training/create_job.rb | 3 +- app/services/export/training_data.rb | 5 +- app/sidekiq/retrain_zoobot_job.rb | 1 + config/schedule.yml | 9 + ...241017114649_add_module_name_to_context.rb | 5 + ...017120552_add_extractor_name_to_context.rb | 5 + db/schema.rb | 4 +- lib/bajor/client.rb | 4 +- spec/fixtures/contexts.yml | 2 + spec/lib/bajor/client_spec.rb | 10 +- .../label_extractors/base_extractor_spec.rb | 51 ++++++ spec/modules/label_extractors/finder_spec.rb | 2 +- .../galaxy_zoo/cosmic_dawn_spec.rb | 4 +- .../galaxy_zoo/decals_spec.rb | 4 +- .../galaxy_zoo/euclid_spec.rb | 160 ++++++++++++++++++ .../shared/cosmic_dawn_and_euclid_spec.rb | 44 +++++ spec/modules/zoobot_spec.rb | 4 +- .../batch/training/create_job_spec.rb | 5 +- spec/services/export/training_data_spec.rb | 2 +- .../services/format/training_data_csv_spec.rb | 8 +- 27 files changed, 528 insertions(+), 120 deletions(-) create mode 100644 app/modules/label_extractors/base_extractor.rb create mode 100644 app/modules/label_extractors/galaxy_zoo/euclid.rb create mode 100644 app/modules/label_extractors/shared/cosmic_dawn_and_euclid.rb create mode 100644 db/migrate/20241017114649_add_module_name_to_context.rb create mode 100644 db/migrate/20241017120552_add_extractor_name_to_context.rb create mode 100644 spec/modules/label_extractors/base_extractor_spec.rb create mode 100644 spec/modules/label_extractors/galaxy_zoo/euclid_spec.rb create mode 100644 spec/modules/label_extractors/shared/cosmic_dawn_and_euclid_spec.rb diff --git a/app/modules/label_extractors/base_extractor.rb b/app/modules/label_extractors/base_extractor.rb new file mode 100644 index 0000000..5a84ff3 --- /dev/null +++ b/app/modules/label_extractors/base_extractor.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +module LabelExtractors + class BaseExtractor + + class UnknownTaskKey < StandardError; end + class UnknownLabelKey < StandardError; end + attr_reader :task_lookup_key, :task_prefix_label + + def initialize(task_lookup_key) + @task_lookup_key = task_lookup_key + @task_prefix_label = task_prefix + end + + # extract the keys from the reduction data payload hash + # and convert the keys to the workflow question tasks + # + # e.g. workflow type (GZ) are question type 'decision tree' tasks + # looking at the 'T0' task it correlates to 3 exclusive answers: + # 0 (smooth) + # 1 (features or disk) + # 2 (star or artifact) + # + # then combined with the label prefix used to identify the correlated task name for Zoobot + def extract(data_hash) + data_hash.transform_keys do |key| + # create the lable key used for column headers in the derived training catalogues + # note the hyphen and underscore formatting, see Zoobot label schema for more details + "#{task_prefix_label}-#{data_release_suffix}_#{data_payload_label(key)}" + end + end + + def self.label_prefixes + self::TASK_KEY_LABEL_PREFIXES + end + + def self.data_labels + self::TASK_KEY_DATA_LABELS + end + + # Base version of question_answers_schema method to be customized by subclasses + def self.question_answers_schema + raise NotImplementedError, "Subclass must define `question_answers_schema`" + end + + private + + def task_prefix + prefix = self.class::TASK_KEY_LABEL_PREFIXES[task_lookup_key] + raise UnknownTaskKey, "key not found: #{task_lookup_key}" unless prefix + + prefix + end + + def data_payload_label(key) + label = self.class::TASK_KEY_DATA_LABELS.dig(task_lookup_key, key) + raise UnknownLabelKey, "key not found: #{key}" unless label + + label + end + + def data_release_suffix + self.class::data_release_suffix + end + end +end diff --git a/app/modules/label_extractors/finder.rb b/app/modules/label_extractors/finder.rb index d789205..93fca85 100644 --- a/app/modules/label_extractors/finder.rb +++ b/app/modules/label_extractors/finder.rb @@ -7,13 +7,13 @@ class UnknownTaskKey < StandardError; end # hard code Galaxy Zoo for now as these will fail due to missing constant lookup # long term we can add these back in and make the lookup dynamic - EXTRACTOR_SCHEMA_CLASS_REGEX = /\Agalaxy_zoo_(decals|cosmic_dawn)_(.+)\z/.freeze + EXTRACTOR_SCHEMA_CLASS_REGEX = /\A(galaxy_zoo)_(decals|cosmic_dawn|euclid)_(.+)\z/.freeze def self.extractor_instance(task_schema_lookup_key) # simulate a regex lookup failure with the || [nil, task_schema_lookup_key] as it'll raise a NameError when trying to constantize schema_name_and_task = EXTRACTOR_SCHEMA_CLASS_REGEX.match(task_schema_lookup_key) || [nil, task_schema_lookup_key] - schema_klass = "LabelExtractors::GalaxyZoo::#{schema_name_and_task[1].camelize}".constantize - task_key = schema_name_and_task[2].upcase + schema_klass = "LabelExtractors::#{schema_name_and_task[1].camelize}::#{schema_name_and_task[2].camelize}".constantize + task_key = schema_name_and_task[3].upcase schema_klass.new(task_key) rescue NameError => _e raise UnknownExtractor, "no extractor class found for '#{schema_name_and_task[1]}'" diff --git a/app/modules/label_extractors/galaxy_zoo/cosmic_dawn.rb b/app/modules/label_extractors/galaxy_zoo/cosmic_dawn.rb index 65fbf59..9cbb96a 100644 --- a/app/modules/label_extractors/galaxy_zoo/cosmic_dawn.rb +++ b/app/modules/label_extractors/galaxy_zoo/cosmic_dawn.rb @@ -2,9 +2,7 @@ module LabelExtractors module GalaxyZoo - class CosmicDawn - - attr_reader :task_lookup_key, :task_prefix_label + class CosmicDawn < LabelExtractors::Shared::CosmicDawnAndEuclid # GZ decision tree task schema and lable tables # @@ -111,60 +109,9 @@ class CosmicDawn DATA_RELEASE_SUFFIX = 'cd' - def self.label_prefixes - TASK_KEY_LABEL_PREFIXES - end - - def self.data_labels - TASK_KEY_DATA_LABELS - end - - # provide a flat task question and answers list for the decals mission catalogues - def self.question_answers_schema - label_prefixes.map do |task_key, question_prefix| - data_labels[task_key].values.map do |answer_suffix| - "#{question_prefix}-#{DATA_RELEASE_SUFFIX}_#{answer_suffix}" - end - end.flatten - end - - def initialize(task_lookup_key) - @task_lookup_key = task_lookup_key - @task_prefix_label = task_prefix - end - - # extract the keys from the reduction data payload hash - # and convert the keys to the workflow question tasks - # - # e.g. workflow type (GZ) are question type 'decision tree' tasks - # looking at the 'T0' task it correlates to 3 exclusive answers: - # 0 (smooth) - # 1 (features or disk) - # 2 (problem) - # - # then combined with the label prefix used to identify the correlated task name for Zoobot - def extract(data_hash) - data_hash.transform_keys do |key| - # create the lable key used for column headers in the derived training catalogues - # note the hyphen and underscore formatting, see Zoobot label schema for more details - "#{task_prefix_label}-#{DATA_RELEASE_SUFFIX}_#{data_payload_label(key)}" - end - end - private - - def task_prefix - prefix = TASK_KEY_LABEL_PREFIXES[task_lookup_key] - raise UnknownTaskKey, "key not found: #{task_lookup_key}" unless prefix - - prefix - end - - def data_payload_label(key) - label = TASK_KEY_DATA_LABELS.dig(task_lookup_key, key) - raise UnknownLabelKey, "key not found: #{key}" unless label - - label + def self.data_release_suffix + DATA_RELEASE_SUFFIX end end end diff --git a/app/modules/label_extractors/galaxy_zoo/decals.rb b/app/modules/label_extractors/galaxy_zoo/decals.rb index fe2d3df..e2cede7 100644 --- a/app/modules/label_extractors/galaxy_zoo/decals.rb +++ b/app/modules/label_extractors/galaxy_zoo/decals.rb @@ -1,8 +1,9 @@ # frozen_string_literal: true +require_relative '../base_extractor' module LabelExtractors module GalaxyZoo - class Decals + class Decals < BaseExtractor attr_reader :task_lookup_key, :task_prefix_label, :data_release_suffix @@ -95,13 +96,6 @@ class Decals # longer term we may include gz2 , e.g. -dr12 (gz 1 & 2) CATALOG_DATA_RELEASE_SUFFIXES = %w[dr5 dr8] - def self.label_prefixes - TASK_KEY_LABEL_PREFIXES - end - - def self.data_labels - TASK_KEY_DATA_LABELS - end # provide a flat task question and answers list for the decals mission catalogues def self.question_answers_schema @@ -120,38 +114,9 @@ def initialize(task_lookup_key, data_release_suffix = DEFAULT_DATA_RELEASE_SUFFI @data_release_suffix = data_release_suffix end - # extract the keys from the reduction data payload hash - # and convert the keys to the workflow question tasks - # - # e.g. workflow type (GZ) are question type 'decision tree' tasks - # looking at the 'T0' task it correlates to 3 exclusive answers: - # 0 (smooth) - # 1 (features or disk) - # 2 (star or artifact) - # - # then combined with the label prefix used to identify the correlated task name for Zoobot - def extract(data_hash) - data_hash.transform_keys do |key| - # create the lable key used for column headers in the derived training catalogues - # note the hyphen and underscore formatting, see Zoobot lable schema for more details - "#{task_prefix_label}-#{data_release_suffix}_#{data_payload_label(key)}" - end - end - private - - def task_prefix - prefix = TASK_KEY_LABEL_PREFIXES[task_lookup_key] - raise UnknownTaskKey, "key not found: #{task_lookup_key}" unless prefix - - prefix - end - - def data_payload_label(key) - label = TASK_KEY_DATA_LABELS.dig(task_lookup_key, key) - raise UnknownLabelKey, "key not found: #{key}" unless label - - label + def self.data_release_suffix + data_release_suffix end end end diff --git a/app/modules/label_extractors/galaxy_zoo/euclid.rb b/app/modules/label_extractors/galaxy_zoo/euclid.rb new file mode 100644 index 0000000..81e0f0d --- /dev/null +++ b/app/modules/label_extractors/galaxy_zoo/euclid.rb @@ -0,0 +1,114 @@ +# frozen_string_literal: true + +module LabelExtractors + module GalaxyZoo + class Euclid < LabelExtractors::Shared::CosmicDawnAndEuclid + + attr_reader :task_lookup_key, :task_prefix_label + + # Derived to conform to the existing catalogue schema for Zoobot euclid + # https://github.com/mwalmsley/galaxy-datasets/blob/eed30d3e37b5559d0427c339e8dc1d2a9dc2d004/galaxy_datasets/shared/label_metadata.py#L462 + TASK_KEY_LABEL_PREFIXES = { + 'T0' => 'smooth-or-featured', + 'T1' => 'how-rounded', + 'T2' => 'disk-edge-on', + 'T3' => 'edge-on-bulge', + 'T4' => 'bar', + 'T5' => 'has-spiral-arms', + 'T6' => 'spiral-winding', + 'T7' => 'spiral-arm-count', + 'T8' => 'bulge-size', + 'T11' => 'merging', # T10 is not used for training and no T9 in prod :shrug: + 'T12' => 'lensing', + 'T13' => 'clumps', + 'T14' => 'problem', + 'T15' => 'artifact' + }.freeze + TASK_KEY_DATA_LABELS = { + 'T0' => { + '0' => 'smooth', + '1' => 'featured-or-disk', + '2' => 'problem' + }, + 'T1' => { + '0' => 'round', + '1' => 'in-between', + '2' => 'cigar-shaped' + }, + 'T2' => { + '0' => 'yes', + '1' => 'no' + }, + 'T3' => { + '0' => 'rounded', + '1' => 'boxy', + '2' => 'none' + }, + 'T4' => { + '0' => 'no', + '1' => 'weak', + '2' => 'strong' + }, + 'T5' => { + '0' => 'yes', + '1' => 'no' + }, + 'T6' => { + '0' => 'tight', + '1' => 'medium', + '2' => 'loose' + }, + 'T7' => { + '0' => '1', + '1' => '2', + '2' => '3', + '3' => '4', + '4' => 'more-than-4', + '5' => 'cant-tell' + }, + 'T8' => { + '0' => 'none', + '1' => 'small', + '2' => 'moderate', + '3' => 'large', + '4' => 'dominant' + }, + 'T11' => { + '0' => 'merger', + '1' => 'major-disturbance', + '2' => 'minor-disturbance', + '3' => 'none' + }, + 'T12' => { + '0' => 'yes', + '1' => 'no' + }, + 'T13' => { + '0' => 'yes', + '1' => 'no' + }, + 'T14' => { + '0' => 'star', + '1' => 'artifact', + '2' => 'zoom' + }, + 'T15' => { + '0' => 'saturation', + '1' => 'diffraction', + '2' => 'satellite', + '3' => 'ray', + '4' => 'scattered', + '5' => 'other', + '6' => 'ghost' + } + }.freeze + + DATA_RELEASE_SUFFIX = 'euclid' + + private + def self.data_release_suffix + DATA_RELEASE_SUFFIX + end + end + end +end diff --git a/app/modules/label_extractors/shared/cosmic_dawn_and_euclid.rb b/app/modules/label_extractors/shared/cosmic_dawn_and_euclid.rb new file mode 100644 index 0000000..322dde7 --- /dev/null +++ b/app/modules/label_extractors/shared/cosmic_dawn_and_euclid.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true +require_relative '../base_extractor' + +module LabelExtractors + module Shared + class CosmicDawnAndEuclid < BaseExtractor + + def self.data_release_suffix + raise NotImplementedError, "Subclass must define `data_release_suffix`" + end + + # provide a flat task question and answers list for the decals mission catalogues + def self.question_answers_schema + label_prefixes.map do |task_key, question_prefix| + data_labels[task_key].values.map do |answer_suffix| + "#{question_prefix}-#{data_release_suffix}_#{answer_suffix}" + end + end.flatten + end + + end + end +end diff --git a/app/modules/zoobot.rb b/app/modules/zoobot.rb index b3d859a..45d7a95 100644 --- a/app/modules/zoobot.rb +++ b/app/modules/zoobot.rb @@ -1,9 +1,10 @@ # frozen_string_literal: true module Zoobot - def self.gz_label_column_headers + def self.label_column_headers(module_name='GalaxyZoo', extractor_name='CosmicDawn') + schema_klass = "LabelExtractors::#{module_name}::#{extractor_name}".constantize # as data sets change, switch to different mission label extractors, e.g. Decals is older - %w[id_str file_loc] | LabelExtractors::GalaxyZoo::CosmicDawn.question_answers_schema + %w[id_str file_loc] | schema_klass.question_answers_schema end module Storage diff --git a/app/services/batch/training/create_job.rb b/app/services/batch/training/create_job.rb index 94e731e..47752ef 100644 --- a/app/services/batch/training/create_job.rb +++ b/app/services/batch/training/create_job.rb @@ -14,7 +14,8 @@ def initialize(training_job, bajor_client = Bajor::Client.new) def run begin - bajor_job_url = bajor_client.create_training_job(training_job.manifest_path) + context = Context.find_by(workflow_id: training_job.workflow_id) + bajor_job_url = bajor_client.create_training_job(training_job.manifest_path, context.extractor_name) training_job.update(state: :submitted, service_job_url: bajor_job_url, message: '') rescue Bajor::Client::Error => e # mark the jobs as failed and record the client error message diff --git a/app/services/export/training_data.rb b/app/services/export/training_data.rb index 606eb37..cfb782f 100644 --- a/app/services/export/training_data.rb +++ b/app/services/export/training_data.rb @@ -9,10 +9,13 @@ def initialize(training_data_export) end def run + context = Context.find_by(workflow_id: training_data_export.workflow_id) + module_name = context.module_name.camelize + extractor_name = context.extractor_name.camelize # create the formatted csv file export IO object csv_export_file = Format::TrainingDataCsv.new( training_data_export.workflow_id, - Zoobot.gz_label_column_headers + Zoobot.label_column_headers(module_name, extractor_name) ).run # upload the export file via active storage diff --git a/app/sidekiq/retrain_zoobot_job.rb b/app/sidekiq/retrain_zoobot_job.rb index 3774b49..994c06a 100644 --- a/app/sidekiq/retrain_zoobot_job.rb +++ b/app/sidekiq/retrain_zoobot_job.rb @@ -1,4 +1,5 @@ # frozen_string_literal: true +require 'bajor/client' class RetrainZoobotJob class Failure < StandardError; end diff --git a/config/schedule.yml b/config/schedule.yml index 2637ed2..43863a5 100644 --- a/config/schedule.yml +++ b/config/schedule.yml @@ -8,3 +8,12 @@ zoobot_training_job: args: '1' # the context_id to train on (holds the workflow, active and pool subject set ids) description: 'Scheduled Worker to re-train GZ Zoobot on latest data' status: <%= Rails.env.production? ? 'enabled' : 'disabled' %> # only enable this in production, manually schedule via the sidekiq UI in other envs + +# this file can be used to paramterize RetrainZoobotJob for different contexts (workflow and subject sets) +zoobot_euclid_training_job: + cron: "0 10 * * 4" # 10:00 every Thursday https://crontab.guru/#0_10_*_*_4 + class: 'RetrainZoobotJob' + queue: 'high' + args: '2' # the context_id to train on (holds the workflow, active and pool subject set ids) + description: 'Scheduled Worker to re-train GZ Zoobot on latest data for euclid workflow' + status: <%= Rails.env.production? ? 'enabled' : 'disabled' %> # only enable this in production, manually schedule via the sidekiq UI in other envs diff --git a/db/migrate/20241017114649_add_module_name_to_context.rb b/db/migrate/20241017114649_add_module_name_to_context.rb new file mode 100644 index 0000000..3612713 --- /dev/null +++ b/db/migrate/20241017114649_add_module_name_to_context.rb @@ -0,0 +1,5 @@ +class AddModuleNameToContext < ActiveRecord::Migration[7.0] + def change + add_column :contexts, :module_name, :string + end +end diff --git a/db/migrate/20241017120552_add_extractor_name_to_context.rb b/db/migrate/20241017120552_add_extractor_name_to_context.rb new file mode 100644 index 0000000..ed70f8a --- /dev/null +++ b/db/migrate/20241017120552_add_extractor_name_to_context.rb @@ -0,0 +1,5 @@ +class AddExtractorNameToContext < ActiveRecord::Migration[7.0] + def change + add_column :contexts, :extractor_name, :string + end +end diff --git a/db/schema.rb b/db/schema.rb index d2adde8..f4ca16e 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.0].define(version: 2023_02_16_054738) do +ActiveRecord::Schema[7.0].define(version: 2024_10_17_120552) do # These are extensions that must be enabled in order to support this database enable_extension "plpgsql" @@ -49,6 +49,8 @@ t.datetime "updated_at", null: false t.bigint "active_subject_set_id", null: false t.bigint "pool_subject_set_id", null: false + t.string "module_name" + t.string "extractor_name" t.index ["workflow_id", "project_id"], name: "index_contexts_on_workflow_id_and_project_id", unique: true end diff --git a/lib/bajor/client.rb b/lib/bajor/client.rb index 23d2cca..899592e 100644 --- a/lib/bajor/client.rb +++ b/lib/bajor/client.rb @@ -20,12 +20,12 @@ class TrainingJobTaskError < StandardError; end BLOB_STORE_HOST_CONTAINER_URL = 'https://kadeactivelearning.blob.core.windows.net' - def create_training_job(manifest_path) + def create_training_job(manifest_path, workflow_name='cosmic_dawn') bajor_response = self.class.post( '/training/jobs/', # NOTE: Here we can augment the batch job run options via bajor # via the `run_opts: '--wandb --debug'` etc, these could be set via ENV - body: { manifest_path: manifest_path }.to_json, + body: { manifest_path: manifest_path, opts: { 'run_opts': "--schema #{workflow_name}", 'workflow_name': workflow_name } }.to_json, headers: JSON_HEADERS ) diff --git a/spec/fixtures/contexts.yml b/spec/fixtures/contexts.yml index f88cbad..9a02e83 100644 --- a/spec/fixtures/contexts.yml +++ b/spec/fixtures/contexts.yml @@ -4,4 +4,6 @@ galaxy_zoo_active_learning_project: project_id: 39 active_subject_set_id: 55 pool_subject_set_id: 66 + module_name: 'galaxy_zoo' + extractor_name: 'cosmic_dawn' diff --git a/spec/lib/bajor/client_spec.rb b/spec/lib/bajor/client_spec.rb index feed66e..4056906 100644 --- a/spec/lib/bajor/client_spec.rb +++ b/spec/lib/bajor/client_spec.rb @@ -19,8 +19,16 @@ describe 'create_training_job' do let(:request_url) { "#{bajor_host}/training/jobs/" } let(:catalogue_manifest_path) { 'training_catalogues/manifest_path.csv' } + let(:run_opts) { '--schema cosmic_dawn' } + let(:workflow_name) { 'cosmic_dawn' } let(:request_body) do - { manifest_path: catalogue_manifest_path } + { + manifest_path: catalogue_manifest_path, + opts: { + run_opts: run_opts, + workflow_name: workflow_name + } + } end let(:job_id) { '3ed68115-dc36-4f66-838c-a52869031676' } let(:bajor_response_body) do diff --git a/spec/modules/label_extractors/base_extractor_spec.rb b/spec/modules/label_extractors/base_extractor_spec.rb new file mode 100644 index 0000000..a9896e6 --- /dev/null +++ b/spec/modules/label_extractors/base_extractor_spec.rb @@ -0,0 +1,51 @@ +# spec/models/label_extractors/base_extractor_spec.rb + +class TestExtractor < LabelExtractors::BaseExtractor + TASK_KEY_LABEL_PREFIXES = { task_key_example: "example_prefix" } + TASK_KEY_DATA_LABELS = { task_key_example: { "T0" => "example_label" } } + def self.data_release_suffix + "v1" + end +end + +RSpec.describe LabelExtractors::BaseExtractor do + let(:task_lookup_key) { :task_key_example } + let(:data_hash) { { "T0" => "example_key" } } + let(:instance) { TestExtractor.new(task_lookup_key) } + + describe '#initialize' do + it 'initializes with task_lookup_key and sets task_prefix_label' do + expect(instance.task_lookup_key).to eq(task_lookup_key) + expect(instance.task_prefix_label).to eq("example_prefix") + end + end + + describe '#extract' do + it 'transforms the data hash keys correctly' do + result = instance.extract(data_hash) + expect(result).to eq({ "example_prefix-v1_example_label" => "example_key" }) + end + end + + describe '#task_prefix' do + it 'returns the correct prefix for a known task_lookup_key' do + expect(instance.send(:task_prefix)).to eq("example_prefix") + end + + it 'raises an error for an unknown task_lookup_key' do + invalid_instance = TestExtractor.allocate # Skips calling initialize + allow(invalid_instance).to receive(:task_lookup_key).and_return(:invalid_key) + expect { invalid_instance.send(:task_prefix) }.to raise_error(LabelExtractors::BaseExtractor::UnknownTaskKey) + end + end + + describe '#data_payload_label' do + it 'returns the correct label for a known key' do + expect(instance.send(:data_payload_label, "T0")).to eq("example_label") + end + + it 'raises an error for an unknown key' do + expect { instance.send(:data_payload_label, "unknown_key") }.to raise_error(LabelExtractors::BaseExtractor::UnknownLabelKey) + end + end +end diff --git a/spec/modules/label_extractors/finder_spec.rb b/spec/modules/label_extractors/finder_spec.rb index 3c5fc6c..8cf1ab2 100644 --- a/spec/modules/label_extractors/finder_spec.rb +++ b/spec/modules/label_extractors/finder_spec.rb @@ -25,7 +25,7 @@ it 'raises an error if the task key is not known for the label schema' do expect { described_class.extractor_instance('galaxy_zoo_cosmic_dawn_t50') - }.to raise_error(LabelExtractors::GalaxyZoo::UnknownTaskKey, 'key not found: T50') + }.to raise_error(LabelExtractors::BaseExtractor::UnknownTaskKey, 'key not found: T50') end it 'finds the decals mission data' do diff --git a/spec/modules/label_extractors/galaxy_zoo/cosmic_dawn_spec.rb b/spec/modules/label_extractors/galaxy_zoo/cosmic_dawn_spec.rb index 5372f4a..931ca34 100644 --- a/spec/modules/label_extractors/galaxy_zoo/cosmic_dawn_spec.rb +++ b/spec/modules/label_extractors/galaxy_zoo/cosmic_dawn_spec.rb @@ -146,14 +146,14 @@ def expected_labels(label_prefix, task_lookup_key, payload) expect { # T0 has 3 choices (0, 1, 2) described_class.new('T0').extract(unknown_key_payload) - }.to raise_error(LabelExtractors::GalaxyZoo::UnknownLabelKey, 'key not found: 3') + }.to raise_error(LabelExtractors::BaseExtractor::UnknownLabelKey, 'key not found: 3') end it 'raises an error if the task key is not found in the known schema' do expect { # T16 is unknown in this schema described_class.new('T16').extract(data_payload) - }.to raise_error(LabelExtractors::GalaxyZoo::UnknownTaskKey, 'key not found: T16') + }.to raise_error(LabelExtractors::BaseExtractor::UnknownTaskKey, 'key not found: T16') end end end diff --git a/spec/modules/label_extractors/galaxy_zoo/decals_spec.rb b/spec/modules/label_extractors/galaxy_zoo/decals_spec.rb index 22f90b3..5a49ac7 100644 --- a/spec/modules/label_extractors/galaxy_zoo/decals_spec.rb +++ b/spec/modules/label_extractors/galaxy_zoo/decals_spec.rb @@ -134,14 +134,14 @@ def expected_labels(label_prefix, task_lookup_key, payload) expect { # T0 has 3 choices (0, 1, 2) described_class.new('T0').extract(unknown_key_payload) - }.to raise_error(LabelExtractors::GalaxyZoo::UnknownLabelKey, 'key not found: 3') + }.to raise_error(LabelExtractors::BaseExtractor::UnknownLabelKey, 'key not found: 3') end it 'raises an error if the task key is not found in the known schema' do expect { # T12 is unknonw in this schema described_class.new('T12').extract(data_payload) - }.to raise_error(LabelExtractors::GalaxyZoo::UnknownTaskKey, 'key not found: T12') + }.to raise_error(LabelExtractors::BaseExtractor::UnknownTaskKey, 'key not found: T12') end end end diff --git a/spec/modules/label_extractors/galaxy_zoo/euclid_spec.rb b/spec/modules/label_extractors/galaxy_zoo/euclid_spec.rb new file mode 100644 index 0000000..abb222a --- /dev/null +++ b/spec/modules/label_extractors/galaxy_zoo/euclid_spec.rb @@ -0,0 +1,160 @@ +# frozen_string_literal: true + +require 'rails_helper' + +RSpec.describe LabelExtractors::GalaxyZoo::Euclid do + let(:data_label_schema) do + { + 'T0' => { + '0' => 'smooth', + '1' => 'featured-or-disk', + '2' => 'problem' + }, + 'T1' => { + '0' => 'round', + '1' => 'in-between', + '2' => 'cigar-shaped' + }, + 'T2' => { + '0' => 'yes', + '1' => 'no' + }, + 'T3' => { + '0' => 'rounded', + '1' => 'boxy', + '2' => 'none' + }, + 'T4' => { + '0' => 'no', + '1' => 'weak', + '2' => 'strong' + }, + 'T5' => { + '0' => 'yes', + '1' => 'no' + }, + 'T6' => { + '0' => 'tight', + '1' => 'medium', + '2' => 'loose' + }, + 'T7' => { + '0' => '1', + '1' => '2', + '2' => '3', + '3' => '4', + '4' => 'more-than-4', + '5' => 'cant-tell' + }, + 'T8' => { + '0' => 'none', + '1' => 'small', + '2' => 'moderate', + '3' => 'large', + '4' => 'dominant' + }, + 'T11' => { + '0' => 'merger', + '1' => 'major-disturbance', + '2' => 'minor-disturbance', + '3' => 'none' + }, + 'T12' => { + '0' => 'yes', + '1' => 'no' + }, + 'T13' => { + '0' => 'yes', + '1' => 'no' + }, + 'T14' => { + '0' => 'star', + '1' => 'artifact', + '2' => 'zoom' + }, + 'T15' => { + '0' => 'saturation', + '1' => 'diffraction', + '2' => 'satellite', + '3' => 'ray', + '4' => 'scattered', + '5' => 'other', + '6' => 'ghost' + } + } + end + let(:label_prefix_schema) do + { + 'T0' => 'smooth-or-featured', + 'T1' => 'how-rounded', + 'T2' => 'disk-edge-on', + 'T3' => 'edge-on-bulge', + 'T4' => 'bar', + 'T5' => 'has-spiral-arms', + 'T6' => 'spiral-winding', + 'T7' => 'spiral-arm-count', + 'T8' => 'bulge-size', + 'T11' => 'merging', # T10 is not used for training and no T9 :shrug: + 'T12' => 'lensing', + 'T13' => 'clumps', + 'T14' => 'problem', + 'T15' => 'artifact' + } + end + + describe '#label_prefixes' do + it 'has the correct schema label prefixes' do + expect(described_class.label_prefixes).to match(label_prefix_schema) + end + end + + describe '#data_labels' do + it 'has the correct schema data labels' do + expect(described_class.data_labels).to match(data_label_schema) + end + end + + describe '#question_answers_schema' do + it 'returns the correct set of header' do + expected_column_headers = described_class.label_prefixes.map do |task_key, question_prefix| + described_class.data_labels[task_key].values.map { |answer_suffix| "#{question_prefix}-euclid_#{answer_suffix}"} + end + expect(described_class.question_answers_schema).to match(expected_column_headers.flatten) + end + end + + describe '#extract' do + # sample payload mimicing the various choices across all GZ decision tree tasks + # NOTE: length must not exceed the smallest schema choice list (or use custom data payloads matching each task) + let(:data_payload) { { '0' => 3, '1' => 9 } } + + described_class.label_prefixes.each do |task_lookup_key, label_prefix| + # manually construct the expected lables for tests + def expected_labels(label_prefix, task_lookup_key, payload) + payload.transform_keys do |key| + "#{label_prefix}-euclid_#{data_label_schema.dig(task_lookup_key, key)}" + end + end + + it 'correctly converts the payload label keys' do + extracted_labels = described_class.new(task_lookup_key).extract(data_payload) + expect(extracted_labels).to match(expected_labels(label_prefix, task_lookup_key, data_payload)) + end + end + + it 'raises an error if the payload data key is not known' do + unknown_key_payload = data_payload.merge('3' => 0) + expect { + # T0 has 3 choices (0, 1, 2) + described_class.new('T0').extract(unknown_key_payload) + }.to raise_error(LabelExtractors::BaseExtractor::UnknownLabelKey, 'key not found: 3') + end + + it 'raises an error if the task key is not found in the known schema' do + expect { + # T16 is unknown in this schema + described_class.new('T16').extract(data_payload) + }.to raise_error(LabelExtractors::BaseExtractor::UnknownTaskKey, 'key not found: T16') + end + end +end diff --git a/spec/modules/label_extractors/shared/cosmic_dawn_and_euclid_spec.rb b/spec/modules/label_extractors/shared/cosmic_dawn_and_euclid_spec.rb new file mode 100644 index 0000000..89a5388 --- /dev/null +++ b/spec/modules/label_extractors/shared/cosmic_dawn_and_euclid_spec.rb @@ -0,0 +1,44 @@ + +# spec/label_extractors/galaxy_zoo/cosmic_dawn_and_euclid_spec.rb +class TestClass < LabelExtractors::Shared::CosmicDawnAndEuclid + TASK_KEY_LABEL_PREFIXES = { task_key_example: "example_prefix" } + TASK_KEY_DATA_LABELS = { task_key_example: { "T0" => "example_label" } } + + def self.data_release_suffix + "v1" + end +end + +RSpec.describe LabelExtractors::Shared::CosmicDawnAndEuclid do + let(:task_lookup_key) { :task_key_example } + + # Define a dynamic subclass of CosmicDawnAndEuclid for testing + let(:test_class) do + Class.new(described_class) do + TASK_KEY_LABEL_PREFIXES = { task_key_example: "example_prefix" } + TASK_KEY_DATA_LABELS = { task_key_example: { "T0" => "example_label" } } + + def self.data_release_suffix + "v1" + end + end + end + + let(:instance) { test_class.new(task_lookup_key) } + + describe 'question_answers_schema' do + it 'constructs the correct question and answers schema' do + result = TestClass.question_answers_schema + expected_result = ["example_prefix-v1_example_label"] + expect(result).to eq(expected_result) + end + end + + describe 'data_release_suffix' do + it 'raises NotImplementedError when not overridden' do + # We directly call the described class here without using the test_class + expect { described_class.data_release_suffix }.to raise_error(NotImplementedError) + end + end +end + diff --git a/spec/modules/zoobot_spec.rb b/spec/modules/zoobot_spec.rb index f83e627..23ee3db 100644 --- a/spec/modules/zoobot_spec.rb +++ b/spec/modules/zoobot_spec.rb @@ -3,10 +3,10 @@ require 'rails_helper' RSpec.describe Zoobot do - describe '.gz_label_column_headers' do + describe '.label_column_headers' do it 'returns the correct list' do expected_column_headers = %w[id_str file_loc] | LabelExtractors::GalaxyZoo::CosmicDawn.question_answers_schema - expect(described_class.gz_label_column_headers).to eq(expected_column_headers) + expect(described_class.label_column_headers('GalaxyZoo','CosmicDawn')).to eq(expected_column_headers) end end end diff --git a/spec/services/batch/training/create_job_spec.rb b/spec/services/batch/training/create_job_spec.rb index 57eef71..4bde967 100644 --- a/spec/services/batch/training/create_job_spec.rb +++ b/spec/services/batch/training/create_job_spec.rb @@ -7,7 +7,7 @@ describe '#run' do let(:manifest_path) { '/a/shared/blob/storage/path.csv' } let(:manifest_url) { "https://a.shared.blob.storage#{manifest_path}"} - let(:training_job) { TrainingJob.new(manifest_url: manifest_url, workflow_id: '1', state: :pending) } + let(:training_job) { TrainingJob.new(manifest_url: manifest_url, workflow_id: '123', state: :pending) } let(:bajor_client_double) { instance_double(Bajor::Client) } let(:training_create_job) { described_class.new(training_job, bajor_client_double) } let(:job_service_url) { 'https://bajor-host/training/job/123' } @@ -21,8 +21,9 @@ end it 'calls the bajor client service with the correct manifest_path' do + parent_context = Context.find_by(workflow_id: training_job.workflow_id) training_create_job.run - expect(bajor_client_double).to have_received(:create_training_job).with(manifest_path).once + expect(bajor_client_double).to have_received(:create_training_job).with(manifest_path, parent_context.extractor_name).once end it 'updates the state tracking info on the training job resource' do diff --git a/spec/services/export/training_data_spec.rb b/spec/services/export/training_data_spec.rb index fa6ba93..8afd961 100644 --- a/spec/services/export/training_data_spec.rb +++ b/spec/services/export/training_data_spec.rb @@ -4,7 +4,7 @@ RSpec.describe Export::TrainingData do describe '#run' do - let(:workflow_id) { 10 } + let(:workflow_id) { 123 } let(:file_time) { Time.now.iso8601 } let(:storage_path) { "#{Zoobot::Storage.path_key(workflow_id)}-#{file_time}.csv" } let(:storage_path_file_name) { "workflow-#{workflow_id}-#{file_time}.csv" } diff --git a/spec/services/format/training_data_csv_spec.rb b/spec/services/format/training_data_csv_spec.rb index 7d5ed11..992b2ce 100644 --- a/spec/services/format/training_data_csv_spec.rb +++ b/spec/services/format/training_data_csv_spec.rb @@ -7,7 +7,7 @@ fixtures :contexts let(:workflow_id) { 4 } - let(:formatter) { described_class.new(workflow_id, Zoobot.gz_label_column_headers) } + let(:formatter) { described_class.new(workflow_id, Zoobot.label_column_headers) } let(:subject_locations) do [ { 'image/jpeg': 'https://panoptes-uploads.zooniverse.org/subject_location/2f2490b4-65c1-4dca-ba25-c44128aa7a39.jpeg' } @@ -42,7 +42,7 @@ end it 'returns the csv data in the temp file' do - expected_output = "#{Zoobot.gz_label_column_headers.join(',')}\n8000_231121_468,/test/2f2490b4-65c1-4dca-ba25-c44128aa7a39.jpeg,3,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n" + expected_output = "#{Zoobot.label_column_headers.join(',')}\n8000_231121_468,/test/2f2490b4-65c1-4dca-ba25-c44128aa7a39.jpeg,3,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\n" results = File.read(export_file.path) expect(results).to eq(expected_output) end @@ -71,7 +71,7 @@ expected_lines = [ '8000_231121_468,/test/2f2490b4-65c1-4dca-ba25-c44128aa7a39.jpeg,3,9,0,1,5,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0' ].join("\n") - expected_output = "#{Zoobot.gz_label_column_headers.join(',')}\n#{expected_lines}\n" + expected_output = "#{Zoobot.label_column_headers.join(',')}\n#{expected_lines}\n" results = File.read(export_file.path) expect(results).to eq(expected_output) end @@ -92,7 +92,7 @@ end it 'returns the multi image csv data in the temp file' do - expected_output = "#{Zoobot.gz_label_column_headers.join(',')}\n#{expected_lines}\n" + expected_output = "#{Zoobot.label_column_headers.join(',')}\n#{expected_lines}\n" results = File.read(export_file.path) expect(results).to eq(expected_output) end