From 32ee500b21b428a031169fdbeb97d09eaf01193b Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Fri, 24 May 2024 14:28:59 +0200 Subject: [PATCH 1/6] Add exclude_from_ranking for additional score extraction from metrics --- app/grandchallenge/evaluation/models.py | 9 ++- app/grandchallenge/evaluation/tasks.py | 1 + app/tests/evaluation_tests/test_utils.py | 90 ++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 1 deletion(-) diff --git a/app/grandchallenge/evaluation/models.py b/app/grandchallenge/evaluation/models.py index b95376c049..60e035a281 100644 --- a/app/grandchallenge/evaluation/models.py +++ b/app/grandchallenge/evaluation/models.py @@ -109,6 +109,12 @@ "examples": ["asc"], "pattern": "^(asc|desc)$", }, + "exclude_from_ranking": { + "$id": "#/items/properties/exclude_from_ranking", + "type": "boolean", + "title": "The Exclude From Ranking Schema", + "default": False, + }, }, }, } @@ -253,7 +259,8 @@ class Phase(FieldChangeMixin, HangingProtocolMixin, UUIDModel): '"path": "accuracy.mean",' '"order": "asc",' '"title": "ASSD +/- std",' - '"error_path": "accuracy.std"' + '"error_path": "accuracy.std",' + '"exclude_from_ranking": true' "}]" ), validators=[JSONValidator(schema=EXTRA_RESULT_COLUMNS_SCHEMA)], diff --git a/app/grandchallenge/evaluation/tasks.py b/app/grandchallenge/evaluation/tasks.py index b66626e65e..cc51d50562 100644 --- a/app/grandchallenge/evaluation/tasks.py +++ b/app/grandchallenge/evaluation/tasks.py @@ -454,6 +454,7 @@ def calculate_ranks(*, phase_pk: uuid.UUID): *[ Metric(path=col["path"], reverse=col["order"] == phase.DESCENDING) for col in phase.extra_results_columns + if not col.get("exclude_from_ranking", False) ], ) diff --git a/app/tests/evaluation_tests/test_utils.py b/app/tests/evaluation_tests/test_utils.py index 464078b14a..a2aa3ff5d0 100644 --- a/app/tests/evaluation_tests/test_utils.py +++ b/app/tests/evaluation_tests/test_utils.py @@ -136,6 +136,96 @@ def test_calculate_ranks(django_assert_max_num_queries): ) +@pytest.mark.django_db +def test_calculate_ranks_with_exclusion(django_assert_max_num_queries): + phase = PhaseFactory() + + results = [ + # Warning: Do not change this values without updating the + # expected_ranks below. + {"a": 0.0, "b": 0.0}, + {"a": 0.5, "b": 0.2}, + {"a": 1.0, "b": 0.3}, + {"a": 0.7, "b": 0.4}, + {"a": 0.5, "b": 0.5}, + {"b": 0.3}, # Incomplete and should not be processed + ] + + queryset = [ + EvaluationFactory(submission__phase=phase, status=Evaluation.SUCCESS) + for _ in range(len(results)) + ] + + for e, r in zip(queryset, results, strict=True): + e.outputs.add( + ComponentInterfaceValue.objects.create( + interface=ComponentInterface.objects.get( + slug="metrics-json-file" + ), + value=r, + ) + ) + + b_included = False + b_excluded = True + + only_a = { + "ranks": [5, 3, 1, 2, 3, 0], + "rank_scores": [5, 3, 1, 2, 3, 0], + } + expected = { + Phase.ABSOLUTE: { + b_included: only_a, + b_excluded: only_a, + }, + Phase.MEDIAN: { + b_included: { + "ranks": [5, 4, 1, 1, 1, 0], + "rank_scores": [5, 3.5, 2, 2, 2, 0], + }, + b_excluded: only_a, + }, + Phase.MEAN: { + b_included: { + "ranks": [5, 4, 1, 1, 1, 0], + "rank_scores": [5, 3.5, 2, 2, 2, 0], + }, + b_excluded: only_a, + }, + } + + for score_method in (Phase.ABSOLUTE, Phase.MEDIAN, Phase.MEAN): + for b_exclusion in (True, False, None): + phase.score_jsonpath = "a" + phase.scoring_method_choice = score_method + phase.score_default_sort = Phase.DESCENDING + phase.extra_results_columns = [ + { + "path": "b", + "title": "b", + "order": Phase.DESCENDING, + } + ] + + if b_exclusion is not None: + phase.extra_results_columns[0][ + "exclude_from_ranking" + ] = b_exclusion + else: + b_exclusion = False # for the expected lookup + + phase.save() + + with django_assert_max_num_queries(10): + assert calculate_ranks(phase_pk=phase.pk) is None + + assert_ranks( + queryset, + expected[score_method][b_exclusion]["ranks"], + expected[score_method][b_exclusion]["rank_scores"], + ) + + @pytest.mark.django_db def test_results_display(): phase = PhaseFactory() From 8b6784ab342fc372d2b7526be0fd9333f487c9bf Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Fri, 24 May 2024 14:37:51 +0200 Subject: [PATCH 2/6] Add migration --- .../0053_alter_phase_extra_results_columns.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 app/grandchallenge/evaluation/migrations/0053_alter_phase_extra_results_columns.py diff --git a/app/grandchallenge/evaluation/migrations/0053_alter_phase_extra_results_columns.py b/app/grandchallenge/evaluation/migrations/0053_alter_phase_extra_results_columns.py new file mode 100644 index 0000000000..b7473fba48 --- /dev/null +++ b/app/grandchallenge/evaluation/migrations/0053_alter_phase_extra_results_columns.py @@ -0,0 +1,82 @@ +# Generated by Django 4.2.11 on 2024-05-24 12:37 + +from django.db import migrations, models + +import grandchallenge.core.validators + + +class Migration(migrations.Migration): + + dependencies = [ + ("evaluation", "0052_phase_parent"), + ] + + operations = [ + migrations.AlterField( + model_name="phase", + name="extra_results_columns", + field=models.JSONField( + blank=True, + default=list, + help_text='A JSON object that contains the extra columns from metrics.json that will be displayed on the results page. An example that will display accuracy score with error would look like this: [{"path": "accuracy.mean","order": "asc","title": "ASSD +/- std","error_path": "accuracy.std","exclude_from_ranking": true}]', + validators=[ + grandchallenge.core.validators.JSONValidator( + schema={ + "$schema": "http://json-schema.org/draft-06/schema#", + "definitions": {}, + "items": { + "$id": "#/items", + "additionalProperties": False, + "properties": { + "error_path": { + "$id": "#/items/properties/error_path", + "default": "", + "examples": ["aggregates.dice.std"], + "pattern": "^(.*)$", + "title": "The Error Path Schema", + "type": "string", + }, + "exclude_from_ranking": { + "$id": "#/items/properties/exclude_from_ranking", + "default": False, + "title": "The Exclude From Ranking Schema", + "type": "boolean", + }, + "order": { + "$id": "#/items/properties/order", + "default": "", + "enum": ["asc", "desc"], + "examples": ["asc"], + "pattern": "^(asc|desc)$", + "title": "The Order Schema", + "type": "string", + }, + "path": { + "$id": "#/items/properties/path", + "default": "", + "examples": ["aggregates.dice.mean"], + "pattern": "^(.*)$", + "title": "The Path Schema", + "type": "string", + }, + "title": { + "$id": "#/items/properties/title", + "default": "", + "examples": ["Mean Dice"], + "pattern": "^(.*)$", + "title": "The Title Schema", + "type": "string", + }, + }, + "required": ["title", "path", "order"], + "title": "The Items Schema", + "type": "object", + }, + "title": "The Extra Results Columns Schema", + "type": "array", + } + ) + ], + ), + ), + ] From 737a671322197e691b698dfc85728a20f6b3d679 Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Sun, 26 May 2024 11:05:30 +0200 Subject: [PATCH 3/6] Refactor test_calculate_ranks --- app/tests/evaluation_tests/test_utils.py | 203 +++++++++++++---------- 1 file changed, 115 insertions(+), 88 deletions(-) diff --git a/app/tests/evaluation_tests/test_utils.py b/app/tests/evaluation_tests/test_utils.py index a2aa3ff5d0..eb28215610 100644 --- a/app/tests/evaluation_tests/test_utils.py +++ b/app/tests/evaluation_tests/test_utils.py @@ -11,12 +11,108 @@ @pytest.mark.django_db -def test_calculate_ranks(django_assert_max_num_queries): +@pytest.mark.parametrize( + "score_method, a_order, b_order, expected_ranks, expected_rank_scores", + ( + ( + Phase.ABSOLUTE, + Phase.DESCENDING, + Phase.DESCENDING, + [5, 3, 1, 2, 3, 0, 0, 0], + [5, 3, 1, 2, 3, 0, 0, 0], + ), + ( + Phase.ABSOLUTE, + Phase.DESCENDING, + Phase.ASCENDING, + [5, 3, 1, 2, 3, 0, 0, 0], + [5, 3, 1, 2, 3, 0, 0, 0], + ), + ( + Phase.MEDIAN, + Phase.DESCENDING, + Phase.DESCENDING, + [5, 4, 1, 1, 1, 0, 0, 0], + [5, 3.5, 2, 2, 2, 0, 0, 0], + ), + ( + Phase.MEDIAN, + Phase.DESCENDING, + Phase.ASCENDING, + [3, 2, 1, 3, 5, 0, 0, 0], + [3, 2.5, 2, 3, 4, 0, 0, 0], + ), + ( + Phase.MEAN, + Phase.DESCENDING, + Phase.DESCENDING, + [5, 4, 1, 1, 1, 0, 0, 0], + [5, 3.5, 2, 2, 2, 0, 0, 0], + ), + ( + Phase.MEAN, + Phase.DESCENDING, + Phase.ASCENDING, + [3, 2, 1, 3, 5, 0, 0, 0], + [3, 2.5, 2, 3, 4, 0, 0, 0], + ), + ( + Phase.ABSOLUTE, + Phase.ASCENDING, + Phase.DESCENDING, + [1, 2, 5, 4, 2, 0, 0, 0], + [1, 2, 5, 4, 2, 0, 0, 0], + ), + ( + Phase.ABSOLUTE, + Phase.ASCENDING, + Phase.ASCENDING, + [1, 2, 5, 4, 2, 0, 0, 0], + [1, 2, 5, 4, 2, 0, 0, 0], + ), + ( + Phase.MEDIAN, + Phase.ASCENDING, + Phase.DESCENDING, + [2, 2, 5, 2, 1, 0, 0, 0], + [3, 3, 4, 3, 1.5, 0, 0, 0], + ), + ( + Phase.MEDIAN, + Phase.ASCENDING, + Phase.ASCENDING, + [1, 2, 4, 4, 3, 0, 0, 0], + [1, 2, 4, 4, 3.5, 0, 0, 0], + ), + ( + Phase.MEAN, + Phase.ASCENDING, + Phase.DESCENDING, + [2, 2, 5, 2, 1, 0, 0, 0], + [3, 3, 4, 3, 1.5, 0, 0, 0], + ), + ( + Phase.MEAN, + Phase.ASCENDING, + Phase.ASCENDING, + [1, 2, 4, 4, 3, 0, 0, 0], + [1, 2, 4, 4, 3.5, 0, 0, 0], + ), + ), +) +def test_calculate_ranks( + django_assert_max_num_queries, + score_method, + a_order, + b_order, + expected_ranks, + expected_rank_scores, +): phase = PhaseFactory() results = [ - # Warning: Do not change this values without updating the - # expected_ranks below. + # Warning: Do not change these values without updating the + # expected above. {"a": 0.0, "b": 0.0}, {"a": 0.5, "b": 0.2}, {"a": 1.0, "b": 0.3}, @@ -48,92 +144,23 @@ def test_calculate_ranks(django_assert_max_num_queries): queryset[-1].published = False queryset[-1].save() - expected = { - Phase.DESCENDING: { - Phase.ABSOLUTE: { - Phase.DESCENDING: { - "ranks": [5, 3, 1, 2, 3, 0, 0, 0], - "rank_scores": [5, 3, 1, 2, 3, 0, 0, 0], - }, - Phase.ASCENDING: { - "ranks": [5, 3, 1, 2, 3, 0, 0, 0], - "rank_scores": [5, 3, 1, 2, 3, 0, 0, 0], - }, - }, - Phase.MEDIAN: { - Phase.DESCENDING: { - "ranks": [5, 4, 1, 1, 1, 0, 0, 0], - "rank_scores": [5, 3.5, 2, 2, 2, 0, 0, 0], - }, - Phase.ASCENDING: { - "ranks": [3, 2, 1, 3, 5, 0, 0, 0], - "rank_scores": [3, 2.5, 2, 3, 4, 0, 0, 0], - }, - }, - Phase.MEAN: { - Phase.DESCENDING: { - "ranks": [5, 4, 1, 1, 1, 0, 0, 0], - "rank_scores": [5, 3.5, 2, 2, 2, 0, 0, 0], - }, - Phase.ASCENDING: { - "ranks": [3, 2, 1, 3, 5, 0, 0, 0], - "rank_scores": [3, 2.5, 2, 3, 4, 0, 0, 0], - }, - }, - }, - Phase.ASCENDING: { - Phase.ABSOLUTE: { - Phase.DESCENDING: { - "ranks": [1, 2, 5, 4, 2, 0, 0, 0], - "rank_scores": [1, 2, 5, 4, 2, 0, 0, 0], - }, - Phase.ASCENDING: { - "ranks": [1, 2, 5, 4, 2, 0, 0, 0], - "rank_scores": [1, 2, 5, 4, 2, 0, 0, 0], - }, - }, - Phase.MEDIAN: { - Phase.DESCENDING: { - "ranks": [2, 2, 5, 2, 1, 0, 0, 0], - "rank_scores": [3, 3, 4, 3, 1.5, 0, 0, 0], - }, - Phase.ASCENDING: { - "ranks": [1, 2, 4, 4, 3, 0, 0, 0], - "rank_scores": [1, 2, 4, 4, 3.5, 0, 0, 0], - }, - }, - Phase.MEAN: { - Phase.DESCENDING: { - "ranks": [2, 2, 5, 2, 1, 0, 0, 0], - "rank_scores": [3, 3, 4, 3, 1.5, 0, 0, 0], - }, - Phase.ASCENDING: { - "ranks": [1, 2, 4, 4, 3, 0, 0, 0], - "rank_scores": [1, 2, 4, 4, 3.5, 0, 0, 0], - }, - }, - }, - } + # Setup Phase + phase.score_jsonpath = "a" + phase.scoring_method_choice = score_method + phase.score_default_sort = a_order + phase.extra_results_columns = [ + {"path": "b", "title": "b", "order": b_order} + ] + phase.save() - for score_method in (Phase.ABSOLUTE, Phase.MEDIAN, Phase.MEAN): - for a_order in (Phase.DESCENDING, Phase.ASCENDING): - for b_order in (Phase.DESCENDING, Phase.ASCENDING): - phase.score_jsonpath = "a" - phase.scoring_method_choice = score_method - phase.score_default_sort = a_order - phase.extra_results_columns = [ - {"path": "b", "title": "b", "order": b_order} - ] - phase.save() - - with django_assert_max_num_queries(10): - calculate_ranks(phase_pk=phase.pk) - - assert_ranks( - queryset, - expected[a_order][score_method][b_order]["ranks"], - expected[a_order][score_method][b_order]["rank_scores"], - ) + with django_assert_max_num_queries(10): + calculate_ranks(phase_pk=phase.pk) + + assert_ranks( + queryset, + expected_ranks, + expected_rank_scores, + ) @pytest.mark.django_db From 288473dc00800bb9738a309d0f36cb8900f43cb4 Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Sun, 26 May 2024 11:18:01 +0200 Subject: [PATCH 4/6] Refactor test_calculate_ranks_with_exclusion --- app/tests/evaluation_tests/test_utils.py | 162 +++++++++++++++-------- 1 file changed, 105 insertions(+), 57 deletions(-) diff --git a/app/tests/evaluation_tests/test_utils.py b/app/tests/evaluation_tests/test_utils.py index eb28215610..e7be69facb 100644 --- a/app/tests/evaluation_tests/test_utils.py +++ b/app/tests/evaluation_tests/test_utils.py @@ -164,12 +164,103 @@ def test_calculate_ranks( @pytest.mark.django_db -def test_calculate_ranks_with_exclusion(django_assert_max_num_queries): +@pytest.mark.parametrize( + "score_method, extra_results_columns, expected_ranks, expected_rank_scores", + ( + ( + Phase.ABSOLUTE, + [ + b_default := { + "path": "b", + "title": "b", + "order": Phase.DESCENDING, + "exclude_from_ranking": False, + } + ], + only_a_expected_ranks := [5, 3, 1, 2, 3, 0], + only_a_expected_rank_score := [5, 3, 1, 2, 3, 0], + ), + ( + Phase.ABSOLUTE, + [ + { + **b_default, + "exclude_from_ranking": True, + } + ], + only_a_expected_ranks, + only_a_expected_rank_score, + ), + ( + Phase.MEDIAN, + [ + { + **b_default, + "exclude_from_ranking": False, + } + ], + [5, 4, 1, 1, 1, 0], + [5, 3.5, 2, 2, 2, 0], + ), + ( + Phase.MEDIAN, + [ + { + **b_default, + "exclude_from_ranking": True, + } + ], + only_a_expected_ranks, + only_a_expected_rank_score, + ), + ( + Phase.MEAN, + [ + { + **b_default, + "exclude_from_ranking": False, + } + ], + [5, 4, 1, 1, 1, 0], + [5, 3.5, 2, 2, 2, 0], + ), + ( + Phase.MEAN, + [ + { + **b_default, + "exclude_from_ranking": True, + } + ], + only_a_expected_ranks, + only_a_expected_rank_score, + ), + ( + Phase.MEAN, + [ + { # Check if by default it is not excluded from ranking + "path": "b", + "title": "b", + "order": Phase.DESCENDING, + } + ], + [5, 4, 1, 1, 1, 0], + [5, 3.5, 2, 2, 2, 0], + ), + ), +) +def test_calculate_ranks_with_exclusion( + django_assert_max_num_queries, + score_method, + extra_results_columns, + expected_ranks, + expected_rank_scores, +): phase = PhaseFactory() results = [ # Warning: Do not change this values without updating the - # expected_ranks below. + # expected_ranks/expected_rank_scores above. {"a": 0.0, "b": 0.0}, {"a": 0.5, "b": 0.2}, {"a": 1.0, "b": 0.3}, @@ -193,64 +284,21 @@ def test_calculate_ranks_with_exclusion(django_assert_max_num_queries): ) ) - b_included = False - b_excluded = True - - only_a = { - "ranks": [5, 3, 1, 2, 3, 0], - "rank_scores": [5, 3, 1, 2, 3, 0], - } - expected = { - Phase.ABSOLUTE: { - b_included: only_a, - b_excluded: only_a, - }, - Phase.MEDIAN: { - b_included: { - "ranks": [5, 4, 1, 1, 1, 0], - "rank_scores": [5, 3.5, 2, 2, 2, 0], - }, - b_excluded: only_a, - }, - Phase.MEAN: { - b_included: { - "ranks": [5, 4, 1, 1, 1, 0], - "rank_scores": [5, 3.5, 2, 2, 2, 0], - }, - b_excluded: only_a, - }, - } - - for score_method in (Phase.ABSOLUTE, Phase.MEDIAN, Phase.MEAN): - for b_exclusion in (True, False, None): - phase.score_jsonpath = "a" - phase.scoring_method_choice = score_method - phase.score_default_sort = Phase.DESCENDING - phase.extra_results_columns = [ - { - "path": "b", - "title": "b", - "order": Phase.DESCENDING, - } - ] - - if b_exclusion is not None: - phase.extra_results_columns[0][ - "exclude_from_ranking" - ] = b_exclusion - else: - b_exclusion = False # for the expected lookup + phase.score_jsonpath = "a" + phase.scoring_method_choice = score_method + phase.score_default_sort = Phase.DESCENDING + phase.extra_results_columns = extra_results_columns - phase.save() + phase.save() - with django_assert_max_num_queries(10): - assert calculate_ranks(phase_pk=phase.pk) is None + with django_assert_max_num_queries(10): + assert calculate_ranks(phase_pk=phase.pk) is None - assert_ranks( - queryset, - expected[score_method][b_exclusion]["ranks"], - expected[score_method][b_exclusion]["rank_scores"], - ) + assert_ranks( + queryset, + expected_ranks, + expected_rank_scores, + ) @pytest.mark.django_db From 20f13c3f6680e8c13ee2744d1715081ec42cb547 Mon Sep 17 00:00:00 2001 From: Chris van Run Date: Sun, 26 May 2024 11:49:15 +0200 Subject: [PATCH 5/6] Ensure rank excluded metrics are not UI decorated on leaderboards --- .../evaluation/templates/evaluation/leaderboard_row.html | 2 +- app/grandchallenge/evaluation/views/__init__.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/app/grandchallenge/evaluation/templates/evaluation/leaderboard_row.html b/app/grandchallenge/evaluation/templates/evaluation/leaderboard_row.html index 3de30d8c76..1d25ec8532 100644 --- a/app/grandchallenge/evaluation/templates/evaluation/leaderboard_row.html +++ b/app/grandchallenge/evaluation/templates/evaluation/leaderboard_row.html @@ -67,7 +67,7 @@  ±  {{ object.metrics_json_file|get_jsonpath:col.error_path|floatformat:object.submission.phase.score_decimal_places }} {% endif %} - {% if object.submission.phase.scoring_method_choice != object.submission.phase.ABSOLUTE %} + {% if object.submission.phase.scoring_method_choice != object.submission.phase.ABSOLUTE and not col.exclude_from_ranking %}  ( {{ object.rank_per_metric|get_key:col.path }} ) diff --git a/app/grandchallenge/evaluation/views/__init__.py b/app/grandchallenge/evaluation/views/__init__.py index 4c69602515..b1423bf36c 100644 --- a/app/grandchallenge/evaluation/views/__init__.py +++ b/app/grandchallenge/evaluation/views/__init__.py @@ -719,6 +719,7 @@ def columns(self): c["title"] if self.phase.scoring_method_choice == self.phase.ABSOLUTE + or c.get("exclude_from_ranking", False) else f"{c['title']} (Position)" ), sort_field="rank", From 547cb577f8dfd63fa393007389918cf0f935beac Mon Sep 17 00:00:00 2001 From: James Meakin <12661555+jmsmkn@users.noreply.github.com> Date: Mon, 27 May 2024 11:19:34 +0200 Subject: [PATCH 6/6] More explicit --- app/tests/evaluation_tests/test_utils.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/app/tests/evaluation_tests/test_utils.py b/app/tests/evaluation_tests/test_utils.py index e7be69facb..3ecc093536 100644 --- a/app/tests/evaluation_tests/test_utils.py +++ b/app/tests/evaluation_tests/test_utils.py @@ -170,7 +170,7 @@ def test_calculate_ranks( ( Phase.ABSOLUTE, [ - b_default := { + { "path": "b", "title": "b", "order": Phase.DESCENDING, @@ -184,7 +184,9 @@ def test_calculate_ranks( Phase.ABSOLUTE, [ { - **b_default, + "path": "b", + "title": "b", + "order": Phase.DESCENDING, "exclude_from_ranking": True, } ], @@ -195,7 +197,9 @@ def test_calculate_ranks( Phase.MEDIAN, [ { - **b_default, + "path": "b", + "title": "b", + "order": Phase.DESCENDING, "exclude_from_ranking": False, } ], @@ -206,7 +210,9 @@ def test_calculate_ranks( Phase.MEDIAN, [ { - **b_default, + "path": "b", + "title": "b", + "order": Phase.DESCENDING, "exclude_from_ranking": True, } ], @@ -217,7 +223,9 @@ def test_calculate_ranks( Phase.MEAN, [ { - **b_default, + "path": "b", + "title": "b", + "order": Phase.DESCENDING, "exclude_from_ranking": False, } ], @@ -228,7 +236,9 @@ def test_calculate_ranks( Phase.MEAN, [ { - **b_default, + "path": "b", + "title": "b", + "order": Phase.DESCENDING, "exclude_from_ranking": True, } ],