From 003fb112bd1eaf8826a6fbe586c235f473e158c4 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 12 Oct 2023 11:27:05 +0200 Subject: [PATCH 1/4] Sanitize indexes in tuple keys when flattening dicts When we "flatten" data dicts in the dictization layer we transform keys like `extras__0__key` and `extras__0__value` to `('extras', 0, 'key')` and `('extras', 0, 'key')`. As these keys are external inputs (generally sent by the UI dataset form, but can also be sent directly with a crafted POST request) we need to perform some validation. More specifically: * They are all ints * They are all sequential, starting from 0 We keep the original order whenever possible, even if the input keys are not sequential. Added missing tests. --- ckan/logic/__init__.py | 26 ++++++++++- ckan/tests/logic/test_logic.py | 81 ++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/ckan/logic/__init__.py b/ckan/logic/__init__.py index 90b9f96cf19..79c6ab58d4b 100644 --- a/ckan/logic/__init__.py +++ b/ckan/logic/__init__.py @@ -24,7 +24,7 @@ from ckan.common import _, g from ckan.types import ( Action, ChainedAction, - ChainedAuthFunction, DataDict, ErrorDict, Context, FlattenDataDict, + ChainedAuthFunction, DataDict, ErrorDict, Context, FlattenDataDict, FlattenKey, Schema, Validator, ValidatorFactory) Decorated = TypeVar("Decorated") @@ -255,7 +255,29 @@ def tuplize_dict(data_dict: dict[str, Any]) -> FlattenDataDict: except ValueError: raise df.DataError('Bad key') tuplized_dict[tuple(key_list)] = value - return tuplized_dict + + # Sanitize key indexes to make sure they are sequential + seq_tuplized_dict: FlattenDataDict = {} + new_index = -1 + last_group = None + last_index = None + tuple_key: FlattenKey + for tuple_key in sorted(tuplized_dict.keys()): + if len(tuple_key) != 3: + seq_tuplized_dict[tuple_key] = tuplized_dict[tuple_key] + else: + if last_group != tuple_key[0]: + new_index = 0 + elif last_index != tuple_key[1]: + new_index += 1 + + new_key = (tuple_key[0], new_index, tuple_key[2]) + seq_tuplized_dict[new_key] = tuplized_dict[tuple_key] + + last_group = tuple_key[0] + last_index = tuple_key[1] + + return seq_tuplized_dict def untuplize_dict(tuplized_dict: FlattenDataDict) -> dict[str, Any]: diff --git a/ckan/tests/logic/test_logic.py b/ckan/tests/logic/test_logic.py index 9cd713f68d9..9b486c2f904 100644 --- a/ckan/tests/logic/test_logic.py +++ b/ckan/tests/logic/test_logic.py @@ -3,6 +3,8 @@ from unittest import mock import pytest from ckan import logic, model +import ckan.lib.navl.dictization_functions as df + from ckan.types import Context import ckan.tests.factories as factories @@ -112,3 +114,82 @@ def test_check_access_auth_user_for_different_objects(): with pytest.raises(logic.NotAuthorized): for dataset in dataset3: logic.check_access("package_show", context, {'id': dataset["id"]}) + + +def test_tuplize_dict(): + + data_dict = { + "author": "Test Author", + "extras__0__key": "extra1", + "extras__0__value": "value1", + "extras__1__key": "extra2", + "extras__1__value": "value2", + "extras__2__key": "extra3", + "extras__2__value": "value3", + "extras__3__key": "", + "extras__3__value": "", + "groups__0__id": "5a65eae8-ef2b-4a85-8022-d9e5a71ad074", + "name": "test-title", + "notes": "Test desc", + "owner_org": "5a65eae8-ef2b-4a85-8022-d9e5a71ad074", + "private": "True", + "tag_string": "economy,climate", + "title": "Test title", + } + + expected = { + ("author",): "Test Author", + ("extras", 0, "key"): "extra1", + ("extras", 0, "value"): "value1", + ("extras", 1, "key"): "extra2", + ("extras", 1, "value"): "value2", + ("extras", 2, "key"): "extra3", + ("extras", 2, "value"): "value3", + ("extras", 3, "key"): "", + ("extras", 3, "value"): "", + ("groups", 0, "id"): "5a65eae8-ef2b-4a85-8022-d9e5a71ad074", + ("name",): "test-title", + ("notes",): "Test desc", + ("owner_org",): "5a65eae8-ef2b-4a85-8022-d9e5a71ad074", + ("private",): "True", + ("tag_string",): "economy,climate", + ("title",): "Test title", + } + + assert logic.tuplize_dict(data_dict) == expected + + +def test_tuplize_dict_random_indexes(): + + data_dict = { + "extras__22__key": "extra2", + "extras__22__value": "value2", + "extras__1__key": "extra1", + "extras__1__value": "value1", + "extras__245566546__key": "extra3", + "extras__245566546__value": "value3", + "groups__13__id": "group2", + "groups__1__id": "group1", + } + + expected = { + ("extras", 0, "key"): "extra1", + ("extras", 0, "value"): "value1", + ("extras", 1, "key"): "extra2", + ("extras", 1, "value"): "value2", + ("extras", 2, "key"): "extra3", + ("extras", 2, "value"): "value3", + ("groups", 0, "id"): "group1", + ("groups", 1, "id"): "group2", + } + + assert logic.tuplize_dict(data_dict) == expected + + +def test_tuplize_dict_wrong_index(): + + with pytest.raises(df.DataError): + data_dict = { + "extras__2a__key": "extra", + } + logic.tuplize_dict(data_dict) From 861f55f58d4cc1a3ba8f30f3492b328c387b7b29 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 17 Oct 2023 14:46:48 +0200 Subject: [PATCH 2/4] Trigger Build From 2789e8edb553c8cebf33604152f7d2594aa30c32 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 11 Dec 2023 11:34:19 +0100 Subject: [PATCH 3/4] Use alternative implementation to support nested keys Co-authored-by: Sergey --- ckan/logic/__init__.py | 42 +++++++++++++++++++--------------- ckan/tests/logic/test_logic.py | 5 ++++ 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/ckan/logic/__init__.py b/ckan/logic/__init__.py index 79c6ab58d4b..4e327950421 100644 --- a/ckan/logic/__init__.py +++ b/ckan/logic/__init__.py @@ -258,24 +258,30 @@ def tuplize_dict(data_dict: dict[str, Any]) -> FlattenDataDict: # Sanitize key indexes to make sure they are sequential seq_tuplized_dict: FlattenDataDict = {} - new_index = -1 - last_group = None - last_index = None - tuple_key: FlattenKey - for tuple_key in sorted(tuplized_dict.keys()): - if len(tuple_key) != 3: - seq_tuplized_dict[tuple_key] = tuplized_dict[tuple_key] - else: - if last_group != tuple_key[0]: - new_index = 0 - elif last_index != tuple_key[1]: - new_index += 1 - - new_key = (tuple_key[0], new_index, tuple_key[2]) - seq_tuplized_dict[new_key] = tuplized_dict[tuple_key] - - last_group = tuple_key[0] - last_index = tuple_key[1] + # sequential field indexes grouped by common prefix + groups: dict[FlattenKey, dict[FlattenKey, int]] = defaultdict(dict) + for key in sorted(tuplized_dict.keys()): + new_key = key + + # iterate over even(numeric) parts of the key + for idx in range(1, len(key), 2): + # narrow down scope by common prefix + group = groups[key[:idx]] + + # if the identifier(i.e `(extra, 123)`, `(resource, 9)`) is met for + # the first time, generate for it next number in the index + # sequence. Index of the latest added item is always equals to the + # number of unique identifiers minus one(because list indexation + # starts from 0 in Python). If identifier already present(i.e, we + # process `(extra, 10, VALUE)` after processing `(extra, 10, + # KEY)`), reuse sequential index of this identifier + seq_index = group.setdefault(key[idx-1:idx+1], len(group)) + + # replace the currently processed key segment with computed + # sequential index + new_key = new_key[:idx] + (seq_index,) + new_key[idx+1:] + + seq_tuplized_dict[new_key] = tuplized_dict[key] return seq_tuplized_dict diff --git a/ckan/tests/logic/test_logic.py b/ckan/tests/logic/test_logic.py index 9b486c2f904..cc424bafbe5 100644 --- a/ckan/tests/logic/test_logic.py +++ b/ckan/tests/logic/test_logic.py @@ -170,6 +170,9 @@ def test_tuplize_dict_random_indexes(): "extras__245566546__value": "value3", "groups__13__id": "group2", "groups__1__id": "group1", + "groups__13__nested__7__name": "latter", + "groups__13__nested__2__name": "former", + } expected = { @@ -181,6 +184,8 @@ def test_tuplize_dict_random_indexes(): ("extras", 2, "value"): "value3", ("groups", 0, "id"): "group1", ("groups", 1, "id"): "group2", + ("groups", 1, "nested", 0, "name"): "former", + ("groups", 1, "nested", 1, "name"): "latter", } assert logic.tuplize_dict(data_dict) == expected From 9393c53e5e321e0af52da6f97e479df9b91cfb74 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 11 Dec 2023 11:54:56 +0100 Subject: [PATCH 4/4] lint --- ckan/logic/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckan/logic/__init__.py b/ckan/logic/__init__.py index 4e327950421..7617a3889b7 100644 --- a/ckan/logic/__init__.py +++ b/ckan/logic/__init__.py @@ -24,8 +24,9 @@ from ckan.common import _, g from ckan.types import ( Action, ChainedAction, - ChainedAuthFunction, DataDict, ErrorDict, Context, FlattenDataDict, FlattenKey, - Schema, Validator, ValidatorFactory) + ChainedAuthFunction, DataDict, ErrorDict, Context, FlattenDataDict, + FlattenKey, Schema, Validator, ValidatorFactory +) Decorated = TypeVar("Decorated")