From 9e5037a53591c4cfe4b2ae7f073d68f11a4ec93a Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Mon, 19 Aug 2024 12:23:38 +0200 Subject: [PATCH] Add allow_private_datasets to ckan harvester Allow retrieval of private datasets if config value is True. --- ckanext/schemingdcat/harvesters/base.py | 6 +++++- ckanext/schemingdcat/harvesters/ckan.py | 28 ++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/ckanext/schemingdcat/harvesters/base.py b/ckanext/schemingdcat/harvesters/base.py index 2861beac..f0feb20a 100644 --- a/ckanext/schemingdcat/harvesters/base.py +++ b/ckanext/schemingdcat/harvesters/base.py @@ -536,6 +536,7 @@ def get_extra_value(extras, key): def apply_field_mapping(d, mapping): new_dict = {} + #TODO: If not field_mapping, items() fails. for local_field, remote_info in mapping.items(): if 'field_name' in remote_info: remote_field = remote_info['field_name'] @@ -1107,8 +1108,11 @@ def _set_translated_fields(self, package_dict): translated_fields["dataset_fields"].append(local_field_name) if isinstance(remote_field_name, dict): + local_field_value = package_dict.get(local_field_name, {}) + if not isinstance(local_field_value, dict): + local_field_value = {} package_dict[local_field_name] = { - lang: package_dict.get(name, package_dict.get(local_field_name, {}).get(lang)) + lang: package_dict.get(name, local_field_value.get(lang)) for lang, name in remote_field_name.items() } if local_field_name.endswith('_translated'): diff --git a/ckanext/schemingdcat/harvesters/ckan.py b/ckanext/schemingdcat/harvesters/ckan.py index de1b1922..6a3d6e99 100644 --- a/ckanext/schemingdcat/harvesters/ckan.py +++ b/ckanext/schemingdcat/harvesters/ckan.py @@ -106,11 +106,12 @@ def validate_config(self, config): f'schema should be one of: {", ".join(self._supported_schemas)}. Current dataset schema: {self._local_schema_name}' ) else: - raise ValueError( - f"Config schema should match the local schema: '{self._local_schema_name}'. " - f"Check the remote schema with CKAN API: {{ckan_site_url}}/api/3/action/scheming_dataset_schema_show?type=dataset, " - f"or specify the local schema, and the harvester will try to map the fields." - ) + if self._local_schema_name.lower().strip() != schema.lower().strip(): + raise ValueError( + f"Config 'schema': {schema} should match the local schema: '{self._local_schema_name}'. " + f"Check the remote schema with CKAN API: {{ckan_site_url}}/api/3/action/scheming_dataset_schema_show?type=dataset, " + f"or specify the local schema '{self._local_schema_name}', and the harvester will try to map the fields." + ) config = json.dumps({**config_obj, "schema": schema.lower().strip()}) @@ -124,6 +125,18 @@ def validate_config(self, config): ): config = json.dumps({**config_obj, "allow_harvest_datasets": False}) + # Check if 'allow_private_datasets' is not in the config_obj or is not a boolean + if "allow_private_datasets" in config_obj: + if "api_key" in config_obj: + if not isinstance(config_obj["allow_private_datasets"], bool): + config = json.dumps({**config_obj, "allow_private_datasets": False}) + else: + raise ValueError( + "'api_key' is needed to using 'allow_private_datasets'" + ) + else: + config = json.dumps({**config_obj, "allow_private_datasets": False}) + # Check remote_orgs and remote_groups == only_local, if not, put # remote_orgs and remote_groups to only_local if ( @@ -376,7 +389,12 @@ def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None): # they will harvested the next time anyway. When datasets are added, # we are at risk of seeing datasets twice in the paging, so we detect # and remove any duplicates. + params["include_private"] = False params["sort"] = "id asc" + + if self.config.get("allow_private_datasets", False): + params["include_private"] = True + if fq_terms: params["fq"] = " ".join(fq_terms)