Merge pull request #98 from mjanez/feature/harvester-private-datasets

Add allow_private_datasets to ckan harvester
mjanez · Aug 19, 2024 · dd75b3f · dd75b3f
2 parents 28c3d3d + 9e5037a
commit dd75b3f
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 6 deletions.
diff --git a/ckanext/schemingdcat/harvesters/base.py b/ckanext/schemingdcat/harvesters/base.py
@@ -536,6 +536,7 @@ def get_extra_value(extras, key):
 
         def apply_field_mapping(d, mapping):
             new_dict = {}
+            #TODO: If not field_mapping, items() fails.
             for local_field, remote_info in mapping.items():
                 if 'field_name' in remote_info:
                     remote_field = remote_info['field_name']
@@ -1107,8 +1108,11 @@ def _set_translated_fields(self, package_dict):
                     translated_fields["dataset_fields"].append(local_field_name)
 
                     if isinstance(remote_field_name, dict):
+                        local_field_value = package_dict.get(local_field_name, {})
+                        if not isinstance(local_field_value, dict):
+                            local_field_value = {}
                         package_dict[local_field_name] = {
-                            lang: package_dict.get(name, package_dict.get(local_field_name, {}).get(lang))
+                            lang: package_dict.get(name, local_field_value.get(lang))
                             for lang, name in remote_field_name.items()
                         }
                         if local_field_name.endswith('_translated'):

diff --git a/ckanext/schemingdcat/harvesters/ckan.py b/ckanext/schemingdcat/harvesters/ckan.py
@@ -106,11 +106,12 @@ def validate_config(self, config):
                         f'schema should be one of: {", ".join(self._supported_schemas)}. Current dataset schema: {self._local_schema_name}'
                     )
                 else:
-                    raise ValueError(
-                        f"Config schema should match the local schema: '{self._local_schema_name}'. "
-                        f"Check the remote schema with CKAN API: {{ckan_site_url}}/api/3/action/scheming_dataset_schema_show?type=dataset, "
-                        f"or specify the local schema, and the harvester will try to map the fields."
-                    )
+                    if self._local_schema_name.lower().strip() != schema.lower().strip():
+                        raise ValueError(
+                            f"Config 'schema': {schema} should match the local schema: '{self._local_schema_name}'. "
+                            f"Check the remote schema with CKAN API: {{ckan_site_url}}/api/3/action/scheming_dataset_schema_show?type=dataset, "
+                            f"or specify the local schema '{self._local_schema_name}', and the harvester will try to map the fields."
+                        )
 
             config = json.dumps({**config_obj, "schema": schema.lower().strip()})
 
@@ -124,6 +125,18 @@ def validate_config(self, config):
         ):
             config = json.dumps({**config_obj, "allow_harvest_datasets": False})
 
+        # Check if 'allow_private_datasets' is not in the config_obj or is not a boolean
+        if "allow_private_datasets" in config_obj:
+            if "api_key" in config_obj:
+                if not isinstance(config_obj["allow_private_datasets"], bool):
+                    config = json.dumps({**config_obj, "allow_private_datasets": False})
+            else:
+                raise ValueError(
+                    "'api_key' is needed to using 'allow_private_datasets'"
+                )
+        else:
+            config = json.dumps({**config_obj, "allow_private_datasets": False})
+
         # Check remote_orgs and remote_groups == only_local, if not, put
         # remote_orgs and remote_groups to only_local
         if (
@@ -376,7 +389,12 @@ def _search_for_datasets(self, remote_ckan_base_url, fq_terms=None):
         #   they will harvested the next time anyway. When datasets are added,
         #   we are at risk of seeing datasets twice in the paging, so we detect
         #   and remove any duplicates.
+        params["include_private"] = False
         params["sort"] = "id asc"
+
+        if self.config.get("allow_private_datasets", False):
+            params["include_private"] = True   
+
         if fq_terms:
             params["fq"] = " ".join(fq_terms)