From 1dccef720feab2cfa3d0b630aed6a60402c4b466 Mon Sep 17 00:00:00 2001 From: Helen Lin <46795546+helen-m-lin@users.noreply.github.com> Date: Tue, 2 Jul 2024 16:40:24 -0700 Subject: [PATCH 1/3] feat: add docdb utils (#57) * feat: add docdb utils --- src/aind_data_access_api/utils.py | 214 ++++++++++++++++ .../resources/utils/example_metadata.nd.json | 116 +++++++++ .../resources/utils/example_metadata1.nd.json | 131 ++++++++++ .../resources/utils/example_metadata2.nd.json | 209 ++++++++++++++++ tests/test_utils.py | 234 ++++++++++++++++++ 5 files changed, 904 insertions(+) create mode 100644 src/aind_data_access_api/utils.py create mode 100644 tests/resources/utils/example_metadata.nd.json create mode 100644 tests/resources/utils/example_metadata1.nd.json create mode 100644 tests/resources/utils/example_metadata2.nd.json create mode 100644 tests/test_utils.py diff --git a/src/aind_data_access_api/utils.py b/src/aind_data_access_api/utils.py new file mode 100644 index 0000000..f884b7e --- /dev/null +++ b/src/aind_data_access_api/utils.py @@ -0,0 +1,214 @@ +"""Package for common methods used for interfacing with DocDB.""" + +from typing import Dict, Iterator, List, Optional +from urllib.parse import urlparse + +from pymongo import MongoClient + + +def get_s3_bucket_and_prefix(s3_location: str) -> Dict[str, str]: + """ + For a location url like s3://bucket/prefix, it will return the bucket + and prefix. It doesn't check the scheme is s3. It will strip the leading + and trailing forward slashes from the prefix. + Parameters + ---------- + s3_location : str + For example, 's3://some_bucket/some_prefix' + + Returns + ------- + Dict[str, str] + For example, {'bucket': 'some_bucket', 'prefix': 'some_prefix'} + + """ + parts = urlparse(s3_location, allow_fragments=False) + stripped_prefix = parts.path.strip("/") + return {"bucket": parts.netloc, "prefix": stripped_prefix} + + +def get_s3_location(bucket: str, prefix: str) -> str: + """ + For a given bucket and prefix, return a location url in format + s3://{bucket}/{prefix} + Parameters + ---------- + bucket : str + prefix : str + + Returns + ------- + str + For example, 's3://some_bucket/some_prefix' + + """ + stripped_prefix = prefix.strip("/") + return f"s3://{bucket}/{stripped_prefix}" + + +def is_dict_corrupt(input_dict: dict) -> bool: + """ + Checks that all the keys, included nested keys, don't contain '$' or '.' + Parameters + ---------- + input_dict : dict + + Returns + ------- + bool + True if input_dict is not a dict, or if nested dictionary keys contain + forbidden characters. False otherwise. + + """ + if not isinstance(input_dict, dict): + return True + for key, value in input_dict.items(): + if "$" in key or "." in key: + return True + elif isinstance(value, dict): + if is_dict_corrupt(value): + return True + return False + + +def does_metadata_record_exist_in_docdb( + docdb_client: MongoClient, + db_name: str, + collection_name: str, + bucket: str, + prefix: str, +) -> bool: + """ + For a given bucket and prefix, check if there is already a record in DocDb + Parameters + ---------- + docdb_client : MongoClient + db_name : str + collection_name : str + bucket : str + prefix : str + + Returns + ------- + True if there is a record in DocDb. Otherwise, False. + + """ + location = get_s3_location(bucket=bucket, prefix=prefix) + db = docdb_client[db_name] + collection = db[collection_name] + records = list( + collection.find( + filter={"location": location}, projection={"_id": 1}, limit=1 + ) + ) + if len(records) == 0: + return False + else: + return True + + +def get_record_from_docdb( + docdb_client: MongoClient, + db_name: str, + collection_name: str, + record_id: str, +) -> Optional[dict]: + """ + Download a record from docdb using the record _id. + Parameters + ---------- + docdb_client : MongoClient + db_name : str + collection_name : str + record_id : str + + Returns + ------- + Optional[dict] + None if record does not exist. Otherwise, it will return the record as + a dict. + + """ + db = docdb_client[db_name] + collection = db[collection_name] + records = list(collection.find(filter={"_id": record_id}, limit=1)) + if len(records) > 0: + return records[0] + else: + return None + + +def paginate_docdb( + db_name: str, + collection_name: str, + docdb_client: MongoClient, + page_size: int = 1000, + filter_query: Optional[dict] = None, + projection: Optional[dict] = None, +) -> Iterator[List[dict]]: + """ + Paginate through records in DocDb. + Parameters + ---------- + db_name : str + collection_name : str + docdb_client : MongoClient + page_size : int + Default is 1000 + filter_query : Optional[dict] + projection : Optional[dict] + + Returns + ------- + Iterator[List[dict]] + + """ + if filter_query is None: + filter_query = {} + if projection is None: + projection = {} + db = docdb_client[db_name] + collection = db[collection_name] + cursor = collection.find(filter=filter_query, projection=projection) + obj = next(cursor, None) + while obj: + page = [] + while len(page) < page_size and obj: + page.append(obj) + obj = next(cursor, None) + yield page + + +def build_docdb_location_to_id_map( + db_name: str, + collection_name: str, + docdb_client: MongoClient, + bucket: str, + prefixes: List[str], +) -> Dict[str, str]: + """ + For a given s3 bucket and list of prefixes, return a dictionary that looks + like {'s3://bucket/prefix': 'abc-1234'} where the value is the id of the + record in DocDb. If the record does not exist, then there will be no key + in the dictionary. + Parameters + ---------- + db_name : str + collection_name : ste + docdb_client : MongoClient + bucket : str + prefixes : List[str] + + Returns + ------- + Dict[str, str] + + """ + locations = [get_s3_location(bucket=bucket, prefix=p) for p in prefixes] + filter_query = {"location": {"$in": locations}} + projection = {"_id": 1, "location": 1} + db = docdb_client[db_name] + collection = db[collection_name] + results = collection.find(filter=filter_query, projection=projection) + location_to_id_map = {r["location"]: r["_id"] for r in results} + return location_to_id_map diff --git a/tests/resources/utils/example_metadata.nd.json b/tests/resources/utils/example_metadata.nd.json new file mode 100644 index 0000000..254c99d --- /dev/null +++ b/tests/resources/utils/example_metadata.nd.json @@ -0,0 +1,116 @@ +{ + "_id": "488bbe42-832b-4c37-8572-25eb87cc50e2", + "acquisition": null, + "created": "2024-05-13T22:01:56.035469", + "data_description": null, + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py", + "external_links": [], + "instrument": null, + "last_modified": "2024-05-13T22:01:56.035475", + "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_642478_2023-01-17_13-56-29", + "metadata_status": "Unknown", + "name": "ecephys_642478_2023-01-17_13-56-29", + "procedures": null, + "processing": { + "data_processes": [ + { + "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "end_date_time": "2023-01-20T19:13:36.434644+00:00", + "input_location": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", + "name": "Ephys preprocessing", + "notes": null, + "output_location": "s3://aind-ephys-data/ecephys_642478_2023-01-17_13-56-29", + "parameters": { + "aws_secret_names": { + "code_ocean_api_token_name": "codeocean-api-token", + "region": "us-west-2", + "video_encryption_password": "video_encryption_password" + }, + "clip_data_job": { + "clip_kwargs": {} + }, + "compress_data_job": { + "compressor": { + "compressor_name": "wavpack", + "kwargs": { + "level": 3 + } + }, + "format_kwargs": {}, + "scale_params": {}, + "write_kwargs": { + "chunk_duration": "1s", + "n_jobs": 24, + "progress_bar": true + } + }, + "data": { + "name": "openephys" + }, + "endpoints": { + "code_repo_location": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "codeocean_domain": "https://codeocean.allenneuraldynamics.org", + "dest_data_dir": "ecephys_642478_2023-01-17_13-56-29", + "gcp_prefix": "ecephys_642478_2023-01-17_13-56-29", + "metadata_service_url": "http://aind-metadata-service", + "raw_data_dir": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29", + "s3_bucket": "aind-ephys-data", + "s3_prefix": "ecephys_642478_2023-01-17_13-56-29" + }, + "jobs": { + "attach_metadata": true, + "clip": true, + "compress": true, + "trigger_codeocean_job": true, + "upload_to_gcp": false, + "upload_to_s3": true + }, + "logging": { + "level": "INFO" + }, + "trigger_codeocean_job": { + "bucket": "aind-ephys-data", + "capsule_id": "648473aa-791e-4372-bd25-205cc587ec56", + "job_type": "openephys", + "prefix": "ecephys_642478_2023-01-17_13-56-29" + }, + "upload_data_job": { + "dryrun": false + } + }, + "start_date_time": "2023-01-20T19:06:02.945386+00:00", + "version": "0.2.9" + } + ], + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py", + "pipeline_url": null, + "pipeline_version": null, + "schema_version": "0.1.0" + }, + "rig": null, + "schema_version": "0.2.7", + "session": null, + "subject": { + "background_strain": null, + "breeding_group": "Chat-IRES-Cre-neo", + "date_of_birth": "2022-07-16", + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/site-packages/aind_data_schema/subject.py", + "genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", + "home_cage_enrichment": null, + "light_cycle": null, + "maternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", + "maternal_id": "624133", + "mgi_allele_ids": null, + "notes": null, + "paternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo", + "paternal_id": "624115", + "restrictions": null, + "rrid": null, + "schema_version": "0.2.2", + "sex": "Male", + "source": null, + "species": "Mus musculus", + "subject_id": "642478", + "wellness_reports": null + } +} \ No newline at end of file diff --git a/tests/resources/utils/example_metadata1.nd.json b/tests/resources/utils/example_metadata1.nd.json new file mode 100644 index 0000000..4c75e61 --- /dev/null +++ b/tests/resources/utils/example_metadata1.nd.json @@ -0,0 +1,131 @@ +{ + "_id": "5ca4a951-d374-4f4b-8279-d570a35b2286", + "acquisition": null, + "created": "2024-05-15T17:41:26.697535", + "data_description": { + "creation_time": "2000-01-01T04:00:00", + "data_level": "raw", + "data_summary": null, + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/data_description.py", + "funding_source": [ + { + "fundee": null, + "funder": { + "abbreviation": "AIND", + "name": "Allen Institute for Neural Dynamics", + "registry": { + "abbreviation": "ROR", + "name": "Research Organization Registry" + }, + "registry_identifier": "04szwah67" + }, + "grant_number": null + } + ], + "group": null, + "institution": { + "abbreviation": "AIND", + "name": "Allen Institute for Neural Dynamics", + "registry": { + "abbreviation": "ROR", + "name": "Research Organization Registry" + }, + "registry_identifier": "04szwah67" + }, + "investigators": [], + "license": "CC-BY-4.0", + "modality": [ + { + "abbreviation": "ecephys", + "name": "Extracellular electrophysiology" + } + ], + "name": "ecephys_567890_2000-01-01_04-00-00", + "platform": { + "abbreviation": "ecephys", + "name": "Electrophysiology platform" + }, + "project_name": null, + "related_data": [], + "restrictions": null, + "schema_version": "0.10.0", + "subject_id": "567890" + }, + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py", + "external_links": [], + "instrument": null, + "last_modified": "2024-05-15T17:41:26.697535", + "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_567890_2000-01-01_04-00-00", + "metadata_status": "Unknown", + "name": "ecephys_567890_2000-01-01_04-00-00", + "procedures": { + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/procedures.py", + "notes": null, + "schema_version": "0.9.4", + "specimen_procedures": [], + "subject_procedures": [] + }, + "processing": { + "data_processes": [ + { + "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "code_version": null, + "end_date_time": "2023-10-10T18:16:40.084257+00:00", + "input_location": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07", + "name": "Ephys preprocessing", + "notes": null, + "output_location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_567890_2000-01-01_04-00-00", + "outputs": null, + "parameters": { + "compress_raw_data": true, + "extra_configs": null, + "modality": { + "abbreviation": "ecephys", + "name": "Extracellular electrophysiology" + }, + "skip_staging": false, + "source": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07" + }, + "start_date_time": "2023-10-10T18:09:09.707473+00:00", + "version": "0.29.3" + } + ], + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py", + "pipeline_url": null, + "pipeline_version": null, + "schema_version": "0.2.5" + }, + "rig": null, + "schema_version": "0.2.7", + "session": null, + "subject": { + "background_strain": null, + "breeding_group": "Gad2-IRES-Cre;Slc32a1-T2A-FlpO;Ai210-hyg", + "date_of_birth": "2021-01-06", + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/subject.py", + "genotype": "Gad2-IRES-Cre/wt;Ai210(TITL-GC7f-ICF-IRES-tTA2)-hyg/wt", + "housing": null, + "maternal_genotype": "Gad2-IRES-Cre/Gad2-IRES-Cre", + "maternal_id": "561759", + "mgi_allele_ids": null, + "notes": null, + "paternal_genotype": null, + "paternal_id": "557009", + "restrictions": null, + "rrid": null, + "schema_version": "0.4.2", + "sex": "Male", + "source": null, + "species": { + "abbreviation": null, + "name": "Mus musculus", + "registry": { + "abbreviation": "NCBI", + "name": "National Center for Biotechnology Information" + }, + "registry_identifier": "10090" + }, + "subject_id": "567890", + "wellness_reports": null + } +} \ No newline at end of file diff --git a/tests/resources/utils/example_metadata2.nd.json b/tests/resources/utils/example_metadata2.nd.json new file mode 100644 index 0000000..6606e70 --- /dev/null +++ b/tests/resources/utils/example_metadata2.nd.json @@ -0,0 +1,209 @@ +{ + "_id": "7d0a4814-5a56-4497-a63c-b732f9e64af9", + "acquisition": null, + "created": "2024-05-15T17:41:30.131278", + "data_description": { + "creation_time": "2000-01-01T01:01:02", + "data_level": "raw", + "data_summary": null, + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/data_description.py", + "funding_source": [ + { + "fundee": null, + "funder": { + "abbreviation": "AIND", + "name": "Allen Institute for Neural Dynamics", + "registry": { + "abbreviation": "ROR", + "name": "Research Organization Registry" + }, + "registry_identifier": "04szwah67" + }, + "grant_number": null + } + ], + "group": null, + "institution": { + "abbreviation": "AIND", + "name": "Allen Institute for Neural Dynamics", + "registry": { + "abbreviation": "ROR", + "name": "Research Organization Registry" + }, + "registry_identifier": "04szwah67" + }, + "investigators": [], + "license": "CC-BY-4.0", + "modality": [ + { + "abbreviation": "ecephys", + "name": "Extracellular electrophysiology" + } + ], + "name": "ecephys_655019_2000-01-01_01-01-02", + "platform": { + "abbreviation": "ecephys", + "name": "Electrophysiology platform" + }, + "project_name": null, + "related_data": [], + "restrictions": null, + "schema_version": "0.10.2", + "subject_id": "655019" + }, + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py", + "external_links": [], + "instrument": null, + "last_modified": "2024-05-15T17:41:30.131278", + "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_655019_2000-01-01_01-01-02", + "metadata_status": "Unknown", + "name": "ecephys_655019_2000-01-01_01-01-02", + "procedures": { + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/procedures.py", + "notes": null, + "schema_version": "0.9.3", + "specimen_procedures": [], + "subject_id": "655019", + "subject_procedures": [ + { + "anaesthesia": null, + "animal_weight_post": null, + "animal_weight_prior": null, + "end_date": "2023-04-10", + "experimenter_full_name": 30241, + "iacuc_protocol": "2109", + "notes": null, + "output_specimen_ids": [ + 655019 + ], + "procedure_type": "Perfusion", + "start_date": "2023-04-10", + "weight_unit": "gram" + }, + { + "anaesthesia": { + "duration": 135.0, + "duration_unit": "minute", + "level": 1.5, + "type": "isoflurane" + }, + "animal_weight_post": 26.8, + "animal_weight_prior": 24.2, + "bregma_to_lambda_distance": 4.426, + "bregma_to_lambda_unit": "millimeter", + "craniotomy_coordinates_ap": null, + "craniotomy_coordinates_ml": null, + "craniotomy_coordinates_reference": null, + "craniotomy_coordinates_unit": "millimeter", + "craniotomy_hemisphere": null, + "craniotomy_size": null, + "craniotomy_size_unit": "millimeter", + "craniotomy_type": null, + "dura_removed": null, + "end_date": "2023-01-26", + "experimenter_full_name": "NSB-5011", + "iacuc_protocol": "2109", + "implant_part_number": null, + "notes": null, + "procedure_type": "Craniotomy", + "protective_material": null, + "recovery_time": 20.0, + "recovery_time_unit": "minute", + "start_date": "2023-01-26", + "weight_unit": "gram", + "workstation_id": "SWS 5" + }, + { + "anaesthesia": { + "duration": 135.0, + "duration_unit": "minute", + "level": 1.5, + "type": "isoflurane" + }, + "animal_weight_post": 26.8, + "animal_weight_prior": 24.2, + "end_date": "2023-01-26", + "experimenter_full_name": "NSB-5011", + "headframe_material": null, + "headframe_part_number": "0160-100-42", + "headframe_type": "WHC NP", + "iacuc_protocol": "2109", + "notes": null, + "procedure_type": "Headframe", + "start_date": "2023-01-26", + "weight_unit": "gram", + "well_part_number": "0160-055-08", + "well_type": "WHC NP" + } + ] + }, + "processing": { + "analyses": null, + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py", + "notes": null, + "processing_pipeline": { + "data_processes": [ + { + "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer", + "code_version": null, + "end_date_time": "2023-12-12T21:58:32.440692+00:00", + "input_location": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07", + "name": "Compression", + "notes": null, + "output_location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_655019_2000-01-01_01-01-02", + "outputs": null, + "parameters": { + "compress_raw_data": true, + "extra_configs": null, + "modality": { + "abbreviation": "ecephys", + "name": "Extracellular electrophysiology" + }, + "skip_staging": false, + "source": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07" + }, + "software_version": "0.32.0", + "start_date_time": "2023-12-12T21:55:39.374709+00:00" + } + ], + "note": null, + "pipeline_url": null, + "pipeline_version": null, + "processor_full_name": "service" + }, + "schema_version": "0.3.1" + }, + "rig": null, + "schema_version": "0.2.7", + "session": null, + "subject": { + "background_strain": null, + "breeding_group": "Ai229(TIT2L-GC6m-IRES2-ChRmine-HA-WPRE-ISL-tTA2-WPRE)-hyg", + "date_of_birth": "2022-10-18", + "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/subject.py", + "genotype": "Ai229(TIT2L-GC6m-IRES2-ChRmine-HA-WPRE-ISL-tTA2-WPRE)-hyg/wt", + "housing": null, + "maternal_genotype": "wt/wt", + "maternal_id": "645737", + "mgi_allele_ids": null, + "notes": null, + "paternal_genotype": "Ai229(TIT2L-GC6m-IRES2-ChRmine-HA-WPRE-ISL-tTA2-WPRE)-hyg/wt", + "paternal_id": "643902", + "restrictions": null, + "rrid": null, + "schema_version": "0.4.2", + "sex": "Male", + "source": null, + "species": { + "abbreviation": null, + "name": "Mus musculus", + "registry": { + "abbreviation": "NCBI", + "name": "National Center for Biotechnology Information" + }, + "registry_identifier": "10090" + }, + "subject_id": "655019", + "wellness_reports": null + } +} \ No newline at end of file diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..4e1c663 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,234 @@ +"""Tests methods in utils module""" + +import json +import os +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + +from aind_data_access_api.utils import ( + build_docdb_location_to_id_map, + does_metadata_record_exist_in_docdb, + get_record_from_docdb, + get_s3_bucket_and_prefix, + get_s3_location, + is_dict_corrupt, + paginate_docdb, +) + +TEST_DIR = Path(os.path.dirname(os.path.realpath(__file__))) +TEST_UTILS_DIR = TEST_DIR / "resources" / "utils" + + +class TestUtils(unittest.TestCase): + """Class to test methods in utils module.""" + + @classmethod + def setUpClass(cls) -> None: + """Set up the class by extracting contents from example files.""" + + def load_json_file(filename: str) -> dict: + """Load json file from resources directory.""" + with open(TEST_UTILS_DIR / filename, "r") as f: + return json.load(f) + + example_metadata_nd = load_json_file("example_metadata.nd.json") + example_metadata_nd1 = load_json_file("example_metadata1.nd.json") + example_metadata_nd2 = load_json_file("example_metadata2.nd.json") + + cls.example_metadata_nd = example_metadata_nd + cls.example_metadata_nd1 = example_metadata_nd1 + cls.example_metadata_nd2 = example_metadata_nd2 + + def test_is_dict_corrupt(self): + """Tests is_dict_corrupt method""" + good_contents = {"a": 1, "b": {"c": 2, "d": 3}} + bad_contents1 = {"a.1": 1, "b": {"c": 2, "d": 3}} + bad_contents2 = {"a": 1, "b": {"c": 2, "$d": 3}} + bad_contents3 = {"a": 1, "b": {"c": 2, "d": 3}, "$e": 4} + bad_contents4 = {"a": 1, "b": {"c": {"d": 3}, "$e": 4}} + bad_contents5 = [{"a": 1}, {"b": {"c": 2, "d": 3}}] + self.assertFalse(is_dict_corrupt(good_contents)) + self.assertTrue(is_dict_corrupt(bad_contents1)) + self.assertTrue(is_dict_corrupt(bad_contents2)) + self.assertTrue(is_dict_corrupt(bad_contents3)) + self.assertTrue(is_dict_corrupt(bad_contents4)) + self.assertTrue(is_dict_corrupt(bad_contents5)) + + def test_get_s3_bucket_and_prefix(self): + """Tests get_s3_bucket_and_prefix""" + results1 = get_s3_bucket_and_prefix( + s3_location="s3://some_bucket/prefix1/" + ) + results2 = get_s3_bucket_and_prefix( + s3_location="s3://some_bucket/prefix2" + ) + + self.assertEqual( + {"bucket": "some_bucket", "prefix": "prefix1"}, results1 + ) + self.assertEqual( + {"bucket": "some_bucket", "prefix": "prefix2"}, results2 + ) + + def test_get_s3_location(self): + """Tests get_s3_location""" + result1 = get_s3_location(bucket="some_bucket", prefix="prefix1") + result2 = get_s3_location(bucket="some_bucket", prefix="prefix2/") + self.assertEqual("s3://some_bucket/prefix1", result1) + self.assertEqual("s3://some_bucket/prefix2", result2) + + @patch("pymongo.MongoClient") + def test_does_metadata_record_exist_in_docdb_true( + self, mock_docdb_client: MagicMock + ): + """Tests does_metadata_record_exist_in_docdb when true""" + + mock_db = MagicMock() + mock_docdb_client.__getitem__.return_value = mock_db + mock_collection = MagicMock() + mock_db.__getitem__.return_value = mock_collection + mock_collection.find.return_value = iter( + [{"_id": "70bcf356-985f-4a2a-8105-de900e35e788"}] + ) + self.assertTrue( + does_metadata_record_exist_in_docdb( + docdb_client=mock_docdb_client, + db_name="metadata_index", + collection_name="data_assets", + bucket="aind-ephys-data-dev-u5u0i5", + prefix="ecephys_642478_2023-01-17_13-56-29", + ) + ) + + @patch("pymongo.MongoClient") + def test_does_metadata_record_exist_in_docdb_false( + self, mock_docdb_client: MagicMock + ): + """Tests does_metadata_record_exist_in_docdb when false""" + + mock_db = MagicMock() + mock_docdb_client.__getitem__.return_value = mock_db + mock_collection = MagicMock() + mock_db.__getitem__.return_value = mock_collection + mock_collection.find.return_value = iter([]) + self.assertFalse( + does_metadata_record_exist_in_docdb( + docdb_client=mock_docdb_client, + db_name="metadata_index", + collection_name="data_assets", + bucket="aind-ephys-data-dev-u5u0i5", + prefix="ecephys_642478_2023-01-17_13-56-29", + ) + ) + + @patch("pymongo.MongoClient") + def test_get_record_from_docdb(self, mock_docdb_client: MagicMock): + """Tests get_record_from_docdb when record exists""" + mock_db = MagicMock() + mock_docdb_client.__getitem__.return_value = mock_db + mock_collection = MagicMock() + mock_db.__getitem__.return_value = mock_collection + mock_collection.find.return_value = iter([self.example_metadata_nd]) + record = get_record_from_docdb( + docdb_client=mock_docdb_client, + db_name="metadata_index", + collection_name="data_assets", + record_id="488bbe42-832b-4c37-8572-25eb87cc50e2", + ) + self.assertEqual(self.example_metadata_nd, record) + + @patch("pymongo.MongoClient") + def test_get_record_from_docdb_none(self, mock_docdb_client: MagicMock): + """Tests get_record_from_docdb when record doesn't exist""" + mock_db = MagicMock() + mock_docdb_client.__getitem__.return_value = mock_db + mock_collection = MagicMock() + mock_db.__getitem__.return_value = mock_collection + mock_collection.find.return_value = iter([]) + record = get_record_from_docdb( + docdb_client=mock_docdb_client, + db_name="metadata_index", + collection_name="data_assets", + record_id="488bbe42-832b-4c37-8572-25eb87cc50ee", + ) + self.assertIsNone(record) + + @patch("pymongo.MongoClient") + def test_paginate_docdb(self, mock_docdb_client: MagicMock): + """Tests paginate_docdb""" + mock_db = MagicMock() + mock_docdb_client.__getitem__.return_value = mock_db + mock_collection = MagicMock() + mock_db.__getitem__.return_value = mock_collection + mock_collection.find.return_value = iter( + [ + self.example_metadata_nd, + self.example_metadata_nd1, + self.example_metadata_nd2, + ] + ) + pages = paginate_docdb( + docdb_client=mock_docdb_client, + db_name="metadata_index", + collection_name="data_assets", + page_size=2, + ) + expected_results = [ + [self.example_metadata_nd, self.example_metadata_nd1], + [self.example_metadata_nd2], + ] + actual_results = list(pages) + self.assertEqual(expected_results, actual_results) + + @patch("pymongo.MongoClient") + def test_build_docdb_location_to_id_map( + self, mock_docdb_client: MagicMock + ): + """Tests build_docdb_location_to_id_map""" + bucket = "aind-ephys-data-dev-u5u0i5" + mock_db = MagicMock() + mock_docdb_client.__getitem__.return_value = mock_db + mock_collection = MagicMock() + mock_db.__getitem__.return_value = mock_collection + mock_collection.find.return_value = iter( + [ + { + "_id": "70bcf356-985f-4a2a-8105-de900e35e788", + "location": ( + f"s3://{bucket}/ecephys_655019_2000-04-04_04-00-00" + ), + }, + { + "_id": "5ca4a951-d374-4f4b-8279-d570a35b2286", + "location": ( + f"s3://{bucket}/ecephys_567890_2000-01-01_04-00-00" + ), + }, + ] + ) + + actual_map = build_docdb_location_to_id_map( + docdb_client=mock_docdb_client, + db_name="metadata_index", + collection_name="data_assets", + bucket=bucket, + prefixes=[ + "ecephys_655019_2000-04-04_04-00-00", + "ecephys_567890_2000-01-01_04-00-00/", + "missing_655019_2000-01-01_01-01-02", + ], + ) + expected_map = { + f"s3://{bucket}/ecephys_655019_2000-04-04_04-00-00": ( + "70bcf356-985f-4a2a-8105-de900e35e788" + ), + f"s3://{bucket}/ecephys_567890_2000-01-01_04-00-00": ( + "5ca4a951-d374-4f4b-8279-d570a35b2286" + ), + } + self.assertEqual(expected_map, actual_map) + + +if __name__ == "__main__": + unittest.main() From 64d2bd3ce2bafa95b68e747723732308f2021de5 Mon Sep 17 00:00:00 2001 From: Helen Lin <46795546+helen-m-lin@users.noreply.github.com> Date: Fri, 12 Jul 2024 15:38:58 -0700 Subject: [PATCH 2/3] docs: add docdb interface and utils usage to readthedocs (#62) * docs: add User Guide from README content * docs: add public REST API usage docs * docs: add Compass setup docs * docs: add examples for DocumentDbSSHClient * docs: add docdb intro to user guide * docs: remove usage from readme * docs: add utils automodule to readthedocs --- README.md | 109 +------ docs/source/ExamplesDocDBDirectConnection.rst | 293 ++++++++++++++++++ docs/source/UserGuide.rst | 226 ++++++++++++++ docs/source/aind_data_access_api.rst | 8 + docs/source/index.rst | 2 + 5 files changed, 534 insertions(+), 104 deletions(-) create mode 100644 docs/source/ExamplesDocDBDirectConnection.rst create mode 100644 docs/source/UserGuide.rst diff --git a/README.md b/README.md index 5258bfe..744aa4b 100644 --- a/README.md +++ b/README.md @@ -4,112 +4,13 @@ ![Code Style](https://img.shields.io/badge/code%20style-black-black) [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release) -API to interact with a few AIND databases. +API to interact with a few AIND databases. We have two primary databases: -## Usage -We have two primary databases. A Document store to keep unstructured json documents, and a relational database to store structured tables. +1. A document database (DocDB) to store + unstructured json documents. The DocDB contains AIND metadata. +2. A relational database to store structured tables. -### Document Store -We have some convenience methods to interact with our Document Store. You can create a client by explicitly setting credentials, or downloading from AWS Secrets Manager. - -__To connect from outside of our VPC:__ - -1. If using credentials from environment, please configure: -```sh -DOC_DB_HOST=docdb-us-west-2-****.cluster-************.us-west-2.docdb.amazonaws.com -DOC_DB_USERNAME=doc_db_username -DOC_DB_PASSWORD=doc_db_password -DOC_DB_SSH_HOST=ssh_host -DOC_DB_SSH_USERNAME=ssh_username -DOC_DB_SSH_PASSWORD=ssh_password -``` -2. Usage: -```python -from aind_data_access_api.document_db_ssh import DocumentDbSSHClient, DocumentDbSSHCredentials - -# Method 1) if credentials are set in environment -credentials = DocumentDbSSHCredentials() - -# Method 2) if you have permissions to AWS Secrets Manager -# Each secret must contain corresponding "host", "username", and "password" -credentials = DocumentDbSSHCredentials.from_secrets_manager( - doc_db_secret_name="/doc/store/secret/name", ssh_secret_name="/ssh/tunnel/secret/name" -) - -with DocumentDbSSHClient(credentials=credentials) as doc_db_client: - # To get a list of filtered records: - filter = {"subject.subject_id": "123456"} - projection = { - "name": 1, "created": 1, "location": 1, "subject.subject_id": 1, "subject.date_of_birth": 1, - } - count = doc_db_client.collection.count_documents(filter) - response = list(doc_db_client.collection.find(filter=filter, projection=projection)) -``` - -__To connect from within our VPC:__ -```python -from aind_data_access_api.credentials import DocumentStoreCredentials -from aind_data_access_api.document_store import Client - -# Method one assuming user, password, and host are known -ds_client = Client( - credentials=DocumentStoreCredentials( - username="user", - password="password", - host="host", - database="metadata", - ), - collection_name="data_assets", - ) - -# Method two if you have permissions to AWS Secrets Manager -ds_client = Client( - credentials=DocumentStoreCredentials( - aws_secrets_name="aind/data/access/api/document_store/metadata" - ), - collection_name="data_assets", - ) - -# To get all records -response = list(ds_client.retrieve_data_asset_records()) - -# To get a list of filtered records: -response = list(ds_client.retrieve_data_asset_records({"subject.subject_id": "123456"})) -``` - -### RDS Tables -We have some convenience methods to interact with our Relational Database. You can create a client by explicitly setting credentials, or downloading from AWS Secrets Manager. -``` -from aind_data_access_api.credentials import RDSCredentials -from aind_data_access_api.rds_tables import Client - -# Method one assuming user, password, and host are known -ds_client = Client( - credentials=RDSCredentials( - username="user", - password="password", - host="host", - database="metadata", - ), - collection_name="data_assets", - ) - -# Method two if you have permissions to AWS Secrets Manager -ds_client = Client( - credentials=RDSCredentials( - aws_secrets_name="aind/data/access/api/rds_tables" - ), - ) - -# To retrieve a table as a pandas dataframe -df = ds_client.read_table(table_name="spike_sorting_urls") - -# Can also pass in a custom sql query -df = ds_client.read_table(query="SELECT * FROM spike_sorting_urls") - -# It's also possible to save a pandas dataframe as a table. Please check internal documentation for more details. -ds_client.overwrite_table_with_df(df, table_name) -``` +More information can be found at [readthedocs](https://aind-data-access-api.readthedocs.io). ## Installation To use the software, it can be installed from PyPI. diff --git a/docs/source/ExamplesDocDBDirectConnection.rst b/docs/source/ExamplesDocDBDirectConnection.rst new file mode 100644 index 0000000..e040d26 --- /dev/null +++ b/docs/source/ExamplesDocDBDirectConnection.rst @@ -0,0 +1,293 @@ +Examples - DocDB Direct Connection +========== + +This page provides examples for interact with the Document Database (DocDB) +using the provided Python client. + +It is assumed that the required credentials are set in environment. +Please refer to the User Guide for more details. + + +Querying Metadata +~~~~~~~~~~~~~~~~~~~~~~ + +Count Example 1: Get # of records with a certain subject_id +------------------ + +.. code:: python + + import json + + from aind_data_access_api.document_db_ssh import ( + DocumentDbSSHClient, + DocumentDbSSHCredentials, + ) + + credentials = DocumentDbSSHCredentials() + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + filter = {"subject.subject_id": "689418"} + count = doc_db_client.collection.count_documents(filter) + print(count) + +Filter Example 1: Get records with a certain subject_id +------------------ + +.. code:: python + + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + filter = {"subject.subject_id": "689418"} + records = list( + doc_db_client.collection.find(filter=filter) + ) + print(json.dumps(records, indent=3)) + + +With projection (recommended): + +.. code:: python + + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + filter = {"subject.subject_id": "689418"} + projection = { + "name": 1, + "created": 1, + "location": 1, + "subject.subject_id": 1, + "subject.date_of_birth": 1, + } + records = list( + doc_db_client.collection.find(filter=filter, projection=projection) + ) + print(json.dumps(records, indent=3)) + + +Filter Example 2: Get records with a certain breeding group +------------------ + +.. code:: python + + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + filter = { + "subject.breeding_info.breeding_group": "Chat-IRES-Cre_Jax006410" + } + records = list( + doc_db_client.collection.find(filter=filter) + ) + print(json.dumps(records, indent=3)) + + +With projection (recommended): + +.. code:: python + + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + filter = { + "subject.breeding_info.breeding_group": "Chat-IRES-Cre_Jax006410" + } + projection = { + "name": 1, + "created": 1, + "location": 1, + "subject.subject_id": 1, + "subject.breeding_info.breeding_group": 1, + } + records = list( + doc_db_client.collection.find(filter=filter, projection=projection) + ) + print(json.dumps(records, indent=3)) + +Aggregation Example 1: Get all subjects per breeding group +------------------ + +.. code:: python + + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + agg_pipeline = [ + { + "$group": { + "_id": "$subject.breeding_info.breeding_group", + "subject_ids": {"$addToSet": "$subject.subject_id"}, + } + } + ] + result = list( + doc_db_client.collection.aggregate(pipeline=agg_pipeline) + ) + print(f"Total breeding groups: {len(result)}") + print(f"First 3 breeding groups and corresponding subjects:") + print(json.dumps(result[:3], indent=3)) + + +Updating Metadata +~~~~~~~~~~~~~~~~~~~~~~ + +We provide several utility functions for interacting with DocDB within the +``aind_data_access_api.utils`` module. Below is an example of how to use these +functions to update records in DocDB. + +.. code:: python + + import json + import logging + from typing import List, Optional + + from aind_data_access_api.document_db_ssh import ( + DocumentDbSSHClient, + DocumentDbSSHCredentials, + ) + from aind_data_schema.core.metadata import Metadata + + from aind_data_access_api.utils import paginate_docdb, is_dict_corrupt + + logging.basicConfig(level="INFO") + + def _process_docdb_records(records: List[dict], doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None: + """ + Process records. + Parameters + ---------- + records : List[dict] + + """ + for record in records: + _process_docdb_record(record=record, doc_db_client=doc_db_client, dryrun=dryrun) + + def _process_docdb_record(record: dict, doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None: + """ + Process record. This example updates the data_description.name field + if it does not match the record.name field. + + Parameters + ---------- + record : dict + + """ + _id = record.get("_id") + name = record.get("name") + location = record.get("location") + if _id: + if record.get("data_description") and record["data_description"].get("name") != name: + # Option 1: update specific fields(s) only + new_fields = { + "data_description.name": name + } + update_docdb_record_partial(record_id=_id, new_fields=new_fields, doc_db_client=doc_db_client, dryrun=dryrun) + # Option 2: build new record Metadata.py and replace entire document with new record + # new_record = build_new_docdb_record(record=record) + # if new_record is not None: + # update_docdb_record_entire(record_id=_id, new_record=new_record, doc_db_client=doc_db_client, dryrun=dryrun) + # else: + # logging.info(f"Record for {location} does not need to be updated.") + else: + logging.warning(f"Record for {location} does not have an _id field! Skipping.") + + + def build_new_docdb_record(record: Optional[dict]) -> Optional[dict]: + """Build new record from existing record. This example updates the + data_description.name field if it does not match the record.name field. + + Parameters + ---------- + record : Optional[dict] + + Returns + ------- + Optional[dict] + The new record, or None if the record cannot be constructed. + """ + # Example: Update record.data_description.name if not matching record.name + new_record = None + if record.get("data_description") and record["data_description"].get("name") != name: + _id = record.get("_id") + name = record.get("name") + location = record.get("location") + created = record.get("created") + if _id is None or name is None or location is None or created is None: + logging.warning(f"Record does not have _id, name, location, or created! Skipping.") + return None + try: + new_record = record.copy() + new_record["data_description"]["name"] = name + new_record_str = Metadata.model_construct( + **new_record + ).model_dump_json(warnings=False, by_alias=True) + new_record = json.loads(new_record_str) + if is_dict_corrupt(new_record): + logging.warning(f"New record for {location} is corrupt! Skipping.") + new_record = None + except Exception: + new_record = None + return new_record + + def update_docdb_record_partial(record_id: str, new_fields: dict, doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None: + """ + Update record in docdb by updating specific fields only. + Parameters + ---------- + record_id : str + The _id of the record to update. + new_fields : dict + New fields to update. E.g. {"data_description.name": "new_name"} + + """ + if dryrun: + logging.info(f"(dryrun) doc_db_client.collection.update_one (partial): {record_id}") + else: + logging.info(f"doc_db_client.collection.update_one (partial): {record_id}") + response = doc_db_client.collection.update_one( + {"_id": record_id}, + {"$set": new_fields}, + upsert=False, + ) + logging.info(response.raw_result) + + + def update_docdb_record_entire(record_id: str, new_record: dict, doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None: + """ + Update record in docdb by replacing the entire document with new record. + Parameters + ---------- + record_id : str + The _id of the record to update. + new_record : dict + The new record to replace the existing record with. + + """ + if is_dict_corrupt(new_record) or record_id != new_record.get("_id"): + logging.warning(f"Attempting to update corrupt record {record_id}! Skipping.") + return + if dryrun: + logging.info(f"(dryrun) doc_db_client.collection.update_one: {record_id}") + else: + logging.info(f"doc_db_client.collection.update_one: {record_id}") + response = doc_db_client.collection.update_one( + {"_id": record_id}, + {"$set": new_record}, + upsert=False, + ) + logging.info(response.raw_result) + + + if __name__ == "__main__": + credentials = DocumentDbSSHCredentials() # credentials in environment + dryrun = True + filter = {"location": {"$regex": ".*s3://codeocean-s3datasetsbucket.*"}} + projection = None + + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + db_name = doc_db_client.database_name + col_name = doc_db_client.collection_name + # count = doc_db_client.collection.count_documents(filter) + # logging.info(f"{db_name}.{col_name}: Found {count} records with {filter}: {count}") + + logging.info(f"{db_name}.{col_name}: Starting to scan for {filter}.") + docdb_pages = paginate_docdb( + db_name=doc_db_client.database_name, + collection_name=doc_db_client.collection_name, + docdb_client=doc_db_client._client, + page_size=500, + filter_query=filter, + ) + for page in docdb_pages: + _process_docdb_records(records=page, doc_db_client=doc_db_client, dryrun=dryrun) + logging.info(f"{db_name}.{col_name}:Finished scanning through DocDb.") \ No newline at end of file diff --git a/docs/source/UserGuide.rst b/docs/source/UserGuide.rst new file mode 100644 index 0000000..c214801 --- /dev/null +++ b/docs/source/UserGuide.rst @@ -0,0 +1,226 @@ +User Guide +========== + +Thank you for using ``aind-data-access-api``! This guide is +intended for scientists and engineers in AIND that wish to interface +with AIND databases. + +We have two primary databases: + +1. A `document database (DocDB) <#document-database-docdb>`__ to store + unstructured json documents. The DocDB contains AIND metadata. +2. A `relational database <#rds-tables>`__ to store structured tables. + +Document Database (DocDB) +-------------------- + +AIND metadata records stored in the DocDB describe the ``metadata.nd.json`` +for a data asset: + +- ``_id``: the unique ID of the data asset. +- ``name``: the name of the data asset. +- ``location``: the S3 location of the metadata, in the format + ``s3://{bucket_name}/{name}``. This is unique across records and can + be used to query or identify specific records. +- Please see the `readthedocs for aind-data-schema + `__ + for more details. + +The DocDB can be accessed through a public read-only REST API or +through a direct connection using SSH. For a direct connection, +it is assumed you have the appropriate credentials. + +REST API (Read-Only) +~~~~~~~~~~~~~~~~~~~~~~ + +1. A GET request to ``https://api.allenneuraldynamics.org/v1/metadata_index/data_assets`` + with appropriate query parameters will return a list of records found. + +.. code:: python + + import json + import requests + + URL = "https://api.allenneuraldynamics.org/v1/metadata_index/data_assets" + filter = {"subject.subject_id": "123456"} + limit = 100 + response = requests.get(URL, params={"filter": json.dumps(filter), "limit": limit}) + print(response.json()) + +2. Alternatively, we provide a Python client: + +.. code:: python + + from aind_data_access_api.document_db import MetadataDbClient + + API_GATEWAY_HOST = "api.allenneuraldynamics.org" + DATABASE = "metadata_index" + COLLECTION = "data_assets" + + docdb_api_client = MetadataDbClient( + host=API_GATEWAY_HOST, + database=DATABASE, + collection=COLLECTION, + ) + + filter = {"subject.subject_id": "123456"} + limit = 1000 + paginate_batch_size = 100 + response = docdb_api_client.retrieve_data_asset_records( + filter_query=filter, + limit=limit, + paginate_batch_size=paginate_batch_size + ) + print(response) + + +Direct Connection (SSH) - Database UI (MongoDB Compass) +~~~~~~~~~~~~~~~~~~~~~~ + +MongoDB Compass is a database GUI that can be used to query and interact +with our document database. + +To connect: + +1. If provided a temporary SSH password, please first run ``ssh {ssh-username}@54.184.81.236`` + and set a new password. +2. Download the full version of `MongoDB Compass `__. +3. When connecting, click “Advanced Connection Options” and use the configurations below. + Leave any unspecified fields on their default setting. + +.. list-table:: + :widths: 25 25 50 + :header-rows: 1 + + * - Tab + - Config + - Value + * - General + - Host + - ``************.us-west-2.docdb.amazonaws.com`` + * - Authentication + - Username + - ``doc_db_username`` + * - + - Password + - ``doc_db_password`` + * - + - Authentication Mechanism + - SCRAM-SHA-1 + * - TLS/SSL + - SSL/TLS Connection + - OFF + * - Proxy/SSH + - SSH Tunnel/ Proxy Method + - SSH with Password + * - + - SSH Hostname + - ``ssh_host`` + * - + - SSH Port + - 22 + * - + - SSH Username + - ``ssh_username`` + * - + - SSH Username + - ``ssh_password`` + +4. You should be able to see the home page with the ``metadata-index`` database. + It should have 1 single collection called ``data_assets``. +5. If provided with a temporary DocDB password, please change it using the embedded + mongo shell in Compass, and then reconnect. + +.. code:: bash + + db.updateUser( + "doc_db_username", + { + pwd: passwordPrompt() + } + ) + +Direct Connection (SSH) - Python Client +~~~~~~~~~~~~~~~~~~~~~~ + +We have some convenience methods to interact with our Document Store. +You can create a client by explicitly setting credentials, or downloading from AWS Secrets Manager. + +If using credentials from environment, please configure: + +.. code:: bash + + DOC_DB_HOST=************.us-west-2.docdb.amazonaws.com + DOC_DB_USERNAME=doc_db_username + DOC_DB_PASSWORD=doc_db_password + DOC_DB_SSH_HOST=ssh_host + DOC_DB_SSH_USERNAME=ssh_username + DOC_DB_SSH_PASSWORD=ssh_password + +To use the client: + +.. code:: python + + from aind_data_access_api.document_db_ssh import DocumentDbSSHClient, DocumentDbSSHCredentials + + # Method 1) if credentials are set in environment + credentials = DocumentDbSSHCredentials() + + # Method 2) if you have permissions to AWS Secrets Manager + # Each secret must contain corresponding "host", "username", and "password" + credentials = DocumentDbSSHCredentials.from_secrets_manager( + doc_db_secret_name="/doc/db/secret/name", ssh_secret_name="/ssh/tunnel/secret/name" + ) + + with DocumentDbSSHClient(credentials=credentials) as doc_db_client: + # To get a list of filtered records: + filter = {"subject.subject_id": "123456"} + projection = { + "name": 1, "created": 1, "location": 1, "subject.subject_id": 1, "subject.date_of_birth": 1, + } + count = doc_db_client.collection.count_documents(filter) + response = list(doc_db_client.collection.find(filter=filter, projection=projection)) + + +RDS Tables +------------------ +We have some convenience methods to interact with our Relational Database. You can create a client by +explicitly setting credentials, or downloading from AWS Secrets Manager. + +.. code:: python + + from aind_data_access_api.credentials import RDSCredentials + from aind_data_access_api.rds_tables import Client + + # Method one assuming user, password, and host are known + ds_client = Client( + credentials=RDSCredentials( + username="user", + password="password", + host="host", + database="metadata", + ), + collection_name="data_assets", + ) + + # Method two if you have permissions to AWS Secrets Manager + ds_client = Client( + credentials=RDSCredentials( + aws_secrets_name="aind/data/access/api/rds_tables" + ), + ) + + # To retrieve a table as a pandas dataframe + df = ds_client.read_table(table_name="spike_sorting_urls") + + # Can also pass in a custom sql query + df = ds_client.read_table(query="SELECT * FROM spike_sorting_urls") + + # It's also possible to save a pandas dataframe as a table. Please check internal documentation for more details. + ds_client.overwrite_table_with_df(df, table_name) + +Reporting bugs or making feature requests +----------------------------------------- + +Please report any bugs or feature requests here: +`issues `__ diff --git a/docs/source/aind_data_access_api.rst b/docs/source/aind_data_access_api.rst index 78cde06..bb006e0 100644 --- a/docs/source/aind_data_access_api.rst +++ b/docs/source/aind_data_access_api.rst @@ -60,6 +60,14 @@ aind\_data\_access\_api.secrets module :undoc-members: :show-inheritance: +aind\_data\_access\_api.utils module +------------------------------------ + +.. automodule:: aind_data_access_api.utils + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 07adcad..91ce562 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,6 +11,8 @@ Welcome to this repository's documentation! :maxdepth: 2 :caption: Contents: + UserGuide + ExamplesDocDBDirectConnection modules From 800de35107357f10d0c779617873cd12bc461d16 Mon Sep 17 00:00:00 2001 From: Helen Lin Date: Fri, 12 Jul 2024 15:46:15 -0700 Subject: [PATCH 3/3] build: version bump to v0.11.0 --- src/aind_data_access_api/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aind_data_access_api/__init__.py b/src/aind_data_access_api/__init__.py index 0deb776..dce99e7 100644 --- a/src/aind_data_access_api/__init__.py +++ b/src/aind_data_access_api/__init__.py @@ -1,3 +1,3 @@ """Init package""" -__version__ = "0.10.1" +__version__ = "0.11.0"