From 1dccef720feab2cfa3d0b630aed6a60402c4b466 Mon Sep 17 00:00:00 2001
From: Helen Lin <46795546+helen-m-lin@users.noreply.github.com>
Date: Tue, 2 Jul 2024 16:40:24 -0700
Subject: [PATCH 1/3] feat: add docdb utils (#57)

* feat: add docdb utils
---
 src/aind_data_access_api/utils.py             | 214 ++++++++++++++++
 .../resources/utils/example_metadata.nd.json  | 116 +++++++++
 .../resources/utils/example_metadata1.nd.json | 131 ++++++++++
 .../resources/utils/example_metadata2.nd.json | 209 ++++++++++++++++
 tests/test_utils.py                           | 234 ++++++++++++++++++
 5 files changed, 904 insertions(+)
 create mode 100644 src/aind_data_access_api/utils.py
 create mode 100644 tests/resources/utils/example_metadata.nd.json
 create mode 100644 tests/resources/utils/example_metadata1.nd.json
 create mode 100644 tests/resources/utils/example_metadata2.nd.json
 create mode 100644 tests/test_utils.py

diff --git a/src/aind_data_access_api/utils.py b/src/aind_data_access_api/utils.py
new file mode 100644
index 0000000..f884b7e
--- /dev/null
+++ b/src/aind_data_access_api/utils.py
@@ -0,0 +1,214 @@
+"""Package for common methods used for interfacing with DocDB."""
+
+from typing import Dict, Iterator, List, Optional
+from urllib.parse import urlparse
+
+from pymongo import MongoClient
+
+
+def get_s3_bucket_and_prefix(s3_location: str) -> Dict[str, str]:
+    """
+    For a location url like s3://bucket/prefix, it will return the bucket
+    and prefix. It doesn't check the scheme is s3. It will strip the leading
+    and trailing forward slashes from the prefix.
+    Parameters
+    ----------
+    s3_location : str
+      For example, 's3://some_bucket/some_prefix'
+
+    Returns
+    -------
+    Dict[str, str]
+      For example, {'bucket': 'some_bucket', 'prefix': 'some_prefix'}
+
+    """
+    parts = urlparse(s3_location, allow_fragments=False)
+    stripped_prefix = parts.path.strip("/")
+    return {"bucket": parts.netloc, "prefix": stripped_prefix}
+
+
+def get_s3_location(bucket: str, prefix: str) -> str:
+    """
+    For a given bucket and prefix, return a location url in format
+    s3://{bucket}/{prefix}
+    Parameters
+    ----------
+    bucket : str
+    prefix : str
+
+    Returns
+    -------
+    str
+      For example, 's3://some_bucket/some_prefix'
+
+    """
+    stripped_prefix = prefix.strip("/")
+    return f"s3://{bucket}/{stripped_prefix}"
+
+
+def is_dict_corrupt(input_dict: dict) -> bool:
+    """
+    Checks that all the keys, included nested keys, don't contain '$' or '.'
+    Parameters
+    ----------
+    input_dict : dict
+
+    Returns
+    -------
+    bool
+      True if input_dict is not a dict, or if nested dictionary keys contain
+      forbidden characters. False otherwise.
+
+    """
+    if not isinstance(input_dict, dict):
+        return True
+    for key, value in input_dict.items():
+        if "$" in key or "." in key:
+            return True
+        elif isinstance(value, dict):
+            if is_dict_corrupt(value):
+                return True
+    return False
+
+
+def does_metadata_record_exist_in_docdb(
+    docdb_client: MongoClient,
+    db_name: str,
+    collection_name: str,
+    bucket: str,
+    prefix: str,
+) -> bool:
+    """
+    For a given bucket and prefix, check if there is already a record in DocDb
+    Parameters
+    ----------
+    docdb_client : MongoClient
+    db_name : str
+    collection_name : str
+    bucket : str
+    prefix : str
+
+    Returns
+    -------
+    True if there is a record in DocDb. Otherwise, False.
+
+    """
+    location = get_s3_location(bucket=bucket, prefix=prefix)
+    db = docdb_client[db_name]
+    collection = db[collection_name]
+    records = list(
+        collection.find(
+            filter={"location": location}, projection={"_id": 1}, limit=1
+        )
+    )
+    if len(records) == 0:
+        return False
+    else:
+        return True
+
+
+def get_record_from_docdb(
+    docdb_client: MongoClient,
+    db_name: str,
+    collection_name: str,
+    record_id: str,
+) -> Optional[dict]:
+    """
+    Download a record from docdb using the record _id.
+    Parameters
+    ----------
+    docdb_client : MongoClient
+    db_name : str
+    collection_name : str
+    record_id : str
+
+    Returns
+    -------
+    Optional[dict]
+        None if record does not exist. Otherwise, it will return the record as
+        a dict.
+
+    """
+    db = docdb_client[db_name]
+    collection = db[collection_name]
+    records = list(collection.find(filter={"_id": record_id}, limit=1))
+    if len(records) > 0:
+        return records[0]
+    else:
+        return None
+
+
+def paginate_docdb(
+    db_name: str,
+    collection_name: str,
+    docdb_client: MongoClient,
+    page_size: int = 1000,
+    filter_query: Optional[dict] = None,
+    projection: Optional[dict] = None,
+) -> Iterator[List[dict]]:
+    """
+    Paginate through records in DocDb.
+    Parameters
+    ----------
+    db_name : str
+    collection_name : str
+    docdb_client : MongoClient
+    page_size : int
+      Default is 1000
+    filter_query : Optional[dict]
+    projection : Optional[dict]
+
+    Returns
+    -------
+    Iterator[List[dict]]
+
+    """
+    if filter_query is None:
+        filter_query = {}
+    if projection is None:
+        projection = {}
+    db = docdb_client[db_name]
+    collection = db[collection_name]
+    cursor = collection.find(filter=filter_query, projection=projection)
+    obj = next(cursor, None)
+    while obj:
+        page = []
+        while len(page) < page_size and obj:
+            page.append(obj)
+            obj = next(cursor, None)
+        yield page
+
+
+def build_docdb_location_to_id_map(
+    db_name: str,
+    collection_name: str,
+    docdb_client: MongoClient,
+    bucket: str,
+    prefixes: List[str],
+) -> Dict[str, str]:
+    """
+    For a given s3 bucket and list of prefixes, return a dictionary that looks
+    like {'s3://bucket/prefix': 'abc-1234'} where the value is the id of the
+    record in DocDb. If the record does not exist, then there will be no key
+    in the dictionary.
+    Parameters
+    ----------
+    db_name : str
+    collection_name : ste
+    docdb_client : MongoClient
+    bucket : str
+    prefixes : List[str]
+
+    Returns
+    -------
+    Dict[str, str]
+
+    """
+    locations = [get_s3_location(bucket=bucket, prefix=p) for p in prefixes]
+    filter_query = {"location": {"$in": locations}}
+    projection = {"_id": 1, "location": 1}
+    db = docdb_client[db_name]
+    collection = db[collection_name]
+    results = collection.find(filter=filter_query, projection=projection)
+    location_to_id_map = {r["location"]: r["_id"] for r in results}
+    return location_to_id_map
diff --git a/tests/resources/utils/example_metadata.nd.json b/tests/resources/utils/example_metadata.nd.json
new file mode 100644
index 0000000..254c99d
--- /dev/null
+++ b/tests/resources/utils/example_metadata.nd.json
@@ -0,0 +1,116 @@
+{
+   "_id": "488bbe42-832b-4c37-8572-25eb87cc50e2",
+   "acquisition": null,
+   "created": "2024-05-13T22:01:56.035469",
+   "data_description": null,
+   "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py",
+   "external_links": [],
+   "instrument": null,
+   "last_modified": "2024-05-13T22:01:56.035475",
+   "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_642478_2023-01-17_13-56-29",
+   "metadata_status": "Unknown",
+   "name": "ecephys_642478_2023-01-17_13-56-29",
+   "procedures": null,
+   "processing": {
+      "data_processes": [
+         {
+            "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer",
+            "end_date_time": "2023-01-20T19:13:36.434644+00:00",
+            "input_location": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29",
+            "name": "Ephys preprocessing",
+            "notes": null,
+            "output_location": "s3://aind-ephys-data/ecephys_642478_2023-01-17_13-56-29",
+            "parameters": {
+               "aws_secret_names": {
+                  "code_ocean_api_token_name": "codeocean-api-token",
+                  "region": "us-west-2",
+                  "video_encryption_password": "video_encryption_password"
+               },
+               "clip_data_job": {
+                  "clip_kwargs": {}
+               },
+               "compress_data_job": {
+                  "compressor": {
+                     "compressor_name": "wavpack",
+                     "kwargs": {
+                        "level": 3
+                     }
+                  },
+                  "format_kwargs": {},
+                  "scale_params": {},
+                  "write_kwargs": {
+                     "chunk_duration": "1s",
+                     "n_jobs": 24,
+                     "progress_bar": true
+                  }
+               },
+               "data": {
+                  "name": "openephys"
+               },
+               "endpoints": {
+                  "code_repo_location": "https://github.com/AllenNeuralDynamics/aind-data-transfer",
+                  "codeocean_domain": "https://codeocean.allenneuraldynamics.org",
+                  "dest_data_dir": "ecephys_642478_2023-01-17_13-56-29",
+                  "gcp_prefix": "ecephys_642478_2023-01-17_13-56-29",
+                  "metadata_service_url": "http://aind-metadata-service",
+                  "raw_data_dir": "\\\\allen\\programs\\aind\\workgroups\\ephys\\data\\ephys\\642478_2023-01-17_13-56-29",
+                  "s3_bucket": "aind-ephys-data",
+                  "s3_prefix": "ecephys_642478_2023-01-17_13-56-29"
+               },
+               "jobs": {
+                  "attach_metadata": true,
+                  "clip": true,
+                  "compress": true,
+                  "trigger_codeocean_job": true,
+                  "upload_to_gcp": false,
+                  "upload_to_s3": true
+               },
+               "logging": {
+                  "level": "INFO"
+               },
+               "trigger_codeocean_job": {
+                  "bucket": "aind-ephys-data",
+                  "capsule_id": "648473aa-791e-4372-bd25-205cc587ec56",
+                  "job_type": "openephys",
+                  "prefix": "ecephys_642478_2023-01-17_13-56-29"
+               },
+               "upload_data_job": {
+                  "dryrun": false
+               }
+            },
+            "start_date_time": "2023-01-20T19:06:02.945386+00:00",
+            "version": "0.2.9"
+         }
+      ],
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py",
+      "pipeline_url": null,
+      "pipeline_version": null,
+      "schema_version": "0.1.0"
+   },
+   "rig": null,
+   "schema_version": "0.2.7",
+   "session": null,
+   "subject": {
+      "background_strain": null,
+      "breeding_group": "Chat-IRES-Cre-neo",
+      "date_of_birth": "2022-07-16",
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/site-packages/aind_data_schema/subject.py",
+      "genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo",
+      "home_cage_enrichment": null,
+      "light_cycle": null,
+      "maternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo",
+      "maternal_id": "624133",
+      "mgi_allele_ids": null,
+      "notes": null,
+      "paternal_genotype": "Chat-IRES-Cre-neo/Chat-IRES-Cre-neo",
+      "paternal_id": "624115",
+      "restrictions": null,
+      "rrid": null,
+      "schema_version": "0.2.2",
+      "sex": "Male",
+      "source": null,
+      "species": "Mus musculus",
+      "subject_id": "642478",
+      "wellness_reports": null
+   }
+}
\ No newline at end of file
diff --git a/tests/resources/utils/example_metadata1.nd.json b/tests/resources/utils/example_metadata1.nd.json
new file mode 100644
index 0000000..4c75e61
--- /dev/null
+++ b/tests/resources/utils/example_metadata1.nd.json
@@ -0,0 +1,131 @@
+{
+   "_id": "5ca4a951-d374-4f4b-8279-d570a35b2286",
+   "acquisition": null,
+   "created": "2024-05-15T17:41:26.697535",
+   "data_description": {
+      "creation_time": "2000-01-01T04:00:00",
+      "data_level": "raw",
+      "data_summary": null,
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/data_description.py",
+      "funding_source": [
+         {
+            "fundee": null,
+            "funder": {
+               "abbreviation": "AIND",
+               "name": "Allen Institute for Neural Dynamics",
+               "registry": {
+                  "abbreviation": "ROR",
+                  "name": "Research Organization Registry"
+               },
+               "registry_identifier": "04szwah67"
+            },
+            "grant_number": null
+         }
+      ],
+      "group": null,
+      "institution": {
+         "abbreviation": "AIND",
+         "name": "Allen Institute for Neural Dynamics",
+         "registry": {
+            "abbreviation": "ROR",
+            "name": "Research Organization Registry"
+         },
+         "registry_identifier": "04szwah67"
+      },
+      "investigators": [],
+      "license": "CC-BY-4.0",
+      "modality": [
+         {
+            "abbreviation": "ecephys",
+            "name": "Extracellular electrophysiology"
+         }
+      ],
+      "name": "ecephys_567890_2000-01-01_04-00-00",
+      "platform": {
+         "abbreviation": "ecephys",
+         "name": "Electrophysiology platform"
+      },
+      "project_name": null,
+      "related_data": [],
+      "restrictions": null,
+      "schema_version": "0.10.0",
+      "subject_id": "567890"
+   },
+   "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py",
+   "external_links": [],
+   "instrument": null,
+   "last_modified": "2024-05-15T17:41:26.697535",
+   "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_567890_2000-01-01_04-00-00",
+   "metadata_status": "Unknown",
+   "name": "ecephys_567890_2000-01-01_04-00-00",
+   "procedures": {
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/procedures.py",
+      "notes": null,
+      "schema_version": "0.9.4",
+      "specimen_procedures": [],
+      "subject_procedures": []
+   },
+   "processing": {
+      "data_processes": [
+         {
+            "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer",
+            "code_version": null,
+            "end_date_time": "2023-10-10T18:16:40.084257+00:00",
+            "input_location": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07",
+            "name": "Ephys preprocessing",
+            "notes": null,
+            "output_location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_567890_2000-01-01_04-00-00",
+            "outputs": null,
+            "parameters": {
+               "compress_raw_data": true,
+               "extra_configs": null,
+               "modality": {
+                  "abbreviation": "ecephys",
+                  "name": "Extracellular electrophysiology"
+               },
+               "skip_staging": false,
+               "source": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07"
+            },
+            "start_date_time": "2023-10-10T18:09:09.707473+00:00",
+            "version": "0.29.3"
+         }
+      ],
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py",
+      "pipeline_url": null,
+      "pipeline_version": null,
+      "schema_version": "0.2.5"
+   },
+   "rig": null,
+   "schema_version": "0.2.7",
+   "session": null,
+   "subject": {
+      "background_strain": null,
+      "breeding_group": "Gad2-IRES-Cre;Slc32a1-T2A-FlpO;Ai210-hyg",
+      "date_of_birth": "2021-01-06",
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/subject.py",
+      "genotype": "Gad2-IRES-Cre/wt;Ai210(TITL-GC7f-ICF-IRES-tTA2)-hyg/wt",
+      "housing": null,
+      "maternal_genotype": "Gad2-IRES-Cre/Gad2-IRES-Cre",
+      "maternal_id": "561759",
+      "mgi_allele_ids": null,
+      "notes": null,
+      "paternal_genotype": null,
+      "paternal_id": "557009",
+      "restrictions": null,
+      "rrid": null,
+      "schema_version": "0.4.2",
+      "sex": "Male",
+      "source": null,
+      "species": {
+         "abbreviation": null,
+         "name": "Mus musculus",
+         "registry": {
+            "abbreviation": "NCBI",
+            "name": "National Center for Biotechnology Information"
+         },
+         "registry_identifier": "10090"
+      },
+      "subject_id": "567890",
+      "wellness_reports": null
+   }
+}
\ No newline at end of file
diff --git a/tests/resources/utils/example_metadata2.nd.json b/tests/resources/utils/example_metadata2.nd.json
new file mode 100644
index 0000000..6606e70
--- /dev/null
+++ b/tests/resources/utils/example_metadata2.nd.json
@@ -0,0 +1,209 @@
+{
+   "_id": "7d0a4814-5a56-4497-a63c-b732f9e64af9",
+   "acquisition": null,
+   "created": "2024-05-15T17:41:30.131278",
+   "data_description": {
+      "creation_time": "2000-01-01T01:01:02",
+      "data_level": "raw",
+      "data_summary": null,
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/data_description.py",
+      "funding_source": [
+         {
+            "fundee": null,
+            "funder": {
+               "abbreviation": "AIND",
+               "name": "Allen Institute for Neural Dynamics",
+               "registry": {
+                  "abbreviation": "ROR",
+                  "name": "Research Organization Registry"
+               },
+               "registry_identifier": "04szwah67"
+            },
+            "grant_number": null
+         }
+      ],
+      "group": null,
+      "institution": {
+         "abbreviation": "AIND",
+         "name": "Allen Institute for Neural Dynamics",
+         "registry": {
+            "abbreviation": "ROR",
+            "name": "Research Organization Registry"
+         },
+         "registry_identifier": "04szwah67"
+      },
+      "investigators": [],
+      "license": "CC-BY-4.0",
+      "modality": [
+         {
+            "abbreviation": "ecephys",
+            "name": "Extracellular electrophysiology"
+         }
+      ],
+      "name": "ecephys_655019_2000-01-01_01-01-02",
+      "platform": {
+         "abbreviation": "ecephys",
+         "name": "Electrophysiology platform"
+      },
+      "project_name": null,
+      "related_data": [],
+      "restrictions": null,
+      "schema_version": "0.10.2",
+      "subject_id": "655019"
+   },
+   "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/core/metadata.py",
+   "external_links": [],
+   "instrument": null,
+   "last_modified": "2024-05-15T17:41:30.131278",
+   "location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_655019_2000-01-01_01-01-02",
+   "metadata_status": "Unknown",
+   "name": "ecephys_655019_2000-01-01_01-01-02",
+   "procedures": {
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/procedures.py",
+      "notes": null,
+      "schema_version": "0.9.3",
+      "specimen_procedures": [],
+      "subject_id": "655019",
+      "subject_procedures": [
+         {
+            "anaesthesia": null,
+            "animal_weight_post": null,
+            "animal_weight_prior": null,
+            "end_date": "2023-04-10",
+            "experimenter_full_name": 30241,
+            "iacuc_protocol": "2109",
+            "notes": null,
+            "output_specimen_ids": [
+               655019
+            ],
+            "procedure_type": "Perfusion",
+            "start_date": "2023-04-10",
+            "weight_unit": "gram"
+         },
+         {
+            "anaesthesia": {
+               "duration": 135.0,
+               "duration_unit": "minute",
+               "level": 1.5,
+               "type": "isoflurane"
+            },
+            "animal_weight_post": 26.8,
+            "animal_weight_prior": 24.2,
+            "bregma_to_lambda_distance": 4.426,
+            "bregma_to_lambda_unit": "millimeter",
+            "craniotomy_coordinates_ap": null,
+            "craniotomy_coordinates_ml": null,
+            "craniotomy_coordinates_reference": null,
+            "craniotomy_coordinates_unit": "millimeter",
+            "craniotomy_hemisphere": null,
+            "craniotomy_size": null,
+            "craniotomy_size_unit": "millimeter",
+            "craniotomy_type": null,
+            "dura_removed": null,
+            "end_date": "2023-01-26",
+            "experimenter_full_name": "NSB-5011",
+            "iacuc_protocol": "2109",
+            "implant_part_number": null,
+            "notes": null,
+            "procedure_type": "Craniotomy",
+            "protective_material": null,
+            "recovery_time": 20.0,
+            "recovery_time_unit": "minute",
+            "start_date": "2023-01-26",
+            "weight_unit": "gram",
+            "workstation_id": "SWS 5"
+         },
+         {
+            "anaesthesia": {
+               "duration": 135.0,
+               "duration_unit": "minute",
+               "level": 1.5,
+               "type": "isoflurane"
+            },
+            "animal_weight_post": 26.8,
+            "animal_weight_prior": 24.2,
+            "end_date": "2023-01-26",
+            "experimenter_full_name": "NSB-5011",
+            "headframe_material": null,
+            "headframe_part_number": "0160-100-42",
+            "headframe_type": "WHC NP",
+            "iacuc_protocol": "2109",
+            "notes": null,
+            "procedure_type": "Headframe",
+            "start_date": "2023-01-26",
+            "weight_unit": "gram",
+            "well_part_number": "0160-055-08",
+            "well_type": "WHC NP"
+         }
+      ]
+   },
+   "processing": {
+      "analyses": null,
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/processing.py",
+      "notes": null,
+      "processing_pipeline": {
+         "data_processes": [
+            {
+               "code_url": "https://github.com/AllenNeuralDynamics/aind-data-transfer",
+               "code_version": null,
+               "end_date_time": "2023-12-12T21:58:32.440692+00:00",
+               "input_location": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07",
+               "name": "Compression",
+               "notes": null,
+               "output_location": "s3://aind-ephys-data-dev-u5u0i5/ecephys_655019_2000-01-01_01-01-02",
+               "outputs": null,
+               "parameters": {
+                  "compress_raw_data": true,
+                  "extra_configs": null,
+                  "modality": {
+                     "abbreviation": "ecephys",
+                     "name": "Extracellular electrophysiology"
+                  },
+                  "skip_staging": false,
+                  "source": "/allen/aind/scratch/svc_aind_upload/test_data_sets/ecephys/655019_2023-04-03_18-17-07"
+               },
+               "software_version": "0.32.0",
+               "start_date_time": "2023-12-12T21:55:39.374709+00:00"
+            }
+         ],
+         "note": null,
+         "pipeline_url": null,
+         "pipeline_version": null,
+         "processor_full_name": "service"
+      },
+      "schema_version": "0.3.1"
+   },
+   "rig": null,
+   "schema_version": "0.2.7",
+   "session": null,
+   "subject": {
+      "background_strain": null,
+      "breeding_group": "Ai229(TIT2L-GC6m-IRES2-ChRmine-HA-WPRE-ISL-tTA2-WPRE)-hyg",
+      "date_of_birth": "2022-10-18",
+      "describedBy": "https://raw.githubusercontent.com/AllenNeuralDynamics/aind-data-schema/main/src/aind_data_schema/subject.py",
+      "genotype": "Ai229(TIT2L-GC6m-IRES2-ChRmine-HA-WPRE-ISL-tTA2-WPRE)-hyg/wt",
+      "housing": null,
+      "maternal_genotype": "wt/wt",
+      "maternal_id": "645737",
+      "mgi_allele_ids": null,
+      "notes": null,
+      "paternal_genotype": "Ai229(TIT2L-GC6m-IRES2-ChRmine-HA-WPRE-ISL-tTA2-WPRE)-hyg/wt",
+      "paternal_id": "643902",
+      "restrictions": null,
+      "rrid": null,
+      "schema_version": "0.4.2",
+      "sex": "Male",
+      "source": null,
+      "species": {
+         "abbreviation": null,
+         "name": "Mus musculus",
+         "registry": {
+            "abbreviation": "NCBI",
+            "name": "National Center for Biotechnology Information"
+         },
+         "registry_identifier": "10090"
+      },
+      "subject_id": "655019",
+      "wellness_reports": null
+   }
+}
\ No newline at end of file
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..4e1c663
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,234 @@
+"""Tests methods in utils module"""
+
+import json
+import os
+import unittest
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+from aind_data_access_api.utils import (
+    build_docdb_location_to_id_map,
+    does_metadata_record_exist_in_docdb,
+    get_record_from_docdb,
+    get_s3_bucket_and_prefix,
+    get_s3_location,
+    is_dict_corrupt,
+    paginate_docdb,
+)
+
+TEST_DIR = Path(os.path.dirname(os.path.realpath(__file__)))
+TEST_UTILS_DIR = TEST_DIR / "resources" / "utils"
+
+
+class TestUtils(unittest.TestCase):
+    """Class to test methods in utils module."""
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        """Set up the class by extracting contents from example files."""
+
+        def load_json_file(filename: str) -> dict:
+            """Load json file from resources directory."""
+            with open(TEST_UTILS_DIR / filename, "r") as f:
+                return json.load(f)
+
+        example_metadata_nd = load_json_file("example_metadata.nd.json")
+        example_metadata_nd1 = load_json_file("example_metadata1.nd.json")
+        example_metadata_nd2 = load_json_file("example_metadata2.nd.json")
+
+        cls.example_metadata_nd = example_metadata_nd
+        cls.example_metadata_nd1 = example_metadata_nd1
+        cls.example_metadata_nd2 = example_metadata_nd2
+
+    def test_is_dict_corrupt(self):
+        """Tests is_dict_corrupt method"""
+        good_contents = {"a": 1, "b": {"c": 2, "d": 3}}
+        bad_contents1 = {"a.1": 1, "b": {"c": 2, "d": 3}}
+        bad_contents2 = {"a": 1, "b": {"c": 2, "$d": 3}}
+        bad_contents3 = {"a": 1, "b": {"c": 2, "d": 3}, "$e": 4}
+        bad_contents4 = {"a": 1, "b": {"c": {"d": 3}, "$e": 4}}
+        bad_contents5 = [{"a": 1}, {"b": {"c": 2, "d": 3}}]
+        self.assertFalse(is_dict_corrupt(good_contents))
+        self.assertTrue(is_dict_corrupt(bad_contents1))
+        self.assertTrue(is_dict_corrupt(bad_contents2))
+        self.assertTrue(is_dict_corrupt(bad_contents3))
+        self.assertTrue(is_dict_corrupt(bad_contents4))
+        self.assertTrue(is_dict_corrupt(bad_contents5))
+
+    def test_get_s3_bucket_and_prefix(self):
+        """Tests get_s3_bucket_and_prefix"""
+        results1 = get_s3_bucket_and_prefix(
+            s3_location="s3://some_bucket/prefix1/"
+        )
+        results2 = get_s3_bucket_and_prefix(
+            s3_location="s3://some_bucket/prefix2"
+        )
+
+        self.assertEqual(
+            {"bucket": "some_bucket", "prefix": "prefix1"}, results1
+        )
+        self.assertEqual(
+            {"bucket": "some_bucket", "prefix": "prefix2"}, results2
+        )
+
+    def test_get_s3_location(self):
+        """Tests get_s3_location"""
+        result1 = get_s3_location(bucket="some_bucket", prefix="prefix1")
+        result2 = get_s3_location(bucket="some_bucket", prefix="prefix2/")
+        self.assertEqual("s3://some_bucket/prefix1", result1)
+        self.assertEqual("s3://some_bucket/prefix2", result2)
+
+    @patch("pymongo.MongoClient")
+    def test_does_metadata_record_exist_in_docdb_true(
+        self, mock_docdb_client: MagicMock
+    ):
+        """Tests does_metadata_record_exist_in_docdb when true"""
+
+        mock_db = MagicMock()
+        mock_docdb_client.__getitem__.return_value = mock_db
+        mock_collection = MagicMock()
+        mock_db.__getitem__.return_value = mock_collection
+        mock_collection.find.return_value = iter(
+            [{"_id": "70bcf356-985f-4a2a-8105-de900e35e788"}]
+        )
+        self.assertTrue(
+            does_metadata_record_exist_in_docdb(
+                docdb_client=mock_docdb_client,
+                db_name="metadata_index",
+                collection_name="data_assets",
+                bucket="aind-ephys-data-dev-u5u0i5",
+                prefix="ecephys_642478_2023-01-17_13-56-29",
+            )
+        )
+
+    @patch("pymongo.MongoClient")
+    def test_does_metadata_record_exist_in_docdb_false(
+        self, mock_docdb_client: MagicMock
+    ):
+        """Tests does_metadata_record_exist_in_docdb when false"""
+
+        mock_db = MagicMock()
+        mock_docdb_client.__getitem__.return_value = mock_db
+        mock_collection = MagicMock()
+        mock_db.__getitem__.return_value = mock_collection
+        mock_collection.find.return_value = iter([])
+        self.assertFalse(
+            does_metadata_record_exist_in_docdb(
+                docdb_client=mock_docdb_client,
+                db_name="metadata_index",
+                collection_name="data_assets",
+                bucket="aind-ephys-data-dev-u5u0i5",
+                prefix="ecephys_642478_2023-01-17_13-56-29",
+            )
+        )
+
+    @patch("pymongo.MongoClient")
+    def test_get_record_from_docdb(self, mock_docdb_client: MagicMock):
+        """Tests get_record_from_docdb when record exists"""
+        mock_db = MagicMock()
+        mock_docdb_client.__getitem__.return_value = mock_db
+        mock_collection = MagicMock()
+        mock_db.__getitem__.return_value = mock_collection
+        mock_collection.find.return_value = iter([self.example_metadata_nd])
+        record = get_record_from_docdb(
+            docdb_client=mock_docdb_client,
+            db_name="metadata_index",
+            collection_name="data_assets",
+            record_id="488bbe42-832b-4c37-8572-25eb87cc50e2",
+        )
+        self.assertEqual(self.example_metadata_nd, record)
+
+    @patch("pymongo.MongoClient")
+    def test_get_record_from_docdb_none(self, mock_docdb_client: MagicMock):
+        """Tests get_record_from_docdb when record doesn't exist"""
+        mock_db = MagicMock()
+        mock_docdb_client.__getitem__.return_value = mock_db
+        mock_collection = MagicMock()
+        mock_db.__getitem__.return_value = mock_collection
+        mock_collection.find.return_value = iter([])
+        record = get_record_from_docdb(
+            docdb_client=mock_docdb_client,
+            db_name="metadata_index",
+            collection_name="data_assets",
+            record_id="488bbe42-832b-4c37-8572-25eb87cc50ee",
+        )
+        self.assertIsNone(record)
+
+    @patch("pymongo.MongoClient")
+    def test_paginate_docdb(self, mock_docdb_client: MagicMock):
+        """Tests paginate_docdb"""
+        mock_db = MagicMock()
+        mock_docdb_client.__getitem__.return_value = mock_db
+        mock_collection = MagicMock()
+        mock_db.__getitem__.return_value = mock_collection
+        mock_collection.find.return_value = iter(
+            [
+                self.example_metadata_nd,
+                self.example_metadata_nd1,
+                self.example_metadata_nd2,
+            ]
+        )
+        pages = paginate_docdb(
+            docdb_client=mock_docdb_client,
+            db_name="metadata_index",
+            collection_name="data_assets",
+            page_size=2,
+        )
+        expected_results = [
+            [self.example_metadata_nd, self.example_metadata_nd1],
+            [self.example_metadata_nd2],
+        ]
+        actual_results = list(pages)
+        self.assertEqual(expected_results, actual_results)
+
+    @patch("pymongo.MongoClient")
+    def test_build_docdb_location_to_id_map(
+        self, mock_docdb_client: MagicMock
+    ):
+        """Tests build_docdb_location_to_id_map"""
+        bucket = "aind-ephys-data-dev-u5u0i5"
+        mock_db = MagicMock()
+        mock_docdb_client.__getitem__.return_value = mock_db
+        mock_collection = MagicMock()
+        mock_db.__getitem__.return_value = mock_collection
+        mock_collection.find.return_value = iter(
+            [
+                {
+                    "_id": "70bcf356-985f-4a2a-8105-de900e35e788",
+                    "location": (
+                        f"s3://{bucket}/ecephys_655019_2000-04-04_04-00-00"
+                    ),
+                },
+                {
+                    "_id": "5ca4a951-d374-4f4b-8279-d570a35b2286",
+                    "location": (
+                        f"s3://{bucket}/ecephys_567890_2000-01-01_04-00-00"
+                    ),
+                },
+            ]
+        )
+
+        actual_map = build_docdb_location_to_id_map(
+            docdb_client=mock_docdb_client,
+            db_name="metadata_index",
+            collection_name="data_assets",
+            bucket=bucket,
+            prefixes=[
+                "ecephys_655019_2000-04-04_04-00-00",
+                "ecephys_567890_2000-01-01_04-00-00/",
+                "missing_655019_2000-01-01_01-01-02",
+            ],
+        )
+        expected_map = {
+            f"s3://{bucket}/ecephys_655019_2000-04-04_04-00-00": (
+                "70bcf356-985f-4a2a-8105-de900e35e788"
+            ),
+            f"s3://{bucket}/ecephys_567890_2000-01-01_04-00-00": (
+                "5ca4a951-d374-4f4b-8279-d570a35b2286"
+            ),
+        }
+        self.assertEqual(expected_map, actual_map)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 64d2bd3ce2bafa95b68e747723732308f2021de5 Mon Sep 17 00:00:00 2001
From: Helen Lin <46795546+helen-m-lin@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:38:58 -0700
Subject: [PATCH 2/3] docs: add docdb interface and utils usage to readthedocs
 (#62)

* docs: add User Guide from README content

* docs: add public REST API usage docs

* docs: add Compass setup docs

* docs: add examples for DocumentDbSSHClient

* docs: add docdb intro to user guide

* docs: remove usage from readme

* docs: add utils automodule to readthedocs
---
 README.md                                     | 109 +------
 docs/source/ExamplesDocDBDirectConnection.rst | 293 ++++++++++++++++++
 docs/source/UserGuide.rst                     | 226 ++++++++++++++
 docs/source/aind_data_access_api.rst          |   8 +
 docs/source/index.rst                         |   2 +
 5 files changed, 534 insertions(+), 104 deletions(-)
 create mode 100644 docs/source/ExamplesDocDBDirectConnection.rst
 create mode 100644 docs/source/UserGuide.rst

diff --git a/README.md b/README.md
index 5258bfe..744aa4b 100644
--- a/README.md
+++ b/README.md
@@ -4,112 +4,13 @@
 ![Code Style](https://img.shields.io/badge/code%20style-black-black)
 [![semantic-release: angular](https://img.shields.io/badge/semantic--release-angular-e10079?logo=semantic-release)](https://github.com/semantic-release/semantic-release)
 
-API to interact with a few AIND databases.
+API to interact with a few AIND databases. We have two primary databases:
 
-## Usage
-We have two primary databases. A Document store to keep unstructured json documents, and a relational database to store structured tables.
+1. A document database (DocDB) to store
+   unstructured json documents. The DocDB contains AIND metadata.
+2. A relational database to store structured tables.
 
-### Document Store
-We have some convenience methods to interact with our Document Store. You can create a client by explicitly setting credentials, or downloading from AWS Secrets Manager.
-
-__To connect from outside of our VPC:__
-
-1. If using credentials from environment, please configure:
-```sh
-DOC_DB_HOST=docdb-us-west-2-****.cluster-************.us-west-2.docdb.amazonaws.com
-DOC_DB_USERNAME=doc_db_username
-DOC_DB_PASSWORD=doc_db_password
-DOC_DB_SSH_HOST=ssh_host
-DOC_DB_SSH_USERNAME=ssh_username
-DOC_DB_SSH_PASSWORD=ssh_password
-```
-2. Usage:
-```python
-from aind_data_access_api.document_db_ssh import DocumentDbSSHClient, DocumentDbSSHCredentials
-
-# Method 1) if credentials are set in environment
-credentials = DocumentDbSSHCredentials()
-
-# Method 2) if you have permissions to AWS Secrets Manager
-# Each secret must contain corresponding "host", "username", and "password"
-credentials = DocumentDbSSHCredentials.from_secrets_manager(
-    doc_db_secret_name="/doc/store/secret/name", ssh_secret_name="/ssh/tunnel/secret/name"
-)
-
-with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
-    # To get a list of filtered records:
-    filter = {"subject.subject_id": "123456"}
-    projection = {
-        "name": 1, "created": 1, "location": 1, "subject.subject_id": 1, "subject.date_of_birth": 1,
-    }
-    count = doc_db_client.collection.count_documents(filter)
-    response = list(doc_db_client.collection.find(filter=filter, projection=projection))
-```
-
-__To connect from within our VPC:__
-```python
-from aind_data_access_api.credentials import DocumentStoreCredentials
-from aind_data_access_api.document_store import Client
-
-# Method one assuming user, password, and host are known
-ds_client = Client(
-            credentials=DocumentStoreCredentials(
-                username="user",
-                password="password",
-                host="host",
-                database="metadata",
-            ),
-            collection_name="data_assets",
-        )
-
-# Method two if you have permissions to AWS Secrets Manager
-ds_client = Client(
-            credentials=DocumentStoreCredentials(
-                aws_secrets_name="aind/data/access/api/document_store/metadata"
-            ),
-            collection_name="data_assets",
-        )
-
-# To get all records
-response = list(ds_client.retrieve_data_asset_records())
-
-# To get a list of filtered records:
-response = list(ds_client.retrieve_data_asset_records({"subject.subject_id": "123456"}))
-```
-
-### RDS Tables
-We have some convenience methods to interact with our Relational Database. You can create a client by explicitly setting credentials, or downloading from AWS Secrets Manager.
-```
-from aind_data_access_api.credentials import RDSCredentials
-from aind_data_access_api.rds_tables import Client
-
-# Method one assuming user, password, and host are known
-ds_client = Client(
-            credentials=RDSCredentials(
-                username="user",
-                password="password",
-                host="host",
-                database="metadata",
-            ),
-            collection_name="data_assets",
-        )
-
-# Method two if you have permissions to AWS Secrets Manager
-ds_client = Client(
-            credentials=RDSCredentials(
-                aws_secrets_name="aind/data/access/api/rds_tables"
-            ),
-        )
-
-# To retrieve a table as a pandas dataframe
-df = ds_client.read_table(table_name="spike_sorting_urls")
-
-# Can also pass in a custom sql query
-df = ds_client.read_table(query="SELECT * FROM spike_sorting_urls")
-
-# It's also possible to save a pandas dataframe as a table. Please check internal documentation for more details.
-ds_client.overwrite_table_with_df(df, table_name)
-```
+More information can be found at [readthedocs](https://aind-data-access-api.readthedocs.io).
 
 ## Installation
 To use the software, it can be installed from PyPI.
diff --git a/docs/source/ExamplesDocDBDirectConnection.rst b/docs/source/ExamplesDocDBDirectConnection.rst
new file mode 100644
index 0000000..e040d26
--- /dev/null
+++ b/docs/source/ExamplesDocDBDirectConnection.rst
@@ -0,0 +1,293 @@
+Examples - DocDB Direct Connection
+==========
+
+This page provides examples for interact with the Document Database (DocDB)
+using the provided Python client.
+
+It is assumed that the required credentials are set in environment.
+Please refer to the User Guide for more details.
+
+
+Querying Metadata
+~~~~~~~~~~~~~~~~~~~~~~
+
+Count Example 1: Get # of records with a certain subject_id
+------------------
+
+.. code:: python
+
+  import json
+
+  from aind_data_access_api.document_db_ssh import (
+      DocumentDbSSHClient,
+      DocumentDbSSHCredentials,
+  )
+
+  credentials = DocumentDbSSHCredentials()
+  with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+      filter = {"subject.subject_id": "689418"}
+      count = doc_db_client.collection.count_documents(filter)
+      print(count)
+
+Filter Example 1: Get records with a certain subject_id
+------------------
+
+.. code:: python
+
+  with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+      filter = {"subject.subject_id": "689418"}
+      records = list(
+          doc_db_client.collection.find(filter=filter)
+      )
+      print(json.dumps(records, indent=3))
+
+
+With projection (recommended):
+      
+.. code:: python
+
+  with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+      filter = {"subject.subject_id": "689418"}
+      projection = {
+          "name": 1,
+          "created": 1,
+          "location": 1,
+          "subject.subject_id": 1,
+          "subject.date_of_birth": 1,
+      }
+      records = list(
+          doc_db_client.collection.find(filter=filter, projection=projection)
+      )
+      print(json.dumps(records, indent=3))
+
+
+Filter Example 2: Get records with a certain breeding group
+------------------
+
+.. code:: python
+
+  with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+      filter = {
+          "subject.breeding_info.breeding_group": "Chat-IRES-Cre_Jax006410"
+      }
+      records = list(
+          doc_db_client.collection.find(filter=filter)
+      )
+      print(json.dumps(records, indent=3))
+
+
+With projection (recommended):
+
+.. code:: python
+
+  with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+      filter = {
+          "subject.breeding_info.breeding_group": "Chat-IRES-Cre_Jax006410"
+      }
+      projection = {
+          "name": 1,
+          "created": 1,
+          "location": 1,
+          "subject.subject_id": 1,
+          "subject.breeding_info.breeding_group": 1,
+      }
+      records = list(
+          doc_db_client.collection.find(filter=filter, projection=projection)
+      )
+      print(json.dumps(records, indent=3))
+
+Aggregation Example 1: Get all subjects per breeding group
+------------------
+
+.. code:: python
+
+  with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+      agg_pipeline = [
+          {
+              "$group": {
+                  "_id": "$subject.breeding_info.breeding_group",
+                  "subject_ids": {"$addToSet": "$subject.subject_id"},
+              }
+          }
+      ]
+      result = list(
+          doc_db_client.collection.aggregate(pipeline=agg_pipeline)
+      )
+      print(f"Total breeding groups: {len(result)}")
+      print(f"First 3 breeding groups and corresponding subjects:")
+      print(json.dumps(result[:3], indent=3))
+
+
+Updating Metadata
+~~~~~~~~~~~~~~~~~~~~~~
+
+We provide several utility functions for interacting with DocDB within the
+``aind_data_access_api.utils`` module. Below is an example of how to use these
+functions to update records in DocDB.
+
+.. code:: python
+
+  import json
+  import logging
+  from typing import List, Optional
+
+  from aind_data_access_api.document_db_ssh import (
+      DocumentDbSSHClient,
+      DocumentDbSSHCredentials,
+  )
+  from aind_data_schema.core.metadata import Metadata
+
+  from aind_data_access_api.utils import paginate_docdb, is_dict_corrupt
+
+  logging.basicConfig(level="INFO")
+
+  def _process_docdb_records(records: List[dict], doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None:
+      """
+      Process records.
+      Parameters
+      ----------
+      records : List[dict]
+
+      """
+      for record in records:
+          _process_docdb_record(record=record, doc_db_client=doc_db_client, dryrun=dryrun)
+
+  def _process_docdb_record(record: dict, doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None:
+      """
+      Process record. This example updates the data_description.name field
+      if it does not match the record.name field.
+
+      Parameters
+      ----------
+      record : dict
+
+      """
+      _id = record.get("_id")
+      name = record.get("name")
+      location = record.get("location")
+      if _id:
+          if record.get("data_description") and record["data_description"].get("name") != name:
+              # Option 1: update specific fields(s) only
+              new_fields = {
+                  "data_description.name": name
+              }
+              update_docdb_record_partial(record_id=_id, new_fields=new_fields, doc_db_client=doc_db_client, dryrun=dryrun)
+              # Option 2: build new record Metadata.py and replace entire document with new record
+              # new_record = build_new_docdb_record(record=record)
+              # if new_record is not None:
+              #     update_docdb_record_entire(record_id=_id, new_record=new_record, doc_db_client=doc_db_client, dryrun=dryrun)
+          # else:
+          #     logging.info(f"Record for {location} does not need to be updated.")
+      else:
+          logging.warning(f"Record for {location} does not have an _id field! Skipping.")
+
+
+  def build_new_docdb_record(record: Optional[dict]) -> Optional[dict]:
+      """Build new record from existing record. This example updates the
+      data_description.name field if it does not match the record.name field.
+
+      Parameters
+      ----------
+      record : Optional[dict]
+
+      Returns
+      -------
+      Optional[dict]
+          The new record, or None if the record cannot be constructed.
+      """
+      # Example: Update record.data_description.name if not matching record.name
+      new_record = None
+      if record.get("data_description") and record["data_description"].get("name") != name:
+          _id = record.get("_id")
+          name = record.get("name")
+          location = record.get("location")
+          created = record.get("created")
+          if _id is None or name is None or location is None or created is None:
+              logging.warning(f"Record does not have _id, name, location, or created! Skipping.")
+              return None
+          try:
+              new_record = record.copy()
+              new_record["data_description"]["name"] = name
+              new_record_str = Metadata.model_construct(
+                  **new_record
+              ).model_dump_json(warnings=False, by_alias=True)
+              new_record = json.loads(new_record_str)
+              if is_dict_corrupt(new_record):
+                  logging.warning(f"New record for {location} is corrupt! Skipping.")
+                  new_record = None
+          except Exception:
+              new_record = None
+      return new_record
+
+  def update_docdb_record_partial(record_id: str, new_fields: dict, doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None:
+      """
+      Update record in docdb by updating specific fields only.
+      Parameters
+      ----------
+      record_id : str
+          The _id of the record to update.
+      new_fields : dict
+          New fields to update. E.g. {"data_description.name": "new_name"}
+
+      """
+      if dryrun:
+          logging.info(f"(dryrun) doc_db_client.collection.update_one (partial): {record_id}")
+      else:
+          logging.info(f"doc_db_client.collection.update_one (partial): {record_id}")
+          response = doc_db_client.collection.update_one(
+              {"_id": record_id},
+              {"$set": new_fields},
+              upsert=False,
+          )
+          logging.info(response.raw_result)
+
+
+  def update_docdb_record_entire(record_id: str, new_record: dict, doc_db_client: DocumentDbSSHClient, dryrun: bool) -> None:
+      """
+      Update record in docdb by replacing the entire document with new record.
+      Parameters
+      ----------
+      record_id : str
+          The _id of the record to update.
+      new_record : dict
+          The new record to replace the existing record with.
+
+      """
+      if is_dict_corrupt(new_record) or record_id != new_record.get("_id"):
+          logging.warning(f"Attempting to update corrupt record {record_id}! Skipping.")
+          return
+      if dryrun:
+          logging.info(f"(dryrun) doc_db_client.collection.update_one: {record_id}")
+      else:
+          logging.info(f"doc_db_client.collection.update_one: {record_id}")
+          response = doc_db_client.collection.update_one(
+              {"_id": record_id},
+              {"$set": new_record},
+              upsert=False,
+          )
+          logging.info(response.raw_result)
+            
+          
+  if __name__ == "__main__":
+      credentials = DocumentDbSSHCredentials()    # credentials in environment
+      dryrun = True
+      filter = {"location": {"$regex": ".*s3://codeocean-s3datasetsbucket.*"}}
+      projection = None
+      
+      with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+          db_name = doc_db_client.database_name
+          col_name = doc_db_client.collection_name
+          # count = doc_db_client.collection.count_documents(filter)
+          # logging.info(f"{db_name}.{col_name}: Found {count} records with {filter}: {count}")
+
+          logging.info(f"{db_name}.{col_name}: Starting to scan for {filter}.")
+          docdb_pages = paginate_docdb(
+              db_name=doc_db_client.database_name,
+              collection_name=doc_db_client.collection_name,
+              docdb_client=doc_db_client._client,
+              page_size=500,
+              filter_query=filter,
+          )
+          for page in docdb_pages:
+              _process_docdb_records(records=page, doc_db_client=doc_db_client, dryrun=dryrun)
+          logging.info(f"{db_name}.{col_name}:Finished scanning through DocDb.")
\ No newline at end of file
diff --git a/docs/source/UserGuide.rst b/docs/source/UserGuide.rst
new file mode 100644
index 0000000..c214801
--- /dev/null
+++ b/docs/source/UserGuide.rst
@@ -0,0 +1,226 @@
+User Guide
+==========
+
+Thank you for using ``aind-data-access-api``! This guide is
+intended for scientists and engineers in AIND that wish to interface
+with AIND databases.
+
+We have two primary databases:
+
+1. A `document database (DocDB) <#document-database-docdb>`__ to store
+   unstructured json documents. The DocDB contains AIND metadata.
+2. A `relational database <#rds-tables>`__ to store structured tables.
+
+Document Database (DocDB)
+--------------------
+
+AIND metadata records stored in the DocDB describe the ``metadata.nd.json``
+for a data asset:
+
+- ``_id``: the unique ID of the data asset.
+- ``name``: the name of the data asset.
+- ``location``: the S3 location of the metadata, in the format
+  ``s3://{bucket_name}/{name}``. This is unique across records and can
+  be used to query or identify specific records.
+- Please see the `readthedocs for aind-data-schema 
+  <https://aind-data-schema.readthedocs.io/en/latest/aind_data_schema.core.html#module-aind_data_schema.core.metadata>`__
+  for more details.
+
+The DocDB can be accessed through a public read-only REST API or
+through a direct connection using SSH. For a direct connection,
+it is assumed you have the appropriate credentials.
+
+REST API (Read-Only)
+~~~~~~~~~~~~~~~~~~~~~~
+
+1. A GET request to ``https://api.allenneuraldynamics.org/v1/metadata_index/data_assets``
+   with appropriate query parameters will return a list of records found.
+
+.. code:: python
+
+   import json
+   import requests
+
+   URL = "https://api.allenneuraldynamics.org/v1/metadata_index/data_assets"
+   filter = {"subject.subject_id": "123456"}
+   limit = 100
+   response = requests.get(URL, params={"filter": json.dumps(filter), "limit": limit})
+   print(response.json())
+
+2. Alternatively, we provide a Python client:
+
+.. code:: python
+
+   from aind_data_access_api.document_db import MetadataDbClient
+
+   API_GATEWAY_HOST = "api.allenneuraldynamics.org"
+   DATABASE = "metadata_index"
+   COLLECTION = "data_assets"
+
+   docdb_api_client = MetadataDbClient(
+      host=API_GATEWAY_HOST,
+      database=DATABASE,
+      collection=COLLECTION,
+   )
+
+   filter = {"subject.subject_id": "123456"}
+   limit = 1000
+   paginate_batch_size = 100
+   response = docdb_api_client.retrieve_data_asset_records(
+      filter_query=filter,
+      limit=limit,
+      paginate_batch_size=paginate_batch_size
+   )
+   print(response)
+
+
+Direct Connection (SSH) - Database UI (MongoDB Compass)
+~~~~~~~~~~~~~~~~~~~~~~
+
+MongoDB Compass is a database GUI that can be used to query and interact
+with our document database.
+
+To connect:
+
+1. If provided a temporary SSH password, please first run ``ssh {ssh-username}@54.184.81.236``
+   and set a new password.
+2. Download the full version of `MongoDB Compass <https://www.mongodb.com/try/download/compass>`__.
+3. When connecting, click “Advanced Connection Options” and use the configurations below.
+   Leave any unspecified fields on their default setting.
+
+.. list-table::
+   :widths: 25 25 50
+   :header-rows: 1
+
+   * - Tab
+     - Config
+     - Value
+   * - General
+     - Host
+     - ``************.us-west-2.docdb.amazonaws.com``
+   * - Authentication
+     - Username
+     - ``doc_db_username``
+   * - 
+     - Password
+     - ``doc_db_password``
+   * - 
+     - Authentication Mechanism
+     - SCRAM-SHA-1
+   * - TLS/SSL
+     - SSL/TLS Connection
+     - OFF
+   * - Proxy/SSH
+     - SSH Tunnel/ Proxy Method	
+     - SSH with Password
+   * -
+     - SSH Hostname
+     - ``ssh_host``
+   * -
+     - SSH Port
+     - 22
+   * -
+     - SSH Username
+     - ``ssh_username``
+   * -
+     - SSH Username
+     - ``ssh_password``
+   
+4. You should be able to see the home page with the ``metadata-index`` database.
+   It should have 1 single collection called ``data_assets``.
+5. If provided with a temporary DocDB password, please change it using the embedded
+   mongo shell in Compass, and then reconnect.
+
+.. code:: bash
+   
+   db.updateUser(
+      "doc_db_username",
+      {
+         pwd: passwordPrompt()
+      }
+   )
+
+Direct Connection (SSH) - Python Client
+~~~~~~~~~~~~~~~~~~~~~~
+
+We have some convenience methods to interact with our Document Store.
+You can create a client by explicitly setting credentials, or downloading from AWS Secrets Manager.
+
+If using credentials from environment, please configure:
+
+.. code:: bash
+
+   DOC_DB_HOST=************.us-west-2.docdb.amazonaws.com
+   DOC_DB_USERNAME=doc_db_username
+   DOC_DB_PASSWORD=doc_db_password
+   DOC_DB_SSH_HOST=ssh_host
+   DOC_DB_SSH_USERNAME=ssh_username
+   DOC_DB_SSH_PASSWORD=ssh_password
+
+To use the client:
+
+.. code:: python
+
+   from aind_data_access_api.document_db_ssh import DocumentDbSSHClient, DocumentDbSSHCredentials
+
+   # Method 1) if credentials are set in environment
+   credentials = DocumentDbSSHCredentials()
+
+   # Method 2) if you have permissions to AWS Secrets Manager
+   # Each secret must contain corresponding "host", "username", and "password"
+   credentials = DocumentDbSSHCredentials.from_secrets_manager(
+      doc_db_secret_name="/doc/db/secret/name", ssh_secret_name="/ssh/tunnel/secret/name"
+   )
+
+   with DocumentDbSSHClient(credentials=credentials) as doc_db_client:
+      # To get a list of filtered records:
+      filter = {"subject.subject_id": "123456"}
+      projection = {
+         "name": 1, "created": 1, "location": 1, "subject.subject_id": 1, "subject.date_of_birth": 1,
+      }
+      count = doc_db_client.collection.count_documents(filter)
+      response = list(doc_db_client.collection.find(filter=filter, projection=projection))
+
+
+RDS Tables
+------------------
+We have some convenience methods to interact with our Relational Database. You can create a client by 
+explicitly setting credentials, or downloading from AWS Secrets Manager.
+
+.. code:: python
+
+   from aind_data_access_api.credentials import RDSCredentials
+   from aind_data_access_api.rds_tables import Client
+
+   # Method one assuming user, password, and host are known
+   ds_client = Client(
+               credentials=RDSCredentials(
+                  username="user",
+                  password="password",
+                  host="host",
+                  database="metadata",
+               ),
+               collection_name="data_assets",
+         )
+
+   # Method two if you have permissions to AWS Secrets Manager
+   ds_client = Client(
+               credentials=RDSCredentials(
+                  aws_secrets_name="aind/data/access/api/rds_tables"
+               ),
+         )
+
+   # To retrieve a table as a pandas dataframe
+   df = ds_client.read_table(table_name="spike_sorting_urls")
+
+   # Can also pass in a custom sql query
+   df = ds_client.read_table(query="SELECT * FROM spike_sorting_urls")
+
+   # It's also possible to save a pandas dataframe as a table. Please check internal documentation for more details.
+   ds_client.overwrite_table_with_df(df, table_name)
+
+Reporting bugs or making feature requests
+-----------------------------------------
+
+Please report any bugs or feature requests here:
+`issues <https://github.com/AllenNeuralDynamics/aind-data-access-api/issues/new/choose>`__
diff --git a/docs/source/aind_data_access_api.rst b/docs/source/aind_data_access_api.rst
index 78cde06..bb006e0 100644
--- a/docs/source/aind_data_access_api.rst
+++ b/docs/source/aind_data_access_api.rst
@@ -60,6 +60,14 @@ aind\_data\_access\_api.secrets module
    :undoc-members:
    :show-inheritance:
 
+aind\_data\_access\_api.utils module
+------------------------------------
+
+.. automodule:: aind_data_access_api.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 Module contents
 ---------------
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 07adcad..91ce562 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -11,6 +11,8 @@ Welcome to this repository's documentation!
    :maxdepth: 2
    :caption: Contents:
 
+   UserGuide
+   ExamplesDocDBDirectConnection
    modules
 
 

From 800de35107357f10d0c779617873cd12bc461d16 Mon Sep 17 00:00:00 2001
From: Helen Lin <helen.lin@alleninstitute.org>
Date: Fri, 12 Jul 2024 15:46:15 -0700
Subject: [PATCH 3/3] build: version bump to v0.11.0

---
 src/aind_data_access_api/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/aind_data_access_api/__init__.py b/src/aind_data_access_api/__init__.py
index 0deb776..dce99e7 100644
--- a/src/aind_data_access_api/__init__.py
+++ b/src/aind_data_access_api/__init__.py
@@ -1,3 +1,3 @@
 """Init package"""
 
-__version__ = "0.10.1"
+__version__ = "0.11.0"