Skip to content

Commit

Permalink
Add docdb utils and property on Session object
Browse files Browse the repository at this point in the history
  • Loading branch information
bjhardcastle committed Aug 28, 2024
1 parent fb9b70b commit 77812b5
Show file tree
Hide file tree
Showing 7 changed files with 596 additions and 11 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ True
's3://aind-ephys-data/ecephys_676909_2023-12-13_13-43-40'
>>> session.modalities
('behavior', 'behavior_videos', 'ecephys')
>>> session.docdb.keys()
dict_keys(['_id', 'acquisition', 'created', 'data_description', 'describedBy', 'external_links', 'instrument', 'last_modified', 'location', 'metadata_status', 'name', 'procedures', 'processing', 'rig', 'schema_version', 'session', 'subject'])

# Additional functionality in namespace extensions:
>>> session.metadata.subject['genotype']
Expand Down
413 changes: 412 additions & 1 deletion pdm.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ dependencies = [
"codeocean>=0.1.5",
"npc-io>=0.1.27",
"npc-session>=0.1.39",
"aind-data-access-api[docdb]>=0.14.0",
]
version = "0.1.18"
classifiers = [
Expand Down
20 changes: 19 additions & 1 deletion src/aind_session/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import datetime
import logging
from typing import Any

import codeocean.data_asset
import npc_session
Expand Down Expand Up @@ -54,7 +55,7 @@ class Session:
>>> session.raw_data_dir.as_posix()
's3://aind-private-data-prod-o5171v/behavior_717121_2024-06-16_11-39-34'
>>> session = Session('SmartSPIM_698260_2024-07-20_21-47-21')
>>> session = Session('SmartSPIM_123456_2024-07-20_21-47-21')
>>> session.raw_data_dir.as_posix()
Traceback (most recent call last):
...
Expand Down Expand Up @@ -232,6 +233,8 @@ def raw_data_dir(self) -> upath.UPath:
>>> session.raw_data_dir.as_posix()
's3://aind-ephys-data/ecephys_676909_2023-12-13_13-43-40'
"""
if (p := self.docdb.get("location")):
return upath.UPath(p)
if getattr(self, "raw_data_asset", None):
logger.debug(
f"Using asset {self.raw_data_asset.id} to find raw data path for {self.id}"
Expand Down Expand Up @@ -292,6 +295,21 @@ def modalities(self) -> tuple[str, ...]:
dir_names.remove(name)
logger.debug(f"Excluding {name!r} from modality names")
return tuple(sorted(dir_names))

@property
def docdb(self) -> dict[str, Any]:
"""Contents of the session's DocumentDB record.
Examples
--------
>>> session = aind_session.Session('ecephys_676909_2023-12-13_13-43-40')
>>> docdb = session.docdb
>>> type(docdb)
dict
>>> docdb.keys() # doctest: +SKIP
dict_keys(['_id', 'acquisition', 'created', 'data_description', 'describedBy', 'external_links', 'instrument', 'last_modified', 'location', 'metadata_status', 'name', 'procedures', 'processing', 'rig', 'schema_version', 'session', 'subject'])
"""
return aind_session.utils.get_docdb_record(self.id)


def get_sessions(
Expand Down
1 change: 1 addition & 0 deletions src/aind_session/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from aind_session.utils.codeocean_utils import *
from aind_session.utils.docdb_utils import *
from aind_session.utils.misc_utils import *
from aind_session.utils.s3_utils import *
19 changes: 10 additions & 9 deletions src/aind_session/utils/codeocean_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,17 @@ def is_raw_data_asset(
f"{asset.id=} name does not contain a valid session ID: {asset.name=}"
)
return False
if session_id == asset.name:
logger.debug(
f"{asset.id=} name is a session ID alone, with no additional suffixes: it is considered raw data {asset.name=}"
)
return True
else:
logger.debug(
f"{asset.id=} name is not a session ID alone: it is not considered raw data {asset.name=}"
)
return False
if session_id == asset.name:
logger.debug(
f"{asset.id=} name is a session ID alone, with no additional suffixes: it is considered raw data {asset.name=}"
)
return True
else:
logger.debug(
f"{asset.id=} name is not a session ID alone: it is not considered raw data {asset.name=}"
)
return False


@functools.cache
Expand Down
151 changes: 151 additions & 0 deletions src/aind_session/utils/docdb_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
from __future__ import annotations

import contextlib
import functools
import logging
from typing import Any
import uuid

import aind_data_access_api.document_db
import npc_session

import aind_session.utils.codeocean_utils

logger = logging.getLogger(__name__)


@functools.cache
def get_docdb_api_client(**kwargs) -> aind_data_access_api.document_db.MetadataDbClient:
"""
Return a MetadataDbClient instance, passing any kwargs supplied.
If not supplied, the following defaults are used:
host: "api.allenneuraldynamics.org"
database: "metadata_index"
collection: "data_assets"
"""
kwargs.setdefault("host", "api.allenneuraldynamics.org")
kwargs.setdefault("database", "metadata_index")
kwargs.setdefault("collection", "data_assets")
return aind_data_access_api.document_db.MetadataDbClient(**kwargs)


@functools.cache
def get_subject_docdb_records(
subject_id: str | int,
ttl_hash: int | None = None,
) -> tuple[dict[str, Any], ...]:
"""
Retrieve all records from the DocumentDB "data_assets" collection that are
associated with a given subject_id. Records are sorted by ascending creation time.
Examples
--------
>>> records = get_subject_docdb_records(676909)
>>> records[0].keys() # doctest: +SKIP
dict_keys(['_id', 'acquisition', 'created', 'data_description', 'describedBy', 'external_links', 'instrument', 'last_modified', 'location', 'metadata_status', 'name', 'procedures', 'processing', 'rig', 'schema_version', 'session', 'subject'])
"""
del ttl_hash
records = get_docdb_api_client().retrieve_docdb_records(
filter_query={
"subject.subject_id": str(subject_id),
},
sort={"created": 1},
)
logger.debug(
f"Retrieved {len(records)} records for subject {subject_id} from DocumentDB"
)
return tuple(records)


@functools.cache
def get_docdb_record(
data_asset_name_or_id: str | uuid.UUID,
ttl_hash: int | None = None,
) -> dict[str, Any]:
"""
Retrieve a single record from the DocumentDB "data_assets" collection that has the
given data asset name or, if a UUID is supplied, corresponds to the given data asset ID.
**note: assets are currently (2024/08) incomplete in DocumentDB:** if none
are found, a workaround using the CodeOcean API is used
- if multiple records are found, the most-recently created record is returned
- if no record is found, an empty dict is returned
Examples
--------
Get a record by data asset name (typically a session ID):
>>> record = get_docdb_record("ecephys_676909_2023-12-13_13-43-40")
>>> assert record
>>> record.keys() # doctest: +SKIP
dict_keys(['_id', 'acquisition', 'created', 'data_description', 'describedBy', 'external_links', 'instrument', 'last_modified', 'location', 'metadata_status', 'name', 'procedures', 'processing', 'rig', 'schema_version', 'session', 'subject'])
Get a record by data asset ID:
>>> assert get_docdb_record('16d46411-540a-4122-b47f-8cb2a15d593a')
"""
del ttl_hash
asset_id = asset_name = None
try:
asset_id = aind_session.utils.codeocean_utils.get_normalized_uuid(
data_asset_name_or_id
)
except ValueError:
asset_name = str(data_asset_name_or_id)
if asset_id:
# retrieve records by ID
records = get_docdb_api_client().retrieve_docdb_records(
filter_query={
"external_links": {
"$elemMatch": {
"Code Ocean": aind_session.utils.codeocean_utils.get_normalized_uuid(
asset_id
)
}
},
},
sort={"created": 1},
)
if len(records) > 0:
if len(records) > 1:
logger.warning(
f"Multiple records found for {asset_id} in DocumentDB: returning most-recently created"
)
assert records[-1]["created"] > records[0]["created"], "records are not sorted by creation time"
return records[-1]

if len(records) == 0:
logger.debug(
f"No records found matching {asset_id} in DocumentDB, however records are currently incomplete (2024-08)."
" Getting asset name from CodeOcean API, then looking up DocumentDB record by name instead."
)
try:
asset_name = aind_session.get_data_asset_model(asset_id).name
except Exception:
logger.warning(f"{asset_id} does not exist in CodeOcean")
return {}

# retrieve records by name
assert asset_name is not None
records = get_docdb_api_client().retrieve_docdb_records(
filter_query={
"name": asset_name,
},
sort={"created": 1},
)
if len(records) == 0:
logger.warning(f"No records found for {asset_name!r} in DocumentDB")
return {}
if len(records) > 1:
logger.warning(
f"Multiple records found for {asset_name!r} in DocumentDB: returning most-recently created"
)
assert records[-1]["created"] > records[0]["created"], "records are not sorted by creation time"
return records[-1]


if __name__ == "__main__":
from aind_session import testmod

testmod()

0 comments on commit 77812b5

Please sign in to comment.