Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lazy dataframes in .obs and .var with backed="r" mode #983

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion anndata/_core/anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
_resolve_idxs,
)
from .sparse_dataset import SparseDataset
from .dataframe import DataFrame
from .. import utils
from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len
from ..logging import anndata_logger as logger
Expand Down Expand Up @@ -347,7 +348,7 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index):

# set data
if self.isbacked:
self._X = None
self._X = self._obs = None

# set raw, easy, as it’s immutable anyways...
if adata_ref._raw is not None:
Expand Down
41 changes: 41 additions & 0 deletions anndata/_core/dataframe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""
This module implements on disk dataframes.
"""

import h5py
import pandas as pd


class DataFrame:
def __init__(self, group: h5py.Group):
assert (
group.attrs["encoding-type"] == "dataframe"
), "HDF5 group at path '{group.name}' is not encoded as a dataframe"

self._group = group
self._attrs = self._group.attrs

self._index = self._group[self._attrs["_index"]].asstr()
self._columns = self._attrs["column-order"]

for column in self.columns:
# read_elem_partial(group) # , items=obs, indices=(obs_idx, slice(None)))
setattr(self, column, self._group[column])

@property
def index(self):
return pd.Index(self._index[:])

@property
def columns(self):
return pd.Index(self._columns)

def __getitem__(self, index):
if isinstance(index, str) and index in self.columns:
return getattr(self, index)

elif isinstance(index, slice):
raise NotImplementedError("Slicing is not yet supported.")

else:
raise TypeError(f"Invalid index '{index}' of type {type(index)}")
1 change: 1 addition & 0 deletions anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from anndata._core.index import _normalize_indices
from anndata._core.merge import intersect_keys
from anndata._core.sparse_dataset import SparseDataset
from anndata._core.dataframe import DataFrame
from anndata._core import views
from anndata.compat import (
ZarrArray,
Expand Down
9 changes: 9 additions & 0 deletions anndata/tests/test_hdf5_backing.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,15 @@ def test_backed_raw(tmp_path):
assert_equal(final_adata, mem_adata)


def test_backed_dataframes(tmp_path):
backed_pth = tmp_path / "backed.h5ad"
mem_adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS)
mem_adata.write(backed_pth)

backed_adata = ad.read_h5ad(backed_pth, backed="r")
assert_equal(backed_adata, mem_adata)


@pytest.mark.parametrize(
"array_type",
[
Expand Down