diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 38680b9dd..541bc0ab0 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -44,6 +44,7 @@ _resolve_idxs, ) from .sparse_dataset import SparseDataset +from .dataframe import DataFrame from .. import utils from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len from ..logging import anndata_logger as logger @@ -347,7 +348,7 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): # set data if self.isbacked: - self._X = None + self._X = self._obs = None # set raw, easy, as it’s immutable anyways... if adata_ref._raw is not None: diff --git a/anndata/_core/dataframe.py b/anndata/_core/dataframe.py new file mode 100644 index 000000000..6d3136e99 --- /dev/null +++ b/anndata/_core/dataframe.py @@ -0,0 +1,41 @@ +""" +This module implements on disk dataframes. +""" + +import h5py +import pandas as pd + + +class DataFrame: + def __init__(self, group: h5py.Group): + assert ( + group.attrs["encoding-type"] == "dataframe" + ), "HDF5 group at path '{group.name}' is not encoded as a dataframe" + + self._group = group + self._attrs = self._group.attrs + + self._index = self._group[self._attrs["_index"]].asstr() + self._columns = self._attrs["column-order"] + + for column in self.columns: + # read_elem_partial(group) # , items=obs, indices=(obs_idx, slice(None))) + setattr(self, column, self._group[column]) + + @property + def index(self): + return pd.Index(self._index[:]) + + @property + def columns(self): + return pd.Index(self._columns) + + def __getitem__(self, index): + if isinstance(index, str) and index in self.columns: + return getattr(self, index) + + elif isinstance(index, slice): + raise NotImplementedError("Slicing is not yet supported.") + + else: + raise TypeError(f"Invalid index '{index}' of type {type(index)}") diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 9584f82d1..78f913eb8 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -17,6 +17,7 @@ from anndata._core.index import _normalize_indices from anndata._core.merge import intersect_keys from anndata._core.sparse_dataset import SparseDataset +from anndata._core.dataframe import DataFrame from anndata._core import views from anndata.compat import ( ZarrArray, diff --git a/anndata/tests/test_hdf5_backing.py b/anndata/tests/test_hdf5_backing.py index 987d01382..66419818e 100644 --- a/anndata/tests/test_hdf5_backing.py +++ b/anndata/tests/test_hdf5_backing.py @@ -175,6 +175,15 @@ def test_backed_raw(tmp_path): assert_equal(final_adata, mem_adata) +def test_backed_dataframes(tmp_path): + backed_pth = tmp_path / "backed.h5ad" + mem_adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) + mem_adata.write(backed_pth) + + backed_adata = ad.read_h5ad(backed_pth, backed="r") + assert_equal(backed_adata, mem_adata) + + @pytest.mark.parametrize( "array_type", [