Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch AnnData.__sizeof__() for backed datasets #1230

Merged
merged 22 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 23 additions & 14 deletions anndata/_core/anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from numpy import ma
from pandas.api.types import infer_dtype, is_string_dtype
from scipy import sparse
from scipy.sparse import csr_matrix, issparse
from scipy.sparse import issparse

from anndata._warnings import ImplicitModificationWarning

Expand Down Expand Up @@ -592,28 +592,37 @@ def _init_as_actual(
# layers
self._layers = Layers(self, layers)

def __sizeof__(self, show_stratified=None) -> int:
def get_size(X):
if issparse(X):
X_csr = csr_matrix(X)
return X_csr.data.nbytes + X_csr.indptr.nbytes + X_csr.indices.nbytes
def __sizeof__(self, show_stratified=None, with_disk: bool = False) -> int:
def get_size(X) -> int:
def cs_to_bytes(X) -> int:
return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes)

if isinstance(X, h5py.Dataset) and with_disk:
return int(np.array(X.shape).prod() * X.dtype.itemsize)
elif isinstance(X, BaseCompressedSparseDataset) and with_disk:
return cs_to_bytes(X._to_backed())
elif isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)):
return cs_to_bytes(X)
else:
return X.__sizeof__()

size = 0
attrs = list(["_X", "_obs", "_var"])
attrs_multi = list(["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"])
sizes = {}
attrs = ["X", "_obs", "_var"]
attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"]
for attr in attrs + attrs_multi:
if attr in attrs_multi:
keys = getattr(self, attr).keys()
s = sum([get_size(getattr(self, attr)[k]) for k in keys])
s = sum(get_size(getattr(self, attr)[k]) for k in keys)
else:
s = get_size(getattr(self, attr))
if s > 0 and show_stratified:
str_attr = attr.replace("_", ".") + " " * (7 - len(attr))
print(f"Size of {str_attr}: {'%3.2f' % (s / (1024 ** 2))} MB")
size += s
return size
from tqdm import tqdm

print(
f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}"
)
sizes[attr] = s
return sum(sizes.values())

def _gen_repr(self, n_obs, n_vars) -> str:
if self.isbacked:
Expand Down
69 changes: 62 additions & 7 deletions anndata/tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

from contextlib import contextmanager
from typing import TYPE_CHECKING, Callable, Literal

import h5py
import numpy as np
import pytest
Expand All @@ -12,6 +15,11 @@
from anndata.experimental import read_dispatched
from anndata.tests.helpers import assert_equal, subset_func

if TYPE_CHECKING:
from pathlib import Path

from numpy.typing import ArrayLike

subset_func2 = subset_func


Expand All @@ -21,7 +29,9 @@ def diskfmt(request):


@pytest.fixture(scope="function")
def ondisk_equivalent_adata(tmp_path, diskfmt):
def ondisk_equivalent_adata(
tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]
) -> tuple[AnnData, AnnData, AnnData, AnnData]:
csr_path = tmp_path / f"csr.{diskfmt}"
csc_path = tmp_path / f"csc.{diskfmt}"
dense_path = tmp_path / f"dense.{diskfmt}"
Expand Down Expand Up @@ -68,7 +78,11 @@ def callback(func, elem_name, elem, iospec):
return csr_mem, csr_disk, csc_disk, dense_disk


def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2):
def test_backed_indexing(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
subset_func,
subset_func2,
):
csr_mem, csr_disk, csc_disk, dense_disk = ondisk_equivalent_adata

obs_idx = subset_func(csr_mem.obs_names)
Expand All @@ -87,7 +101,12 @@ def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2):
pytest.param(sparse.csc_matrix, sparse.hstack),
],
)
def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt):
def test_dataset_append_memory(
tmp_path: Path,
sparse_format: Callable[[ArrayLike], sparse.spmatrix],
append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix],
diskfmt: Literal["h5ad", "zarr"],
):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand Down Expand Up @@ -115,7 +134,12 @@ def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt):
pytest.param(sparse.csc_matrix, sparse.hstack),
],
)
def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt):
def test_dataset_append_disk(
tmp_path: Path,
sparse_format: Callable[[ArrayLike], sparse.spmatrix],
append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix],
diskfmt: Literal["h5ad", "zarr"],
):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand Down Expand Up @@ -146,7 +170,13 @@ def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt):
pytest.param("csc", (100, 100), (200, 100)),
],
)
def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt):
def test_wrong_shape(
tmp_path: Path,
sparse_format: Literal["csr", "csc"],
a_shape: tuple[int, int],
b_shape: tuple[int, int],
diskfmt: Literal["h5ad", "zarr"],
):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand All @@ -167,7 +197,7 @@ def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt):
a_disk.append(b_disk)


def test_wrong_formats(tmp_path, diskfmt):
def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand Down Expand Up @@ -198,7 +228,7 @@ def test_wrong_formats(tmp_path, diskfmt):
assert not np.any((pre_checks != post_checks).toarray())


def test_anndata_sparse_compat(tmp_path, diskfmt):
def test_anndata_sparse_compat(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand All @@ -212,3 +242,28 @@ def test_anndata_sparse_compat(tmp_path, diskfmt):
ad._io.specs.write_elem(f, "/", base)
adata = ad.AnnData(sparse_dataset(f["/"]))
assert_equal(adata.X, base)


@contextmanager
def xfail_if_zarr(diskfmt: Literal["h5ad", "zarr"]):
if diskfmt == "zarr":
with pytest.raises(AssertionError):
yield
# TODO: Zarr backed mode https://github.com/scverse/anndata/issues/219
pytest.xfail("Backed zarr not really supported yet")
else:
yield


def test_backed_sizeof(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
diskfmt: Literal["h5ad", "zarr"],
):
csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata

assert csr_mem.__sizeof__() == csr_disk.__sizeof__(with_disk=True)
assert csr_mem.__sizeof__() == csc_disk.__sizeof__(with_disk=True)
assert csr_disk.__sizeof__(with_disk=True) == csc_disk.__sizeof__(with_disk=True)
with xfail_if_zarr(diskfmt):
assert csr_mem.__sizeof__() > csr_disk.__sizeof__()
assert csr_mem.__sizeof__() > csc_disk.__sizeof__()
1 change: 1 addition & 0 deletions docs/release-notes/0.10.4.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
```{rubric} Bugfix
```
* Only try to use `Categorical.map(na_action=…)` in actually supported Pandas ≥2.1 {pr}`1226` {user}`flying-sheep`
* `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko`

```{rubric} Documentation
```
Expand Down
Loading