Skip to content

Commit

Permalink
Backport PR #1230 on branch 0.10.x (Patch AnnData.__sizeof__() for ba…
Browse files Browse the repository at this point in the history
…cked datasets) (#1234)

Co-authored-by: Etienne JODRY <Etienne.JODRY@hotmail.fr>
  • Loading branch information
meeseeksmachine and Neah-Ko authored Nov 17, 2023
1 parent 7dc4439 commit 0611f3b
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 21 deletions.
37 changes: 23 additions & 14 deletions anndata/_core/anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from numpy import ma
from pandas.api.types import infer_dtype, is_string_dtype
from scipy import sparse
from scipy.sparse import csr_matrix, issparse
from scipy.sparse import issparse

from anndata._warnings import ImplicitModificationWarning

Expand Down Expand Up @@ -592,28 +592,37 @@ def _init_as_actual(
# layers
self._layers = Layers(self, layers)

def __sizeof__(self, show_stratified=None) -> int:
def get_size(X):
if issparse(X):
X_csr = csr_matrix(X)
return X_csr.data.nbytes + X_csr.indptr.nbytes + X_csr.indices.nbytes
def __sizeof__(self, show_stratified=None, with_disk: bool = False) -> int:
def get_size(X) -> int:
def cs_to_bytes(X) -> int:
return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes)

if isinstance(X, h5py.Dataset) and with_disk:
return int(np.array(X.shape).prod() * X.dtype.itemsize)
elif isinstance(X, BaseCompressedSparseDataset) and with_disk:
return cs_to_bytes(X._to_backed())
elif isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)):
return cs_to_bytes(X)
else:
return X.__sizeof__()

size = 0
attrs = list(["_X", "_obs", "_var"])
attrs_multi = list(["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"])
sizes = {}
attrs = ["X", "_obs", "_var"]
attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"]
for attr in attrs + attrs_multi:
if attr in attrs_multi:
keys = getattr(self, attr).keys()
s = sum([get_size(getattr(self, attr)[k]) for k in keys])
s = sum(get_size(getattr(self, attr)[k]) for k in keys)
else:
s = get_size(getattr(self, attr))
if s > 0 and show_stratified:
str_attr = attr.replace("_", ".") + " " * (7 - len(attr))
print(f"Size of {str_attr}: {'%3.2f' % (s / (1024 ** 2))} MB")
size += s
return size
from tqdm import tqdm

print(
f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}"
)
sizes[attr] = s
return sum(sizes.values())

def _gen_repr(self, n_obs, n_vars) -> str:
if self.isbacked:
Expand Down
69 changes: 62 additions & 7 deletions anndata/tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

from contextlib import contextmanager
from typing import TYPE_CHECKING, Callable, Literal

import h5py
import numpy as np
import pytest
Expand All @@ -12,6 +15,11 @@
from anndata.experimental import read_dispatched
from anndata.tests.helpers import assert_equal, subset_func

if TYPE_CHECKING:
from pathlib import Path

from numpy.typing import ArrayLike

subset_func2 = subset_func


Expand All @@ -21,7 +29,9 @@ def diskfmt(request):


@pytest.fixture(scope="function")
def ondisk_equivalent_adata(tmp_path, diskfmt):
def ondisk_equivalent_adata(
tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]
) -> tuple[AnnData, AnnData, AnnData, AnnData]:
csr_path = tmp_path / f"csr.{diskfmt}"
csc_path = tmp_path / f"csc.{diskfmt}"
dense_path = tmp_path / f"dense.{diskfmt}"
Expand Down Expand Up @@ -68,7 +78,11 @@ def callback(func, elem_name, elem, iospec):
return csr_mem, csr_disk, csc_disk, dense_disk


def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2):
def test_backed_indexing(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
subset_func,
subset_func2,
):
csr_mem, csr_disk, csc_disk, dense_disk = ondisk_equivalent_adata

obs_idx = subset_func(csr_mem.obs_names)
Expand All @@ -87,7 +101,12 @@ def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2):
pytest.param(sparse.csc_matrix, sparse.hstack),
],
)
def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt):
def test_dataset_append_memory(
tmp_path: Path,
sparse_format: Callable[[ArrayLike], sparse.spmatrix],
append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix],
diskfmt: Literal["h5ad", "zarr"],
):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand Down Expand Up @@ -115,7 +134,12 @@ def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt):
pytest.param(sparse.csc_matrix, sparse.hstack),
],
)
def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt):
def test_dataset_append_disk(
tmp_path: Path,
sparse_format: Callable[[ArrayLike], sparse.spmatrix],
append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix],
diskfmt: Literal["h5ad", "zarr"],
):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand Down Expand Up @@ -146,7 +170,13 @@ def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt):
pytest.param("csc", (100, 100), (200, 100)),
],
)
def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt):
def test_wrong_shape(
tmp_path: Path,
sparse_format: Literal["csr", "csc"],
a_shape: tuple[int, int],
b_shape: tuple[int, int],
diskfmt: Literal["h5ad", "zarr"],
):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand All @@ -167,7 +197,7 @@ def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt):
a_disk.append(b_disk)


def test_wrong_formats(tmp_path, diskfmt):
def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand Down Expand Up @@ -198,7 +228,7 @@ def test_wrong_formats(tmp_path, diskfmt):
assert not np.any((pre_checks != post_checks).toarray())


def test_anndata_sparse_compat(tmp_path, diskfmt):
def test_anndata_sparse_compat(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]):
path = (
tmp_path / f"test.{diskfmt.replace('ad', '')}"
) # diskfmt is either h5ad or zarr
Expand All @@ -212,3 +242,28 @@ def test_anndata_sparse_compat(tmp_path, diskfmt):
ad._io.specs.write_elem(f, "/", base)
adata = ad.AnnData(sparse_dataset(f["/"]))
assert_equal(adata.X, base)


@contextmanager
def xfail_if_zarr(diskfmt: Literal["h5ad", "zarr"]):
if diskfmt == "zarr":
with pytest.raises(AssertionError):
yield
# TODO: Zarr backed mode https://github.com/scverse/anndata/issues/219
pytest.xfail("Backed zarr not really supported yet")
else:
yield


def test_backed_sizeof(
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
diskfmt: Literal["h5ad", "zarr"],
):
csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata

assert csr_mem.__sizeof__() == csr_disk.__sizeof__(with_disk=True)
assert csr_mem.__sizeof__() == csc_disk.__sizeof__(with_disk=True)
assert csr_disk.__sizeof__(with_disk=True) == csc_disk.__sizeof__(with_disk=True)
with xfail_if_zarr(diskfmt):
assert csr_mem.__sizeof__() > csr_disk.__sizeof__()
assert csr_mem.__sizeof__() > csc_disk.__sizeof__()
1 change: 1 addition & 0 deletions docs/release-notes/0.10.4.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
```{rubric} Bugfix
```
* Only try to use `Categorical.map(na_action=…)` in actually supported Pandas ≥2.1 {pr}`1226` {user}`flying-sheep`
* `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko`

```{rubric} Documentation
```
Expand Down

0 comments on commit 0611f3b

Please sign in to comment.