From 36480ba2af5559bd6a21cb0964f02658667d4abd Mon Sep 17 00:00:00 2001
From: Grisha Szep <gregory.szep@gmail.com>
Date: Sun, 16 Apr 2023 12:42:36 +0100
Subject: [PATCH 1/4] HDF5DataFrame class

---
 anndata/_core/anndata.py           | 10 ++++-
 anndata/_core/file_backing.py      | 70 ++++++++++++++++++++++++++++++
 anndata/tests/test_hdf5_backing.py |  9 ++++
 3 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py
index 38680b9dd..1a8bf6ba8 100644
--- a/anndata/_core/anndata.py
+++ b/anndata/_core/anndata.py
@@ -26,7 +26,7 @@
 from anndata._warnings import ImplicitModificationWarning
 from .raw import Raw
 from .index import _normalize_indices, _subset, Index, Index1D, get_vector
-from .file_backing import AnnDataFileManager, to_memory
+from .file_backing import AnnDataFileManager, to_memory, HDF5DataFrame
 from .access import ElementRef
 from .aligned_mapping import (
     AxisArrays,
@@ -836,7 +836,13 @@ def _set_dim_index(self, value: pd.Index, attr: str):
     @property
     def obs(self) -> pd.DataFrame:
         """One-dimensional annotation of observations (`pd.DataFrame`)."""
-        return self._obs
+        if self.isbacked:
+            if not self.file.is_open:
+                self.file.open()
+            obs = HDF5DataFrame(self.file["obs"])
+        else:
+            obs = self._obs
+        return obs
 
     @obs.setter
     def obs(self, value: pd.DataFrame):
diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py
index 02401873c..987ff038c 100644
--- a/anndata/_core/file_backing.py
+++ b/anndata/_core/file_backing.py
@@ -5,6 +5,7 @@
 from collections.abc import Mapping
 
 import h5py
+import pandas as pd
 
 from . import anndata
 from .sparse_dataset import SparseDataset
@@ -133,3 +134,72 @@ def _(x, copy=False):
         return _copy(x)
     else:
         return x
+
+
+class HDF5DataFrame:
+    def __init__(self, group: h5py.Group):
+        assert (
+            group.attrs["encoding-type"] == "dataframe"
+        ), "HDF5 group at path '{group.name}' is not encoded as a dataframe"
+
+        self._group = group
+        self._attrs = self._group.attrs
+
+        self._index = self._group[self._attrs["_index"]].asstr()
+        self.columns = pd.Index(self._attrs["column-order"])
+
+        for column in self.columns:
+            setattr(self, column, self._group[column])
+
+    @property
+    def index(self):
+        return pd.Index(self._index[:])
+
+    # def __getitem__(self, key):
+    #     if isinstance(key, str) and key in self.columns:
+    #         return self._group[key]
+
+    #     elif isinstance(key, slice):
+    #         return self._group[key]
+
+    #     if isinstance(index, tuple) and self.attr in ("obs", "obsm"):
+    #         oidx = index[0]
+    #         if len(index) > 1:
+    #             vidx = index[1]
+
+    #     if oidx is None:
+    #         view = self.adset[index]
+    #     else:
+    #         view = self.adset[oidx]
+    #     attr_arr = getattr(view, self.attr)
+    #     if self.key is not None:
+    #         attr_arr = attr_arr[self.key]
+    #     return attr_arr if vidx is None else attr_arr[:, vidx]
+
+    # @property
+    # def shape(self):
+    #     shape = self.adset.shape
+    #     if self.attr in ["X", "layers"]:
+    #         return shape
+    #     elif self.attr == "obs":
+    #         return (shape[0],)
+    #     elif self.attr == "obsm" and self.key is not None:
+    #         return shape[0], self[:1].shape[1]
+    #     else:
+    #         return None
+
+    # @property
+    # def ndim(self):
+    #     return len(self.shape) if self.shape is not None else 0
+
+    # @property
+    # def dtype(self):
+    #     _dtypes = self.adset._dtypes
+    #     if _dtypes is not None and self.attr in _dtypes:
+    #         return _dtypes[self.attr][self.key]
+
+    #     attr = self[:1]
+    #     if hasattr(attr, "dtype"):
+    #         return attr.dtype
+    #     else:
+    #         return None
diff --git a/anndata/tests/test_hdf5_backing.py b/anndata/tests/test_hdf5_backing.py
index 987d01382..66419818e 100644
--- a/anndata/tests/test_hdf5_backing.py
+++ b/anndata/tests/test_hdf5_backing.py
@@ -175,6 +175,15 @@ def test_backed_raw(tmp_path):
     assert_equal(final_adata, mem_adata)
 
 
+def test_backed_dataframes(tmp_path):
+    backed_pth = tmp_path / "backed.h5ad"
+    mem_adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS)
+    mem_adata.write(backed_pth)
+
+    backed_adata = ad.read_h5ad(backed_pth, backed="r")
+    assert_equal(backed_adata, mem_adata)
+
+
 @pytest.mark.parametrize(
     "array_type",
     [

From d9bd744305908c4b754a755b07278c9c40e83fdd Mon Sep 17 00:00:00 2001
From: Grisha Szep <gregory.szep@gmail.com>
Date: Sun, 16 Apr 2023 13:56:07 +0100
Subject: [PATCH 2/4] factor out dataframe.py

---
 anndata/_core/anndata.py      |   5 +-
 anndata/_core/dataframe.py    | 413 ++++++++++++++++++++++++++++++++++
 anndata/_core/file_backing.py |  70 ------
 3 files changed, 416 insertions(+), 72 deletions(-)
 create mode 100644 anndata/_core/dataframe.py

diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py
index 1a8bf6ba8..235827c39 100644
--- a/anndata/_core/anndata.py
+++ b/anndata/_core/anndata.py
@@ -26,7 +26,7 @@
 from anndata._warnings import ImplicitModificationWarning
 from .raw import Raw
 from .index import _normalize_indices, _subset, Index, Index1D, get_vector
-from .file_backing import AnnDataFileManager, to_memory, HDF5DataFrame
+from .file_backing import AnnDataFileManager, to_memory
 from .access import ElementRef
 from .aligned_mapping import (
     AxisArrays,
@@ -44,6 +44,7 @@
     _resolve_idxs,
 )
 from .sparse_dataset import SparseDataset
+from .dataframe import DataFrame
 from .. import utils
 from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len
 from ..logging import anndata_logger as logger
@@ -839,7 +840,7 @@ def obs(self) -> pd.DataFrame:
         if self.isbacked:
             if not self.file.is_open:
                 self.file.open()
-            obs = HDF5DataFrame(self.file["obs"])
+            obs = DataFrame(self.file["obs"])
         else:
             obs = self._obs
         return obs
diff --git a/anndata/_core/dataframe.py b/anndata/_core/dataframe.py
new file mode 100644
index 000000000..388ae551a
--- /dev/null
+++ b/anndata/_core/dataframe.py
@@ -0,0 +1,413 @@
+"""
+This module implements on disk dataframes.
+"""
+
+import collections.abc as cabc
+from itertools import accumulate, chain
+from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type
+from warnings import warn
+
+import h5py
+import pandas as pd
+import numpy as np
+import scipy.sparse as ss
+from scipy.sparse import _sparsetools
+
+from ..compat import _read_attr
+
+# from .._io.specs.methods import read_elem_partial
+
+try:
+    # Not really important, just for IDEs to be more helpful
+    from scipy.sparse.compressed import _cs_matrix
+except ImportError:
+    _cs_matrix = ss.spmatrix
+
+from .index import unpack_index, Index, _subset
+
+
+class BackedFormat(NamedTuple):
+    format_str: str
+    backed_type: Type["BackedSparseMatrix"]
+    memory_type: Type[ss.spmatrix]
+
+
+class BackedSparseMatrix(_cs_matrix):
+    """\
+    Mixin class for backed sparse matrices.
+
+    Largely needed for the case `backed_sparse_csr(...)[:]`,
+    since that calls copy on `.data`, `.indices`, and `.indptr`.
+    """
+
+    def copy(self) -> ss.spmatrix:
+        if isinstance(self.data, h5py.Dataset):
+            return SparseDataset(self.data.parent).to_memory()
+        else:
+            return super().copy()
+
+    def _set_many(self, i: Iterable[int], j: Iterable[int], x):
+        """\
+        Sets value at each (i, j) to x
+
+        Here (i,j) index major and minor respectively,
+        and must not contain duplicate entries.
+        """
+        # Scipy 1.3+ compat
+        n_samples = 1 if np.isscalar(x) else len(x)
+        offsets = self._offsets(i, j, n_samples)
+
+        if -1 not in offsets:
+            # make a list for interaction with h5py
+            offsets = list(offsets)
+            # only affects existing non-zero cells
+            self.data[offsets] = x
+            return
+
+        else:
+            raise ValueError(
+                "You cannot change the sparsity structure of a SparseDataset."
+            )
+            # replace where possible
+            # mask = offsets > -1
+            # # offsets[mask]
+            # bool_data_mask = np.zeros(len(self.data), dtype=bool)
+            # bool_data_mask[offsets[mask]] = True
+            # self.data[bool_data_mask] = x[mask]
+            # # self.data[offsets[mask]] = x[mask]
+            # # only insertions remain
+            # mask = ~mask
+            # i = i[mask]
+            # i[i < 0] += M
+            # j = j[mask]
+            # j[j < 0] += N
+            # self._insert_many(i, j, x[mask])
+
+    def _zero_many(self, i: Sequence[int], j: Sequence[int]):
+        """\
+        Sets value at each (i, j) to zero, preserving sparsity structure.
+
+        Here (i,j) index major and minor respectively.
+        """
+        offsets = self._offsets(i, j, len(i))
+
+        # only assign zeros to the existing sparsity structure
+        self.data[list(offsets[offsets > -1])] = 0
+
+    def _offsets(
+        self, i: Iterable[int], j: Iterable[int], n_samples: int
+    ) -> np.ndarray:
+        i, j, M, N = self._prepare_indices(i, j)
+        offsets = np.empty(n_samples, dtype=self.indices.dtype)
+        ret = _sparsetools.csr_sample_offsets(
+            M, N, self.indptr, self.indices, n_samples, i, j, offsets
+        )
+        if ret == 1:
+            # rinse and repeat
+            self.sum_duplicates()
+            _sparsetools.csr_sample_offsets(
+                M, N, self.indptr, self.indices, n_samples, i, j, offsets
+            )
+        return offsets
+
+
+class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix):
+    def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix:
+        return ss.csr_matrix(
+            get_compressed_vector(self, row), shape=(1, self.shape[1])
+        )[:, col]
+
+    def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:
+        out_shape = (
+            slice_len(row, self.shape[0]),
+            slice_len(col, self.shape[1]),
+        )
+        if out_shape[0] == 1:
+            return self._get_intXslice(slice_as_int(row, self.shape[0]), col)
+        elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]:
+            return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col)
+        return super()._get_sliceXslice(row, col)
+
+    def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix:
+        idxs = np.asarray(row)
+        if idxs.dtype == bool:
+            idxs = np.where(idxs)
+        return ss.csr_matrix(
+            get_compressed_vectors(self, idxs), shape=(len(idxs), self.shape[1])
+        )[:, col]
+
+
+class backed_csc_matrix(BackedSparseMatrix, ss.csc_matrix):
+    def _get_sliceXint(self, row: slice, col: int) -> ss.csc_matrix:
+        return ss.csc_matrix(
+            get_compressed_vector(self, col), shape=(self.shape[0], 1)
+        )[row, :]
+
+    def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix:
+        out_shape = (
+            slice_len(row, self.shape[0]),
+            slice_len(col, self.shape[1]),
+        )
+        if out_shape[1] == 1:
+            return self._get_sliceXint(row, slice_as_int(col, self.shape[1]))
+        elif out_shape[0] == self.shape[0] and out_shape[1] < self.shape[1]:
+            return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1])))
+        return super()._get_sliceXslice(row, col)
+
+    def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
+        idxs = np.asarray(col)
+        if idxs.dtype == bool:
+            idxs = np.where(idxs)
+        return ss.csc_matrix(
+            get_compressed_vectors(self, idxs), shape=(self.shape[0], len(idxs))
+        )[row, :]
+
+
+FORMATS = [
+    BackedFormat("csr", backed_csr_matrix, ss.csr_matrix),
+    BackedFormat("csc", backed_csc_matrix, ss.csc_matrix),
+]
+
+
+def slice_len(s: slice, l: int) -> int:
+    """Returns length of `a[s]` where `len(a) == l`."""
+    return len(range(*s.indices(l)))
+
+
+def slice_as_int(s: slice, l: int) -> int:
+    """Converts slices of length 1 to the integer index they’ll access."""
+    out = list(range(*s.indices(l)))
+    assert len(out) == 1
+    return out[0]
+
+
+def get_compressed_vectors(
+    x: BackedSparseMatrix, row_idxs: Iterable[int]
+) -> Tuple[Sequence, Sequence, Sequence]:
+    slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs]
+    data = np.concatenate([x.data[s] for s in slices])
+    indices = np.concatenate([x.indices[s] for s in slices])
+    indptr = list(accumulate(chain((0,), (s.stop - s.start for s in slices))))
+    return data, indices, indptr
+
+
+def get_compressed_vector(
+    x: BackedSparseMatrix, idx: int
+) -> Tuple[Sequence, Sequence, Sequence]:
+    s = slice(*(x.indptr[idx : idx + 2]))
+    data = x.data[s]
+    indices = x.indices[s]
+    indptr = [0, len(data)]
+    return data, indices, indptr
+
+
+def get_format_str(data: ss.spmatrix) -> str:
+    for fmt, _, memory_class in FORMATS:
+        if isinstance(data, memory_class):
+            return fmt
+    raise ValueError(f"Data type {type(data)} is not supported.")
+
+
+def get_memory_class(format_str: str) -> Type[ss.spmatrix]:
+    for fmt, _, memory_class in FORMATS:
+        if format_str == fmt:
+            return memory_class
+    raise ValueError(f"Format string {format_str} is not supported.")
+
+
+def get_backed_class(format_str: str) -> Type[BackedSparseMatrix]:
+    for fmt, backed_class, _ in FORMATS:
+        if format_str == fmt:
+            return backed_class
+    raise ValueError(f"Format string {format_str} is not supported.")
+
+
+class DataFrame:
+    def __init__(self, group: h5py.Group):
+        assert (
+            group.attrs["encoding-type"] == "dataframe"
+        ), "HDF5 group at path '{group.name}' is not encoded as a dataframe"
+
+        self._group = group
+        self._attrs = self._group.attrs
+
+        self._index = self._group[self._attrs["_index"]].asstr()
+        self.columns = pd.Index(self._attrs["column-order"])
+
+        for column in self.columns:
+            # read_elem_partial(group)  # , items=obs, indices=(obs_idx, slice(None)))
+            setattr(self, column, self._group[column])
+
+    @property
+    def index(self):
+        return pd.Index(self._index[:])
+
+    def __getitem__(self, index):
+        if isinstance(index, str) and index in self.columns:
+            return getattr(self, index)
+
+        elif isinstance(index, slice):
+            return self._group[index]
+
+        else:
+            raise TypeError(f"Invalid index '{index}' of type {type(index)}")
+
+
+class SparseDataset:
+    """Analogous to :class:`h5py.Dataset <h5py:Dataset>`, but for sparse matrices."""
+
+    def __init__(self, group: h5py.Group):
+        self.group = group
+
+    @property
+    def dtype(self) -> np.dtype:
+        return self.group["data"].dtype
+
+    @property
+    def format_str(self) -> str:
+        if "h5sparse_format" in self.group.attrs:
+            return _read_attr(self.group.attrs, "h5sparse_format")
+        else:
+            # Should this be an extra field?
+            return _read_attr(self.group.attrs, "encoding-type").replace("_matrix", "")
+
+    @property
+    def h5py_group(self) -> h5py.Group:
+        warn(
+            "Attribute `h5py_group` of SparseDatasets is deprecated. "
+            "Use `group` instead.",
+            DeprecationWarning,
+        )
+        return self.group
+
+    @property
+    def name(self) -> str:
+        return self.group.name
+
+    @property
+    def file(self) -> h5py.File:
+        return self.group.file
+
+    @property
+    def shape(self) -> Tuple[int, int]:
+        shape = self.group.attrs.get("h5sparse_shape")
+        return tuple(self.group.attrs["shape"] if shape is None else shape)
+
+    @property
+    def value(self) -> ss.spmatrix:
+        return self.to_memory()
+
+    def __repr__(self) -> str:
+        return (
+            f"<HDF5 sparse dataset: format {self.format_str!r}, "
+            f"shape {self.shape}, "
+            f'type {self.group["data"].dtype.str!r}>'
+        )
+
+    def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]:
+        row, col = self._normalize_index(index)
+        mtx = self.to_backed()
+        sub = mtx[row, col]
+        # If indexing is array x array it returns a backed_sparse_matrix
+        # Not sure what the performance is on that operation
+        if isinstance(sub, BackedSparseMatrix):
+            return get_memory_class(self.format_str)(sub)
+        else:
+            return sub
+
+    def __setitem__(self, index: Union[Index, Tuple[()]], value):
+        row, col = self._normalize_index(index)
+        mock_matrix = self.to_backed()
+        mock_matrix[row, col] = value
+
+    def _normalize_index(
+        self, index: Union[Index, Tuple[()]]
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        if index == ():
+            index = slice(None)
+        row, col = unpack_index(index)
+        if all(isinstance(x, cabc.Iterable) for x in (row, col)):
+            row, col = np.ix_(row, col)
+        return row, col
+
+    def append(self, sparse_matrix: ss.spmatrix):
+        # Prep variables
+        shape = self.shape
+        if isinstance(sparse_matrix, SparseDataset):
+            sparse_matrix = sparse_matrix.to_backed()
+
+        # Check input
+        if not ss.isspmatrix(sparse_matrix):
+            raise NotImplementedError(
+                "Currently, only sparse matrices of equivalent format can be "
+                "appended to a SparseDataset."
+            )
+        if self.format_str not in {"csr", "csc"}:
+            raise NotImplementedError(
+                f"The append method for format {self.format_str} "
+                f"is not implemented."
+            )
+        if self.format_str != get_format_str(sparse_matrix):
+            raise ValueError(
+                f"Matrices must have same format. Currently are "
+                f"{self.format_str!r} and {get_format_str(sparse_matrix)!r}"
+            )
+
+        # shape
+        if self.format_str == "csr":
+            assert (
+                shape[1] == sparse_matrix.shape[1]
+            ), "CSR matrices must have same size of dimension 1 to be appended."
+            new_shape = (shape[0] + sparse_matrix.shape[0], shape[1])
+        elif self.format_str == "csc":
+            assert (
+                shape[0] == sparse_matrix.shape[0]
+            ), "CSC matrices must have same size of dimension 0 to be appended."
+            new_shape = (shape[0], shape[1] + sparse_matrix.shape[1])
+        else:
+            assert False, "We forgot to update this branching to a new format"
+        if "h5sparse_shape" in self.group.attrs:
+            del self.group.attrs["h5sparse_shape"]
+        self.group.attrs["shape"] = new_shape
+
+        # data
+        data = self.group["data"]
+        orig_data_size = data.shape[0]
+        data.resize((orig_data_size + sparse_matrix.data.shape[0],))
+        data[orig_data_size:] = sparse_matrix.data
+
+        # indptr
+        indptr = self.group["indptr"]
+        orig_data_size = indptr.shape[0]
+        append_offset = indptr[-1]
+        indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,))
+        indptr[orig_data_size:] = (
+            sparse_matrix.indptr[1:].astype(np.int64) + append_offset
+        )
+
+        # indices
+        indices = self.group["indices"]
+        orig_data_size = indices.shape[0]
+        indices.resize((orig_data_size + sparse_matrix.indices.shape[0],))
+        indices[orig_data_size:] = sparse_matrix.indices
+
+    def to_backed(self) -> BackedSparseMatrix:
+        format_class = get_backed_class(self.format_str)
+        mtx = format_class(self.shape, dtype=self.dtype)
+        mtx.data = self.group["data"]
+        mtx.indices = self.group["indices"]
+        mtx.indptr = self.group["indptr"][:]
+        return mtx
+
+    def to_memory(self) -> ss.spmatrix:
+        format_class = get_memory_class(self.format_str)
+        mtx = format_class(self.shape, dtype=self.dtype)
+        mtx.data = self.group["data"][...]
+        mtx.indices = self.group["indices"][...]
+        mtx.indptr = self.group["indptr"][...]
+        return mtx
+
+
+@_subset.register(SparseDataset)
+def subset_sparsedataset(d, subset_idx):
+    return d[subset_idx]
diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py
index 987ff038c..02401873c 100644
--- a/anndata/_core/file_backing.py
+++ b/anndata/_core/file_backing.py
@@ -5,7 +5,6 @@
 from collections.abc import Mapping
 
 import h5py
-import pandas as pd
 
 from . import anndata
 from .sparse_dataset import SparseDataset
@@ -134,72 +133,3 @@ def _(x, copy=False):
         return _copy(x)
     else:
         return x
-
-
-class HDF5DataFrame:
-    def __init__(self, group: h5py.Group):
-        assert (
-            group.attrs["encoding-type"] == "dataframe"
-        ), "HDF5 group at path '{group.name}' is not encoded as a dataframe"
-
-        self._group = group
-        self._attrs = self._group.attrs
-
-        self._index = self._group[self._attrs["_index"]].asstr()
-        self.columns = pd.Index(self._attrs["column-order"])
-
-        for column in self.columns:
-            setattr(self, column, self._group[column])
-
-    @property
-    def index(self):
-        return pd.Index(self._index[:])
-
-    # def __getitem__(self, key):
-    #     if isinstance(key, str) and key in self.columns:
-    #         return self._group[key]
-
-    #     elif isinstance(key, slice):
-    #         return self._group[key]
-
-    #     if isinstance(index, tuple) and self.attr in ("obs", "obsm"):
-    #         oidx = index[0]
-    #         if len(index) > 1:
-    #             vidx = index[1]
-
-    #     if oidx is None:
-    #         view = self.adset[index]
-    #     else:
-    #         view = self.adset[oidx]
-    #     attr_arr = getattr(view, self.attr)
-    #     if self.key is not None:
-    #         attr_arr = attr_arr[self.key]
-    #     return attr_arr if vidx is None else attr_arr[:, vidx]
-
-    # @property
-    # def shape(self):
-    #     shape = self.adset.shape
-    #     if self.attr in ["X", "layers"]:
-    #         return shape
-    #     elif self.attr == "obs":
-    #         return (shape[0],)
-    #     elif self.attr == "obsm" and self.key is not None:
-    #         return shape[0], self[:1].shape[1]
-    #     else:
-    #         return None
-
-    # @property
-    # def ndim(self):
-    #     return len(self.shape) if self.shape is not None else 0
-
-    # @property
-    # def dtype(self):
-    #     _dtypes = self.adset._dtypes
-    #     if _dtypes is not None and self.attr in _dtypes:
-    #         return _dtypes[self.attr][self.key]
-
-    #     attr = self[:1]
-    #     if hasattr(attr, "dtype"):
-    #         return attr.dtype
-    #     else:
-    #         return None

From 5b1ea95639b3303e35e62834e86dcbbb1d45ad6b Mon Sep 17 00:00:00 2001
From: Grisha Szep <gregory.szep@gmail.com>
Date: Sun, 16 Apr 2023 14:39:04 +0100
Subject: [PATCH 3/4] try lazyness on registry level

---
 anndata/_core/dataframe.py   | 384 +----------------------------------
 anndata/_io/specs/methods.py |   1 +
 2 files changed, 7 insertions(+), 378 deletions(-)

diff --git a/anndata/_core/dataframe.py b/anndata/_core/dataframe.py
index 388ae551a..6d3136e99 100644
--- a/anndata/_core/dataframe.py
+++ b/anndata/_core/dataframe.py
@@ -2,224 +2,8 @@
 This module implements on disk dataframes.
 """
 
-import collections.abc as cabc
-from itertools import accumulate, chain
-from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type
-from warnings import warn
-
 import h5py
 import pandas as pd
-import numpy as np
-import scipy.sparse as ss
-from scipy.sparse import _sparsetools
-
-from ..compat import _read_attr
-
-# from .._io.specs.methods import read_elem_partial
-
-try:
-    # Not really important, just for IDEs to be more helpful
-    from scipy.sparse.compressed import _cs_matrix
-except ImportError:
-    _cs_matrix = ss.spmatrix
-
-from .index import unpack_index, Index, _subset
-
-
-class BackedFormat(NamedTuple):
-    format_str: str
-    backed_type: Type["BackedSparseMatrix"]
-    memory_type: Type[ss.spmatrix]
-
-
-class BackedSparseMatrix(_cs_matrix):
-    """\
-    Mixin class for backed sparse matrices.
-
-    Largely needed for the case `backed_sparse_csr(...)[:]`,
-    since that calls copy on `.data`, `.indices`, and `.indptr`.
-    """
-
-    def copy(self) -> ss.spmatrix:
-        if isinstance(self.data, h5py.Dataset):
-            return SparseDataset(self.data.parent).to_memory()
-        else:
-            return super().copy()
-
-    def _set_many(self, i: Iterable[int], j: Iterable[int], x):
-        """\
-        Sets value at each (i, j) to x
-
-        Here (i,j) index major and minor respectively,
-        and must not contain duplicate entries.
-        """
-        # Scipy 1.3+ compat
-        n_samples = 1 if np.isscalar(x) else len(x)
-        offsets = self._offsets(i, j, n_samples)
-
-        if -1 not in offsets:
-            # make a list for interaction with h5py
-            offsets = list(offsets)
-            # only affects existing non-zero cells
-            self.data[offsets] = x
-            return
-
-        else:
-            raise ValueError(
-                "You cannot change the sparsity structure of a SparseDataset."
-            )
-            # replace where possible
-            # mask = offsets > -1
-            # # offsets[mask]
-            # bool_data_mask = np.zeros(len(self.data), dtype=bool)
-            # bool_data_mask[offsets[mask]] = True
-            # self.data[bool_data_mask] = x[mask]
-            # # self.data[offsets[mask]] = x[mask]
-            # # only insertions remain
-            # mask = ~mask
-            # i = i[mask]
-            # i[i < 0] += M
-            # j = j[mask]
-            # j[j < 0] += N
-            # self._insert_many(i, j, x[mask])
-
-    def _zero_many(self, i: Sequence[int], j: Sequence[int]):
-        """\
-        Sets value at each (i, j) to zero, preserving sparsity structure.
-
-        Here (i,j) index major and minor respectively.
-        """
-        offsets = self._offsets(i, j, len(i))
-
-        # only assign zeros to the existing sparsity structure
-        self.data[list(offsets[offsets > -1])] = 0
-
-    def _offsets(
-        self, i: Iterable[int], j: Iterable[int], n_samples: int
-    ) -> np.ndarray:
-        i, j, M, N = self._prepare_indices(i, j)
-        offsets = np.empty(n_samples, dtype=self.indices.dtype)
-        ret = _sparsetools.csr_sample_offsets(
-            M, N, self.indptr, self.indices, n_samples, i, j, offsets
-        )
-        if ret == 1:
-            # rinse and repeat
-            self.sum_duplicates()
-            _sparsetools.csr_sample_offsets(
-                M, N, self.indptr, self.indices, n_samples, i, j, offsets
-            )
-        return offsets
-
-
-class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix):
-    def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix:
-        return ss.csr_matrix(
-            get_compressed_vector(self, row), shape=(1, self.shape[1])
-        )[:, col]
-
-    def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix:
-        out_shape = (
-            slice_len(row, self.shape[0]),
-            slice_len(col, self.shape[1]),
-        )
-        if out_shape[0] == 1:
-            return self._get_intXslice(slice_as_int(row, self.shape[0]), col)
-        elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]:
-            return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col)
-        return super()._get_sliceXslice(row, col)
-
-    def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix:
-        idxs = np.asarray(row)
-        if idxs.dtype == bool:
-            idxs = np.where(idxs)
-        return ss.csr_matrix(
-            get_compressed_vectors(self, idxs), shape=(len(idxs), self.shape[1])
-        )[:, col]
-
-
-class backed_csc_matrix(BackedSparseMatrix, ss.csc_matrix):
-    def _get_sliceXint(self, row: slice, col: int) -> ss.csc_matrix:
-        return ss.csc_matrix(
-            get_compressed_vector(self, col), shape=(self.shape[0], 1)
-        )[row, :]
-
-    def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix:
-        out_shape = (
-            slice_len(row, self.shape[0]),
-            slice_len(col, self.shape[1]),
-        )
-        if out_shape[1] == 1:
-            return self._get_sliceXint(row, slice_as_int(col, self.shape[1]))
-        elif out_shape[0] == self.shape[0] and out_shape[1] < self.shape[1]:
-            return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1])))
-        return super()._get_sliceXslice(row, col)
-
-    def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix:
-        idxs = np.asarray(col)
-        if idxs.dtype == bool:
-            idxs = np.where(idxs)
-        return ss.csc_matrix(
-            get_compressed_vectors(self, idxs), shape=(self.shape[0], len(idxs))
-        )[row, :]
-
-
-FORMATS = [
-    BackedFormat("csr", backed_csr_matrix, ss.csr_matrix),
-    BackedFormat("csc", backed_csc_matrix, ss.csc_matrix),
-]
-
-
-def slice_len(s: slice, l: int) -> int:
-    """Returns length of `a[s]` where `len(a) == l`."""
-    return len(range(*s.indices(l)))
-
-
-def slice_as_int(s: slice, l: int) -> int:
-    """Converts slices of length 1 to the integer index they’ll access."""
-    out = list(range(*s.indices(l)))
-    assert len(out) == 1
-    return out[0]
-
-
-def get_compressed_vectors(
-    x: BackedSparseMatrix, row_idxs: Iterable[int]
-) -> Tuple[Sequence, Sequence, Sequence]:
-    slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs]
-    data = np.concatenate([x.data[s] for s in slices])
-    indices = np.concatenate([x.indices[s] for s in slices])
-    indptr = list(accumulate(chain((0,), (s.stop - s.start for s in slices))))
-    return data, indices, indptr
-
-
-def get_compressed_vector(
-    x: BackedSparseMatrix, idx: int
-) -> Tuple[Sequence, Sequence, Sequence]:
-    s = slice(*(x.indptr[idx : idx + 2]))
-    data = x.data[s]
-    indices = x.indices[s]
-    indptr = [0, len(data)]
-    return data, indices, indptr
-
-
-def get_format_str(data: ss.spmatrix) -> str:
-    for fmt, _, memory_class in FORMATS:
-        if isinstance(data, memory_class):
-            return fmt
-    raise ValueError(f"Data type {type(data)} is not supported.")
-
-
-def get_memory_class(format_str: str) -> Type[ss.spmatrix]:
-    for fmt, _, memory_class in FORMATS:
-        if format_str == fmt:
-            return memory_class
-    raise ValueError(f"Format string {format_str} is not supported.")
-
-
-def get_backed_class(format_str: str) -> Type[BackedSparseMatrix]:
-    for fmt, backed_class, _ in FORMATS:
-        if format_str == fmt:
-            return backed_class
-    raise ValueError(f"Format string {format_str} is not supported.")
 
 
 class DataFrame:
@@ -232,7 +16,7 @@ def __init__(self, group: h5py.Group):
         self._attrs = self._group.attrs
 
         self._index = self._group[self._attrs["_index"]].asstr()
-        self.columns = pd.Index(self._attrs["column-order"])
+        self._columns = self._attrs["column-order"]
 
         for column in self.columns:
             # read_elem_partial(group)  # , items=obs, indices=(obs_idx, slice(None)))
@@ -242,172 +26,16 @@ def __init__(self, group: h5py.Group):
     def index(self):
         return pd.Index(self._index[:])
 
+    @property
+    def columns(self):
+        return pd.Index(self._columns)
+
     def __getitem__(self, index):
         if isinstance(index, str) and index in self.columns:
             return getattr(self, index)
 
         elif isinstance(index, slice):
-            return self._group[index]
+            raise NotImplementedError("Slicing is not yet supported.")
 
         else:
             raise TypeError(f"Invalid index '{index}' of type {type(index)}")
-
-
-class SparseDataset:
-    """Analogous to :class:`h5py.Dataset <h5py:Dataset>`, but for sparse matrices."""
-
-    def __init__(self, group: h5py.Group):
-        self.group = group
-
-    @property
-    def dtype(self) -> np.dtype:
-        return self.group["data"].dtype
-
-    @property
-    def format_str(self) -> str:
-        if "h5sparse_format" in self.group.attrs:
-            return _read_attr(self.group.attrs, "h5sparse_format")
-        else:
-            # Should this be an extra field?
-            return _read_attr(self.group.attrs, "encoding-type").replace("_matrix", "")
-
-    @property
-    def h5py_group(self) -> h5py.Group:
-        warn(
-            "Attribute `h5py_group` of SparseDatasets is deprecated. "
-            "Use `group` instead.",
-            DeprecationWarning,
-        )
-        return self.group
-
-    @property
-    def name(self) -> str:
-        return self.group.name
-
-    @property
-    def file(self) -> h5py.File:
-        return self.group.file
-
-    @property
-    def shape(self) -> Tuple[int, int]:
-        shape = self.group.attrs.get("h5sparse_shape")
-        return tuple(self.group.attrs["shape"] if shape is None else shape)
-
-    @property
-    def value(self) -> ss.spmatrix:
-        return self.to_memory()
-
-    def __repr__(self) -> str:
-        return (
-            f"<HDF5 sparse dataset: format {self.format_str!r}, "
-            f"shape {self.shape}, "
-            f'type {self.group["data"].dtype.str!r}>'
-        )
-
-    def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]:
-        row, col = self._normalize_index(index)
-        mtx = self.to_backed()
-        sub = mtx[row, col]
-        # If indexing is array x array it returns a backed_sparse_matrix
-        # Not sure what the performance is on that operation
-        if isinstance(sub, BackedSparseMatrix):
-            return get_memory_class(self.format_str)(sub)
-        else:
-            return sub
-
-    def __setitem__(self, index: Union[Index, Tuple[()]], value):
-        row, col = self._normalize_index(index)
-        mock_matrix = self.to_backed()
-        mock_matrix[row, col] = value
-
-    def _normalize_index(
-        self, index: Union[Index, Tuple[()]]
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        if index == ():
-            index = slice(None)
-        row, col = unpack_index(index)
-        if all(isinstance(x, cabc.Iterable) for x in (row, col)):
-            row, col = np.ix_(row, col)
-        return row, col
-
-    def append(self, sparse_matrix: ss.spmatrix):
-        # Prep variables
-        shape = self.shape
-        if isinstance(sparse_matrix, SparseDataset):
-            sparse_matrix = sparse_matrix.to_backed()
-
-        # Check input
-        if not ss.isspmatrix(sparse_matrix):
-            raise NotImplementedError(
-                "Currently, only sparse matrices of equivalent format can be "
-                "appended to a SparseDataset."
-            )
-        if self.format_str not in {"csr", "csc"}:
-            raise NotImplementedError(
-                f"The append method for format {self.format_str} "
-                f"is not implemented."
-            )
-        if self.format_str != get_format_str(sparse_matrix):
-            raise ValueError(
-                f"Matrices must have same format. Currently are "
-                f"{self.format_str!r} and {get_format_str(sparse_matrix)!r}"
-            )
-
-        # shape
-        if self.format_str == "csr":
-            assert (
-                shape[1] == sparse_matrix.shape[1]
-            ), "CSR matrices must have same size of dimension 1 to be appended."
-            new_shape = (shape[0] + sparse_matrix.shape[0], shape[1])
-        elif self.format_str == "csc":
-            assert (
-                shape[0] == sparse_matrix.shape[0]
-            ), "CSC matrices must have same size of dimension 0 to be appended."
-            new_shape = (shape[0], shape[1] + sparse_matrix.shape[1])
-        else:
-            assert False, "We forgot to update this branching to a new format"
-        if "h5sparse_shape" in self.group.attrs:
-            del self.group.attrs["h5sparse_shape"]
-        self.group.attrs["shape"] = new_shape
-
-        # data
-        data = self.group["data"]
-        orig_data_size = data.shape[0]
-        data.resize((orig_data_size + sparse_matrix.data.shape[0],))
-        data[orig_data_size:] = sparse_matrix.data
-
-        # indptr
-        indptr = self.group["indptr"]
-        orig_data_size = indptr.shape[0]
-        append_offset = indptr[-1]
-        indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,))
-        indptr[orig_data_size:] = (
-            sparse_matrix.indptr[1:].astype(np.int64) + append_offset
-        )
-
-        # indices
-        indices = self.group["indices"]
-        orig_data_size = indices.shape[0]
-        indices.resize((orig_data_size + sparse_matrix.indices.shape[0],))
-        indices[orig_data_size:] = sparse_matrix.indices
-
-    def to_backed(self) -> BackedSparseMatrix:
-        format_class = get_backed_class(self.format_str)
-        mtx = format_class(self.shape, dtype=self.dtype)
-        mtx.data = self.group["data"]
-        mtx.indices = self.group["indices"]
-        mtx.indptr = self.group["indptr"][:]
-        return mtx
-
-    def to_memory(self) -> ss.spmatrix:
-        format_class = get_memory_class(self.format_str)
-        mtx = format_class(self.shape, dtype=self.dtype)
-        mtx.data = self.group["data"][...]
-        mtx.indices = self.group["indices"][...]
-        mtx.indptr = self.group["indptr"][...]
-        return mtx
-
-
-@_subset.register(SparseDataset)
-def subset_sparsedataset(d, subset_idx):
-    return d[subset_idx]
diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py
index 9584f82d1..78f913eb8 100644
--- a/anndata/_io/specs/methods.py
+++ b/anndata/_io/specs/methods.py
@@ -17,6 +17,7 @@
 from anndata._core.index import _normalize_indices
 from anndata._core.merge import intersect_keys
 from anndata._core.sparse_dataset import SparseDataset
+from anndata._core.dataframe import DataFrame
 from anndata._core import views
 from anndata.compat import (
     ZarrArray,

From 846d07e66407c9a7b810a534d27196f3c6419401 Mon Sep 17 00:00:00 2001
From: Grisha Szep <gregory.szep@gmail.com>
Date: Sun, 16 Apr 2023 14:55:14 +0100
Subject: [PATCH 4/4] revert changes

---
 anndata/_core/anndata.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py
index 235827c39..541bc0ab0 100644
--- a/anndata/_core/anndata.py
+++ b/anndata/_core/anndata.py
@@ -348,7 +348,7 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index):
 
         # set data
         if self.isbacked:
-            self._X = None
+            self._X = self._obs = None
 
         # set raw, easy, as it’s immutable anyways...
         if adata_ref._raw is not None:
@@ -837,13 +837,7 @@ def _set_dim_index(self, value: pd.Index, attr: str):
     @property
     def obs(self) -> pd.DataFrame:
         """One-dimensional annotation of observations (`pd.DataFrame`)."""
-        if self.isbacked:
-            if not self.file.is_open:
-                self.file.open()
-            obs = DataFrame(self.file["obs"])
-        else:
-            obs = self._obs
-        return obs
+        return self._obs
 
     @obs.setter
     def obs(self, value: pd.DataFrame):