From 36480ba2af5559bd6a21cb0964f02658667d4abd Mon Sep 17 00:00:00 2001 From: Grisha Szep Date: Sun, 16 Apr 2023 12:42:36 +0100 Subject: [PATCH 1/4] HDF5DataFrame class --- anndata/_core/anndata.py | 10 ++++- anndata/_core/file_backing.py | 70 ++++++++++++++++++++++++++++++ anndata/tests/test_hdf5_backing.py | 9 ++++ 3 files changed, 87 insertions(+), 2 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 38680b9dd..1a8bf6ba8 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -26,7 +26,7 @@ from anndata._warnings import ImplicitModificationWarning from .raw import Raw from .index import _normalize_indices, _subset, Index, Index1D, get_vector -from .file_backing import AnnDataFileManager, to_memory +from .file_backing import AnnDataFileManager, to_memory, HDF5DataFrame from .access import ElementRef from .aligned_mapping import ( AxisArrays, @@ -836,7 +836,13 @@ def _set_dim_index(self, value: pd.Index, attr: str): @property def obs(self) -> pd.DataFrame: """One-dimensional annotation of observations (`pd.DataFrame`).""" - return self._obs + if self.isbacked: + if not self.file.is_open: + self.file.open() + obs = HDF5DataFrame(self.file["obs"]) + else: + obs = self._obs + return obs @obs.setter def obs(self, value: pd.DataFrame): diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py index 02401873c..987ff038c 100644 --- a/anndata/_core/file_backing.py +++ b/anndata/_core/file_backing.py @@ -5,6 +5,7 @@ from collections.abc import Mapping import h5py +import pandas as pd from . import anndata from .sparse_dataset import SparseDataset @@ -133,3 +134,72 @@ def _(x, copy=False): return _copy(x) else: return x + + +class HDF5DataFrame: + def __init__(self, group: h5py.Group): + assert ( + group.attrs["encoding-type"] == "dataframe" + ), "HDF5 group at path '{group.name}' is not encoded as a dataframe" + + self._group = group + self._attrs = self._group.attrs + + self._index = self._group[self._attrs["_index"]].asstr() + self.columns = pd.Index(self._attrs["column-order"]) + + for column in self.columns: + setattr(self, column, self._group[column]) + + @property + def index(self): + return pd.Index(self._index[:]) + + # def __getitem__(self, key): + # if isinstance(key, str) and key in self.columns: + # return self._group[key] + + # elif isinstance(key, slice): + # return self._group[key] + + # if isinstance(index, tuple) and self.attr in ("obs", "obsm"): + # oidx = index[0] + # if len(index) > 1: + # vidx = index[1] + + # if oidx is None: + # view = self.adset[index] + # else: + # view = self.adset[oidx] + # attr_arr = getattr(view, self.attr) + # if self.key is not None: + # attr_arr = attr_arr[self.key] + # return attr_arr if vidx is None else attr_arr[:, vidx] + + # @property + # def shape(self): + # shape = self.adset.shape + # if self.attr in ["X", "layers"]: + # return shape + # elif self.attr == "obs": + # return (shape[0],) + # elif self.attr == "obsm" and self.key is not None: + # return shape[0], self[:1].shape[1] + # else: + # return None + + # @property + # def ndim(self): + # return len(self.shape) if self.shape is not None else 0 + + # @property + # def dtype(self): + # _dtypes = self.adset._dtypes + # if _dtypes is not None and self.attr in _dtypes: + # return _dtypes[self.attr][self.key] + + # attr = self[:1] + # if hasattr(attr, "dtype"): + # return attr.dtype + # else: + # return None diff --git a/anndata/tests/test_hdf5_backing.py b/anndata/tests/test_hdf5_backing.py index 987d01382..66419818e 100644 --- a/anndata/tests/test_hdf5_backing.py +++ b/anndata/tests/test_hdf5_backing.py @@ -175,6 +175,15 @@ def test_backed_raw(tmp_path): assert_equal(final_adata, mem_adata) +def test_backed_dataframes(tmp_path): + backed_pth = tmp_path / "backed.h5ad" + mem_adata = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) + mem_adata.write(backed_pth) + + backed_adata = ad.read_h5ad(backed_pth, backed="r") + assert_equal(backed_adata, mem_adata) + + @pytest.mark.parametrize( "array_type", [ From d9bd744305908c4b754a755b07278c9c40e83fdd Mon Sep 17 00:00:00 2001 From: Grisha Szep Date: Sun, 16 Apr 2023 13:56:07 +0100 Subject: [PATCH 2/4] factor out dataframe.py --- anndata/_core/anndata.py | 5 +- anndata/_core/dataframe.py | 413 ++++++++++++++++++++++++++++++++++ anndata/_core/file_backing.py | 70 ------ 3 files changed, 416 insertions(+), 72 deletions(-) create mode 100644 anndata/_core/dataframe.py diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 1a8bf6ba8..235827c39 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -26,7 +26,7 @@ from anndata._warnings import ImplicitModificationWarning from .raw import Raw from .index import _normalize_indices, _subset, Index, Index1D, get_vector -from .file_backing import AnnDataFileManager, to_memory, HDF5DataFrame +from .file_backing import AnnDataFileManager, to_memory from .access import ElementRef from .aligned_mapping import ( AxisArrays, @@ -44,6 +44,7 @@ _resolve_idxs, ) from .sparse_dataset import SparseDataset +from .dataframe import DataFrame from .. import utils from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len from ..logging import anndata_logger as logger @@ -839,7 +840,7 @@ def obs(self) -> pd.DataFrame: if self.isbacked: if not self.file.is_open: self.file.open() - obs = HDF5DataFrame(self.file["obs"]) + obs = DataFrame(self.file["obs"]) else: obs = self._obs return obs diff --git a/anndata/_core/dataframe.py b/anndata/_core/dataframe.py new file mode 100644 index 000000000..388ae551a --- /dev/null +++ b/anndata/_core/dataframe.py @@ -0,0 +1,413 @@ +""" +This module implements on disk dataframes. +""" + +import collections.abc as cabc +from itertools import accumulate, chain +from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type +from warnings import warn + +import h5py +import pandas as pd +import numpy as np +import scipy.sparse as ss +from scipy.sparse import _sparsetools + +from ..compat import _read_attr + +# from .._io.specs.methods import read_elem_partial + +try: + # Not really important, just for IDEs to be more helpful + from scipy.sparse.compressed import _cs_matrix +except ImportError: + _cs_matrix = ss.spmatrix + +from .index import unpack_index, Index, _subset + + +class BackedFormat(NamedTuple): + format_str: str + backed_type: Type["BackedSparseMatrix"] + memory_type: Type[ss.spmatrix] + + +class BackedSparseMatrix(_cs_matrix): + """\ + Mixin class for backed sparse matrices. + + Largely needed for the case `backed_sparse_csr(...)[:]`, + since that calls copy on `.data`, `.indices`, and `.indptr`. + """ + + def copy(self) -> ss.spmatrix: + if isinstance(self.data, h5py.Dataset): + return SparseDataset(self.data.parent).to_memory() + else: + return super().copy() + + def _set_many(self, i: Iterable[int], j: Iterable[int], x): + """\ + Sets value at each (i, j) to x + + Here (i,j) index major and minor respectively, + and must not contain duplicate entries. + """ + # Scipy 1.3+ compat + n_samples = 1 if np.isscalar(x) else len(x) + offsets = self._offsets(i, j, n_samples) + + if -1 not in offsets: + # make a list for interaction with h5py + offsets = list(offsets) + # only affects existing non-zero cells + self.data[offsets] = x + return + + else: + raise ValueError( + "You cannot change the sparsity structure of a SparseDataset." + ) + # replace where possible + # mask = offsets > -1 + # # offsets[mask] + # bool_data_mask = np.zeros(len(self.data), dtype=bool) + # bool_data_mask[offsets[mask]] = True + # self.data[bool_data_mask] = x[mask] + # # self.data[offsets[mask]] = x[mask] + # # only insertions remain + # mask = ~mask + # i = i[mask] + # i[i < 0] += M + # j = j[mask] + # j[j < 0] += N + # self._insert_many(i, j, x[mask]) + + def _zero_many(self, i: Sequence[int], j: Sequence[int]): + """\ + Sets value at each (i, j) to zero, preserving sparsity structure. + + Here (i,j) index major and minor respectively. + """ + offsets = self._offsets(i, j, len(i)) + + # only assign zeros to the existing sparsity structure + self.data[list(offsets[offsets > -1])] = 0 + + def _offsets( + self, i: Iterable[int], j: Iterable[int], n_samples: int + ) -> np.ndarray: + i, j, M, N = self._prepare_indices(i, j) + offsets = np.empty(n_samples, dtype=self.indices.dtype) + ret = _sparsetools.csr_sample_offsets( + M, N, self.indptr, self.indices, n_samples, i, j, offsets + ) + if ret == 1: + # rinse and repeat + self.sum_duplicates() + _sparsetools.csr_sample_offsets( + M, N, self.indptr, self.indices, n_samples, i, j, offsets + ) + return offsets + + +class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix): + def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix: + return ss.csr_matrix( + get_compressed_vector(self, row), shape=(1, self.shape[1]) + )[:, col] + + def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: + out_shape = ( + slice_len(row, self.shape[0]), + slice_len(col, self.shape[1]), + ) + if out_shape[0] == 1: + return self._get_intXslice(slice_as_int(row, self.shape[0]), col) + elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]: + return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col) + return super()._get_sliceXslice(row, col) + + def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix: + idxs = np.asarray(row) + if idxs.dtype == bool: + idxs = np.where(idxs) + return ss.csr_matrix( + get_compressed_vectors(self, idxs), shape=(len(idxs), self.shape[1]) + )[:, col] + + +class backed_csc_matrix(BackedSparseMatrix, ss.csc_matrix): + def _get_sliceXint(self, row: slice, col: int) -> ss.csc_matrix: + return ss.csc_matrix( + get_compressed_vector(self, col), shape=(self.shape[0], 1) + )[row, :] + + def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix: + out_shape = ( + slice_len(row, self.shape[0]), + slice_len(col, self.shape[1]), + ) + if out_shape[1] == 1: + return self._get_sliceXint(row, slice_as_int(col, self.shape[1])) + elif out_shape[0] == self.shape[0] and out_shape[1] < self.shape[1]: + return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1]))) + return super()._get_sliceXslice(row, col) + + def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: + idxs = np.asarray(col) + if idxs.dtype == bool: + idxs = np.where(idxs) + return ss.csc_matrix( + get_compressed_vectors(self, idxs), shape=(self.shape[0], len(idxs)) + )[row, :] + + +FORMATS = [ + BackedFormat("csr", backed_csr_matrix, ss.csr_matrix), + BackedFormat("csc", backed_csc_matrix, ss.csc_matrix), +] + + +def slice_len(s: slice, l: int) -> int: + """Returns length of `a[s]` where `len(a) == l`.""" + return len(range(*s.indices(l))) + + +def slice_as_int(s: slice, l: int) -> int: + """Converts slices of length 1 to the integer index they’ll access.""" + out = list(range(*s.indices(l))) + assert len(out) == 1 + return out[0] + + +def get_compressed_vectors( + x: BackedSparseMatrix, row_idxs: Iterable[int] +) -> Tuple[Sequence, Sequence, Sequence]: + slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs] + data = np.concatenate([x.data[s] for s in slices]) + indices = np.concatenate([x.indices[s] for s in slices]) + indptr = list(accumulate(chain((0,), (s.stop - s.start for s in slices)))) + return data, indices, indptr + + +def get_compressed_vector( + x: BackedSparseMatrix, idx: int +) -> Tuple[Sequence, Sequence, Sequence]: + s = slice(*(x.indptr[idx : idx + 2])) + data = x.data[s] + indices = x.indices[s] + indptr = [0, len(data)] + return data, indices, indptr + + +def get_format_str(data: ss.spmatrix) -> str: + for fmt, _, memory_class in FORMATS: + if isinstance(data, memory_class): + return fmt + raise ValueError(f"Data type {type(data)} is not supported.") + + +def get_memory_class(format_str: str) -> Type[ss.spmatrix]: + for fmt, _, memory_class in FORMATS: + if format_str == fmt: + return memory_class + raise ValueError(f"Format string {format_str} is not supported.") + + +def get_backed_class(format_str: str) -> Type[BackedSparseMatrix]: + for fmt, backed_class, _ in FORMATS: + if format_str == fmt: + return backed_class + raise ValueError(f"Format string {format_str} is not supported.") + + +class DataFrame: + def __init__(self, group: h5py.Group): + assert ( + group.attrs["encoding-type"] == "dataframe" + ), "HDF5 group at path '{group.name}' is not encoded as a dataframe" + + self._group = group + self._attrs = self._group.attrs + + self._index = self._group[self._attrs["_index"]].asstr() + self.columns = pd.Index(self._attrs["column-order"]) + + for column in self.columns: + # read_elem_partial(group) # , items=obs, indices=(obs_idx, slice(None))) + setattr(self, column, self._group[column]) + + @property + def index(self): + return pd.Index(self._index[:]) + + def __getitem__(self, index): + if isinstance(index, str) and index in self.columns: + return getattr(self, index) + + elif isinstance(index, slice): + return self._group[index] + + else: + raise TypeError(f"Invalid index '{index}' of type {type(index)}") + + +class SparseDataset: + """Analogous to :class:`h5py.Dataset `, but for sparse matrices.""" + + def __init__(self, group: h5py.Group): + self.group = group + + @property + def dtype(self) -> np.dtype: + return self.group["data"].dtype + + @property + def format_str(self) -> str: + if "h5sparse_format" in self.group.attrs: + return _read_attr(self.group.attrs, "h5sparse_format") + else: + # Should this be an extra field? + return _read_attr(self.group.attrs, "encoding-type").replace("_matrix", "") + + @property + def h5py_group(self) -> h5py.Group: + warn( + "Attribute `h5py_group` of SparseDatasets is deprecated. " + "Use `group` instead.", + DeprecationWarning, + ) + return self.group + + @property + def name(self) -> str: + return self.group.name + + @property + def file(self) -> h5py.File: + return self.group.file + + @property + def shape(self) -> Tuple[int, int]: + shape = self.group.attrs.get("h5sparse_shape") + return tuple(self.group.attrs["shape"] if shape is None else shape) + + @property + def value(self) -> ss.spmatrix: + return self.to_memory() + + def __repr__(self) -> str: + return ( + f"' + ) + + def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: + row, col = self._normalize_index(index) + mtx = self.to_backed() + sub = mtx[row, col] + # If indexing is array x array it returns a backed_sparse_matrix + # Not sure what the performance is on that operation + if isinstance(sub, BackedSparseMatrix): + return get_memory_class(self.format_str)(sub) + else: + return sub + + def __setitem__(self, index: Union[Index, Tuple[()]], value): + row, col = self._normalize_index(index) + mock_matrix = self.to_backed() + mock_matrix[row, col] = value + + def _normalize_index( + self, index: Union[Index, Tuple[()]] + ) -> Tuple[np.ndarray, np.ndarray]: + if index == (): + index = slice(None) + row, col = unpack_index(index) + if all(isinstance(x, cabc.Iterable) for x in (row, col)): + row, col = np.ix_(row, col) + return row, col + + def append(self, sparse_matrix: ss.spmatrix): + # Prep variables + shape = self.shape + if isinstance(sparse_matrix, SparseDataset): + sparse_matrix = sparse_matrix.to_backed() + + # Check input + if not ss.isspmatrix(sparse_matrix): + raise NotImplementedError( + "Currently, only sparse matrices of equivalent format can be " + "appended to a SparseDataset." + ) + if self.format_str not in {"csr", "csc"}: + raise NotImplementedError( + f"The append method for format {self.format_str} " + f"is not implemented." + ) + if self.format_str != get_format_str(sparse_matrix): + raise ValueError( + f"Matrices must have same format. Currently are " + f"{self.format_str!r} and {get_format_str(sparse_matrix)!r}" + ) + + # shape + if self.format_str == "csr": + assert ( + shape[1] == sparse_matrix.shape[1] + ), "CSR matrices must have same size of dimension 1 to be appended." + new_shape = (shape[0] + sparse_matrix.shape[0], shape[1]) + elif self.format_str == "csc": + assert ( + shape[0] == sparse_matrix.shape[0] + ), "CSC matrices must have same size of dimension 0 to be appended." + new_shape = (shape[0], shape[1] + sparse_matrix.shape[1]) + else: + assert False, "We forgot to update this branching to a new format" + if "h5sparse_shape" in self.group.attrs: + del self.group.attrs["h5sparse_shape"] + self.group.attrs["shape"] = new_shape + + # data + data = self.group["data"] + orig_data_size = data.shape[0] + data.resize((orig_data_size + sparse_matrix.data.shape[0],)) + data[orig_data_size:] = sparse_matrix.data + + # indptr + indptr = self.group["indptr"] + orig_data_size = indptr.shape[0] + append_offset = indptr[-1] + indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,)) + indptr[orig_data_size:] = ( + sparse_matrix.indptr[1:].astype(np.int64) + append_offset + ) + + # indices + indices = self.group["indices"] + orig_data_size = indices.shape[0] + indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) + indices[orig_data_size:] = sparse_matrix.indices + + def to_backed(self) -> BackedSparseMatrix: + format_class = get_backed_class(self.format_str) + mtx = format_class(self.shape, dtype=self.dtype) + mtx.data = self.group["data"] + mtx.indices = self.group["indices"] + mtx.indptr = self.group["indptr"][:] + return mtx + + def to_memory(self) -> ss.spmatrix: + format_class = get_memory_class(self.format_str) + mtx = format_class(self.shape, dtype=self.dtype) + mtx.data = self.group["data"][...] + mtx.indices = self.group["indices"][...] + mtx.indptr = self.group["indptr"][...] + return mtx + + +@_subset.register(SparseDataset) +def subset_sparsedataset(d, subset_idx): + return d[subset_idx] diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py index 987ff038c..02401873c 100644 --- a/anndata/_core/file_backing.py +++ b/anndata/_core/file_backing.py @@ -5,7 +5,6 @@ from collections.abc import Mapping import h5py -import pandas as pd from . import anndata from .sparse_dataset import SparseDataset @@ -134,72 +133,3 @@ def _(x, copy=False): return _copy(x) else: return x - - -class HDF5DataFrame: - def __init__(self, group: h5py.Group): - assert ( - group.attrs["encoding-type"] == "dataframe" - ), "HDF5 group at path '{group.name}' is not encoded as a dataframe" - - self._group = group - self._attrs = self._group.attrs - - self._index = self._group[self._attrs["_index"]].asstr() - self.columns = pd.Index(self._attrs["column-order"]) - - for column in self.columns: - setattr(self, column, self._group[column]) - - @property - def index(self): - return pd.Index(self._index[:]) - - # def __getitem__(self, key): - # if isinstance(key, str) and key in self.columns: - # return self._group[key] - - # elif isinstance(key, slice): - # return self._group[key] - - # if isinstance(index, tuple) and self.attr in ("obs", "obsm"): - # oidx = index[0] - # if len(index) > 1: - # vidx = index[1] - - # if oidx is None: - # view = self.adset[index] - # else: - # view = self.adset[oidx] - # attr_arr = getattr(view, self.attr) - # if self.key is not None: - # attr_arr = attr_arr[self.key] - # return attr_arr if vidx is None else attr_arr[:, vidx] - - # @property - # def shape(self): - # shape = self.adset.shape - # if self.attr in ["X", "layers"]: - # return shape - # elif self.attr == "obs": - # return (shape[0],) - # elif self.attr == "obsm" and self.key is not None: - # return shape[0], self[:1].shape[1] - # else: - # return None - - # @property - # def ndim(self): - # return len(self.shape) if self.shape is not None else 0 - - # @property - # def dtype(self): - # _dtypes = self.adset._dtypes - # if _dtypes is not None and self.attr in _dtypes: - # return _dtypes[self.attr][self.key] - - # attr = self[:1] - # if hasattr(attr, "dtype"): - # return attr.dtype - # else: - # return None From 5b1ea95639b3303e35e62834e86dcbbb1d45ad6b Mon Sep 17 00:00:00 2001 From: Grisha Szep Date: Sun, 16 Apr 2023 14:39:04 +0100 Subject: [PATCH 3/4] try lazyness on registry level --- anndata/_core/dataframe.py | 384 +---------------------------------- anndata/_io/specs/methods.py | 1 + 2 files changed, 7 insertions(+), 378 deletions(-) diff --git a/anndata/_core/dataframe.py b/anndata/_core/dataframe.py index 388ae551a..6d3136e99 100644 --- a/anndata/_core/dataframe.py +++ b/anndata/_core/dataframe.py @@ -2,224 +2,8 @@ This module implements on disk dataframes. """ -import collections.abc as cabc -from itertools import accumulate, chain -from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type -from warnings import warn - import h5py import pandas as pd -import numpy as np -import scipy.sparse as ss -from scipy.sparse import _sparsetools - -from ..compat import _read_attr - -# from .._io.specs.methods import read_elem_partial - -try: - # Not really important, just for IDEs to be more helpful - from scipy.sparse.compressed import _cs_matrix -except ImportError: - _cs_matrix = ss.spmatrix - -from .index import unpack_index, Index, _subset - - -class BackedFormat(NamedTuple): - format_str: str - backed_type: Type["BackedSparseMatrix"] - memory_type: Type[ss.spmatrix] - - -class BackedSparseMatrix(_cs_matrix): - """\ - Mixin class for backed sparse matrices. - - Largely needed for the case `backed_sparse_csr(...)[:]`, - since that calls copy on `.data`, `.indices`, and `.indptr`. - """ - - def copy(self) -> ss.spmatrix: - if isinstance(self.data, h5py.Dataset): - return SparseDataset(self.data.parent).to_memory() - else: - return super().copy() - - def _set_many(self, i: Iterable[int], j: Iterable[int], x): - """\ - Sets value at each (i, j) to x - - Here (i,j) index major and minor respectively, - and must not contain duplicate entries. - """ - # Scipy 1.3+ compat - n_samples = 1 if np.isscalar(x) else len(x) - offsets = self._offsets(i, j, n_samples) - - if -1 not in offsets: - # make a list for interaction with h5py - offsets = list(offsets) - # only affects existing non-zero cells - self.data[offsets] = x - return - - else: - raise ValueError( - "You cannot change the sparsity structure of a SparseDataset." - ) - # replace where possible - # mask = offsets > -1 - # # offsets[mask] - # bool_data_mask = np.zeros(len(self.data), dtype=bool) - # bool_data_mask[offsets[mask]] = True - # self.data[bool_data_mask] = x[mask] - # # self.data[offsets[mask]] = x[mask] - # # only insertions remain - # mask = ~mask - # i = i[mask] - # i[i < 0] += M - # j = j[mask] - # j[j < 0] += N - # self._insert_many(i, j, x[mask]) - - def _zero_many(self, i: Sequence[int], j: Sequence[int]): - """\ - Sets value at each (i, j) to zero, preserving sparsity structure. - - Here (i,j) index major and minor respectively. - """ - offsets = self._offsets(i, j, len(i)) - - # only assign zeros to the existing sparsity structure - self.data[list(offsets[offsets > -1])] = 0 - - def _offsets( - self, i: Iterable[int], j: Iterable[int], n_samples: int - ) -> np.ndarray: - i, j, M, N = self._prepare_indices(i, j) - offsets = np.empty(n_samples, dtype=self.indices.dtype) - ret = _sparsetools.csr_sample_offsets( - M, N, self.indptr, self.indices, n_samples, i, j, offsets - ) - if ret == 1: - # rinse and repeat - self.sum_duplicates() - _sparsetools.csr_sample_offsets( - M, N, self.indptr, self.indices, n_samples, i, j, offsets - ) - return offsets - - -class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix): - def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix: - return ss.csr_matrix( - get_compressed_vector(self, row), shape=(1, self.shape[1]) - )[:, col] - - def _get_sliceXslice(self, row: slice, col: slice) -> ss.csr_matrix: - out_shape = ( - slice_len(row, self.shape[0]), - slice_len(col, self.shape[1]), - ) - if out_shape[0] == 1: - return self._get_intXslice(slice_as_int(row, self.shape[0]), col) - elif out_shape[1] == self.shape[1] and out_shape[0] < self.shape[0]: - return self._get_arrayXslice(np.arange(*row.indices(self.shape[0])), col) - return super()._get_sliceXslice(row, col) - - def _get_arrayXslice(self, row: Sequence[int], col: slice) -> ss.csr_matrix: - idxs = np.asarray(row) - if idxs.dtype == bool: - idxs = np.where(idxs) - return ss.csr_matrix( - get_compressed_vectors(self, idxs), shape=(len(idxs), self.shape[1]) - )[:, col] - - -class backed_csc_matrix(BackedSparseMatrix, ss.csc_matrix): - def _get_sliceXint(self, row: slice, col: int) -> ss.csc_matrix: - return ss.csc_matrix( - get_compressed_vector(self, col), shape=(self.shape[0], 1) - )[row, :] - - def _get_sliceXslice(self, row: slice, col: slice) -> ss.csc_matrix: - out_shape = ( - slice_len(row, self.shape[0]), - slice_len(col, self.shape[1]), - ) - if out_shape[1] == 1: - return self._get_sliceXint(row, slice_as_int(col, self.shape[1])) - elif out_shape[0] == self.shape[0] and out_shape[1] < self.shape[1]: - return self._get_sliceXarray(row, np.arange(*col.indices(self.shape[1]))) - return super()._get_sliceXslice(row, col) - - def _get_sliceXarray(self, row: slice, col: Sequence[int]) -> ss.csc_matrix: - idxs = np.asarray(col) - if idxs.dtype == bool: - idxs = np.where(idxs) - return ss.csc_matrix( - get_compressed_vectors(self, idxs), shape=(self.shape[0], len(idxs)) - )[row, :] - - -FORMATS = [ - BackedFormat("csr", backed_csr_matrix, ss.csr_matrix), - BackedFormat("csc", backed_csc_matrix, ss.csc_matrix), -] - - -def slice_len(s: slice, l: int) -> int: - """Returns length of `a[s]` where `len(a) == l`.""" - return len(range(*s.indices(l))) - - -def slice_as_int(s: slice, l: int) -> int: - """Converts slices of length 1 to the integer index they’ll access.""" - out = list(range(*s.indices(l))) - assert len(out) == 1 - return out[0] - - -def get_compressed_vectors( - x: BackedSparseMatrix, row_idxs: Iterable[int] -) -> Tuple[Sequence, Sequence, Sequence]: - slices = [slice(*(x.indptr[i : i + 2])) for i in row_idxs] - data = np.concatenate([x.data[s] for s in slices]) - indices = np.concatenate([x.indices[s] for s in slices]) - indptr = list(accumulate(chain((0,), (s.stop - s.start for s in slices)))) - return data, indices, indptr - - -def get_compressed_vector( - x: BackedSparseMatrix, idx: int -) -> Tuple[Sequence, Sequence, Sequence]: - s = slice(*(x.indptr[idx : idx + 2])) - data = x.data[s] - indices = x.indices[s] - indptr = [0, len(data)] - return data, indices, indptr - - -def get_format_str(data: ss.spmatrix) -> str: - for fmt, _, memory_class in FORMATS: - if isinstance(data, memory_class): - return fmt - raise ValueError(f"Data type {type(data)} is not supported.") - - -def get_memory_class(format_str: str) -> Type[ss.spmatrix]: - for fmt, _, memory_class in FORMATS: - if format_str == fmt: - return memory_class - raise ValueError(f"Format string {format_str} is not supported.") - - -def get_backed_class(format_str: str) -> Type[BackedSparseMatrix]: - for fmt, backed_class, _ in FORMATS: - if format_str == fmt: - return backed_class - raise ValueError(f"Format string {format_str} is not supported.") class DataFrame: @@ -232,7 +16,7 @@ def __init__(self, group: h5py.Group): self._attrs = self._group.attrs self._index = self._group[self._attrs["_index"]].asstr() - self.columns = pd.Index(self._attrs["column-order"]) + self._columns = self._attrs["column-order"] for column in self.columns: # read_elem_partial(group) # , items=obs, indices=(obs_idx, slice(None))) @@ -242,172 +26,16 @@ def __init__(self, group: h5py.Group): def index(self): return pd.Index(self._index[:]) + @property + def columns(self): + return pd.Index(self._columns) + def __getitem__(self, index): if isinstance(index, str) and index in self.columns: return getattr(self, index) elif isinstance(index, slice): - return self._group[index] + raise NotImplementedError("Slicing is not yet supported.") else: raise TypeError(f"Invalid index '{index}' of type {type(index)}") - - -class SparseDataset: - """Analogous to :class:`h5py.Dataset `, but for sparse matrices.""" - - def __init__(self, group: h5py.Group): - self.group = group - - @property - def dtype(self) -> np.dtype: - return self.group["data"].dtype - - @property - def format_str(self) -> str: - if "h5sparse_format" in self.group.attrs: - return _read_attr(self.group.attrs, "h5sparse_format") - else: - # Should this be an extra field? - return _read_attr(self.group.attrs, "encoding-type").replace("_matrix", "") - - @property - def h5py_group(self) -> h5py.Group: - warn( - "Attribute `h5py_group` of SparseDatasets is deprecated. " - "Use `group` instead.", - DeprecationWarning, - ) - return self.group - - @property - def name(self) -> str: - return self.group.name - - @property - def file(self) -> h5py.File: - return self.group.file - - @property - def shape(self) -> Tuple[int, int]: - shape = self.group.attrs.get("h5sparse_shape") - return tuple(self.group.attrs["shape"] if shape is None else shape) - - @property - def value(self) -> ss.spmatrix: - return self.to_memory() - - def __repr__(self) -> str: - return ( - f"' - ) - - def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: - row, col = self._normalize_index(index) - mtx = self.to_backed() - sub = mtx[row, col] - # If indexing is array x array it returns a backed_sparse_matrix - # Not sure what the performance is on that operation - if isinstance(sub, BackedSparseMatrix): - return get_memory_class(self.format_str)(sub) - else: - return sub - - def __setitem__(self, index: Union[Index, Tuple[()]], value): - row, col = self._normalize_index(index) - mock_matrix = self.to_backed() - mock_matrix[row, col] = value - - def _normalize_index( - self, index: Union[Index, Tuple[()]] - ) -> Tuple[np.ndarray, np.ndarray]: - if index == (): - index = slice(None) - row, col = unpack_index(index) - if all(isinstance(x, cabc.Iterable) for x in (row, col)): - row, col = np.ix_(row, col) - return row, col - - def append(self, sparse_matrix: ss.spmatrix): - # Prep variables - shape = self.shape - if isinstance(sparse_matrix, SparseDataset): - sparse_matrix = sparse_matrix.to_backed() - - # Check input - if not ss.isspmatrix(sparse_matrix): - raise NotImplementedError( - "Currently, only sparse matrices of equivalent format can be " - "appended to a SparseDataset." - ) - if self.format_str not in {"csr", "csc"}: - raise NotImplementedError( - f"The append method for format {self.format_str} " - f"is not implemented." - ) - if self.format_str != get_format_str(sparse_matrix): - raise ValueError( - f"Matrices must have same format. Currently are " - f"{self.format_str!r} and {get_format_str(sparse_matrix)!r}" - ) - - # shape - if self.format_str == "csr": - assert ( - shape[1] == sparse_matrix.shape[1] - ), "CSR matrices must have same size of dimension 1 to be appended." - new_shape = (shape[0] + sparse_matrix.shape[0], shape[1]) - elif self.format_str == "csc": - assert ( - shape[0] == sparse_matrix.shape[0] - ), "CSC matrices must have same size of dimension 0 to be appended." - new_shape = (shape[0], shape[1] + sparse_matrix.shape[1]) - else: - assert False, "We forgot to update this branching to a new format" - if "h5sparse_shape" in self.group.attrs: - del self.group.attrs["h5sparse_shape"] - self.group.attrs["shape"] = new_shape - - # data - data = self.group["data"] - orig_data_size = data.shape[0] - data.resize((orig_data_size + sparse_matrix.data.shape[0],)) - data[orig_data_size:] = sparse_matrix.data - - # indptr - indptr = self.group["indptr"] - orig_data_size = indptr.shape[0] - append_offset = indptr[-1] - indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,)) - indptr[orig_data_size:] = ( - sparse_matrix.indptr[1:].astype(np.int64) + append_offset - ) - - # indices - indices = self.group["indices"] - orig_data_size = indices.shape[0] - indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) - indices[orig_data_size:] = sparse_matrix.indices - - def to_backed(self) -> BackedSparseMatrix: - format_class = get_backed_class(self.format_str) - mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"] - mtx.indices = self.group["indices"] - mtx.indptr = self.group["indptr"][:] - return mtx - - def to_memory(self) -> ss.spmatrix: - format_class = get_memory_class(self.format_str) - mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"][...] - mtx.indices = self.group["indices"][...] - mtx.indptr = self.group["indptr"][...] - return mtx - - -@_subset.register(SparseDataset) -def subset_sparsedataset(d, subset_idx): - return d[subset_idx] diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 9584f82d1..78f913eb8 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -17,6 +17,7 @@ from anndata._core.index import _normalize_indices from anndata._core.merge import intersect_keys from anndata._core.sparse_dataset import SparseDataset +from anndata._core.dataframe import DataFrame from anndata._core import views from anndata.compat import ( ZarrArray, From 846d07e66407c9a7b810a534d27196f3c6419401 Mon Sep 17 00:00:00 2001 From: Grisha Szep Date: Sun, 16 Apr 2023 14:55:14 +0100 Subject: [PATCH 4/4] revert changes --- anndata/_core/anndata.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 235827c39..541bc0ab0 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -348,7 +348,7 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): # set data if self.isbacked: - self._X = None + self._X = self._obs = None # set raw, easy, as it’s immutable anyways... if adata_ref._raw is not None: @@ -837,13 +837,7 @@ def _set_dim_index(self, value: pd.Index, attr: str): @property def obs(self) -> pd.DataFrame: """One-dimensional annotation of observations (`pd.DataFrame`).""" - if self.isbacked: - if not self.file.is_open: - self.file.open() - obs = DataFrame(self.file["obs"]) - else: - obs = self._obs - return obs + return self._obs @obs.setter def obs(self, value: pd.DataFrame):