diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 1bd777d0bc7..c038805f28f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -51,6 +51,11 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ +- Don't assume that arrays read from disk will be Numpy arrays. This is a step toward + enabling reads from a Zarr store using the `Kvikio `_ + or `TensorStore `_ libraries. + (:pull:`6874`). By `Deepak Cherian `_. + - Remove internal support for reading GRIB files through the ``cfgrib`` backend. ``cfgrib`` now uses the external backend interface, so no existing code should break. By `Deepak Cherian `_. diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 050493e3034..1468299fc84 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -84,9 +84,9 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500 class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed): __slots__ = () - def __array__(self, dtype=None): + def get_duck_array(self, dtype: np.typing.DTypeLike = None): key = indexing.BasicIndexer((slice(None),) * self.ndim) - return np.asarray(self[key], dtype=dtype) + return self[key] # type: ignore [index] class AbstractDataStore: diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index df26a03d790..e09603c722d 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -46,6 +46,7 @@ def _getitem(self, key): # downloading coordinate data twice array = getattr(self.array, "array", self.array) result = robust_getitem(array, key, catch=ValueError) + result = np.asarray(result) # in some cases, pydap doesn't squeeze axes automatically like numpy axis = tuple(n for n, k in enumerate(key) if isinstance(k, integer_types)) if result.ndim + len(axis) != array.ndim and axis: diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index c290307b4b6..4107b3aa883 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -69,8 +69,8 @@ def dtype(self) -> np.dtype: def __getitem__(self, key): return type(self)(self.array[key], self.func, self.dtype) - def __array__(self, dtype=None): - return self.func(self.array) + def get_duck_array(self): + return self.func(self.array.get_duck_array()) def __repr__(self) -> str: return "{}({!r}, func={!r}, dtype={!r})".format( @@ -224,7 +224,7 @@ def decode(self, variable: Variable, name: T_Name = None): def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTypeLike): - data = np.array(data, dtype=dtype, copy=True) + data = data.astype(dtype=dtype, copy=True) if scale_factor is not None: data *= scale_factor if add_offset is not None: diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index ed548771809..4eae6302993 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -16,7 +16,7 @@ from pandas.errors import OutOfBoundsDatetime from xarray.core.duck_array_ops import array_equiv -from xarray.core.indexing import MemoryCachedArray +from xarray.core.indexing import ExplicitlyIndexed, MemoryCachedArray from xarray.core.options import OPTIONS, _get_boolean_with_default from xarray.core.pycompat import array_type from xarray.core.utils import is_duck_array @@ -557,8 +557,15 @@ def limit_lines(string: str, *, limit: int): return string -def short_numpy_repr(array): - array = np.asarray(array) +def short_array_repr(array): + from xarray.core.common import AbstractArray + + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() + elif isinstance(array, AbstractArray): + array = array.data + if not is_duck_array(array): + array = np.asarray(array) # default to lower precision so a full (abbreviated) line can fit on # one line with the default display_width @@ -582,11 +589,11 @@ def short_data_repr(array): """Format "data" for DataArray and Variable.""" internal_data = getattr(array, "variable", array)._data if isinstance(array, np.ndarray): - return short_numpy_repr(array) + return short_array_repr(array) elif is_duck_array(internal_data): return limit_lines(repr(array.data), limit=40) elif array._in_memory: - return short_numpy_repr(array) + return short_array_repr(array) else: # internal xarray array type return f"[{array.size} values with dtype={array.dtype}]" @@ -831,7 +838,7 @@ def diff_array_repr(a, b, compat): equiv = array_equiv if not equiv(a.data, b.data): - temp = [wrap_indent(short_numpy_repr(obj), start=" ") for obj in (a, b)] + temp = [wrap_indent(short_array_repr(obj), start=" ") for obj in (a, b)] diff_data_repr = [ ab_side + "\n" + ab_data_repr for ab_side, ab_data_repr in zip(("L", "R"), temp) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 7109d4fdd2c..35a5261f248 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -449,13 +449,25 @@ class ExplicitlyIndexed: __slots__ = () + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + # Leave casting to an array up to the underlying array type. + return np.asarray(self.get_duck_array(), dtype=dtype) + + def get_duck_array(self): + return self.array + class ExplicitlyIndexedNDArrayMixin(NDArrayMixin, ExplicitlyIndexed): __slots__ = () - def __array__(self, dtype=None): + def get_duck_array(self): key = BasicIndexer((slice(None),) * self.ndim) - return np.asarray(self[key], dtype=dtype) + return self[key] + + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + # This is necessary because we apply the indexing key in self.get_duck_array() + # Note this is the base class for all lazy indexing classes + return np.asarray(self.get_duck_array(), dtype=dtype) class ImplicitToExplicitIndexingAdapter(NDArrayMixin): @@ -467,8 +479,11 @@ def __init__(self, array, indexer_cls=BasicIndexer): self.array = as_indexable(array) self.indexer_cls = indexer_cls - def __array__(self, dtype=None): - return np.asarray(self.array, dtype=dtype) + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + return np.asarray(self.get_duck_array(), dtype=dtype) + + def get_duck_array(self): + return self.array.get_duck_array() def __getitem__(self, key): key = expanded_indexer(key, self.ndim) @@ -531,9 +546,15 @@ def shape(self) -> tuple[int, ...]: shape.append(k.size) return tuple(shape) - def __array__(self, dtype=None): - array = as_indexable(self.array) - return np.asarray(array[self.key], dtype=None) + def get_duck_array(self): + array = self.array[self.key] + # self.array[self.key] is now a numpy array when + # self.array is a BackendArray subclass + # and self.key is BasicIndexer((slice(None, None, None),)) + # so we need the explicit check for ExplicitlyIndexed + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() + return _wrap_numpy_scalars(array) def transpose(self, order): return LazilyVectorizedIndexedArray(self.array, self.key).transpose(order) @@ -584,8 +605,15 @@ def __init__(self, array, key): def shape(self) -> tuple[int, ...]: return np.broadcast(*self.key.tuple).shape - def __array__(self, dtype=None): - return np.asarray(self.array[self.key], dtype=None) + def get_duck_array(self): + array = self.array[self.key] + # self.array[self.key] is now a numpy array when + # self.array is a BackendArray subclass + # and self.key is BasicIndexer((slice(None, None, None),)) + # so we need the explicit check for ExplicitlyIndexed + if isinstance(array, ExplicitlyIndexed): + array = array.get_duck_array() + return _wrap_numpy_scalars(array) def _updated_key(self, new_key): return _combine_indexers(self.key, self.shape, new_key) @@ -631,8 +659,8 @@ def _ensure_copied(self): self.array = as_indexable(np.array(self.array)) self._copied = True - def __array__(self, dtype=None): - return np.asarray(self.array, dtype=dtype) + def get_duck_array(self): + return self.array.get_duck_array() def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) @@ -658,12 +686,14 @@ def __init__(self, array): self.array = _wrap_numpy_scalars(as_indexable(array)) def _ensure_cached(self): - if not isinstance(self.array, NumpyIndexingAdapter): - self.array = NumpyIndexingAdapter(np.asarray(self.array)) + self.array = as_indexable(self.array.get_duck_array()) + + def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + return np.asarray(self.get_duck_array(), dtype=dtype) - def __array__(self, dtype=None): + def get_duck_array(self): self._ensure_cached() - return np.asarray(self.array, dtype=dtype) + return self.array.get_duck_array() def __getitem__(self, key): return type(self)(_wrap_numpy_scalars(self.array[key])) @@ -827,7 +857,7 @@ def explicit_indexing_adapter( result = raw_indexing_method(raw_key.tuple) if numpy_indices.tuple: # index the loaded np.ndarray - result = NumpyIndexingAdapter(np.asarray(result))[numpy_indices] + result = NumpyIndexingAdapter(result)[numpy_indices] return result @@ -1463,6 +1493,9 @@ def __array__(self, dtype: DTypeLike = None) -> np.ndarray: array = array.astype("object") return np.asarray(array.values, dtype=dtype) + def get_duck_array(self) -> np.ndarray: + return np.asarray(self) + @property def shape(self) -> tuple[int, ...]: return (len(self.array),) @@ -1603,9 +1636,9 @@ def _repr_inline_(self, max_width: int) -> str: return format_array_flat(self._get_array_subset(), max_width) def _repr_html_(self) -> str: - from xarray.core.formatting import short_numpy_repr + from xarray.core.formatting import short_array_repr - array_repr = short_numpy_repr(self._get_array_subset()) + array_repr = short_array_repr(self._get_array_subset()) return f"
{escape(array_repr)}
" def copy(self, deep: bool = True) -> PandasMultiIndexingAdapter: diff --git a/xarray/core/variable.py b/xarray/core/variable.py index bddeb85f5e9..0b765b21cab 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -426,6 +426,8 @@ def data(self) -> Any: """ if is_duck_array(self._data): return self._data + elif isinstance(self._data, indexing.ExplicitlyIndexed): + return self._data.get_duck_array() else: return self.values @@ -533,6 +535,8 @@ def load(self, **kwargs): """ if is_duck_dask_array(self._data): self._data = as_compatible_data(self._data.compute(**kwargs)) + elif isinstance(self._data, indexing.ExplicitlyIndexed): + self._data = self._data.get_duck_array() elif not is_duck_array(self._data): self._data = np.asarray(self._data) return self diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 864b3df8405..7e1b964ecba 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -139,13 +139,18 @@ class UnexpectedDataAccess(Exception): class InaccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): + """Disallows any loading.""" + def __init__(self, array): self.array = array - def __getitem__(self, key): - raise UnexpectedDataAccess("Tried accessing data.") + def get_duck_array(self): + raise UnexpectedDataAccess("Tried accessing data") + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") - def __array__(self): + def __getitem__(self, key): raise UnexpectedDataAccess("Tried accessing data.") @@ -157,6 +162,23 @@ def __getitem__(self, key): return self.array[tuple_idxr] +class DuckArrayWrapper(utils.NDArrayMixin): + """Array-like that prevents casting to array. + Modeled after cupy.""" + + def __init__(self, array: np.ndarray): + self.array = array + + def __getitem__(self, key): + return type(self)(self.array[key]) + + def __array__(self, dtype: np.typing.DTypeLike = None): + raise UnexpectedDataAccess("Tried accessing data") + + def __array_namespace__(self): + """Present to satisfy is_duck_array test.""" + + class ReturnItem: def __getitem__(self, key): return key diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 2e23d02a261..ef2954676ac 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -36,6 +36,7 @@ from xarray.core.pycompat import array_type, integer_types from xarray.core.utils import is_scalar from xarray.tests import ( + DuckArrayWrapper, InaccessibleArray, UnexpectedDataAccess, assert_allclose, @@ -203,6 +204,10 @@ def create_test_stacked_array() -> tuple[DataArray, DataArray]: class InaccessibleVariableDataStore(backends.InMemoryDataStore): + """ + Store that does not allow any data access. + """ + def __init__(self): super().__init__() self._indexvars = set() @@ -223,6 +228,47 @@ def lazy_inaccessible(k, v): return {k: lazy_inaccessible(k, v) for k, v in self._variables.items()} +class DuckBackendArrayWrapper(backends.common.BackendArray): + """Mimic a BackendArray wrapper around DuckArrayWrapper""" + + def __init__(self, array): + self.array = DuckArrayWrapper(array) + self.shape = array.shape + self.dtype = array.dtype + + def get_array(self): + return self.array + + def __getitem__(self, key): + return self.array[key.tuple] + + +class AccessibleAsDuckArrayDataStore(backends.InMemoryDataStore): + """ + Store that returns a duck array, not convertible to numpy array, + on read. Modeled after nVIDIA's kvikio. + """ + + def __init__(self): + super().__init__() + self._indexvars = set() + + def store(self, variables, *args, **kwargs) -> None: + super().store(variables, *args, **kwargs) + for k, v in variables.items(): + if isinstance(v, IndexVariable): + self._indexvars.add(k) + + def get_variables(self) -> dict[Any, xr.Variable]: + def lazy_accessible(k, v) -> xr.Variable: + if k in self._indexvars: + return v + data = indexing.LazilyIndexedArray(DuckBackendArrayWrapper(v.values)) + return Variable(v.dims, data, v.attrs) + + return {k: lazy_accessible(k, v) for k, v in self._variables.items()} + + class TestDataset: def test_repr(self) -> None: data = create_test_data(seed=123) @@ -4684,6 +4730,29 @@ def test_lazy_load(self) -> None: ds.isel(time=10) ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) + def test_lazy_load_duck_array(self) -> None: + store = AccessibleAsDuckArrayDataStore() + create_test_data().dump_to_store(store) + + for decode_cf in [True, False]: + ds = open_dataset(store, decode_cf=decode_cf) + with pytest.raises(UnexpectedDataAccess): + ds["var1"].values + + # these should not raise UnexpectedDataAccess: + ds.var1.data + ds.isel(time=10) + ds.isel(time=slice(10), dim1=[0]).isel(dim1=0, dim2=-1) + repr(ds) + + # preserve the duck array type and don't cast to array + assert isinstance(ds["var1"].load().data, DuckArrayWrapper) + assert isinstance( + ds["var1"].isel(dim2=0, dim1=0).load().data, DuckArrayWrapper + ) + + ds.close() + def test_dropna(self) -> None: x = np.random.randn(4, 4) x[::2, 0] = np.nan diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 3cba5b965f9..bf5f7d0bdc5 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -542,7 +542,7 @@ def test_set_numpy_options() -> None: assert np.get_printoptions() == original_options -def test_short_numpy_repr() -> None: +def test_short_array_repr() -> None: cases = [ np.random.randn(500), np.random.randn(20, 20), @@ -552,16 +552,16 @@ def test_short_numpy_repr() -> None: ] # number of lines: # for default numpy repr: 167, 140, 254, 248, 599 - # for short_numpy_repr: 1, 7, 24, 19, 25 + # for short_array_repr: 1, 7, 24, 19, 25 for array in cases: - num_lines = formatting.short_numpy_repr(array).count("\n") + 1 + num_lines = formatting.short_array_repr(array).count("\n") + 1 assert num_lines < 30 # threshold option (default: 200) array2 = np.arange(100) - assert "..." not in formatting.short_numpy_repr(array2) + assert "..." not in formatting.short_array_repr(array2) with xr.set_options(display_values_threshold=10): - assert "..." in formatting.short_numpy_repr(array2) + assert "..." in formatting.short_array_repr(array2) def test_large_array_repr_length() -> None: