Source code for xcube.core.zarrstore.generic

# Copyright (c) 2018-2024 by xcube team and contributors
# Permissions are hereby granted under the terms of the MIT License:
# https://opensource.org/licenses/MIT.

import collections.abc
import inspect
import itertools
import json
import math
import threading
import warnings
from typing import Iterator, Dict, Tuple, Any, Callable, Optional, List, Sequence
from typing import Union

import numcodecs.abc
import numpy as np
import xarray as xr
import zarr.storage

from xcube.util.assertions import assert_instance, assert_true

GetData = Callable[[Tuple[int]], Union[bytes, np.ndarray]]

OnClose = Callable[[Dict[str, Any]], None]


[docs] class GenericArray(Dict[str, any]): """Represent a generic array in the ``GenericZarrStore`` as dictionary of properties. Although all properties of this class are optional, some of them are mandatory when added to the ``GenericZarrStore``. When added to the store using ``GenericZarrStore.add_array()``, the array *name* and *dims* must always be present. Other mandatory properties depend on the *data* and *get_data* properties, which are mutually exclusive: * *get_data* is called for a requested data chunk of an array. It must return a bytes object or a numpy nd-array and is passed the chunk index, the chunk shape, and this array info dictionary. *get_data* requires the following properties to be present too: *name*, *dims*, *dtype*, *shape*. *chunks* is optional and defaults to *shape*. * *data* must be a bytes object or a numpy nd-array. *data* requires the following properties to be present too: *name*, *dims*. *chunks* must be same as *shape*. The function *get_data* receives only keyword-arguments which comprises the ones passed by *get_data_params*, if any, and two special ones which may occur in the signature of *get_data*: * The keyword argument *chunk_info*, if given, provides a dictionary that holds information about the current chunk: - ``index: tuple[int, ...]`` - the chunk's index - ``shape: tuple[int, ...]`` - the chunk's shape - ``slices: tuple[slice, ...]`` - the chunk's array slices * The keyword argument *array_info*, if given, provides a dictionary that holds information about the overall array. It contains all array properties passed to the constructor of ``GenericArray`` plus - ``ndim: int`` - number of dimensions - ``num_chunks: tuple[int, ...]`` - number of chunks in every dimension ``GenericZarrStore`` will convert a Numpy array returned by *get_data* or given by *data* into a bytes object. It will also be compressed, if a *compressor* is given. It is important that the array chunks always See also https://zarr.readthedocs.io/en/stable/spec/v2.html#chunks Note that if the value of a named keyword argument is None, it will not be stored. Args: array: Optional array info dictionary name: Optional array name data: Optional array data. Mutually exclusive with *get_data*. Must be a bytes object or a numpy array. get_data: Optional array data chunk getter. Mutually exclusive with *data*. Called for a requested data chunk of an array. Must return a bytes object or a numpy array. get_data_params: Optional keyword-arguments passed to *get_data*. dtype: Optional array data type. Either a string using syntax of the Zarr spec or a ``numpy.dtype``. For string encoded data types, see https://zarr.readthedocs.io/en/stable/spec/v2.html#data- type-encoding dims: Optional sequence of dimension names. shape: Optional sequence of shape sizes for each dimension. chunks: Optional sequence of chunk sizes for each dimension. fill_value: Optional fill value, see https://zarr.readthedocs.io/en/stable/spec/v2.html#fill- value-encoding compressor: Optional compressor. If given, it must be an instance of ``numcodecs.abc.Codec``. filters: Optional sequence of filters, see https://zarr.readthedocs.io/en/stable/spec/v2.html#filters. order: Optional array endian ordering. If given, must be "C" or "F". Defaults to "C". attrs: Optional array attributes. If given, must be JSON- serializable. on_close: Optional array close handler. Called if the store is closed. chunk_encoding: Optional encoding type of the chunk data returned for the array. Can be "bytes" (the default) or "ndarray" for array chunks that are numpy.ndarray instances. kwargs: Other keyword arguments passed directly to the dictionary constructor. """ def __init__( self, array: Optional[Dict[str, any]] = None, name: Optional[str] = None, get_data: Optional[GetData] = None, get_data_params: Optional[Dict[str, Any]] = None, data: Optional[np.ndarray] = None, dtype: Optional[Union[str, np.dtype]] = None, dims: Optional[Union[str, Sequence[str]]] = None, shape: Optional[Sequence[int]] = None, chunks: Optional[Sequence[int]] = None, fill_value: Optional[Union[bool, int, float, str]] = None, compressor: Optional[numcodecs.abc.Codec] = None, filters: Optional[Sequence[numcodecs.abc.Codec]] = None, order: Optional[str] = None, attrs: Optional[Dict[str, Any]] = None, on_close: Optional[OnClose] = None, chunk_encoding: Optional[str] = None, **kwargs, ): array = dict(array) if array is not None else dict() array.update( { k: v for k, v in dict( name=name, dtype=dtype, dims=dims, shape=shape, chunks=chunks, fill_value=fill_value, compressor=compressor, filters=filters, order=order, attrs=attrs, data=data, get_data=get_data, get_data_params=get_data_params, on_close=on_close, chunk_encoding=chunk_encoding, ).items() if v is not None } ) super().__init__(array, **kwargs)
[docs] def finalize(self) -> "GenericArray": """Normalize and validate array properties and return a valid array info dictionary to be stored in the `GenericZarrStore`. """ name = self.get("name") if not name: raise ValueError("missing array name") data = self.get("data") get_data = self.get("get_data") if data is None and get_data is None: raise ValueError( f"array {name!r}:" f" either data or get_data must be defined" ) if get_data is not None: if data is not None: raise ValueError( f"array {name!r}:" f" data and get_data cannot" f" be defined together" ) if not callable(get_data): raise TypeError(f"array {name!r}:" f" get_data must be a callable") sig = inspect.signature(get_data) get_data_info = { "has_array_info": "array_info" in sig.parameters, "has_chunk_info": "chunk_info" in sig.parameters, } get_data_params = dict(self.get("get_data_params") or {}) else: get_data_info = None get_data_params = None dims = self.get("dims") dims = [dims] if isinstance(dims, str) else dims if dims is None: raise ValueError(f"array {name!r}: missing dims") ndim = len(dims) if isinstance(data, np.ndarray): # forman: maybe warn if dtype or shape is given, # but does not match data.dtype and data.shape dtype = str(data.dtype.str) shape = data.shape chunks = data.shape else: dtype = self.get("dtype") shape = self.get("shape") chunks = self.get("chunks", shape) if not dtype: raise ValueError(f"array {name!r}: missing dtype") elif isinstance(dtype, np.dtype): dtype = dtype.str if shape is None: raise ValueError(f"array {name!r}: missing shape") if len(shape) != ndim: raise ValueError( f"array {name!r}:" f" dims and shape must have same length" ) if len(chunks) != ndim: raise ValueError( f"array {name!r}:" f" dims and chunks must have same length" ) num_chunks = tuple(map(lambda x: math.ceil(x[0] / x[1]), zip(shape, chunks))) filters = self.get("filters") if filters: filters = list(filters) for f in filters: if not isinstance(f, numcodecs.abc.Codec): raise TypeError( f"array {name!r}:" f" filter items must be an" f" instance of numcodecs.abc.Codec" ) else: filters = None compressor = self.get("compressor") if compressor is not None: if not isinstance(compressor, numcodecs.abc.Codec): raise TypeError( f"array {name!r}:" f" compressor must be an" f" instance of numcodecs.abc.Codec" ) fill_value = self.get("fill_value") if isinstance(fill_value, np.ndarray): fill_value = fill_value.item() allowed_fill_value_types = (type(None), bool, int, float, str) if not isinstance(fill_value, allowed_fill_value_types): raise TypeError( f"array {name!r}:" f" fill_value type must be one of" f" {tuple(t.__name__ for t in allowed_fill_value_types)}," f" was {type(fill_value).__name__}" ) order = self.get("order") or "C" allowed_orders = ("C", "F") if order not in allowed_orders: raise ValueError( f"array {name!r}:" f" order must be one of {allowed_orders}," f" was {order!r}" ) chunk_encoding = self.get("chunk_encoding") or "bytes" allowed_chunk_encodings = ("bytes", "ndarray") if chunk_encoding not in allowed_chunk_encodings: raise ValueError( f"array {name!r}:" f" chunk_encoding must be one of {allowed_chunk_encodings}," f" was {chunk_encoding!r}" ) attrs = self.get("attrs") if attrs is not None: if not isinstance(attrs, dict): raise TypeError( f"array {name!r}:" f" attrs must be dict, was {type(attrs).__name__}" ) # Note: passing the properties as dictionary # will prevent removing them if their value is None, # see GenericArray constructor. return GenericArray( { "name": name, "dtype": dtype, "dims": tuple(dims), "shape": tuple(shape), "chunks": tuple(chunks), "fill_value": fill_value, "filters": filters, "compressor": compressor, "order": order, "attrs": attrs, "data": data, "get_data": get_data, "get_data_params": get_data_params, "on_close": self.get("on_close"), "chunk_encoding": chunk_encoding, # Computed properties "ndim": len(dims), "num_chunks": num_chunks, "get_data_info": get_data_info, } )
GenericArrayLike = Union[GenericArray, Dict[str, Any]]
[docs] class GenericZarrStore(zarr.storage.Store): """A Zarr store that maintains generic arrays in a flat, top-level hierarchy. The root of the store is a Zarr group conforming to the Zarr spec v2. It is designed to serve as a Zarr store for xarray datasets that compute their data arrays dynamically. See class ``GenericArray`` for specifying the arrays' properties. The array data of this store's arrays are either retrieved from static (numpy) arrays or from a callable that provides the array's data chunks as bytes or numpy arrays. Args: arrays: Arrays to be added. Typically, these will be instances of ``GenericArray``. attrs: Optional attributes of the top-level group. If given, it must be JSON serializable. array_defaults: Optional array defaults for array properties not passed to ``add_array``. Typically, this will be an instance of ``GenericArray``. """ # Shortcut for GenericArray Array = GenericArray def __init__( self, *arrays: GenericArrayLike, attrs: Optional[Dict[str, Any]] = None, array_defaults: Optional[GenericArrayLike] = None, ): self._attrs = dict(attrs) if attrs is not None else {} self._array_defaults = array_defaults self._dim_sizes: Dict[str, int] = {} self._arrays: Dict[str, GenericArray] = {} for array in arrays: self.add_array(array)
[docs] def add_array( self, array: Optional[GenericArrayLike] = None, **array_kwargs ) -> None: """Add a new array to this store. Args: array: Optional array properties. Typically, this will be an instance of ``GenericArray``. array_kwargs: Keyword arguments form for the properties of ``GenericArray``. """ effective_array = GenericArray(self._array_defaults or {}) if array: effective_array.update(array) if array_kwargs: effective_array.update(array_kwargs) effective_array = effective_array.finalize() name = effective_array["name"] if name in self._arrays: raise ValueError(f"array {name!r} is already defined") dims = effective_array["dims"] shape = effective_array["shape"] for dim_name, dim_size in zip(dims, shape): old_dim_size = self._dim_sizes.get(dim_name) if old_dim_size is None: self._dim_sizes[name] = dim_size elif old_dim_size != dim_size: # Dimensions must have same lengths for all arrays # in this store raise ValueError( f"array {name!r}" f" defines dimension {dim_name!r}" f" with size {dim_size}," f" but existing size is {old_dim_size}" ) self._arrays[name] = effective_array
########################################################################## # Zarr Store implementation ##########################################################################
[docs] def is_writeable(self) -> bool: """Return False, because arrays in this store are generative.""" return False
[docs] def listdir(self, path: str = "") -> List[str]: """List a store path. Args: path: The path. Returns: List of sorted directory entries. """ if path == "": return sorted([".zmetadata", ".zgroup", ".zattrs", *self._arrays.keys()]) elif "/" not in path: return sorted(self._get_array_keys(path)) raise ValueError(f"{path} is not a directory")
[docs] def rmdir(self, path: str = "") -> None: """The general form removes store paths. This implementation can remove entire arrays only. Args: path: The array's name. """ if path not in self._arrays: raise ValueError(f"{path}: can only remove existing arrays") array = self._arrays.pop(path) dims = array["dims"] for i, dim_name in enumerate(dims): dim_used = False for array_name, array in self._arrays.items(): if dim_name in array["dims"]: dim_used = True break if not dim_used: del self._dim_sizes[dim_name]
[docs] def rename(self, src_path: str, dst_path: str) -> None: """The general form renames store paths. This implementation can rename arrays only. Args: src_path: Source array name. dst_path: Target array name. """ array = self._arrays.get(src_path) if array is None: raise ValueError( f"can only rename arrays, but {src_path!r}" f" is not an array" ) if dst_path in self._arrays: raise ValueError( f"cannot rename array {src_path!r} into" f" {dst_path!r} because it already exists" ) if "/" in dst_path: raise ValueError(f"cannot rename array {src_path!r}" f" into {dst_path!r}") array["name"] = dst_path self._arrays[dst_path] = array del self._arrays[src_path]
[docs] def close(self) -> None: """Calls the "on_close" handlers, if any, of arrays.""" for array in self._arrays.values(): on_close = array.get("on_close") if on_close is not None: on_close(array)
# Note, getsize is not implemented by intention as it requires # actual computation of arrays. # # def getsize(self, key: str) -> int: # pass ########################################################################## # MutableMapping implementation ########################################################################## def __iter__(self) -> Iterator[str]: """Get an iterator of all keys in this store.""" yield ".zmetadata" yield ".zgroup" yield ".zattrs" for array_name in self._arrays.keys(): yield from self._get_array_keys(array_name) def __len__(self) -> int: return sum(1 for _ in iter(self)) def __contains__(self, key: str) -> bool: if key in (".zmetadata", ".zgroup", ".zattrs"): return True try: array_name, value_id = self._parse_array_key(key) except KeyError: return False if value_id in (".zarray", ".zattrs"): return True try: self._get_array_chunk_index(array_name, value_id) return True except KeyError: return False def __getitem__(self, key: str) -> Union[bytes, np.ndarray]: item = self._get_item(key) if isinstance(item, dict): return dict_to_bytes(item) elif isinstance(item, str): return str_to_bytes(item) return item def __setitem__(self, key: str, value: bytes) -> None: class_name = self.__module__ + "." + self.__class__.__name__ raise TypeError(f"{class_name} is read-only") def __delitem__(self, key: str) -> None: self.rmdir(key) ######################################################################## # Utilities ##########################################################################
[docs] @classmethod def from_dataset( cls, dataset: xr.Dataset, array_defaults: Optional[GenericArrayLike] = None ) -> "GenericZarrStore": """Create a Zarr store for given *dataset*. to the *dataset*'s attributes. The following *array_defaults* properties can be provided (other properties are prescribed by the *dataset*): * ``fill_value``- defaults to None * ``compressor``- defaults to None * ``filters``- defaults to None * ``order``- defaults to "C" * ``chunk_encoding`` - defaults to "bytes" Args: dataset: The dataset array_defaults: Array default values. Returns: A new Zarr store instance. """ def _get_dataset_data(ds=None, chunk_info=None, array_info=None) -> np.ndarray: array_name = array_info["name"] chunk_slices = chunk_info["slices"] return ds[array_name][chunk_slices].values arrays = [] for var_name, var in dataset.variables.items(): arrays.append( GenericArray( name=str(var_name), dtype=np.dtype(var.dtype).str, dims=[str(dim) for dim in var.dims], shape=var.shape, chunks=( [(max(*c) if len(c) > 1 else c[0]) for c in var.chunks] if var.chunks else var.shape ), attrs={str(k): v for k, v in var.attrs.items()}, get_data=_get_dataset_data, get_data_params=dict(ds=dataset), ) ) attrs = {str(k): v for k, v in dataset.attrs.items()} return GenericZarrStore(*arrays, attrs=attrs, array_defaults=array_defaults)
######################################################################## # Helpers ########################################################################## def _get_item(self, key: str) -> Union[dict, str, bytes]: if key == ".zmetadata": return self._get_metadata_item() if key == ".zgroup": return self._get_group_item() if key == ".zattrs": return self._get_attrs_item() array_name, value_id = self._parse_array_key(key) array = self._arrays[array_name] if value_id == ".zarray": return self._get_array_spec_item(array) if value_id == ".zattrs": return self._get_array_attrs_item(array) chunk_index = self._get_array_chunk_index(array_name, value_id) return self._get_array_data_item(array, chunk_index) def _get_metadata_item(self): metadata = { ".zgroup": self._get_item(".zgroup"), ".zattrs": self._get_item(".zattrs"), } for array_name in self._arrays.keys(): key = array_name + "/.zarray" metadata[key] = self._get_item(key) key = array_name + "/.zattrs" metadata[key] = self._get_item(key) return {"zarr_consolidated_format": 1, "metadata": metadata} # noinspection PyMethodMayBeStatic def _get_group_item(self): return {"zarr_format": 2} def _get_attrs_item(self): return self._attrs or {} # noinspection PyMethodMayBeStatic def _get_array_spec_item(self, array: GenericArray): # JSON-encode fill_value fill_value = array["fill_value"] if isinstance(fill_value, float): if math.isnan(fill_value): fill_value = "NaN" elif math.isinf(fill_value): if fill_value < 0: fill_value = "-Infinity" else: fill_value = "Infinity" # JSON-encode compressor compressor = array["compressor"] if compressor is not None: compressor = compressor.get_config() # JSON-encode filters filters = array["filters"] if filters is not None: filters = list(f.get_config() for f in filters) return { "zarr_format": 2, "dtype": array["dtype"], "shape": list(array["shape"]), "chunks": list(array["chunks"]), "fill_value": fill_value, "compressor": compressor, "filters": filters, "order": array["order"], } # noinspection PyMethodMayBeStatic def _get_array_attrs_item(self, array: GenericArray): dims = array["dims"] attrs = array["attrs"] return {"_ARRAY_DIMENSIONS": dims, **(attrs or {})} # noinspection PyMethodMayBeStatic def _get_array_data_item( self, array: Dict[str, Any], chunk_index: Tuple[int] ) -> Union[bytes, np.ndarray]: # Note, here array is expected to be "finalized", # that is, validated and normalized shape = array["shape"] chunks = array["chunks"] chunk_shape = None data = array["data"] if data is None: get_data = array["get_data"] assert callable(get_data) # Has been ensured before get_data_params = array["get_data_params"] get_data_kwargs = dict(get_data_params) get_data_info = array["get_data_info"] if get_data_info["has_chunk_info"]: chunk_shape = get_chunk_shape(shape, chunks, chunk_index) array_slices = get_array_slices(shape, chunks, chunk_index) get_data_kwargs["chunk_info"] = { "index": chunk_index, "shape": chunk_shape, "slices": array_slices, } if get_data_info["has_array_info"]: get_data_kwargs["array_info"] = dict(array) data = get_data(**get_data_kwargs) chunk_encoding = array["chunk_encoding"] if isinstance(data, np.ndarray): # As of Zarr 2.0, all chunks of an array # must have the same shape (= chunks) if data.shape != chunks: # This commonly happens if array shape sizes # are not integer multiple of chunk shape sizes. if chunk_shape is None: # Compute expected chunk shape. chunk_shape = get_chunk_shape(shape, chunks, chunk_index) # We will only pad the data if the data shape # corresponds to the expected chunk's shape. if data.shape == chunk_shape: padding = get_chunk_padding(shape, chunks, chunk_index) fill_value = array["fill_value"] data = np.pad( data, padding, mode="constant", constant_values=fill_value or 0 ) else: key = format_chunk_key(array["name"], chunk_index) raise ValueError( f"{key}:" f" data chunk at {chunk_index}" f" must have shape {chunk_shape}," f" but was {data.shape}" ) if chunk_encoding == "bytes": # Convert to bytes, filter and compress data = ndarray_to_bytes( data, order=array["order"], filters=array["filters"], compressor=array["compressor"], ) # Sanity check if (chunk_encoding == "bytes" and not isinstance(data, bytes)) or ( chunk_encoding == "ndarray" and not isinstance(data, np.ndarray) ): key = format_chunk_key(array["name"], chunk_index) expected_type = "numpy.ndarray" if chunk_encoding == "ndarray" else "bytes" raise TypeError( f"{key}:" f" data must be encoded as {expected_type}," f" but was {type(data).__name__}" ) return data def _parse_array_key(self, key: str) -> Tuple[str, str]: array_name_and_value_id = key.rsplit("/", maxsplit=1) if len(array_name_and_value_id) != 2: raise KeyError(key) array_name, value_id = array_name_and_value_id if array_name not in self._arrays: raise KeyError(key) return array_name, value_id def _get_array_chunk_index(self, array_name: str, index_id: str) -> Tuple[int]: try: chunk_index = tuple(map(int, index_id.split("."))) except (ValueError, TypeError): raise KeyError(f"{array_name}/{index_id}") array = self._arrays[array_name] shape = array["shape"] if len(chunk_index) != len(shape): raise KeyError(f"{array_name}/{index_id}") num_chunks = array["num_chunks"] for i, n in zip(chunk_index, num_chunks): if not (0 <= i < n): raise KeyError(f"{array_name}/{index_id}") return chunk_index def _get_array_keys(self, array_name: str) -> Iterator[str]: yield array_name + "/.zarray" yield array_name + "/.zattrs" array = self._arrays[array_name] num_chunks = array["num_chunks"] yield from get_chunk_keys(array_name, num_chunks)
def get_array_slices( shape: Tuple[int, ...], chunks: Tuple[int, ...], chunk_index: Tuple[int, ...] ) -> Tuple[slice, ...]: return tuple( slice(i * c, i * c + (c if (i + 1) * c <= s else s % c)) for s, c, i in zip(shape, chunks, chunk_index) ) def get_chunk_shape( shape: Tuple[int, ...], chunks: Tuple[int, ...], chunk_index: Tuple[int, ...] ) -> Tuple[int, ...]: return tuple( c if (i + 1) * c <= s else s % c for s, c, i in zip(shape, chunks, chunk_index) ) def get_chunk_padding( shape: Tuple[int, ...], chunks: Tuple[int, ...], chunk_index: Tuple[int, ...] ): return tuple( (0, 0 if (i + 1) * c <= s else c - s % c) for s, c, i in zip(shape, chunks, chunk_index) ) def get_chunk_indexes(num_chunks: Tuple[int, ...]) -> Iterator[Tuple[int, ...]]: if not num_chunks: yield 0, else: yield from itertools.product(*tuple(map(range, map(int, num_chunks)))) def get_chunk_keys(array_name: str, num_chunks: Tuple[int, ...]) -> Iterator[str]: for chunk_index in get_chunk_indexes(num_chunks): yield format_chunk_key(array_name, chunk_index) def format_chunk_key(array_name: str, chunk_index: Tuple[int, ...]) -> str: chunk_id = ".".join(map(str, chunk_index)) return f"{array_name}/{chunk_id}" def dict_to_bytes(d: Dict) -> bytes: return str_to_bytes(json.dumps(d, indent=2)) def str_to_bytes(s: str) -> bytes: return bytes(s, encoding="utf-8") def ndarray_to_bytes( data: np.ndarray, order: Optional[str] = None, filters: Optional[Sequence[Any]] = None, compressor: Optional[numcodecs.abc.Codec] = None, ) -> bytes: data = data.tobytes(order=order or "C") if filters: for f in filters: data = f.encode(data) if compressor is not None: data = compressor.encode(data) return data