Source code for xcube.core.dsio

# The MIT License (MIT)
# Copyright (c) 2019 by the xcube development team and contributors
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import os
import shutil
import warnings
from abc import ABCMeta, abstractmethod
from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union, Mapping

import pandas as pd
import s3fs
import urllib3.util
import xarray as xr
import zarr

from xcube.constants import EXTENSION_POINT_DATASET_IOS
from xcube.constants import FORMAT_NAME_CSV, FORMAT_NAME_MEM, FORMAT_NAME_NETCDF4, FORMAT_NAME_ZARR
from xcube.core.timeslice import append_time_slice, insert_time_slice, replace_time_slice
from xcube.core.verify import assert_cube
from xcube.util.plugin import ExtensionComponent, get_extension_registry


[docs]def open_cube(input_path: str,
              format_name: str = None,
              **kwargs) -> xr.Dataset:
    """
    Open a xcube dataset from *input_path*.
    If *format* is not provided it will be guessed from *input_path*.
    :param input_path: input path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param kwargs: format-specific keyword arguments
    :return: xcube dataset
    """
    return open_dataset(input_path, format_name=format_name, is_cube=True, **kwargs)


[docs]def write_cube(cube: xr.Dataset,
               output_path: str,
               format_name: str = None,
               cube_asserted: bool = False,
               **kwargs) -> xr.Dataset:
    """
    Write a xcube dataset to *output_path*.
    If *format* is not provided it will be guessed from *output_path*.
    :param cube: xcube dataset to be written.
    :param output_path: output path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param kwargs: format-specific keyword arguments
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: xcube dataset *cube*
    """
    if not cube_asserted:
        assert_cube(cube)
    return write_dataset(cube, output_path, format_name=format_name, **kwargs)


def open_dataset(input_path: str,
                 format_name: str = None,
                 is_cube: bool = False,
                 **kwargs) -> xr.Dataset:
    """
    Open a dataset from *input_path*.
    If *format* is not provided it will be guessed from *output_path*.
    :param input_path: input path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param is_cube: Whether a ValueError will be raised, if the dataset read from *input_path* is not a xcube dataset.
    :param kwargs: format-specific keyword arguments
    :return: dataset object
    """
    format_name = format_name if format_name else guess_dataset_format(input_path)
    if format_name is None:
        raise ValueError("Unknown input format")
    dataset_io = find_dataset_io(format_name, modes=["r"])
    if dataset_io is None:
        raise ValueError(f"Unknown input format {format_name!r} for {input_path}")
    dataset = dataset_io.read(input_path, **kwargs)
    if is_cube:
        assert_cube(dataset)
    return dataset


def write_dataset(dataset: xr.Dataset,
                  output_path: str,
                  format_name: str = None,
                  **kwargs) -> xr.Dataset:
    """
    Write dataset to *output_path*.
    If *format* is not provided it will be guessed from *output_path*.
    :param dataset: Dataset to be written.
    :param output_path: output path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param kwargs: format-specific keyword arguments
    :return: the input dataset
    """
    format_name = format_name if format_name else guess_dataset_format(output_path)
    if format_name is None:
        raise ValueError("Unknown output format")
    dataset_io = find_dataset_io(format_name, modes=["w"])
    if dataset_io is None:
        raise ValueError(f"Unknown output format {format_name!r} for {output_path}")
    dataset_io.write(dataset, output_path, **kwargs)
    return dataset


class DatasetIO(ExtensionComponent, metaclass=ABCMeta):
    """
    An abstract base class that represents dataset input/output.
    :param name: A unique dataset I/O identifier.
    """

    def __init__(self, name: str):
        super().__init__(EXTENSION_POINT_DATASET_IOS, name)

    @property
    def description(self) -> str:
        """
        :return: A description for this input processor
        """
        return self.get_metadata_attr('description', '')

    @property
    def ext(self) -> str:
        """The primary filename extension used by this dataset I/O."""
        return self.get_metadata_attr('ext', '')

    @property
    def modes(self) -> Set[str]:
        """
        A set describing the modes of this dataset I/O.
        Must be one or more of "r" (read), "w" (write), and "a" (append).
        """
        return self.get_metadata_attr('modes', set())

    @abstractmethod
    def fitness(self, path: str, path_type: str = None) -> float:
        """
        Compute a fitness of this dataset I/O in the interval [0 to 1]
        for reading/writing from/to the given *path*.
        :param path: The path or URL.
        :param path_type: Either "file", "dir", "url", or None.
        :return: the chance in range [0 to 1]
        """
        return 0.0

    def read(self, input_path: str, **kwargs) -> xr.Dataset:
        """Read a dataset from *input_path* using format-specific read parameters *kwargs*."""
        raise NotImplementedError()

    def write(self, dataset: xr.Dataset, output_path: str, **kwargs):
        """"Write *dataset* to *output_path* using format-specific write parameters *kwargs*."""
        raise NotImplementedError()

    def append(self, dataset: xr.Dataset, output_path: str, **kwargs):
        """"Append *dataset* to existing *output_path* using format-specific write parameters *kwargs*."""
        raise NotImplementedError()

    def insert(self, dataset: xr.Dataset, index: int, output_path: str, **kwargs):
        """"Insert *dataset* at *index* into existing *output_path* using format-specific write parameters *kwargs*."""
        raise NotImplementedError()

    def replace(self, dataset: xr.Dataset, index: int, output_path: str, **kwargs):
        """"Replace *dataset* at *index* in existing *output_path* using format-specific write parameters *kwargs*."""
        raise NotImplementedError()

    def update(self, output_path: str, global_attrs: Dict[str, Any] = None, **kwargs):
        """"Update *dataset* at *output_path* using format-specific open parameters *kwargs*."""
        raise NotImplementedError()


def get_extension(name: str):
    return get_extension_registry().get_extension(EXTENSION_POINT_DATASET_IOS, name)


def find_dataset_io_by_name(name: str):
    extension = get_extension(name)
    if not extension:
        return None
    return extension.component


def find_dataset_io(format_name: str, modes: Iterable[str] = None, default: DatasetIO = None) -> Optional[DatasetIO]:
    modes = set(modes) if modes else None
    format_name = format_name.lower()
    dataset_ios = get_extension_registry().find_components(EXTENSION_POINT_DATASET_IOS)
    for dataset_io in dataset_ios:
        # noinspection PyUnresolvedReferences
        if format_name == dataset_io.name.lower():
            # noinspection PyTypeChecker
            if not modes or modes.issubset(dataset_io.modes):
                return dataset_io
    for dataset_io in dataset_ios:
        # noinspection PyUnresolvedReferences
        if format_name == dataset_io.ext.lower():
            # noinspection PyTypeChecker
            if not modes or modes.issubset(dataset_io.modes):
                return dataset_io
    return default


def guess_dataset_format(path: str) -> Optional[str]:
    """
    Guess a dataset format for a file system path or URL given by *path*.
    :param path: A file system path or URL.
    :return: The name of a dataset format guessed from *path*.
    """
    dataset_io_fitness_list = guess_dataset_ios(path)
    if dataset_io_fitness_list:
        return dataset_io_fitness_list[0][0].name
    return None


def guess_dataset_ios(path: str) -> List[Tuple[DatasetIO, float]]:
    """
    Guess suitable DatasetIO objects for a file system path or URL given by *path*.
    Returns a list of (DatasetIO, fitness) tuples, sorted by descending fitness values.
    Fitness values are in the interval (0, 1].
    The first entry is the most appropriate DatasetIO object.
    :param path: A file system path or URL.
    :return: A list of (DatasetIO, fitness) tuples.
    """
    if os.path.isfile(path):
        input_type = "file"
    elif os.path.isdir(path):
        input_type = "dir"
    elif path.find("://") > 0:
        input_type = "url"
    else:
        input_type = None

    dataset_ios = get_extension_registry().find_components(EXTENSION_POINT_DATASET_IOS)

    dataset_io_fitness_list = []
    for dataset_io in dataset_ios:
        fitness = dataset_io.fitness(path, path_type=input_type)
        if fitness > 0.0:
            dataset_io_fitness_list.append((dataset_io, fitness))

    dataset_io_fitness_list.sort(key=lambda item: -item[1])
    return dataset_io_fitness_list


def _get_ext(path: str) -> Optional[str]:
    _, ext = os.path.splitext(path)
    return ext.lower()


def query_dataset_io(filter_fn: Callable[[DatasetIO], bool] = None) -> List[DatasetIO]:
    dataset_ios = get_extension_registry().find_components(EXTENSION_POINT_DATASET_IOS)
    if filter_fn is None:
        return dataset_ios
    return list(filter(filter_fn, dataset_ios))


# noinspection PyAbstractClass
class MemDatasetIO(DatasetIO):
    """
    An in-memory  dataset I/O. Keeps all datasets in a dictionary.
    :param datasets: The initial datasets as a path to dataset mapping.
    """

    def __init__(self, datasets: Dict[str, xr.Dataset] = None):
        super().__init__(FORMAT_NAME_MEM)
        self._datasets = datasets or {}

    @property
    def datasets(self) -> Dict[str, xr.Dataset]:
        return self._datasets

    def fitness(self, path: str, path_type: str = None) -> float:
        if path in self._datasets:
            return 1.0
        ext_value = _get_ext(path) == ".mem"
        type_value = 0.0
        return (3 * ext_value + type_value) / 4

    def read(self, path: str, **kwargs) -> xr.Dataset:
        if path in self._datasets:
            return self._datasets[path]
        raise FileNotFoundError(path)

    def write(self, dataset: xr.Dataset, path: str, **kwargs):
        self._datasets[path] = dataset

    def append(self, dataset: xr.Dataset, path: str, **kwargs):
        if path in self._datasets:
            old_ds = self._datasets[path]
            # noinspection PyTypeChecker
            self._datasets[path] = xr.concat([old_ds, dataset],
                                             dim='time',
                                             data_vars='minimal',
                                             coords='minimal',
                                             compat='equals')
        else:
            self._datasets[path] = dataset.copy()

    def update(self, output_path: str, global_attrs: Dict[str, Any] = None, **kwargs):
        if global_attrs:
            ds = self._datasets[output_path]
            ds.attrs.update(global_attrs)


class Netcdf4DatasetIO(DatasetIO):
    """
    A dataset I/O that reads from / writes to NetCDF files.
    """

    def __init__(self):
        super().__init__(FORMAT_NAME_NETCDF4)

    def fitness(self, path: str, path_type: str = None) -> float:
        ext = _get_ext(path)
        ext_value = ext in {'.nc', '.hdf', '.h5'}
        type_value = 0.0
        if path_type == "file":
            type_value = 1.0
        elif path_type is None:
            type_value = 0.5
        else:
            ext_value = 0.0
        return (3 * ext_value + type_value) / 4

    def read(self, input_path: str, **kwargs) -> xr.Dataset:
        return xr.open_dataset(input_path, **kwargs)

    def write(self, dataset: xr.Dataset, output_path: str, **kwargs):
        dataset.to_netcdf(output_path)

    def append(self, dataset: xr.Dataset, output_path: str, **kwargs):
        import os
        temp_path = output_path + '.temp.nc'
        os.rename(output_path, temp_path)
        old_ds = xr.open_dataset(temp_path)
        new_ds = xr.concat([old_ds, dataset],
                           dim='time',
                           data_vars='minimal',
                           coords='minimal',
                           compat='equals')
        # noinspection PyUnresolvedReferences
        new_ds.to_netcdf(output_path)
        old_ds.close()
        rimraf(temp_path)

    def update(self, output_path: str, global_attrs: Dict[str, Any] = None, **kwargs):
        if global_attrs:
            import netCDF4
            ds = netCDF4.Dataset(output_path, 'r+')
            ds.setncatts(global_attrs)
            ds.close()


class ZarrDatasetIO(DatasetIO):
    """
    A dataset I/O that reads from / writes to Zarr directories or archives.
    """

    def __init__(self):
        super().__init__(FORMAT_NAME_ZARR)

    def fitness(self, path: str, path_type: str = None) -> float:
        ext = _get_ext(path)
        ext_value = 0.0
        type_value = 0.0
        if ext == ".zarr":
            ext_value = 1.0
            if path_type == "dir":
                type_value = 1.0
            elif path_type == "url" or path_type is None:
                type_value = 0.5
            else:
                ext_value = 0.0
        else:
            lower_path = path.lower()
            if lower_path.endswith(".zarr.zip"):
                ext_value = 1.0
                if path_type == "file":
                    type_value = 1.0
                elif path_type is None:
                    type_value = 0.5
                else:
                    ext_value = 0.0
            else:
                if path_type == "dir":
                    type_value = 1.0
                elif path_type == "url":
                    type_value = 0.5
        return (3 * ext_value + type_value) / 4

    def read(self,
             path: str,
             s3_kwargs: Dict[str, Any] = None,
             s3_client_kwargs: Dict[str, Any] = None,
             max_cache_size: int = None,
             **kwargs) -> xr.Dataset:
        """
        Read dataset from some Zarr storage.
        :param path: File path or object storage URL.
        :param s3_kwargs: if *path* is an object storage URL, keyword-arguments passed to S3 file system,
            that is ``s3fs.S3FileSystem(**s3_kwargs, ...)``.
        :param s3_client_kwargs: if *path* is an object storage URL, keyword-arguments passed to S3 (boto3) client,
            that is ``s3fs.S3FileSystem(..., client_kwargs=s3_client_kwargs)``.
        :param max_cache_size: if this is a positive integer, the store will be wrapped in an in-memory cache,
            that is ``store = zarr.LRUStoreCache(store, max_size=max_cache_size)``.
        :param kwargs: Keyword-arguments passed to xarray Zarr adapter,
            that is ``xarray.open_zarr(..., **kwargs)``. In addition, the parameter **
        :return:
        """
        path_or_store = path
        consolidated = False
        if isinstance(path, str):
            path_or_store, consolidated = get_path_or_s3_store(path_or_store,
                                                               s3_kwargs=s3_kwargs,
                                                               s3_client_kwargs=s3_client_kwargs,
                                                               mode='r')
            if max_cache_size is not None and max_cache_size > 0:
                path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size)
        return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)

    def write(self,
              dataset: xr.Dataset,
              output_path: str,
              compressor: Dict[str, Any] = None,
              chunksizes: Dict[str, int] = None,
              packing: Dict[str, Dict[str, Any]] = None,
              s3_kwargs: Dict[str, Any] = None,
              s3_client_kwargs: Dict[str, Any] = None,
              **kwargs):
        path_or_store, consolidated = get_path_or_s3_store(output_path,
                                                           s3_kwargs=s3_kwargs,
                                                           s3_client_kwargs=s3_client_kwargs,
                                                           mode='w')
        encoding = self._get_write_encodings(dataset, compressor, chunksizes, packing)
        dataset.to_zarr(path_or_store, mode='w', encoding=encoding, **kwargs)

    @classmethod
    def _get_write_encodings(cls, dataset, compressor, chunksizes, packing):
        encoding = None
        if chunksizes:
            encoding = {}
            for var_name in dataset.data_vars:
                var = dataset[var_name]
                chunks: List[int] = []
                for i in range(len(var.dims)):
                    dim_name = var.dims[i]
                    if dim_name in chunksizes:
                        chunks.append(chunksizes[dim_name])
                    else:
                        chunks.append(var.shape[i])
                encoding[var_name] = dict(chunks=chunks)
        if packing:
            if encoding:
                for var_name in packing.keys():
                    if var_name in encoding.keys():
                        encoding[var_name].update(dict(packing[var_name]))
                    else:
                        encoding[var_name] = dict(packing[var_name])
            else:
                encoding = {}
                for var_name in packing.keys():
                    encoding[var_name] = dict(packing[var_name])

        if compressor:
            compressor = zarr.Blosc(**compressor)

            if encoding:
                for var_name in encoding.keys():
                    encoding[var_name].update(compressor=compressor)
            else:
                encoding = {var_name: dict(compressor=compressor) for var_name in dataset.data_vars}
        return encoding

    def append(self, dataset: xr.Dataset, output_path: str, **kwargs):
        append_time_slice(output_path, dataset)

    def insert(self, dataset: xr.Dataset, index: int, output_path: str, **kwargs):
        insert_time_slice(output_path, index, dataset)

    def replace(self, dataset: xr.Dataset, index: int, output_path: str, **kwargs):
        replace_time_slice(output_path, index, dataset)

    def update(self, output_path: str, global_attrs: Dict[str, Any] = None, **kwargs):
        if global_attrs:
            import zarr
            ds = zarr.open_group(output_path, mode='r+', **kwargs)
            ds.attrs.update(global_attrs)


# noinspection PyAbstractClass
class CsvDatasetIO(DatasetIO):
    """
    A dataset I/O that reads from / writes to CSV files.
    """

    def __init__(self):
        super().__init__(FORMAT_NAME_CSV)

    def fitness(self, path: str, path_type: str = None) -> float:
        if path_type == "dir":
            return 0.0
        ext = _get_ext(path)
        ext_value = {".csv": 1.0, ".txt": 0.5, ".dat": 0.2}.get(ext, 0.1)
        type_value = {"file": 1.0, "url": 0.5, None: 0.5}.get(path_type, 0.0)
        return (3 * ext_value + type_value) / 4

    def read(self, path: str, **kwargs) -> xr.Dataset:
        return xr.Dataset.from_dataframe(pd.read_csv(path, **kwargs))

    def write(self, dataset: xr.Dataset, output_path: str, **kwargs):
        dataset.to_dataframe().to_csv(output_path, **kwargs)


def rimraf(path):
    """
    The UNIX command `rm -rf` for xcube.
    Recursively remove directory or single file.
    :param path:  directory or single file
    """
    if os.path.isdir(path):
        try:
            shutil.rmtree(path, ignore_errors=False)
        except OSError:
            warnings.warn(f"failed to remove file {path}")
    elif os.path.isfile(path):
        try:
            os.remove(path)
        except OSError:
            warnings.warn(f'failed to remove file {path}')
            pass


def get_path_or_s3_store(path_or_url: str,
                         s3_kwargs: Mapping[str, Any] = None,
                         s3_client_kwargs: Mapping[str, Any] = None,
                         mode: str = 'r') -> Tuple[Union[str, Dict], bool]:
    """
    If *path_or_url* is an object storage URL, return a object storage Zarr store (mapping object)
    using *s3_client_kwargs* and *mode* and a flag indicating whether the Zarr datasets is consolidated.

    Otherwise *path_or_url* is interpreted as a local file system path, retured as-is plus
    a flag indicating whether the Zarr datasets is consolidated.

    :param path_or_url: A path or a URL.
    :param s3_kwargs: keyword arguments for S3 file system.
    :param s3_client_kwargs: keyword arguments for S3 boto3 client.
    :param mode: "r" or "w"
    :return: A tuple (path_or_obs_store, consolidated).
    """
    if is_s3_url(path_or_url) or s3_kwargs is not None or s3_client_kwargs is not None:
        root, s3_kwargs, s3_client_kwargs = parse_s3_url_and_kwargs(path_or_url,
                                                                    s3_kwargs=s3_kwargs,
                                                                    s3_client_kwargs=s3_client_kwargs)
        s3 = s3fs.S3FileSystem(**s3_kwargs, client_kwargs=s3_client_kwargs)
        consolidated = mode == "r" and s3.exists(f'{root}/.zmetadata')
        return s3fs.S3Map(root=root, s3=s3, check=False, create=mode == "w"), consolidated
    else:
        consolidated = os.path.exists(os.path.join(path_or_url, '.zmetadata'))
        return path_or_url, consolidated


def parse_s3_url_and_kwargs(s3_url: str,
                            s3_kwargs: Mapping[str, Any] = None,
                            s3_client_kwargs: Mapping[str, Any] = None) \
        -> Tuple[str, Dict[str, Any], Dict[str, Any]]:
    """
    Parses *obs_url*, *s3_kwargs*, *s3_client_kwargs* and returns a
    new tuple (*root*, *s3_kwargs*, *s3_client_kwargs*) with updated kwargs whose elements
    can be passed to the s3fs.S3FileSystem and s3fs.S3Map constructors as follows:::

        obs_fs = s3fs.S3FileSystem(**s3_kwargs, client_kwargs=s3_client_kwargs)
        obs_map = s3fs.S3Map(root=root, s3=obs_fs)

    :param s3_url: Object storage URL, e.g. "s3://bucket/root", or "https://bucket.s3.amazonaws.com/root".
    :param s3_kwargs: keyword arguments for S3 file system.
    :param s3_client_kwargs: keyword arguments for S3 boto3 client.
    :return: A tuple (root, s3_kwargs, s3_client_kwargs).
    """
    endpoint_url, root = split_s3_url(s3_url)

    new_s3_client_kwargs = dict(s3_client_kwargs) if s3_client_kwargs else dict()
    if endpoint_url:
        new_s3_client_kwargs['endpoint_url'] = endpoint_url

    # The following key + secret kwargs are no longer supported in client_kwargs and are now moved into s3_kwargs
    key = secret = None
    if 'provider_access_key_id' in new_s3_client_kwargs:
        key = new_s3_client_kwargs.pop('provider_access_key_id')
    if 'aws_access_key_id' in new_s3_client_kwargs:
        key = new_s3_client_kwargs.pop('aws_access_key_id')
    if 'provider_secret_access_key' in new_s3_client_kwargs:
        secret = new_s3_client_kwargs.pop('provider_secret_access_key')
    if 'aws_secret_access_key' in new_s3_client_kwargs:
        secret = new_s3_client_kwargs.pop('aws_secret_access_key')

    new_s3_kwargs = dict(key=key, secret=secret)
    if s3_kwargs:
        new_s3_kwargs.update(**s3_kwargs)

    return root, new_s3_kwargs, new_s3_client_kwargs


def split_s3_url(path: str) -> Tuple[Optional[str], str]:
    """
    If *path* is a URL, return tuple (endpoint_url, root), otherwise (None, *path*)
    """
    url = urllib3.util.parse_url(path)
    if all((url.scheme, url.host, url.path)) and url.scheme != 's3':
        if url.port is not None:
            endpoint_url = f'{url.scheme}://{url.host}:{url.port}'
        else:
            endpoint_url = f'{url.scheme}://{url.host}'
        root = url.path
        if root.startswith('/'):
            root = root[1:]
        return endpoint_url, root
    return None, path


def is_s3_url(path_or_url: str) -> bool:
    """
    Test if *path_or_url* is a potential object storage URL.

    :param path_or_url: Path or URL to test.
    :return: True, if so.
    """
    return path_or_url.startswith("https://") \
           or path_or_url.startswith("http://") \
           or path_or_url.startswith("s3://")