pandas_openscm.db.netcdf#

netCDF backend

Powered by xarray

Classes:

Name	Description
`netCDFDataBackend`	netCDF data backend
`netCDFIndexBackend`	netCDF index backend

Functions:

Name	Description
`metadata_df_to_xr`	Convert metadata to a format that can be used with xarray
`metadata_xr_to_df`	Convert metadata in xarray form to pandas

netCDFDataBackend #

netCDF data backend

Methods:

Name	Description
`load_data`	Load a data file
`save_data`	Save data to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[True]`	Whether this backend preserves the index of data upon (de-)serialisation
`timeseries_dim`	`str`	Name of the timeseries dimension in serialised output

Source code in src/pandas_openscm/db/netcdf.py

@define
class netCDFDataBackend:
    """
    netCDF data backend
    """

    ext: str = ".nc"
    """
    Extension to use with files saved by this backend.
    """

    timeseries_dim: str = "ts_id"
    """
    Name of the timeseries dimension in serialised output
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return True

    def load_data(self, data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_data", requirement="xarray"
            ) from exc

        raw = xr.load_dataset(data_file)

        data: pd.DataFrame = raw["values"].to_pandas()  # type: ignore
        index = metadata_xr_to_df(raw)
        index_concat = index.loc[raw[self.timeseries_dim].values]

        res = pd.concat([index_concat, data], axis="columns").set_index(
            index.columns.to_list()
        )

        return res

    def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.save_data", requirement="xarray"
            ) from exc

        # Resetting the index will also give each timeseries a unique ID
        data_rs = data.reset_index()
        timeseries_coord_info = {self.timeseries_dim: data_rs.index.values}
        if data.columns.name is None:
            time_dim = "time"
        else:
            time_dim = str(data.columns.name)

        time_coord_info = {time_dim: data.columns.values}

        data_index_xr = metadata_df_to_xr(
            data_rs[data.index.names],
            timeseries_id_coord=xr.Coordinates(timeseries_coord_info),
            timeseries_dim=self.timeseries_dim,
        )
        data_values_xr = xr.DataArray(
            data,
            dims=[self.timeseries_dim, time_dim],
            coords=xr.Coordinates(timeseries_coord_info | time_coord_info),
        )
        data_xr = xr.merge([data_index_xr, data_values_xr.to_dataset(name="values")])
        data_xr.to_netcdf(data_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.nc'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

timeseries_dim `class-attribute` `instance-attribute` #

timeseries_dim: str = 'ts_id'

Name of the timeseries dimension in serialised output

load_data #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name	Type	Description	Default
`data_file`	`Path`	File from which to load the data	required

Returns:

Type	Description
`DataFrame`	Loaded data

Source code in src/pandas_openscm/db/netcdf.py

def load_data(self, data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_data", requirement="xarray"
        ) from exc

    raw = xr.load_dataset(data_file)

    data: pd.DataFrame = raw["values"].to_pandas()  # type: ignore
    index = metadata_xr_to_df(raw)
    index_concat = index.loc[raw[self.timeseries_dim].values]

    res = pd.concat([index_concat, data], axis="columns").set_index(
        index.columns.to_list()
    )

    return res

save_data #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to save	required
`data_file`	`Path`	File in which to save the data	required

Source code in src/pandas_openscm/db/netcdf.py

def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.save_data", requirement="xarray"
        ) from exc

    # Resetting the index will also give each timeseries a unique ID
    data_rs = data.reset_index()
    timeseries_coord_info = {self.timeseries_dim: data_rs.index.values}
    if data.columns.name is None:
        time_dim = "time"
    else:
        time_dim = str(data.columns.name)

    time_coord_info = {time_dim: data.columns.values}

    data_index_xr = metadata_df_to_xr(
        data_rs[data.index.names],
        timeseries_id_coord=xr.Coordinates(timeseries_coord_info),
        timeseries_dim=self.timeseries_dim,
    )
    data_values_xr = xr.DataArray(
        data,
        dims=[self.timeseries_dim, time_dim],
        coords=xr.Coordinates(timeseries_coord_info | time_coord_info),
    )
    data_xr = xr.merge([data_index_xr, data_values_xr.to_dataset(name="values")])
    data_xr.to_netcdf(data_file)

netCDFIndexBackend #

netCDF index backend

Methods:

Name	Description
`load_file_map`	Load the database's file map
`load_index`	Load the index
`save_file_map`	Save the file map to disk
`save_index`	Save the index to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[True]`	Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
`timeseries_dim`	`str`	Name of the timeseries dimension in serialised output

Source code in src/pandas_openscm/db/netcdf.py

@define
class netCDFIndexBackend:
    """
    netCDF index backend
    """

    ext: str = ".nc"
    """
    Extension to use with files saved by this backend.
    """

    timeseries_dim: str = "ts_id"
    """
    Name of the timeseries dimension in serialised output
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return True

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the database's file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_file_map", requirement="xarray"
            ) from exc

        res = xr.load_dataset(file_map_file).to_pandas()
        if isinstance(res, pd.Series):  # pragma: no cover
            raise TypeError(res)

        return res

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_index", requirement="xarray"
            ) from exc

        raw = xr.load_dataset(index_file)

        intermediate = metadata_xr_to_df(raw)
        res = intermediate.set_index(
            intermediate.columns.difference(["file_id"]).to_list()
        )

        return res

    @staticmethod
    def save_file_map(
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.save_file_map", requirement="xarray"
            ) from exc

        file_map_xr = xr.DataArray.from_series(file_map.astype(str))
        file_map_xr.to_netcdf(file_map_file)

    def save_index(
        self,
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        # Use a different name because the timeseries IDs in the index
        # won't necessarily line up with those in the file(s).
        # This should not matter for users, who never see them side-by-side,
        # but just in case.
        index_xr = metadata_df_to_xr(
            # Have to reset the index so we can serialise to disk
            index.reset_index(),
            timeseries_dim=f"{self.timeseries_dim}_index",
        )
        index_xr.to_netcdf(index_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.nc'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

timeseries_dim `class-attribute` `instance-attribute` #

timeseries_dim: str = 'ts_id'

Name of the timeseries dimension in serialised output

load_file_map `staticmethod` #

load_file_map(file_map_file: Path) -> DataFrame

Load the database's file map

Parameters:

Name	Type	Description	Default
`file_map_file`	`Path`	File from which to load the file map	required

Returns:

Type	Description
`DataFrame`	Loaded file map

Source code in src/pandas_openscm/db/netcdf.py

@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the database's file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_file_map", requirement="xarray"
        ) from exc

    res = xr.load_dataset(file_map_file).to_pandas()
    if isinstance(res, pd.Series):  # pragma: no cover
        raise TypeError(res)

    return res

load_index `staticmethod` #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name	Type	Description	Default
`index_file`	`Path`	File from which to load the index	required

Returns:

Type	Description
`DataFrame`	Loaded index

Source code in src/pandas_openscm/db/netcdf.py

@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_index", requirement="xarray"
        ) from exc

    raw = xr.load_dataset(index_file)

    intermediate = metadata_xr_to_df(raw)
    res = intermediate.set_index(
        intermediate.columns.difference(["file_id"]).to_list()
    )

    return res

save_file_map `staticmethod` #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name	Type	Description	Default
`file_map`	`Series[Path]`	File map to save	required
`file_map_file`	`Path`	File in which to save the file map	required

Source code in src/pandas_openscm/db/netcdf.py

@staticmethod
def save_file_map(
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.save_file_map", requirement="xarray"
        ) from exc

    file_map_xr = xr.DataArray.from_series(file_map.astype(str))
    file_map_xr.to_netcdf(file_map_file)

save_index #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name	Type	Description	Default
`index`	`DataFrame`	Index to save	required
`index_file`	`Path`	File in which to save the index	required

Source code in src/pandas_openscm/db/netcdf.py

def save_index(
    self,
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    # Use a different name because the timeseries IDs in the index
    # won't necessarily line up with those in the file(s).
    # This should not matter for users, who never see them side-by-side,
    # but just in case.
    index_xr = metadata_df_to_xr(
        # Have to reset the index so we can serialise to disk
        index.reset_index(),
        timeseries_dim=f"{self.timeseries_dim}_index",
    )
    index_xr.to_netcdf(index_file)

metadata_df_to_xr #

metadata_df_to_xr(
    metadata: DataFrame,
    timeseries_id_coord: Coordinates | None = None,
    timeseries_dim: str = "ts_id",
    map_suffix: str = "_map",
    int_suffix: str = "_int",
) -> Dataset

Convert metadata to a format that can be used with xarray

This assumes that each row is the metadata for an individual timeseries and that the index of metadata should be used for the unique ID of each timeseries.

This avoids the metadata creating super sparse arrays, at the expense of being harder to work with if directly used with xarray. If you need to work with super sparse data and want to work with it in a lower-dimensional data structure to avoid sparsity issues, our advice would be to use pandas. For docs on pandas support for higher dimensional data, see https://pandas.pydata.org/docs/user_guide/advanced.html#hierarchical-indexing-multiindex.

Parameters:

Name	Type	Description	Default
`metadata`	`DataFrame`	Metadata to convert	required
`timeseries_id_coord`	`Coordinates \| None`	Co-ordinate to use for the timeseries ID. If not supplied, this will be created from `metadata.index`.	`None`
`timeseries_dim`	`str`	Name of the dimension to use to label timeseries in the output	`'ts_id'`
`map_suffix`	`str`	Suffix to add to metadata names to get their mapping variable	`'_map'`
`int_suffix`	`str`	Suffix to add to metadata names to get their integer equivalent.	`'_int'`

Returns:

Type	Description
`Dataset`	Metadata, converted to a 'flat' xarray object.

Raises:

Type	Description
`MissingOptionalDependencyError`	xarray is not installed.
`AssertionError`	`timeseries_id_coord` is `None` and the index of `metadata` is not already unique.

See Also

metadata_xr_to_df

Examples:

>>> import pandas as pd
>>>
>>> metadata = pd.DataFrame(
...     [
...         ["scen_a", "mod_a", "var_a"],
...         ["scen_b", "mod_b", "var_b"],
...         ["scen_a", "mod_b", "var_a"],
...     ],
...     columns=["scenario", "model", "variable"],
... )
>>> metadata_df_to_xr(metadata)
<xarray.Dataset> Size: ...B
Dimensions:       (scenario_map: 2, ts_id: 3, model_map: 2, variable_map: 2)
Coordinates:
  * scenario_map  (scenario_map) int64 16B 0 1
  * ts_id         (ts_id) int64 24B 0 1 2
  * model_map     (model_map) int64 16B 0 1
  * variable_map  (variable_map) int64 16B 0 1
Data variables:
    scenario      (scenario_map) <U6 48B 'scen_a' 'scen_b'
    scenario_int  (ts_id) int64 24B 0 1 0
    model         (model_map) <U5 40B 'mod_a' 'mod_b'
    model_int     (ts_id) int64 24B 0 1 1
    variable      (variable_map) <U5 40B 'var_a' 'var_b'
    variable_int  (ts_id) int64 24B 0 1 0

Source code in src/pandas_openscm/db/netcdf.py

def metadata_df_to_xr(
    metadata: pd.DataFrame,
    timeseries_id_coord: xr.Coordinates | None = None,
    timeseries_dim: str = "ts_id",
    map_suffix: str = "_map",
    int_suffix: str = "_int",
) -> xr.Dataset:
    """
    Convert metadata to a format that can be used with xarray

    This assumes that each row is the metadata for an individual timeseries
    and that the index of `metadata` should be used for the unique ID
    of each timeseries.

    This avoids the metadata creating super sparse arrays,
    at the expense of being harder to work with if directly used with xarray.
    If you need to work with super sparse data
    and want to work with it in a lower-dimensional data structure
    to avoid sparsity issues, our advice would be to use pandas.
    For docs on pandas support for higher dimensional data,
    see https://pandas.pydata.org/docs/user_guide/advanced.html#hierarchical-indexing-multiindex.

    Parameters
    ----------
    metadata
        Metadata to convert

    timeseries_id_coord
        Co-ordinate to use for the timeseries ID.

        If not supplied, this will be created from `metadata.index`.

    timeseries_dim
        Name of the dimension to use to label timeseries in the output

    map_suffix
        Suffix to add to metadata names to get their mapping variable

    int_suffix
        Suffix to add to metadata names to get their integer equivalent.

    Returns
    -------
    :
        Metadata, converted to a 'flat' xarray object.

    Raises
    ------
    MissingOptionalDependencyError
        [xarray][] is not installed.

    AssertionError
        `timeseries_id_coord` is `None`
        and the index of `metadata` is not already unique.

    See Also
    --------
    metadata_xr_to_df

    Examples
    --------
    >>> import pandas as pd
    >>>
    >>> metadata = pd.DataFrame(
    ...     [
    ...         ["scen_a", "mod_a", "var_a"],
    ...         ["scen_b", "mod_b", "var_b"],
    ...         ["scen_a", "mod_b", "var_a"],
    ...     ],
    ...     columns=["scenario", "model", "variable"],
    ... )
    >>> metadata_df_to_xr(metadata)  # doctest: +ELLIPSIS
    <xarray.Dataset> Size: ...B
    Dimensions:       (scenario_map: 2, ts_id: 3, model_map: 2, variable_map: 2)
    Coordinates:
      * scenario_map  (scenario_map) int64 16B 0 1
      * ts_id         (ts_id) int64 24B 0 1 2
      * model_map     (model_map) int64 16B 0 1
      * variable_map  (variable_map) int64 16B 0 1
    Data variables:
        scenario      (scenario_map) <U6 48B 'scen_a' 'scen_b'
        scenario_int  (ts_id) int64 24B 0 1 0
        model         (model_map) <U5 40B 'mod_a' 'mod_b'
        model_int     (ts_id) int64 24B 0 1 1
        variable      (variable_map) <U5 40B 'var_a' 'var_b'
        variable_int  (ts_id) int64 24B 0 1 0
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "metadata_df_to_xr", requirement="xarray"
        ) from exc

    if timeseries_id_coord is None:
        if metadata.index.duplicated().any():
            dups = metadata.index[metadata.index.duplicated(keep=False)]
            msg = f"Index values should be unique. Received these duplicates: {dups}"
            raise AssertionError(msg)

    metadata_columns = metadata.columns.tolist()
    index_darrays = {}
    for metadata_col in metadata_columns:
        metadata_col_unique_vals = metadata[metadata_col].unique().tolist()
        metadata_col_map = {}
        metadata_col_mapped_values = [0] * len(metadata_col_unique_vals)
        for i, v in enumerate(metadata_col_unique_vals):
            metadata_col_map[v] = i
            metadata_col_mapped_values[i] = i

        metadata_map_xr = xr.DataArray(
            metadata_col_unique_vals,
            dims=[f"{metadata_col}{map_suffix}"],
            coords={f"{metadata_col}{map_suffix}": metadata_col_mapped_values},
        )
        metadata_int_xr = xr.DataArray(
            # Not sure if converting to category first then mapping
            # is faster here or not, would have to profile.
            metadata[metadata_col].map(metadata_col_map).astype(int),
            dims=[timeseries_dim],
            coords=timeseries_id_coord,
        )

        index_darrays[metadata_col] = metadata_map_xr
        index_darrays[f"{metadata_col}{int_suffix}"] = metadata_int_xr

    index_xr = xr.Dataset(index_darrays)

    return index_xr

metadata_xr_to_df #

metadata_xr_to_df(
    metadata: Dataset,
    category_index: bool = False,
    map_suffix: str = "_map",
    int_suffix: str = "_int",
) -> DataFrame

Convert metadata in xarray form to pandas

Parameters:

Name	Type	Description	Default
`metadata`	`Dataset`	Metadata to convert	required
`category_index`	`bool`	Should the index be returned as category type?	`False`
`map_suffix`	`str`	Suffix to add to metadata names to get their mapping variable	`'_map'`
`int_suffix`	`str`	Suffix to add to metadata names to get their integer equivalent.	`'_int'`

Returns:

Type	Description
`DataFrame`	pandas form of the metadata

pandas_openscm.db.netcdf#

netCDFDataBackend #

ext class-attribute instance-attribute #

preserves_index property #

timeseries_dim class-attribute instance-attribute #

load_data #

save_data #

netCDFIndexBackend #

ext class-attribute instance-attribute #

preserves_index property #

timeseries_dim class-attribute instance-attribute #

load_file_map staticmethod #

load_index staticmethod #

save_file_map staticmethod #

save_index #

metadata_df_to_xr #

metadata_xr_to_df #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

timeseries_dim `class-attribute` `instance-attribute` #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

timeseries_dim `class-attribute` `instance-attribute` #

load_file_map `staticmethod` #

load_index `staticmethod` #

save_file_map `staticmethod` #