Skip to content

pandas_openscm.db#

Database

Modules:

Name Description
backends

Available back-ends

csv

CSV backend

deleting

Functionality for deleting data

feather

Feather backend

in_memory

In-memory backend

interfaces

Interfaces used throughout the db (database) module

loading

Loading of data from disk

netcdf

netCDF backend

openscm_db

Definition of our key OpenSCMDB class

path_handling

Functionality for handling paths

reader

Database reader

rewriting

Functionality for re-writing a database

saving

Functionality for saving data

Classes:

Name Description
AlreadyInDBError

Raised when saving data would overwrite data which is already in the database

CSVDataBackend

CSV data backend

CSVIndexBackend

CSV index backend

EmptyDBError

Raised when trying to access data from a database that is empty

FeatherDataBackend

Feather data backend

FeatherIndexBackend

Feather index backend

InMemoryDataBackend

In-memory data backend

InMemoryIndexBackend

In-memory index backend

OpenSCMDB

Database for storing OpenSCM-style data

OpenSCMDBDataBackend

Backend for (de-)serialising data

OpenSCMDBIndexBackend

Backend for (de-)serialising the index (and file map)

netCDFDataBackend

netCDF data backend

netCDFIndexBackend

netCDF index backend

Attributes:

Name Type Description
DATA_BACKENDS

Inbuilt data back-ends

INDEX_BACKENDS

Inbuilt index back-ends

DATA_BACKENDS module-attribute #

DATA_BACKENDS = DataBackendOptions(
    cast(
        tuple[tuple[str, type[OpenSCMDBDataBackend]], ...],
        (
            ("csv", CSVDataBackend),
            ("feather", FeatherDataBackend),
            ("in_memory", InMemoryDataBackend),
            ("netCDF", netCDFDataBackend),
        ),
    )
)

Inbuilt data back-ends

INDEX_BACKENDS module-attribute #

INDEX_BACKENDS = IndexBackendOptions(
    cast(
        tuple[tuple[str, type[OpenSCMDBIndexBackend]], ...],
        (
            ("csv", CSVIndexBackend),
            ("feather", FeatherIndexBackend),
            ("in_memory", InMemoryIndexBackend),
            ("netCDF", netCDFIndexBackend),
        ),
    )
)

Inbuilt index back-ends

AlreadyInDBError #

Bases: ValueError

Raised when saving data would overwrite data which is already in the database

Methods:

Name Description
__init__

Initialise the error

Source code in src/pandas_openscm/db/openscm_db.py
class AlreadyInDBError(ValueError):
    """
    Raised when saving data would overwrite data which is already in the database
    """

    def __init__(self, already_in_db: pd.DataFrame) -> None:
        """
        Initialise the error

        Parameters
        ----------
        already_in_db
            data that is already in the database
        """
        error_msg = (
            "The following rows are already in the database:\n"
            f"{already_in_db.index.to_frame(index=False)}"
        )
        super().__init__(error_msg)

__init__ #

__init__(already_in_db: DataFrame) -> None

Initialise the error

Parameters:

Name Type Description Default
already_in_db DataFrame

data that is already in the database

required
Source code in src/pandas_openscm/db/openscm_db.py
def __init__(self, already_in_db: pd.DataFrame) -> None:
    """
    Initialise the error

    Parameters
    ----------
    already_in_db
        data that is already in the database
    """
    error_msg = (
        "The following rows are already in the database:\n"
        f"{already_in_db.index.to_frame(index=False)}"
    )
    super().__init__(error_msg)

CSVDataBackend #

CSV data backend

Methods:

Name Description
load_data

Load a data file

save_data

Save data to disk

Attributes:

Name Type Description
ext str

Extension to use with files saved by this backend.

preserves_index Literal[False]

Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/csv.py
@define
class CSVDataBackend:
    """
    CSV data backend
    """

    ext: str = ".csv"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[False]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return False

    @staticmethod
    def load_data(data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        return pd.read_csv(data_file)

    @staticmethod
    def save_data(data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        data.to_csv(data_file)

ext class-attribute instance-attribute #

ext: str = '.csv'

Extension to use with files saved by this backend.

preserves_index property #

preserves_index: Literal[False]

Whether this backend preserves the index of data upon (de-)serialisation

load_data staticmethod #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name Type Description Default
data_file Path

File from which to load the data

required

Returns:

Type Description
DataFrame

Loaded data

Source code in src/pandas_openscm/db/csv.py
@staticmethod
def load_data(data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    return pd.read_csv(data_file)

save_data staticmethod #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name Type Description Default
data DataFrame

Data to save

required
data_file Path

File in which to save the data

required
Source code in src/pandas_openscm/db/csv.py
@staticmethod
def save_data(data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    data.to_csv(data_file)

CSVIndexBackend #

CSV index backend

Methods:

Name Description
load_file_map

Load the file map

load_index

Load the index

save_file_map

Save the file map to disk

save_index

Save the index to disk

Attributes:

Name Type Description
ext str

Extension to use with files saved by this backend.

preserves_index Literal[False]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

Source code in src/pandas_openscm/db/csv.py
@define
class CSVIndexBackend:
    """
    CSV index backend
    """

    ext: str = ".csv"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[False]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return False

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        return pd.read_csv(file_map_file)

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        return pd.read_csv(index_file)

    @staticmethod
    def save_file_map(
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        file_map.to_csv(file_map_file)

    @staticmethod
    def save_index(
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        index.to_csv(index_file)

ext class-attribute instance-attribute #

ext: str = '.csv'

Extension to use with files saved by this backend.

preserves_index property #

preserves_index: Literal[False]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map staticmethod #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

Parameters:

Name Type Description Default
file_map_file Path

File from which to load the file map

required

Returns:

Type Description
DataFrame

Loaded file map

Source code in src/pandas_openscm/db/csv.py
@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    return pd.read_csv(file_map_file)

load_index staticmethod #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name Type Description Default
index_file Path

File from which to load the index

required

Returns:

Type Description
DataFrame

Loaded index

Source code in src/pandas_openscm/db/csv.py
@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    return pd.read_csv(index_file)

save_file_map staticmethod #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name Type Description Default
file_map Series[Path]

File map to save

required
file_map_file Path

File in which to save the file map

required
Source code in src/pandas_openscm/db/csv.py
@staticmethod
def save_file_map(
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    file_map.to_csv(file_map_file)

save_index staticmethod #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name Type Description Default
index DataFrame

Index to save

required
index_file Path

File in which to save the index

required
Source code in src/pandas_openscm/db/csv.py
@staticmethod
def save_index(
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    index.to_csv(index_file)

EmptyDBError #

Bases: ValueError

Raised when trying to access data from a database that is empty

Methods:

Name Description
__init__

Initialise the error

Source code in src/pandas_openscm/db/openscm_db.py
class EmptyDBError(ValueError):
    """
    Raised when trying to access data from a database that is empty
    """

    def __init__(self, db: OpenSCMDB) -> None:
        """
        Initialise the error

        Parameters
        ----------
        db
            The database
        """
        error_msg = f"The database is empty: {db=}"
        super().__init__(error_msg)

__init__ #

__init__(db: OpenSCMDB) -> None

Initialise the error

Parameters:

Name Type Description Default
db OpenSCMDB

The database

required
Source code in src/pandas_openscm/db/openscm_db.py
def __init__(self, db: OpenSCMDB) -> None:
    """
    Initialise the error

    Parameters
    ----------
    db
        The database
    """
    error_msg = f"The database is empty: {db=}"
    super().__init__(error_msg)

FeatherDataBackend #

Feather data backend

For details on feather, see https://arrow.apache.org/docs/python/feather.html

Methods:

Name Description
load_data

Load a data file

save_data

Save data to disk

Attributes:

Name Type Description
ext str

Extension to use with files saved by this backend.

preserves_index Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/feather.py
@define
class FeatherDataBackend:
    """
    Feather data backend

    For details on feather, see https://arrow.apache.org/docs/python/feather.html
    """

    ext: str = ".feather"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return True

    @staticmethod
    def load_data(data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        return pd.read_feather(data_file)

    @staticmethod
    def save_data(data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        # The docs say that feather doesn't support writing indexes
        # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
        # However, it seems to have no issue writing our multi-indexes.
        # Hence the implementation below
        data.to_feather(data_file)

ext class-attribute instance-attribute #

ext: str = '.feather'

Extension to use with files saved by this backend.

preserves_index property #

preserves_index: Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

load_data staticmethod #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name Type Description Default
data_file Path

File from which to load the data

required

Returns:

Type Description
DataFrame

Loaded data

Source code in src/pandas_openscm/db/feather.py
@staticmethod
def load_data(data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    return pd.read_feather(data_file)

save_data staticmethod #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name Type Description Default
data DataFrame

Data to save

required
data_file Path

File in which to save the data

required
Source code in src/pandas_openscm/db/feather.py
@staticmethod
def save_data(data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    # The docs say that feather doesn't support writing indexes
    # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
    # However, it seems to have no issue writing our multi-indexes.
    # Hence the implementation below
    data.to_feather(data_file)

FeatherIndexBackend #

Feather index backend

For details on feather, see https://arrow.apache.org/docs/python/feather.html

Methods:

Name Description
load_file_map

Load the file map

load_index

Load the index

save_file_map

Save the file map to disk

save_index

Save the index to disk

Attributes:

Name Type Description
ext str

Extension to use with files saved by this backend.

preserves_index Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

Source code in src/pandas_openscm/db/feather.py
@define
class FeatherIndexBackend:
    """
    Feather index backend

    For details on feather, see https://arrow.apache.org/docs/python/feather.html
    """

    ext: str = ".feather"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return True

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        return pd.read_feather(file_map_file)

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        return pd.read_feather(index_file)

    @staticmethod
    def save_file_map(
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        # Feather doesn't support writing non-native types
        # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
        # The docs say that feather doesn't support writing indexes
        # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
        # However, it seems to have no issue writing this index.
        # Hence the implementation below
        file_map_write = file_map.astype(str)
        file_map_write.to_frame().to_feather(file_map_file)

    @staticmethod
    def save_index(
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        index.to_feather(index_file)

ext class-attribute instance-attribute #

ext: str = '.feather'

Extension to use with files saved by this backend.

preserves_index property #

preserves_index: Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map staticmethod #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

Parameters:

Name Type Description Default
file_map_file Path

File from which to load the file map

required

Returns:

Type Description
DataFrame

Loaded file map

Source code in src/pandas_openscm/db/feather.py
@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    return pd.read_feather(file_map_file)

load_index staticmethod #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name Type Description Default
index_file Path

File from which to load the index

required

Returns:

Type Description
DataFrame

Loaded index

Source code in src/pandas_openscm/db/feather.py
@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    return pd.read_feather(index_file)

save_file_map staticmethod #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name Type Description Default
file_map Series[Path]

File map to save

required
file_map_file Path

File in which to save the file map

required
Source code in src/pandas_openscm/db/feather.py
@staticmethod
def save_file_map(
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    # Feather doesn't support writing non-native types
    # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
    # The docs say that feather doesn't support writing indexes
    # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
    # However, it seems to have no issue writing this index.
    # Hence the implementation below
    file_map_write = file_map.astype(str)
    file_map_write.to_frame().to_feather(file_map_file)

save_index staticmethod #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name Type Description Default
index DataFrame

Index to save

required
index_file Path

File in which to save the index

required
Source code in src/pandas_openscm/db/feather.py
@staticmethod
def save_index(
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    index.to_feather(index_file)

InMemoryDataBackend #

In-memory data backend

Methods:

Name Description
load_data

Load a data file

save_data

Save data to disk

Attributes:

Name Type Description
data dict[str, DataFrame] | None

Data store

ext str

Extension to use with files saved by this backend.

preserves_index Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/in_memory.py
@define
class InMemoryDataBackend:
    """
    In-memory data backend
    """

    ext: str = ".in-mem"
    """
    Extension to use with files saved by this backend.
    """

    data: dict[str, pd.DataFrame] | None = None
    """
    Data store
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return True

    def load_data(self, data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        if self.data is None:
            raise TypeError

        return self.data[str(data_file)]

    def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        if self.data is None:
            self.data = {}

        self.data[str(data_file)] = data
        # Have to do this as, even though it's in-memory,
        # the layer above expects to have files to check, remove etc.
        data_file.touch()

data class-attribute instance-attribute #

data: dict[str, DataFrame] | None = None

Data store

ext class-attribute instance-attribute #

ext: str = '.in-mem'

Extension to use with files saved by this backend.

preserves_index property #

preserves_index: Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

load_data #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name Type Description Default
data_file Path

File from which to load the data

required

Returns:

Type Description
DataFrame

Loaded data

Source code in src/pandas_openscm/db/in_memory.py
def load_data(self, data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    if self.data is None:
        raise TypeError

    return self.data[str(data_file)]

save_data #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name Type Description Default
data DataFrame

Data to save

required
data_file Path

File in which to save the data

required
Source code in src/pandas_openscm/db/in_memory.py
def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    if self.data is None:
        self.data = {}

    self.data[str(data_file)] = data
    # Have to do this as, even though it's in-memory,
    # the layer above expects to have files to check, remove etc.
    data_file.touch()

InMemoryIndexBackend #

In-memory index backend

Methods:

Name Description
load_file_map

Load the file map

load_index

Load the index

save_file_map

Save the file map to disk

save_index

Save the index to disk

Attributes:

Name Type Description
ext str

Extension to use with files saved by this backend.

file_map Series[Path] | None

File map store

index DataFrame | None

Index store

preserves_index Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

Source code in src/pandas_openscm/db/in_memory.py
@define
class InMemoryIndexBackend:
    """
    In-memory index backend
    """

    ext: str = ".in-mem"
    """
    Extension to use with files saved by this backend.
    """

    index: pd.DataFrame | None = None
    """Index store"""

    file_map: pd.Series[Path] | None = None  # type: ignore # pandas confused about what it supports
    """File map store"""

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return True

    def load_file_map(self, file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        if self.file_map is None:
            raise TypeError

        return self.file_map.to_frame()

    def load_index(self, index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        if self.index is None:
            raise TypeError

        return self.index

    def save_file_map(
        self,
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        self.file_map = file_map
        # Have to do this as, even though it's in-memory,
        # the layer above expects to have files to check
        file_map_file.touch()

    def save_index(
        self,
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        self.index = index
        # Have to do this as, even though it's in-memory,
        # the layer above expects to have files to check
        index_file.touch()

ext class-attribute instance-attribute #

ext: str = '.in-mem'

Extension to use with files saved by this backend.

file_map class-attribute instance-attribute #

file_map: Series[Path] | None = None

File map store

index class-attribute instance-attribute #

index: DataFrame | None = None

Index store

preserves_index property #

preserves_index: Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

Parameters:

Name Type Description Default
file_map_file Path

File from which to load the file map

required

Returns:

Type Description
DataFrame

Loaded file map

Source code in src/pandas_openscm/db/in_memory.py
def load_file_map(self, file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    if self.file_map is None:
        raise TypeError

    return self.file_map.to_frame()

load_index #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name Type Description Default
index_file Path

File from which to load the index

required

Returns:

Type Description
DataFrame

Loaded index

Source code in src/pandas_openscm/db/in_memory.py
def load_index(self, index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    if self.index is None:
        raise TypeError

    return self.index

save_file_map #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name Type Description Default
file_map Series[Path]

File map to save

required
file_map_file Path

File in which to save the file map

required
Source code in src/pandas_openscm/db/in_memory.py
def save_file_map(
    self,
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    self.file_map = file_map
    # Have to do this as, even though it's in-memory,
    # the layer above expects to have files to check
    file_map_file.touch()

save_index #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name Type Description Default
index DataFrame

Index to save

required
index_file Path

File in which to save the index

required
Source code in src/pandas_openscm/db/in_memory.py
def save_index(
    self,
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    self.index = index
    # Have to do this as, even though it's in-memory,
    # the layer above expects to have files to check
    index_file.touch()

OpenSCMDB #

Database for storing OpenSCM-style data

This class is focussed on backends that use files as their storage. If you had a different database backend, you might make different choices. We haven't thought through those use cases hence aren't sure how much effort would be required to make something truly backend agnostic.

Methods:

Name Description
create_reader

Create a database reader

default_index_file_lock

Get default lock for the back-end's index file

delete

Delete all data in the database

from_gzipped_tar_archive

Initialise from a gzipped tar archive

get_new_data_file_path

Get the path in which to write a new data file

load

Load data

load_file_map

Load the file map

load_index

Load the index

load_metadata

Load the database's metadata

save

Save data into the database

to_gzipped_tar_archive

Convert to a gzipped tar archive

Attributes:

Name Type Description
backend_data OpenSCMDBDataBackend

The backend for (de-)serialising data (from) to disk

backend_index OpenSCMDBIndexBackend

The backend for (de-)serialising the database index (from) to disk

db_dir Path

Path in which the database is stored

file_map_file Path

The file in which the file map is stored

index_file Path

The file in which the database's index is stored

index_file_lock BaseFileLock

Lock for the index file

index_file_lock_path Path

Path to the lock file for the back-end's index file

is_empty bool

Whether the database is empty or not

Source code in src/pandas_openscm/db/openscm_db.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
@define
class OpenSCMDB:
    """
    Database for storing OpenSCM-style data

    This class is focussed on backends that use files as their storage.
    If you had a different database backend,
    you might make different choices.
    We haven't thought through those use cases
    hence aren't sure how much effort
    would be required to make something truly backend agnostic.
    """

    backend_data: OpenSCMDBDataBackend = field(kw_only=True)
    """
    The backend for (de-)serialising data (from) to disk
    """

    backend_index: OpenSCMDBIndexBackend = field(kw_only=True)
    """
    The backend for (de-)serialising the database index (from) to disk
    """

    db_dir: Path = field(kw_only=True)
    """
    Path in which the database is stored

    Both the index and the data files will be written in this directory.
    """

    index_file_lock: filelock.BaseFileLock = field(kw_only=True)
    """
    Lock for the index file
    """
    # Note to devs: filelock releases the lock when __del__ is called
    # (i.e. when the lock instance is garbage collected).
    # Hence, you have to keep a reference to this around
    # if you want it to do anything.
    # For a while, we made this a property that created the lock when requested.
    # That was super confusing as, if the reference to the created lock wasn't kept,
    # the lock would immediately be released.

    @index_file_lock.default  # ty: ignore[call-non-callable]
    def default_index_file_lock(self) -> filelock.BaseFileLock:
        """Get default lock for the back-end's index file"""
        try:
            import filelock  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "default_index_file_lock", requirement="filelock"
            ) from exc

        return filelock.FileLock(self.index_file_lock_path)

    @property
    def file_map_file(self) -> Path:
        """
        The file in which the file map is stored

        The file map stores the mapping from file_id
        to file path.

        Returns
        -------
        :
            Path to the file map file
        """
        return self.db_dir / f"filemap{self.backend_index.ext}"

    @property
    def index_file(self) -> Path:
        """
        The file in which the database's index is stored

        Returns
        -------
        :
            Path to the index file
        """
        return self.db_dir / f"index{self.backend_index.ext}"

    @property
    def index_file_lock_path(self) -> Path:
        """Path to the lock file for the back-end's index file"""
        return self.index_file.parent / f"{self.index_file.name}.lock"

    @property
    def is_empty(self) -> bool:
        """
        Whether the database is empty or not

        Returns
        -------
        :
            `True` if the database is empty, `False` otherwise
        """
        return not self.index_file.exists()

    def create_reader(
        self,
        *,
        lock: bool | filelock.BaseFileLock | None = True,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> OpenSCMDBReader:
        """
        Create a database reader

        Parameters
        ----------
        lock
            Lock to give to the reader.

            If `True`, we create a new lock for the database, such that,
            if the reader is holding the lock,
            no operations can be performed on the database.

            If `False`, the reader is not given any lock.

        index_file_lock
            Lock for the database's index file

            Used while loading the index from disk.

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Database reader
        """
        if isinstance(lock, bool):
            if lock:
                try:
                    import filelock  # noqa: PLC0415
                except ImportError as exc:
                    raise MissingOptionalDependencyError(  # noqa: TRY003
                        "create_reader(..., lock=True, ...)", requirement="filelock"
                    ) from exc

                # Create a new lock for the reader
                lock = filelock.FileLock(self.index_file_lock_path)

            else:
                # Convert to None
                lock = None

        db_index = self.load_index(index_file_lock=index_file_lock)
        db_file_map = self.load_file_map(index_file_lock=index_file_lock)

        res = OpenSCMDBReader(
            backend_data=self.backend_data,
            db_dir=self.db_dir,
            db_index=db_index,
            db_file_map=db_file_map,
            lock=lock,
        )

        return res

    def delete(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
        parallel_op_config: ParallelOpConfig | None = None,
        progress: bool = False,
        max_workers: int | None = None,
    ) -> None:
        """
        Delete all data in the database

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        parallel_op_config
            Configuration for executing the operation in parallel with progress bars

            If not supplied, we use the values of `progress` and `max_workers`.

        progress
            Should progress bar(s) be used to display the progress of the deletion?

            Only used if `parallel_op_config` is `None`.

        max_workers
            Maximum number of workers to use for parallel processing.

            If supplied, we create an instance of
            [concurrent.futures.ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor)
            with the provided number of workers
            (a thread pool makes sense as deletion is I/O-bound).

            If not supplied, the deletions are executed serially.

            Only used if `parallel_op_config` is `None`.
        """
        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            files_to_delete = {
                *self.db_dir.glob(f"*{self.backend_data.ext}"),
                *self.db_dir.glob(f"*{self.backend_index.ext}"),
            }
            delete_files(
                files_to_delete=files_to_delete,
                parallel_op_config=parallel_op_config,
                progress=progress,
                max_workers=max_workers,
            )

    @classmethod
    def from_gzipped_tar_archive(
        cls,
        tar_archive: Path,
        db_dir: Path,
        backend_data: OpenSCMDBDataBackend | None = None,
        backend_index: OpenSCMDBIndexBackend | None = None,
    ) -> OpenSCMDB:
        """
        Initialise from a gzipped tar archive

        This also unpacks the files to disk

        Parameters
        ----------
        tar_archive
            Tar archive from which to initialise

        db_dir
            Directory in which to unpack the database

        backend_data
            Backend to use for handling the data

        backend_index
            Backend to use for handling the index

        Returns
        -------
        :
            Initialised database
        """
        with tarfile.open(tar_archive, "r") as tar:
            for member in tar.getmembers():
                if not member.isreg():
                    # Only extract files
                    continue
                # Extract to the db_dir
                member.name = Path(member.name).name
                tar.extract(member, db_dir)
                if backend_index is None and member.name.startswith("index"):
                    backend_index = INDEX_BACKENDS.guess_backend(member.name)

                if backend_data is None and not any(
                    member.name.startswith(v) for v in ["index", "filemap"]
                ):
                    backend_data = DATA_BACKENDS.guess_backend(member.name)

        if backend_data is None:  # pragma: no cover
            # Should be impossible to get here
            raise TypeError(backend_data)

        if backend_index is None:  # pragma: no cover
            # Should be impossible to get here
            raise TypeError(backend_index)

        res = cls(  # ty: ignore[missing-argument]
            backend_data=backend_data, backend_index=backend_index, db_dir=db_dir
        )

        return res

    def get_new_data_file_path(self, file_id: int) -> DBPath:
        """
        Get the path in which to write a new data file

        Parameters
        ----------
        file_id
            ID to associate with the file

        Returns
        -------
        :
            Information about the path in which to write the new data

        Raises
        ------
        FileExistsError
            A file already exists for the given `file_id`
        """
        file_path = self.db_dir / f"{file_id}{self.backend_data.ext}"

        if file_path.exists():
            raise FileExistsError(file_path)

        return DBPath.from_abs_path_and_db_dir(abs=file_path, db_dir=self.db_dir)

    def load(  # noqa: PLR0913
        self,
        selector: pd.Index[Any] | pd.MultiIndex | pix.selectors.Selector | None = None,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
        out_columns_type: type | None = None,
        out_columns_name: str | None = None,
        parallel_op_config: ParallelOpConfig | None = None,
        progress: bool = False,
        max_workers: int | None = None,
    ) -> pd.DataFrame:
        """
        Load data

        Parameters
        ----------
        selector
            Selector to use to choose the data to load

        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        out_columns_type
            Type to set the output columns to.

            If not supplied, we don't set the output columns' type.

        out_columns_name
            The name for the columns in the output.

            If not supplied, we don't set the output columns' name.

            This can also be set with
            [pd.DataFrame.rename_axis][pandas.DataFrame.rename_axis]
            but we provide it here for convenience
            (and in case you couldn't find this trick for ages, like us).

        parallel_op_config
            Configuration for executing the operation in parallel with progress bars

            If not supplied, we use the values of `progress` and `max_workers`.

        progress
            Should progress bar(s) be used to display the progress of the deletion?

            Only used if `parallel_op_config` is `None`.

        max_workers
            Maximum number of workers to use for parallel processing.

            If supplied, we create an instance of
            [concurrent.futures.ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)
            with the provided number of workers.
            A process pool seems to be the sensible default from our experimentation,
            but it is not a universally better choice.
            If you need something else because of how your database is set up,
            simply pass `parallel_op_config`
            rather than using the shortcut of passing `max_workers`.

            If not supplied, the loading is executed serially.

            Only used if `parallel_op_config` is `None`.

        Returns
        -------
        :
            Loaded data

        Raises
        ------
        EmptyDBError
            The database is empty
        """
        if self.is_empty:
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            file_map = self.load_file_map(index_file_lock=index_file_lock)
            index = self.load_index(index_file_lock=index_file_lock)

            res = load_data(
                backend_data=self.backend_data,
                db_index=index,
                db_file_map=file_map,
                db_dir=self.db_dir,
                selector=selector,
                out_columns_type=out_columns_type,
                out_columns_name=out_columns_name,
                parallel_op_config=parallel_op_config,
                progress=progress,
                max_workers=max_workers,
            )

        return res

    def load_file_map(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> pd.Series[Path]:  # type: ignore # pandas type hints confused about what they support
        """
        Load the file map

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Map from file ID to file path

        Raises
        ------
        EmptyDBError
            The database is empty
        """
        if self.is_empty:
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            file_map = load_db_file_map(
                backend_index=self.backend_index, file_map_file=self.file_map_file
            )

        return file_map

    def load_index(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Database index

        Raises
        ------
        EmptyDBError
            The database is empty
        """
        if self.is_empty:
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            index = load_db_index(
                backend_index=self.backend_index,
                index_file=self.index_file,
            )

        return index

    def load_metadata(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> pd.MultiIndex:
        """
        Load the database's metadata

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Loaded metadata
        """
        if not self.index_file.exists():
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            metadata = load_db_metadata(
                backend_index=self.backend_index, index_file=self.index_file
            )

        return metadata

    def save(  # noqa: PLR0913
        self,
        data: pd.DataFrame,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
        groupby: list[str] | None = None,
        allow_overwrite: bool = False,
        warn_on_partial_overwrite: bool = True,
        progress_grouping: ProgressLike | None = None,
        parallel_op_config_save: ParallelOpConfig | None = None,
        parallel_op_config_delete: ParallelOpConfig | None = None,
        parallel_op_config_rewrite: ParallelOpConfig | None = None,
        progress: bool = False,
        max_workers: int | None = None,
    ) -> None:
        """
        Save data into the database

        Parameters
        ----------
        data
            Data to add to the database

        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        groupby
            Metadata columns to use to group the data.

            If not supplied, we save all the data in a single file.

        allow_overwrite
            Should overwrites of data that is already in the database be allowed?

            If this is `True`, there is a risk that, if interrupted halfway through,
            you can end up with duplicate data in your database
            or some other odd broken state.

        warn_on_partial_overwrite
            Should a warning be raised if a partial overwrite will occur?

            This is on by default so that users
            are warned about the slow operation of re-writing.

        progress_grouping
            Progress bar to use when grouping the data

            If not supplied, we use the values of `progress` and `max_workers`.

        parallel_op_config_save
            Parallel op configuration for executing save operations

            If not supplied, we use the values of `progress` and `max_workers`.

        parallel_op_config_delete
            Parallel op configuration for executing any needed delete operations

            If not supplied, we use the values of `progress` and `max_workers`.

        parallel_op_config_rewrite
            Parallel op configuration for executing any needed re-write operations

            If not supplied, we use the values of `progress` and `max_workers`.

        progress
            Should progress bar(s) be used to display the progress of the various steps?

            Only used if the corresponding `parallel_op_config_*` variable
            for the operation is `None`.

        max_workers
            Maximum number of workers to use for parallel processing.

            If supplied, we create instances of
            [concurrent.futures.Executor][]
            with the provided number of workers
            (the exact kind of executor depends on the operation).

            If not supplied, the operations are executed serially.

            Only used if the corresponding `parallel_op_config_*` variable
            for the operation is `None`.
        """
        if not isinstance(data.index, pd.MultiIndex):
            msg = (
                "`data.index` must be an instance of `pd.MultiIndex`. "
                f"Received {type(data.index)=}"
            )
            raise TypeError(msg)

        if data.index.duplicated().any():
            duplicate_rows = data.index.duplicated(keep=False)
            duplicates = data.loc[duplicate_rows, :]
            msg = (
                "`data` contains rows with the same metadata. "
                f"duplicates=\n{duplicates}"
            )

            raise ValueError(msg)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            if self.is_empty:
                move_plan = None
                index_non_data = None
                file_map_non_data = None
                min_file_id = 0

            else:
                file_map_db = self.load_file_map(index_file_lock=index_file_lock)
                index_db = self.load_index(index_file_lock=index_file_lock)
                if not allow_overwrite:
                    data_index_unified, index_db_index_unified = (
                        unify_index_levels_check_index_types(data.index, index_db.index)
                    )
                    overwrite_required = multi_index_match(
                        data_index_unified, index_db_index_unified
                    )

                    if overwrite_required.any():
                        data_to_write_already_in_db = data.loc[overwrite_required, :]
                        raise AlreadyInDBError(
                            already_in_db=data_to_write_already_in_db
                        )

                move_plan = make_move_plan(
                    index_start=index_db,
                    file_map_start=file_map_db,
                    data_to_write=data,
                    get_new_data_file_path=self.get_new_data_file_path,
                    db_dir=self.db_dir,
                )

                # As needed, re-write files without deleting the old files
                if move_plan.rewrite_actions is not None:
                    if warn_on_partial_overwrite:
                        msg = (
                            "Overwriting the data will require re-writing. "
                            "This may be slow. "
                            "If that is an issue, the way to solve it "
                            "is to update your workflow to ensure "
                            "that you are not overwriting data "
                            "or are only overwriting entire files."
                        )
                        warnings.warn(msg)

                    rewrite_files(
                        move_plan.rewrite_actions,
                        backend=self.backend_data,
                        parallel_op_config=parallel_op_config_rewrite,
                        progress=progress,
                        max_workers=max_workers,
                    )

                # Write the new data
                current_largest_file_id = file_map_db.index.max()
                if not move_plan.moved_file_map.empty:
                    current_largest_file_id = max(
                        move_plan.moved_file_map.index.max(), current_largest_file_id
                    )

                index_non_data = move_plan.moved_index
                file_map_non_data = move_plan.moved_file_map
                min_file_id = current_largest_file_id + 1

            save_data(
                data,
                backend_data=self.backend_data,
                get_new_data_file_path=self.get_new_data_file_path,
                backend_index=self.backend_index,
                index_file=self.index_file,
                file_map_file=self.file_map_file,
                index_non_data=index_non_data,
                file_map_non_data=file_map_non_data,
                min_file_id=min_file_id,
                groupby=groupby,
                progress_grouping=progress_grouping,
                parallel_op_config=parallel_op_config_save,
                progress=progress,
                max_workers=max_workers,
            )

            # As needed, delete files.
            # We delete files last to minimise the risk of losing data
            # (might end up with double if we get interrupted here,
            # but that is better than zero).
            if move_plan is not None and move_plan.delete_paths is not None:
                delete_files(
                    files_to_delete=move_plan.delete_paths,
                    parallel_op_config=parallel_op_config_delete,
                    progress=progress,
                    max_workers=max_workers,
                )

    def to_gzipped_tar_archive(
        self,
        out_file: Path,
        mode: Literal["w:gz", "x:gz"] = "w:gz",
    ) -> Path:
        """
        Convert to a gzipped tar archive

        Parameters
        ----------
        out_file
            File in which to write the output

        mode
            Mode to use to open `out_file`

        Returns
        -------
        :
            Path to the gzipped tar archive

            This is the same as `out_file`, but is returned for convenience.
        """
        with tarfile.open(out_file, mode=mode) as tar:
            tar.add(self.db_dir, arcname="db")

        return out_file

backend_data class-attribute instance-attribute #

backend_data: OpenSCMDBDataBackend = field(kw_only=True)

The backend for (de-)serialising data (from) to disk

backend_index class-attribute instance-attribute #

backend_index: OpenSCMDBIndexBackend = field(kw_only=True)

The backend for (de-)serialising the database index (from) to disk

db_dir class-attribute instance-attribute #

db_dir: Path = field(kw_only=True)

Path in which the database is stored

Both the index and the data files will be written in this directory.

file_map_file property #

file_map_file: Path

The file in which the file map is stored

The file map stores the mapping from file_id to file path.

Returns:

Type Description
Path

Path to the file map file

index_file property #

index_file: Path

The file in which the database's index is stored

Returns:

Type Description
Path

Path to the index file

index_file_lock class-attribute instance-attribute #

index_file_lock: BaseFileLock = field(kw_only=True)

Lock for the index file

index_file_lock_path property #

index_file_lock_path: Path

Path to the lock file for the back-end's index file

is_empty property #

is_empty: bool

Whether the database is empty or not

Returns:

Type Description
bool

True if the database is empty, False otherwise

create_reader #

create_reader(
    *,
    lock: bool | BaseFileLock | None = True,
    index_file_lock: BaseFileLock | None = None,
) -> OpenSCMDBReader

Create a database reader

Parameters:

Name Type Description Default
lock bool | BaseFileLock | None

Lock to give to the reader.

If True, we create a new lock for the database, such that, if the reader is holding the lock, no operations can be performed on the database.

If False, the reader is not given any lock.

True
index_file_lock BaseFileLock | None

Lock for the database's index file

Used while loading the index from disk.

If not supplied, we use self.index_file_lock.

None

Returns:

Type Description
OpenSCMDBReader

Database reader

Source code in src/pandas_openscm/db/openscm_db.py
def create_reader(
    self,
    *,
    lock: bool | filelock.BaseFileLock | None = True,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> OpenSCMDBReader:
    """
    Create a database reader

    Parameters
    ----------
    lock
        Lock to give to the reader.

        If `True`, we create a new lock for the database, such that,
        if the reader is holding the lock,
        no operations can be performed on the database.

        If `False`, the reader is not given any lock.

    index_file_lock
        Lock for the database's index file

        Used while loading the index from disk.

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Database reader
    """
    if isinstance(lock, bool):
        if lock:
            try:
                import filelock  # noqa: PLC0415
            except ImportError as exc:
                raise MissingOptionalDependencyError(  # noqa: TRY003
                    "create_reader(..., lock=True, ...)", requirement="filelock"
                ) from exc

            # Create a new lock for the reader
            lock = filelock.FileLock(self.index_file_lock_path)

        else:
            # Convert to None
            lock = None

    db_index = self.load_index(index_file_lock=index_file_lock)
    db_file_map = self.load_file_map(index_file_lock=index_file_lock)

    res = OpenSCMDBReader(
        backend_data=self.backend_data,
        db_dir=self.db_dir,
        db_index=db_index,
        db_file_map=db_file_map,
        lock=lock,
    )

    return res

default_index_file_lock #

default_index_file_lock() -> BaseFileLock

Get default lock for the back-end's index file

Source code in src/pandas_openscm/db/openscm_db.py
@index_file_lock.default  # ty: ignore[call-non-callable]
def default_index_file_lock(self) -> filelock.BaseFileLock:
    """Get default lock for the back-end's index file"""
    try:
        import filelock  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "default_index_file_lock", requirement="filelock"
        ) from exc

    return filelock.FileLock(self.index_file_lock_path)

delete #

delete(
    *,
    index_file_lock: BaseFileLock | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None

Delete all data in the database

Parameters:

Name Type Description Default
index_file_lock BaseFileLock | None

Lock for the database's index file

If not supplied, we use self.index_file_lock.

None
parallel_op_config ParallelOpConfig | None

Configuration for executing the operation in parallel with progress bars

If not supplied, we use the values of progress and max_workers.

None
progress bool

Should progress bar(s) be used to display the progress of the deletion?

Only used if parallel_op_config is None.

False
max_workers int | None

Maximum number of workers to use for parallel processing.

If supplied, we create an instance of concurrent.futures.ThreadPoolExecutor with the provided number of workers (a thread pool makes sense as deletion is I/O-bound).

If not supplied, the deletions are executed serially.

Only used if parallel_op_config is None.

None
Source code in src/pandas_openscm/db/openscm_db.py
def delete(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None:
    """
    Delete all data in the database

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    parallel_op_config
        Configuration for executing the operation in parallel with progress bars

        If not supplied, we use the values of `progress` and `max_workers`.

    progress
        Should progress bar(s) be used to display the progress of the deletion?

        Only used if `parallel_op_config` is `None`.

    max_workers
        Maximum number of workers to use for parallel processing.

        If supplied, we create an instance of
        [concurrent.futures.ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor)
        with the provided number of workers
        (a thread pool makes sense as deletion is I/O-bound).

        If not supplied, the deletions are executed serially.

        Only used if `parallel_op_config` is `None`.
    """
    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        files_to_delete = {
            *self.db_dir.glob(f"*{self.backend_data.ext}"),
            *self.db_dir.glob(f"*{self.backend_index.ext}"),
        }
        delete_files(
            files_to_delete=files_to_delete,
            parallel_op_config=parallel_op_config,
            progress=progress,
            max_workers=max_workers,
        )

from_gzipped_tar_archive classmethod #

from_gzipped_tar_archive(
    tar_archive: Path,
    db_dir: Path,
    backend_data: OpenSCMDBDataBackend | None = None,
    backend_index: OpenSCMDBIndexBackend | None = None,
) -> OpenSCMDB

Initialise from a gzipped tar archive

This also unpacks the files to disk

Parameters:

Name Type Description Default
tar_archive Path

Tar archive from which to initialise

required
db_dir Path

Directory in which to unpack the database

required
backend_data OpenSCMDBDataBackend | None

Backend to use for handling the data

None
backend_index OpenSCMDBIndexBackend | None

Backend to use for handling the index

None

Returns:

Type Description
OpenSCMDB

Initialised database

Source code in src/pandas_openscm/db/openscm_db.py
@classmethod
def from_gzipped_tar_archive(
    cls,
    tar_archive: Path,
    db_dir: Path,
    backend_data: OpenSCMDBDataBackend | None = None,
    backend_index: OpenSCMDBIndexBackend | None = None,
) -> OpenSCMDB:
    """
    Initialise from a gzipped tar archive

    This also unpacks the files to disk

    Parameters
    ----------
    tar_archive
        Tar archive from which to initialise

    db_dir
        Directory in which to unpack the database

    backend_data
        Backend to use for handling the data

    backend_index
        Backend to use for handling the index

    Returns
    -------
    :
        Initialised database
    """
    with tarfile.open(tar_archive, "r") as tar:
        for member in tar.getmembers():
            if not member.isreg():
                # Only extract files
                continue
            # Extract to the db_dir
            member.name = Path(member.name).name
            tar.extract(member, db_dir)
            if backend_index is None and member.name.startswith("index"):
                backend_index = INDEX_BACKENDS.guess_backend(member.name)

            if backend_data is None and not any(
                member.name.startswith(v) for v in ["index", "filemap"]
            ):
                backend_data = DATA_BACKENDS.guess_backend(member.name)

    if backend_data is None:  # pragma: no cover
        # Should be impossible to get here
        raise TypeError(backend_data)

    if backend_index is None:  # pragma: no cover
        # Should be impossible to get here
        raise TypeError(backend_index)

    res = cls(  # ty: ignore[missing-argument]
        backend_data=backend_data, backend_index=backend_index, db_dir=db_dir
    )

    return res

get_new_data_file_path #

get_new_data_file_path(file_id: int) -> DBPath

Get the path in which to write a new data file

Parameters:

Name Type Description Default
file_id int

ID to associate with the file

required

Returns:

Type Description
DBPath

Information about the path in which to write the new data

Raises:

Type Description
FileExistsError

A file already exists for the given file_id

Source code in src/pandas_openscm/db/openscm_db.py
def get_new_data_file_path(self, file_id: int) -> DBPath:
    """
    Get the path in which to write a new data file

    Parameters
    ----------
    file_id
        ID to associate with the file

    Returns
    -------
    :
        Information about the path in which to write the new data

    Raises
    ------
    FileExistsError
        A file already exists for the given `file_id`
    """
    file_path = self.db_dir / f"{file_id}{self.backend_data.ext}"

    if file_path.exists():
        raise FileExistsError(file_path)

    return DBPath.from_abs_path_and_db_dir(abs=file_path, db_dir=self.db_dir)

load #

load(
    selector: Index[Any]
    | MultiIndex
    | Selector
    | None = None,
    *,
    index_file_lock: BaseFileLock | None = None,
    out_columns_type: type | None = None,
    out_columns_name: str | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> DataFrame

Load data

Parameters:

Name Type Description Default
selector Index[Any] | MultiIndex | Selector | None

Selector to use to choose the data to load

None
index_file_lock BaseFileLock | None

Lock for the database's index file

If not supplied, we use self.index_file_lock.

None
out_columns_type type | None

Type to set the output columns to.

If not supplied, we don't set the output columns' type.

None
out_columns_name str | None

The name for the columns in the output.

If not supplied, we don't set the output columns' name.

This can also be set with pd.DataFrame.rename_axis but we provide it here for convenience (and in case you couldn't find this trick for ages, like us).

None
parallel_op_config ParallelOpConfig | None

Configuration for executing the operation in parallel with progress bars

If not supplied, we use the values of progress and max_workers.

None
progress bool

Should progress bar(s) be used to display the progress of the deletion?

Only used if parallel_op_config is None.

False
max_workers int | None

Maximum number of workers to use for parallel processing.

If supplied, we create an instance of concurrent.futures.ProcessPoolExecutor with the provided number of workers. A process pool seems to be the sensible default from our experimentation, but it is not a universally better choice. If you need something else because of how your database is set up, simply pass parallel_op_config rather than using the shortcut of passing max_workers.

If not supplied, the loading is executed serially.

Only used if parallel_op_config is None.

None

Returns:

Type Description
DataFrame

Loaded data

Raises:

Type Description
EmptyDBError

The database is empty

Source code in src/pandas_openscm/db/openscm_db.py
def load(  # noqa: PLR0913
    self,
    selector: pd.Index[Any] | pd.MultiIndex | pix.selectors.Selector | None = None,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
    out_columns_type: type | None = None,
    out_columns_name: str | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> pd.DataFrame:
    """
    Load data

    Parameters
    ----------
    selector
        Selector to use to choose the data to load

    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    out_columns_type
        Type to set the output columns to.

        If not supplied, we don't set the output columns' type.

    out_columns_name
        The name for the columns in the output.

        If not supplied, we don't set the output columns' name.

        This can also be set with
        [pd.DataFrame.rename_axis][pandas.DataFrame.rename_axis]
        but we provide it here for convenience
        (and in case you couldn't find this trick for ages, like us).

    parallel_op_config
        Configuration for executing the operation in parallel with progress bars

        If not supplied, we use the values of `progress` and `max_workers`.

    progress
        Should progress bar(s) be used to display the progress of the deletion?

        Only used if `parallel_op_config` is `None`.

    max_workers
        Maximum number of workers to use for parallel processing.

        If supplied, we create an instance of
        [concurrent.futures.ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)
        with the provided number of workers.
        A process pool seems to be the sensible default from our experimentation,
        but it is not a universally better choice.
        If you need something else because of how your database is set up,
        simply pass `parallel_op_config`
        rather than using the shortcut of passing `max_workers`.

        If not supplied, the loading is executed serially.

        Only used if `parallel_op_config` is `None`.

    Returns
    -------
    :
        Loaded data

    Raises
    ------
    EmptyDBError
        The database is empty
    """
    if self.is_empty:
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        file_map = self.load_file_map(index_file_lock=index_file_lock)
        index = self.load_index(index_file_lock=index_file_lock)

        res = load_data(
            backend_data=self.backend_data,
            db_index=index,
            db_file_map=file_map,
            db_dir=self.db_dir,
            selector=selector,
            out_columns_type=out_columns_type,
            out_columns_name=out_columns_name,
            parallel_op_config=parallel_op_config,
            progress=progress,
            max_workers=max_workers,
        )

    return res

load_file_map #

load_file_map(
    *, index_file_lock: BaseFileLock | None = None
) -> Series[Path]

Load the file map

Parameters:

Name Type Description Default
index_file_lock BaseFileLock | None

Lock for the database's index file

If not supplied, we use self.index_file_lock.

None

Returns:

Type Description
Series[Path]

Map from file ID to file path

Raises:

Type Description
EmptyDBError

The database is empty

Source code in src/pandas_openscm/db/openscm_db.py
def load_file_map(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> pd.Series[Path]:  # type: ignore # pandas type hints confused about what they support
    """
    Load the file map

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Map from file ID to file path

    Raises
    ------
    EmptyDBError
        The database is empty
    """
    if self.is_empty:
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        file_map = load_db_file_map(
            backend_index=self.backend_index, file_map_file=self.file_map_file
        )

    return file_map

load_index #

load_index(
    *, index_file_lock: BaseFileLock | None = None
) -> DataFrame

Load the index

Parameters:

Name Type Description Default
index_file_lock BaseFileLock | None

Lock for the database's index file

If not supplied, we use self.index_file_lock.

None

Returns:

Type Description
DataFrame

Database index

Raises:

Type Description
EmptyDBError

The database is empty

Source code in src/pandas_openscm/db/openscm_db.py
def load_index(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Database index

    Raises
    ------
    EmptyDBError
        The database is empty
    """
    if self.is_empty:
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        index = load_db_index(
            backend_index=self.backend_index,
            index_file=self.index_file,
        )

    return index

load_metadata #

load_metadata(
    *, index_file_lock: BaseFileLock | None = None
) -> MultiIndex

Load the database's metadata

Parameters:

Name Type Description Default
index_file_lock BaseFileLock | None

Lock for the database's index file

If not supplied, we use self.index_file_lock.

None

Returns:

Type Description
MultiIndex

Loaded metadata

Source code in src/pandas_openscm/db/openscm_db.py
def load_metadata(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> pd.MultiIndex:
    """
    Load the database's metadata

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Loaded metadata
    """
    if not self.index_file.exists():
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        metadata = load_db_metadata(
            backend_index=self.backend_index, index_file=self.index_file
        )

    return metadata

save #

save(
    data: DataFrame,
    *,
    index_file_lock: BaseFileLock | None = None,
    groupby: list[str] | None = None,
    allow_overwrite: bool = False,
    warn_on_partial_overwrite: bool = True,
    progress_grouping: ProgressLike | None = None,
    parallel_op_config_save: ParallelOpConfig | None = None,
    parallel_op_config_delete: ParallelOpConfig
    | None = None,
    parallel_op_config_rewrite: ParallelOpConfig
    | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None

Save data into the database

Parameters:

Name Type Description Default
data DataFrame

Data to add to the database

required
index_file_lock BaseFileLock | None

Lock for the database's index file

If not supplied, we use self.index_file_lock.

None
groupby list[str] | None

Metadata columns to use to group the data.

If not supplied, we save all the data in a single file.

None
allow_overwrite bool

Should overwrites of data that is already in the database be allowed?

If this is True, there is a risk that, if interrupted halfway through, you can end up with duplicate data in your database or some other odd broken state.

False
warn_on_partial_overwrite bool

Should a warning be raised if a partial overwrite will occur?

This is on by default so that users are warned about the slow operation of re-writing.

True
progress_grouping ProgressLike | None

Progress bar to use when grouping the data

If not supplied, we use the values of progress and max_workers.

None
parallel_op_config_save ParallelOpConfig | None

Parallel op configuration for executing save operations

If not supplied, we use the values of progress and max_workers.

None
parallel_op_config_delete ParallelOpConfig | None

Parallel op configuration for executing any needed delete operations

If not supplied, we use the values of progress and max_workers.

None
parallel_op_config_rewrite ParallelOpConfig | None

Parallel op configuration for executing any needed re-write operations

If not supplied, we use the values of progress and max_workers.

None
progress bool

Should progress bar(s) be used to display the progress of the various steps?

Only used if the corresponding parallel_op_config_* variable for the operation is None.

False
max_workers int | None

Maximum number of workers to use for parallel processing.

If supplied, we create instances of concurrent.futures.Executor with the provided number of workers (the exact kind of executor depends on the operation).

If not supplied, the operations are executed serially.

Only used if the corresponding parallel_op_config_* variable for the operation is None.

None
Source code in src/pandas_openscm/db/openscm_db.py
def save(  # noqa: PLR0913
    self,
    data: pd.DataFrame,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
    groupby: list[str] | None = None,
    allow_overwrite: bool = False,
    warn_on_partial_overwrite: bool = True,
    progress_grouping: ProgressLike | None = None,
    parallel_op_config_save: ParallelOpConfig | None = None,
    parallel_op_config_delete: ParallelOpConfig | None = None,
    parallel_op_config_rewrite: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None:
    """
    Save data into the database

    Parameters
    ----------
    data
        Data to add to the database

    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    groupby
        Metadata columns to use to group the data.

        If not supplied, we save all the data in a single file.

    allow_overwrite
        Should overwrites of data that is already in the database be allowed?

        If this is `True`, there is a risk that, if interrupted halfway through,
        you can end up with duplicate data in your database
        or some other odd broken state.

    warn_on_partial_overwrite
        Should a warning be raised if a partial overwrite will occur?

        This is on by default so that users
        are warned about the slow operation of re-writing.

    progress_grouping
        Progress bar to use when grouping the data

        If not supplied, we use the values of `progress` and `max_workers`.

    parallel_op_config_save
        Parallel op configuration for executing save operations

        If not supplied, we use the values of `progress` and `max_workers`.

    parallel_op_config_delete
        Parallel op configuration for executing any needed delete operations

        If not supplied, we use the values of `progress` and `max_workers`.

    parallel_op_config_rewrite
        Parallel op configuration for executing any needed re-write operations

        If not supplied, we use the values of `progress` and `max_workers`.

    progress
        Should progress bar(s) be used to display the progress of the various steps?

        Only used if the corresponding `parallel_op_config_*` variable
        for the operation is `None`.

    max_workers
        Maximum number of workers to use for parallel processing.

        If supplied, we create instances of
        [concurrent.futures.Executor][]
        with the provided number of workers
        (the exact kind of executor depends on the operation).

        If not supplied, the operations are executed serially.

        Only used if the corresponding `parallel_op_config_*` variable
        for the operation is `None`.
    """
    if not isinstance(data.index, pd.MultiIndex):
        msg = (
            "`data.index` must be an instance of `pd.MultiIndex`. "
            f"Received {type(data.index)=}"
        )
        raise TypeError(msg)

    if data.index.duplicated().any():
        duplicate_rows = data.index.duplicated(keep=False)
        duplicates = data.loc[duplicate_rows, :]
        msg = (
            "`data` contains rows with the same metadata. "
            f"duplicates=\n{duplicates}"
        )

        raise ValueError(msg)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        if self.is_empty:
            move_plan = None
            index_non_data = None
            file_map_non_data = None
            min_file_id = 0

        else:
            file_map_db = self.load_file_map(index_file_lock=index_file_lock)
            index_db = self.load_index(index_file_lock=index_file_lock)
            if not allow_overwrite:
                data_index_unified, index_db_index_unified = (
                    unify_index_levels_check_index_types(data.index, index_db.index)
                )
                overwrite_required = multi_index_match(
                    data_index_unified, index_db_index_unified
                )

                if overwrite_required.any():
                    data_to_write_already_in_db = data.loc[overwrite_required, :]
                    raise AlreadyInDBError(
                        already_in_db=data_to_write_already_in_db
                    )

            move_plan = make_move_plan(
                index_start=index_db,
                file_map_start=file_map_db,
                data_to_write=data,
                get_new_data_file_path=self.get_new_data_file_path,
                db_dir=self.db_dir,
            )

            # As needed, re-write files without deleting the old files
            if move_plan.rewrite_actions is not None:
                if warn_on_partial_overwrite:
                    msg = (
                        "Overwriting the data will require re-writing. "
                        "This may be slow. "
                        "If that is an issue, the way to solve it "
                        "is to update your workflow to ensure "
                        "that you are not overwriting data "
                        "or are only overwriting entire files."
                    )
                    warnings.warn(msg)

                rewrite_files(
                    move_plan.rewrite_actions,
                    backend=self.backend_data,
                    parallel_op_config=parallel_op_config_rewrite,
                    progress=progress,
                    max_workers=max_workers,
                )

            # Write the new data
            current_largest_file_id = file_map_db.index.max()
            if not move_plan.moved_file_map.empty:
                current_largest_file_id = max(
                    move_plan.moved_file_map.index.max(), current_largest_file_id
                )

            index_non_data = move_plan.moved_index
            file_map_non_data = move_plan.moved_file_map
            min_file_id = current_largest_file_id + 1

        save_data(
            data,
            backend_data=self.backend_data,
            get_new_data_file_path=self.get_new_data_file_path,
            backend_index=self.backend_index,
            index_file=self.index_file,
            file_map_file=self.file_map_file,
            index_non_data=index_non_data,
            file_map_non_data=file_map_non_data,
            min_file_id=min_file_id,
            groupby=groupby,
            progress_grouping=progress_grouping,
            parallel_op_config=parallel_op_config_save,
            progress=progress,
            max_workers=max_workers,
        )

        # As needed, delete files.
        # We delete files last to minimise the risk of losing data
        # (might end up with double if we get interrupted here,
        # but that is better than zero).
        if move_plan is not None and move_plan.delete_paths is not None:
            delete_files(
                files_to_delete=move_plan.delete_paths,
                parallel_op_config=parallel_op_config_delete,
                progress=progress,
                max_workers=max_workers,
            )

to_gzipped_tar_archive #

to_gzipped_tar_archive(
    out_file: Path, mode: Literal["w:gz", "x:gz"] = "w:gz"
) -> Path

Convert to a gzipped tar archive

Parameters:

Name Type Description Default
out_file Path

File in which to write the output

required
mode Literal['w:gz', 'x:gz']

Mode to use to open out_file

'w:gz'

Returns:

Type Description
Path

Path to the gzipped tar archive

This is the same as out_file, but is returned for convenience.

Source code in src/pandas_openscm/db/openscm_db.py
def to_gzipped_tar_archive(
    self,
    out_file: Path,
    mode: Literal["w:gz", "x:gz"] = "w:gz",
) -> Path:
    """
    Convert to a gzipped tar archive

    Parameters
    ----------
    out_file
        File in which to write the output

    mode
        Mode to use to open `out_file`

    Returns
    -------
    :
        Path to the gzipped tar archive

        This is the same as `out_file`, but is returned for convenience.
    """
    with tarfile.open(out_file, mode=mode) as tar:
        tar.add(self.db_dir, arcname="db")

    return out_file

OpenSCMDBDataBackend #

Bases: Protocol

Backend for (de-)serialising data

Designed to be used with OpenSCMDB

Methods:

Name Description
load_data

Load a data file

save_data

Save data to disk

Attributes:

Name Type Description
ext str

Extension to use with data files saved by this backend.

preserves_index bool

Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/interfaces.py
@runtime_checkable
class OpenSCMDBDataBackend(Protocol):
    """
    Backend for (de-)serialising data

    Designed to be used with [OpenSCMDB][(m)]
    """

    ext: str
    """
    Extension to use with data files saved by this backend.
    """

    preserves_index: bool
    """
    Whether this backend preserves the index of data upon (de-)serialisation
    """

    @staticmethod
    def load_data(data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        This is a low-level method
        that just handles the specifics of loading the data from disk.
        Working out the path from which to load the data
        should happen in higher-level functions.

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data

        Notes
        -----
        This just loads the data directly from disk.
        If the data had a `pd.MultiIndex` when it was saved,
        this may or not be restored.
        It is up to the user
        to decide whether to do any `pd.MultiIndex` restoration or not,
        based on their use case and the value of `self.preserves_index`.
        We do not make this choice as converting back to a
        `pd.MultiIndex` can be a very expensive operation,
        and we want to give the user control over any such optimisations.
        """

    @staticmethod
    def save_data(
        data: pd.DataFrame,
        data_file: Path,
    ) -> None:
        """
        Save data to disk

        This is a low-level method
        that just handles the specifics of serialising the data to disk.
        Working out what to save and in what path
        should happen in higher-level functions.

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """

ext instance-attribute #

ext: str

Extension to use with data files saved by this backend.

preserves_index instance-attribute #

preserves_index: bool

Whether this backend preserves the index of data upon (de-)serialisation

load_data staticmethod #

load_data(data_file: Path) -> DataFrame

Load a data file

This is a low-level method that just handles the specifics of loading the data from disk. Working out the path from which to load the data should happen in higher-level functions.

Parameters:

Name Type Description Default
data_file Path

File from which to load the data

required

Returns:

Type Description
DataFrame

Loaded data

Notes

This just loads the data directly from disk. If the data had a pd.MultiIndex when it was saved, this may or not be restored. It is up to the user to decide whether to do any pd.MultiIndex restoration or not, based on their use case and the value of self.preserves_index. We do not make this choice as converting back to a pd.MultiIndex can be a very expensive operation, and we want to give the user control over any such optimisations.

Source code in src/pandas_openscm/db/interfaces.py
@staticmethod
def load_data(data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    This is a low-level method
    that just handles the specifics of loading the data from disk.
    Working out the path from which to load the data
    should happen in higher-level functions.

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data

    Notes
    -----
    This just loads the data directly from disk.
    If the data had a `pd.MultiIndex` when it was saved,
    this may or not be restored.
    It is up to the user
    to decide whether to do any `pd.MultiIndex` restoration or not,
    based on their use case and the value of `self.preserves_index`.
    We do not make this choice as converting back to a
    `pd.MultiIndex` can be a very expensive operation,
    and we want to give the user control over any such optimisations.
    """

save_data staticmethod #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

This is a low-level method that just handles the specifics of serialising the data to disk. Working out what to save and in what path should happen in higher-level functions.

Parameters:

Name Type Description Default
data DataFrame

Data to save

required
data_file Path

File in which to save the data

required
Source code in src/pandas_openscm/db/interfaces.py
@staticmethod
def save_data(
    data: pd.DataFrame,
    data_file: Path,
) -> None:
    """
    Save data to disk

    This is a low-level method
    that just handles the specifics of serialising the data to disk.
    Working out what to save and in what path
    should happen in higher-level functions.

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """

OpenSCMDBIndexBackend #

Bases: Protocol

Backend for (de-)serialising the index (and file map)

Designed to be used with OpenSCMDB

Methods:

Name Description
load_file_map

Load the file map

load_index

Load the index

save_file_map

Save the file map to disk

save_index

Save the index to disk

Attributes:

Name Type Description
ext str

Extension to use with index files saved by this backend.

preserves_index bool

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

Source code in src/pandas_openscm/db/interfaces.py
@runtime_checkable
class OpenSCMDBIndexBackend(Protocol):
    """
    Backend for (de-)serialising the index (and file map)

    Designed to be used with [OpenSCMDB][(m)]
    """

    ext: str
    """
    Extension to use with index files saved by this backend.
    """

    preserves_index: bool
    """
    Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
    """

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        This is a low-level method
        that just handles the specifics of loading the index from disk.
        Working out the path from which to load the file map
        should happen in higher-level functions.

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map

        Notes
        -----
        This returns a [pd.DataFrame][pandas.DataFrame].
        It is up to the user to cast this to a [pd.Series][pandas.Series]
        if they wish.
        """

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        This is a low-level method
        that just handles the specifics of loading the index from disk.
        Working out the path from which to load the index
        should happen in higher-level functions.

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index

        Notes
        -----
        This just loads the index directly from disk.
        If the index had a `pd.MultiIndex` when it was saved,
        this may or not be restored.
        It is up to the user
        to decide whether to do any `pd.MultiIndex` restoration or not,
        based on their use case and the value of `self.preserves_index`.
        We do not make this choice as converting back to a
        `pd.MultiIndex` can be a very expensive operation,
        and we want to give the user control over any such optimisations.
        """

    def save_file_map(
        self,
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        This is a low-level method
        that just handles the specifics of serialising the file map to disk.
        Working out what to save and in what path
        should happen in higher-level functions.

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """

    def save_index(
        self,
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        This is a low-level method
        that just handles the specifics of serialising the index to disk.
        Working out what to save and in what path
        should happen in higher-level functions.

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """

ext instance-attribute #

ext: str

Extension to use with index files saved by this backend.

preserves_index instance-attribute #

preserves_index: bool

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map staticmethod #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

This is a low-level method that just handles the specifics of loading the index from disk. Working out the path from which to load the file map should happen in higher-level functions.

Parameters:

Name Type Description Default
file_map_file Path

File from which to load the file map

required

Returns:

Type Description
DataFrame

Loaded file map

Notes

This returns a pd.DataFrame. It is up to the user to cast this to a pd.Series if they wish.

Source code in src/pandas_openscm/db/interfaces.py
@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    This is a low-level method
    that just handles the specifics of loading the index from disk.
    Working out the path from which to load the file map
    should happen in higher-level functions.

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map

    Notes
    -----
    This returns a [pd.DataFrame][pandas.DataFrame].
    It is up to the user to cast this to a [pd.Series][pandas.Series]
    if they wish.
    """

load_index staticmethod #

load_index(index_file: Path) -> DataFrame

Load the index

This is a low-level method that just handles the specifics of loading the index from disk. Working out the path from which to load the index should happen in higher-level functions.

Parameters:

Name Type Description Default
index_file Path

File from which to load the index

required

Returns:

Type Description
DataFrame

Loaded index

Notes

This just loads the index directly from disk. If the index had a pd.MultiIndex when it was saved, this may or not be restored. It is up to the user to decide whether to do any pd.MultiIndex restoration or not, based on their use case and the value of self.preserves_index. We do not make this choice as converting back to a pd.MultiIndex can be a very expensive operation, and we want to give the user control over any such optimisations.

Source code in src/pandas_openscm/db/interfaces.py
@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    This is a low-level method
    that just handles the specifics of loading the index from disk.
    Working out the path from which to load the index
    should happen in higher-level functions.

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index

    Notes
    -----
    This just loads the index directly from disk.
    If the index had a `pd.MultiIndex` when it was saved,
    this may or not be restored.
    It is up to the user
    to decide whether to do any `pd.MultiIndex` restoration or not,
    based on their use case and the value of `self.preserves_index`.
    We do not make this choice as converting back to a
    `pd.MultiIndex` can be a very expensive operation,
    and we want to give the user control over any such optimisations.
    """

save_file_map #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

This is a low-level method that just handles the specifics of serialising the file map to disk. Working out what to save and in what path should happen in higher-level functions.

Parameters:

Name Type Description Default
file_map Series[Path]

File map to save

required
file_map_file Path

File in which to save the file map

required
Source code in src/pandas_openscm/db/interfaces.py
def save_file_map(
    self,
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    This is a low-level method
    that just handles the specifics of serialising the file map to disk.
    Working out what to save and in what path
    should happen in higher-level functions.

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """

save_index #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

This is a low-level method that just handles the specifics of serialising the index to disk. Working out what to save and in what path should happen in higher-level functions.

Parameters:

Name Type Description Default
index DataFrame

Index to save

required
index_file Path

File in which to save the index

required
Source code in src/pandas_openscm/db/interfaces.py
def save_index(
    self,
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    This is a low-level method
    that just handles the specifics of serialising the index to disk.
    Working out what to save and in what path
    should happen in higher-level functions.

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """

netCDFDataBackend #

netCDF data backend

Methods:

Name Description
load_data

Load a data file

save_data

Save data to disk

Attributes:

Name Type Description
ext str

Extension to use with files saved by this backend.

preserves_index Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

timeseries_dim str

Name of the timeseries dimension in serialised output

Source code in src/pandas_openscm/db/netcdf.py
@define
class netCDFDataBackend:
    """
    netCDF data backend
    """

    ext: str = ".nc"
    """
    Extension to use with files saved by this backend.
    """

    timeseries_dim: str = "ts_id"
    """
    Name of the timeseries dimension in serialised output
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return True

    def load_data(self, data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_data", requirement="xarray"
            ) from exc

        raw = xr.load_dataset(data_file)

        data: pd.DataFrame = raw["values"].to_pandas()  # type: ignore
        index = metadata_xr_to_df(raw)
        index_concat = index.loc[raw[self.timeseries_dim].values]

        res = pd.concat([index_concat, data], axis="columns").set_index(
            index.columns.to_list()
        )

        return res

    def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.save_data", requirement="xarray"
            ) from exc

        # Resetting the index will also give each timeseries a unique ID
        data_rs = data.reset_index()
        timeseries_coord_info = {self.timeseries_dim: data_rs.index.values}
        if data.columns.name is None:
            time_dim = "time"
        else:
            time_dim = str(data.columns.name)

        time_coord_info = {time_dim: data.columns.values}

        data_index_xr = metadata_df_to_xr(
            data_rs[data.index.names],
            timeseries_id_coord=xr.Coordinates(timeseries_coord_info),
            timeseries_dim=self.timeseries_dim,
        )
        data_values_xr = xr.DataArray(
            data,
            dims=[self.timeseries_dim, time_dim],
            coords=xr.Coordinates(timeseries_coord_info | time_coord_info),
        )
        data_xr = xr.merge([data_index_xr, data_values_xr.to_dataset(name="values")])
        data_xr.to_netcdf(data_file)

ext class-attribute instance-attribute #

ext: str = '.nc'

Extension to use with files saved by this backend.

preserves_index property #

preserves_index: Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

timeseries_dim class-attribute instance-attribute #

timeseries_dim: str = 'ts_id'

Name of the timeseries dimension in serialised output

load_data #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name Type Description Default
data_file Path

File from which to load the data

required

Returns:

Type Description
DataFrame

Loaded data

Source code in src/pandas_openscm/db/netcdf.py
def load_data(self, data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_data", requirement="xarray"
        ) from exc

    raw = xr.load_dataset(data_file)

    data: pd.DataFrame = raw["values"].to_pandas()  # type: ignore
    index = metadata_xr_to_df(raw)
    index_concat = index.loc[raw[self.timeseries_dim].values]

    res = pd.concat([index_concat, data], axis="columns").set_index(
        index.columns.to_list()
    )

    return res

save_data #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name Type Description Default
data DataFrame

Data to save

required
data_file Path

File in which to save the data

required
Source code in src/pandas_openscm/db/netcdf.py
def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.save_data", requirement="xarray"
        ) from exc

    # Resetting the index will also give each timeseries a unique ID
    data_rs = data.reset_index()
    timeseries_coord_info = {self.timeseries_dim: data_rs.index.values}
    if data.columns.name is None:
        time_dim = "time"
    else:
        time_dim = str(data.columns.name)

    time_coord_info = {time_dim: data.columns.values}

    data_index_xr = metadata_df_to_xr(
        data_rs[data.index.names],
        timeseries_id_coord=xr.Coordinates(timeseries_coord_info),
        timeseries_dim=self.timeseries_dim,
    )
    data_values_xr = xr.DataArray(
        data,
        dims=[self.timeseries_dim, time_dim],
        coords=xr.Coordinates(timeseries_coord_info | time_coord_info),
    )
    data_xr = xr.merge([data_index_xr, data_values_xr.to_dataset(name="values")])
    data_xr.to_netcdf(data_file)

netCDFIndexBackend #

netCDF index backend

Methods:

Name Description
load_file_map

Load the database's file map

load_index

Load the index

save_file_map

Save the file map to disk

save_index

Save the index to disk

Attributes:

Name Type Description
ext str

Extension to use with files saved by this backend.

preserves_index Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

timeseries_dim str

Name of the timeseries dimension in serialised output

Source code in src/pandas_openscm/db/netcdf.py
@define
class netCDFIndexBackend:
    """
    netCDF index backend
    """

    ext: str = ".nc"
    """
    Extension to use with files saved by this backend.
    """

    timeseries_dim: str = "ts_id"
    """
    Name of the timeseries dimension in serialised output
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return True

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the database's file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_file_map", requirement="xarray"
            ) from exc

        res = xr.load_dataset(file_map_file).to_pandas()
        if isinstance(res, pd.Series):  # pragma: no cover
            raise TypeError(res)

        return res

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_index", requirement="xarray"
            ) from exc

        raw = xr.load_dataset(index_file)

        intermediate = metadata_xr_to_df(raw)
        res = intermediate.set_index(
            intermediate.columns.difference(["file_id"]).to_list()
        )

        return res

    @staticmethod
    def save_file_map(
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.save_file_map", requirement="xarray"
            ) from exc

        file_map_xr = xr.DataArray.from_series(file_map.astype(str))
        file_map_xr.to_netcdf(file_map_file)

    def save_index(
        self,
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        # Use a different name because the timeseries IDs in the index
        # won't necessarily line up with those in the file(s).
        # This should not matter for users, who never see them side-by-side,
        # but just in case.
        index_xr = metadata_df_to_xr(
            # Have to reset the index so we can serialise to disk
            index.reset_index(),
            timeseries_dim=f"{self.timeseries_dim}_index",
        )
        index_xr.to_netcdf(index_file)

ext class-attribute instance-attribute #

ext: str = '.nc'

Extension to use with files saved by this backend.

preserves_index property #

preserves_index: Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

timeseries_dim class-attribute instance-attribute #

timeseries_dim: str = 'ts_id'

Name of the timeseries dimension in serialised output

load_file_map staticmethod #

load_file_map(file_map_file: Path) -> DataFrame

Load the database's file map

Parameters:

Name Type Description Default
file_map_file Path

File from which to load the file map

required

Returns:

Type Description
DataFrame

Loaded file map

Source code in src/pandas_openscm/db/netcdf.py
@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the database's file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_file_map", requirement="xarray"
        ) from exc

    res = xr.load_dataset(file_map_file).to_pandas()
    if isinstance(res, pd.Series):  # pragma: no cover
        raise TypeError(res)

    return res

load_index staticmethod #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name Type Description Default
index_file Path

File from which to load the index

required

Returns:

Type Description
DataFrame

Loaded index

Source code in src/pandas_openscm/db/netcdf.py
@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_index", requirement="xarray"
        ) from exc

    raw = xr.load_dataset(index_file)

    intermediate = metadata_xr_to_df(raw)
    res = intermediate.set_index(
        intermediate.columns.difference(["file_id"]).to_list()
    )

    return res

save_file_map staticmethod #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name Type Description Default
file_map Series[Path]

File map to save

required
file_map_file Path

File in which to save the file map

required
Source code in src/pandas_openscm/db/netcdf.py
@staticmethod
def save_file_map(
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.save_file_map", requirement="xarray"
        ) from exc

    file_map_xr = xr.DataArray.from_series(file_map.astype(str))
    file_map_xr.to_netcdf(file_map_file)

save_index #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name Type Description Default
index DataFrame

Index to save

required
index_file Path

File in which to save the index

required
Source code in src/pandas_openscm/db/netcdf.py
def save_index(
    self,
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    # Use a different name because the timeseries IDs in the index
    # won't necessarily line up with those in the file(s).
    # This should not matter for users, who never see them side-by-side,
    # but just in case.
    index_xr = metadata_df_to_xr(
        # Have to reset the index so we can serialise to disk
        index.reset_index(),
        timeseries_dim=f"{self.timeseries_dim}_index",
    )
    index_xr.to_netcdf(index_file)