pandas_openscm.db#

Database

Modules:

Name	Description
`backends`	Available back-ends
`csv`	CSV backend
`deleting`	Functionality for deleting data
`feather`	Feather backend
`in_memory`	In-memory backend
`interfaces`	Interfaces used throughout the db (database) module
`loading`	Loading of data from disk
`netcdf`	netCDF backend
`openscm_db`	Definition of our key OpenSCMDB class
`path_handling`	Functionality for handling paths
`reader`	Database reader
`rewriting`	Functionality for re-writing a database
`saving`	Functionality for saving data

Classes:

Name	Description
`AlreadyInDBError`	Raised when saving data would overwrite data which is already in the database
`CSVDataBackend`	CSV data backend
`CSVIndexBackend`	CSV index backend
`EmptyDBError`	Raised when trying to access data from a database that is empty
`FeatherDataBackend`	Feather data backend
`FeatherIndexBackend`	Feather index backend
`InMemoryDataBackend`	In-memory data backend
`InMemoryIndexBackend`	In-memory index backend
`OpenSCMDB`	Database for storing OpenSCM-style data
`OpenSCMDBDataBackend`	Backend for (de-)serialising data
`OpenSCMDBIndexBackend`	Backend for (de-)serialising the index (and file map)
`netCDFDataBackend`	netCDF data backend
`netCDFIndexBackend`	netCDF index backend

Attributes:

Name	Type	Description
`DATA_BACKENDS`		Inbuilt data back-ends
`INDEX_BACKENDS`		Inbuilt index back-ends

DATA_BACKENDS `module-attribute` #

DATA_BACKENDS = DataBackendOptions(
    cast(
        tuple[tuple[str, type[OpenSCMDBDataBackend]], ...],
        (
            ("csv", CSVDataBackend),
            ("feather", FeatherDataBackend),
            ("in_memory", InMemoryDataBackend),
            ("netCDF", netCDFDataBackend),
        ),
    )
)

Inbuilt data back-ends

INDEX_BACKENDS `module-attribute` #

INDEX_BACKENDS = IndexBackendOptions(
    cast(
        tuple[tuple[str, type[OpenSCMDBIndexBackend]], ...],
        (
            ("csv", CSVIndexBackend),
            ("feather", FeatherIndexBackend),
            ("in_memory", InMemoryIndexBackend),
            ("netCDF", netCDFIndexBackend),
        ),
    )
)

Inbuilt index back-ends

AlreadyInDBError #

Bases: ValueError

Raised when saving data would overwrite data which is already in the database

Methods:

Name	Description
`__init__`	Initialise the error

Source code in src/pandas_openscm/db/openscm_db.py

class AlreadyInDBError(ValueError):
    """
    Raised when saving data would overwrite data which is already in the database
    """

    def __init__(self, already_in_db: pd.DataFrame) -> None:
        """
        Initialise the error

        Parameters
        ----------
        already_in_db
            data that is already in the database
        """
        error_msg = (
            "The following rows are already in the database:\n"
            f"{already_in_db.index.to_frame(index=False)}"
        )
        super().__init__(error_msg)

init #

__init__(already_in_db: DataFrame) -> None

Initialise the error

Parameters:

Name	Type	Description	Default
`already_in_db`	`DataFrame`	data that is already in the database	required

Source code in src/pandas_openscm/db/openscm_db.py

def __init__(self, already_in_db: pd.DataFrame) -> None:
    """
    Initialise the error

    Parameters
    ----------
    already_in_db
        data that is already in the database
    """
    error_msg = (
        "The following rows are already in the database:\n"
        f"{already_in_db.index.to_frame(index=False)}"
    )
    super().__init__(error_msg)

CSVDataBackend #

CSV data backend

Methods:

Name	Description
`load_data`	Load a data file
`save_data`	Save data to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[False]`	Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/csv.py

@define
class CSVDataBackend:
    """
    CSV data backend
    """

    ext: str = ".csv"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[False]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return False

    @staticmethod
    def load_data(data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        return pd.read_csv(data_file)

    @staticmethod
    def save_data(data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        data.to_csv(data_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.csv'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[False]

Whether this backend preserves the index of data upon (de-)serialisation

load_data `staticmethod` #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name	Type	Description	Default
`data_file`	`Path`	File from which to load the data	required

Returns:

Type	Description
`DataFrame`	Loaded data

Source code in src/pandas_openscm/db/csv.py

@staticmethod
def load_data(data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    return pd.read_csv(data_file)

save_data `staticmethod` #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to save	required
`data_file`	`Path`	File in which to save the data	required

Source code in src/pandas_openscm/db/csv.py

@staticmethod
def save_data(data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    data.to_csv(data_file)

CSVIndexBackend #

CSV index backend

Methods:

Name	Description
`load_file_map`	Load the file map
`load_index`	Load the index
`save_file_map`	Save the file map to disk
`save_index`	Save the index to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[False]`	Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation

Source code in src/pandas_openscm/db/csv.py

@define
class CSVIndexBackend:
    """
    CSV index backend
    """

    ext: str = ".csv"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[False]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return False

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        return pd.read_csv(file_map_file)

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        return pd.read_csv(index_file)

    @staticmethod
    def save_file_map(
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        file_map.to_csv(file_map_file)

    @staticmethod
    def save_index(
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        index.to_csv(index_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.csv'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[False]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map `staticmethod` #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

Parameters:

Name	Type	Description	Default
`file_map_file`	`Path`	File from which to load the file map	required

Returns:

Type	Description
`DataFrame`	Loaded file map

Source code in src/pandas_openscm/db/csv.py

@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    return pd.read_csv(file_map_file)

load_index `staticmethod` #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name	Type	Description	Default
`index_file`	`Path`	File from which to load the index	required

Returns:

Type	Description
`DataFrame`	Loaded index

Source code in src/pandas_openscm/db/csv.py

@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    return pd.read_csv(index_file)

save_file_map `staticmethod` #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name	Type	Description	Default
`file_map`	`Series[Path]`	File map to save	required
`file_map_file`	`Path`	File in which to save the file map	required

Source code in src/pandas_openscm/db/csv.py

@staticmethod
def save_file_map(
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    file_map.to_csv(file_map_file)

save_index `staticmethod` #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name	Type	Description	Default
`index`	`DataFrame`	Index to save	required
`index_file`	`Path`	File in which to save the index	required

Source code in src/pandas_openscm/db/csv.py

@staticmethod
def save_index(
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    index.to_csv(index_file)

EmptyDBError #

Bases: ValueError

Raised when trying to access data from a database that is empty

Methods:

Name	Description
`__init__`	Initialise the error

Source code in src/pandas_openscm/db/openscm_db.py

class EmptyDBError(ValueError):
    """
    Raised when trying to access data from a database that is empty
    """

    def __init__(self, db: OpenSCMDB) -> None:
        """
        Initialise the error

        Parameters
        ----------
        db
            The database
        """
        error_msg = f"The database is empty: {db=}"
        super().__init__(error_msg)

init #

__init__(db: OpenSCMDB) -> None

Initialise the error

Parameters:

Name	Type	Description	Default
`db`	`OpenSCMDB`	The database	required

Source code in src/pandas_openscm/db/openscm_db.py

def __init__(self, db: OpenSCMDB) -> None:
    """
    Initialise the error

    Parameters
    ----------
    db
        The database
    """
    error_msg = f"The database is empty: {db=}"
    super().__init__(error_msg)

FeatherDataBackend #

Feather data backend

For details on feather, see https://arrow.apache.org/docs/python/feather.html

Methods:

Name	Description
`load_data`	Load a data file
`save_data`	Save data to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[True]`	Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/feather.py

@define
class FeatherDataBackend:
    """
    Feather data backend

    For details on feather, see https://arrow.apache.org/docs/python/feather.html
    """

    ext: str = ".feather"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return True

    @staticmethod
    def load_data(data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        return pd.read_feather(data_file)

    @staticmethod
    def save_data(data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        # The docs say that feather doesn't support writing indexes
        # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
        # However, it seems to have no issue writing our multi-indexes.
        # Hence the implementation below
        data.to_feather(data_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.feather'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

load_data `staticmethod` #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name	Type	Description	Default
`data_file`	`Path`	File from which to load the data	required

Returns:

Type	Description
`DataFrame`	Loaded data

Source code in src/pandas_openscm/db/feather.py

@staticmethod
def load_data(data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    return pd.read_feather(data_file)

save_data `staticmethod` #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to save	required
`data_file`	`Path`	File in which to save the data	required

Source code in src/pandas_openscm/db/feather.py

@staticmethod
def save_data(data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    # The docs say that feather doesn't support writing indexes
    # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
    # However, it seems to have no issue writing our multi-indexes.
    # Hence the implementation below
    data.to_feather(data_file)

FeatherIndexBackend #

Feather index backend

For details on feather, see https://arrow.apache.org/docs/python/feather.html

Methods:

Name	Description
`load_file_map`	Load the file map
`load_index`	Load the index
`save_file_map`	Save the file map to disk
`save_index`	Save the index to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[True]`	Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation

Source code in src/pandas_openscm/db/feather.py

@define
class FeatherIndexBackend:
    """
    Feather index backend

    For details on feather, see https://arrow.apache.org/docs/python/feather.html
    """

    ext: str = ".feather"
    """
    Extension to use with files saved by this backend.
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return True

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        return pd.read_feather(file_map_file)

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        return pd.read_feather(index_file)

    @staticmethod
    def save_file_map(
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        # Feather doesn't support writing non-native types
        # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
        # The docs say that feather doesn't support writing indexes
        # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
        # However, it seems to have no issue writing this index.
        # Hence the implementation below
        file_map_write = file_map.astype(str)
        file_map_write.to_frame().to_feather(file_map_file)

    @staticmethod
    def save_index(
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        index.to_feather(index_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.feather'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map `staticmethod` #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

Parameters:

Name	Type	Description	Default
`file_map_file`	`Path`	File from which to load the file map	required

Returns:

Type	Description
`DataFrame`	Loaded file map

Source code in src/pandas_openscm/db/feather.py

@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    return pd.read_feather(file_map_file)

load_index `staticmethod` #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name	Type	Description	Default
`index_file`	`Path`	File from which to load the index	required

Returns:

Type	Description
`DataFrame`	Loaded index

Source code in src/pandas_openscm/db/feather.py

@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    return pd.read_feather(index_file)

save_file_map `staticmethod` #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name	Type	Description	Default
`file_map`	`Series[Path]`	File map to save	required
`file_map_file`	`Path`	File in which to save the file map	required

Source code in src/pandas_openscm/db/feather.py

@staticmethod
def save_file_map(
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    # Feather doesn't support writing non-native types
    # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
    # The docs say that feather doesn't support writing indexes
    # # (see https://pandas.pydata.org/docs/user_guide/io.html#feather).
    # However, it seems to have no issue writing this index.
    # Hence the implementation below
    file_map_write = file_map.astype(str)
    file_map_write.to_frame().to_feather(file_map_file)

save_index `staticmethod` #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name	Type	Description	Default
`index`	`DataFrame`	Index to save	required
`index_file`	`Path`	File in which to save the index	required

Source code in src/pandas_openscm/db/feather.py

@staticmethod
def save_index(
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    index.to_feather(index_file)

InMemoryDataBackend #

In-memory data backend

Methods:

Name	Description
`load_data`	Load a data file
`save_data`	Save data to disk

Attributes:

Name	Type	Description
`data`	`dict[str, DataFrame] \| None`	Data store
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[True]`	Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/in_memory.py

@define
class InMemoryDataBackend:
    """
    In-memory data backend
    """

    ext: str = ".in-mem"
    """
    Extension to use with files saved by this backend.
    """

    data: dict[str, pd.DataFrame] | None = None
    """
    Data store
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return True

    def load_data(self, data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        if self.data is None:
            raise TypeError

        return self.data[str(data_file)]

    def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        if self.data is None:
            self.data = {}

        self.data[str(data_file)] = data
        # Have to do this as, even though it's in-memory,
        # the layer above expects to have files to check, remove etc.
        data_file.touch()

data `class-attribute` `instance-attribute` #

data: dict[str, DataFrame] | None = None

Data store

ext `class-attribute` `instance-attribute` #

ext: str = '.in-mem'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

load_data #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name	Type	Description	Default
`data_file`	`Path`	File from which to load the data	required

Returns:

Type	Description
`DataFrame`	Loaded data

Source code in src/pandas_openscm/db/in_memory.py

def load_data(self, data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    if self.data is None:
        raise TypeError

    return self.data[str(data_file)]

save_data #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to save	required
`data_file`	`Path`	File in which to save the data	required

Source code in src/pandas_openscm/db/in_memory.py

def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    if self.data is None:
        self.data = {}

    self.data[str(data_file)] = data
    # Have to do this as, even though it's in-memory,
    # the layer above expects to have files to check, remove etc.
    data_file.touch()

InMemoryIndexBackend #

In-memory index backend

Methods:

Name	Description
`load_file_map`	Load the file map
`load_index`	Load the index
`save_file_map`	Save the file map to disk
`save_index`	Save the index to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`file_map`	`Series[Path] \| None`	File map store
`index`	`DataFrame \| None`	Index store
`preserves_index`	`Literal[True]`	Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation

Source code in src/pandas_openscm/db/in_memory.py

@define
class InMemoryIndexBackend:
    """
    In-memory index backend
    """

    ext: str = ".in-mem"
    """
    Extension to use with files saved by this backend.
    """

    index: pd.DataFrame | None = None
    """Index store"""

    file_map: pd.Series[Path] | None = None  # type: ignore # pandas confused about what it supports
    """File map store"""

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return True

    def load_file_map(self, file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        if self.file_map is None:
            raise TypeError

        return self.file_map.to_frame()

    def load_index(self, index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        if self.index is None:
            raise TypeError

        return self.index

    def save_file_map(
        self,
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        self.file_map = file_map
        # Have to do this as, even though it's in-memory,
        # the layer above expects to have files to check
        file_map_file.touch()

    def save_index(
        self,
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        self.index = index
        # Have to do this as, even though it's in-memory,
        # the layer above expects to have files to check
        index_file.touch()

ext `class-attribute` `instance-attribute` #

ext: str = '.in-mem'

Extension to use with files saved by this backend.

file_map `class-attribute` `instance-attribute` #

file_map: Series[Path] | None = None

File map store

index `class-attribute` `instance-attribute` #

index: DataFrame | None = None

Index store

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

Parameters:

Name	Type	Description	Default
`file_map_file`	`Path`	File from which to load the file map	required

Returns:

Type	Description
`DataFrame`	Loaded file map

Source code in src/pandas_openscm/db/in_memory.py

def load_file_map(self, file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    if self.file_map is None:
        raise TypeError

    return self.file_map.to_frame()

load_index #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name	Type	Description	Default
`index_file`	`Path`	File from which to load the index	required

Returns:

Type	Description
`DataFrame`	Loaded index

Source code in src/pandas_openscm/db/in_memory.py

def load_index(self, index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    if self.index is None:
        raise TypeError

    return self.index

save_file_map #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name	Type	Description	Default
`file_map`	`Series[Path]`	File map to save	required
`file_map_file`	`Path`	File in which to save the file map	required

Source code in src/pandas_openscm/db/in_memory.py

def save_file_map(
    self,
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    self.file_map = file_map
    # Have to do this as, even though it's in-memory,
    # the layer above expects to have files to check
    file_map_file.touch()

save_index #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name	Type	Description	Default
`index`	`DataFrame`	Index to save	required
`index_file`	`Path`	File in which to save the index	required

Source code in src/pandas_openscm/db/in_memory.py

def save_index(
    self,
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    self.index = index
    # Have to do this as, even though it's in-memory,
    # the layer above expects to have files to check
    index_file.touch()

OpenSCMDB #

Database for storing OpenSCM-style data

This class is focussed on backends that use files as their storage. If you had a different database backend, you might make different choices. We haven't thought through those use cases hence aren't sure how much effort would be required to make something truly backend agnostic.

Methods:

Name	Description
`create_reader`	Create a database reader
`default_index_file_lock`	Get default lock for the back-end's index file
`delete`	Delete all data in the database
`from_gzipped_tar_archive`	Initialise from a gzipped tar archive
`get_new_data_file_path`	Get the path in which to write a new data file
`load`	Load data
`load_file_map`	Load the file map
`load_index`	Load the index
`load_metadata`	Load the database's metadata
`save`	Save data into the database
`to_gzipped_tar_archive`	Convert to a gzipped tar archive

Attributes:

Name	Type	Description
`backend_data`	`OpenSCMDBDataBackend`	The backend for (de-)serialising data (from) to disk
`backend_index`	`OpenSCMDBIndexBackend`	The backend for (de-)serialising the database index (from) to disk
`db_dir`	`Path`	Path in which the database is stored
`file_map_file`	`Path`	The file in which the file map is stored
`index_file`	`Path`	The file in which the database's index is stored
`index_file_lock`	`BaseFileLock`	Lock for the index file
`index_file_lock_path`	`Path`	Path to the lock file for the back-end's index file
`is_empty`	`bool`	Whether the database is empty or not

Source code in src/pandas_openscm/db/openscm_db.py

@define
class OpenSCMDB:
    """
    Database for storing OpenSCM-style data

    This class is focussed on backends that use files as their storage.
    If you had a different database backend,
    you might make different choices.
    We haven't thought through those use cases
    hence aren't sure how much effort
    would be required to make something truly backend agnostic.
    """

    backend_data: OpenSCMDBDataBackend = field(kw_only=True)
    """
    The backend for (de-)serialising data (from) to disk
    """

    backend_index: OpenSCMDBIndexBackend = field(kw_only=True)
    """
    The backend for (de-)serialising the database index (from) to disk
    """

    db_dir: Path = field(kw_only=True)
    """
    Path in which the database is stored

    Both the index and the data files will be written in this directory.
    """

    index_file_lock: filelock.BaseFileLock = field(kw_only=True)
    """
    Lock for the index file
    """
    # Note to devs: filelock releases the lock when __del__ is called
    # (i.e. when the lock instance is garbage collected).
    # Hence, you have to keep a reference to this around
    # if you want it to do anything.
    # For a while, we made this a property that created the lock when requested.
    # That was super confusing as, if the reference to the created lock wasn't kept,
    # the lock would immediately be released.

    @index_file_lock.default  # ty: ignore[call-non-callable]
    def default_index_file_lock(self) -> filelock.BaseFileLock:
        """Get default lock for the back-end's index file"""
        try:
            import filelock  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "default_index_file_lock", requirement="filelock"
            ) from exc

        return filelock.FileLock(self.index_file_lock_path)

    @property
    def file_map_file(self) -> Path:
        """
        The file in which the file map is stored

        The file map stores the mapping from file_id
        to file path.

        Returns
        -------
        :
            Path to the file map file
        """
        return self.db_dir / f"filemap{self.backend_index.ext}"

    @property
    def index_file(self) -> Path:
        """
        The file in which the database's index is stored

        Returns
        -------
        :
            Path to the index file
        """
        return self.db_dir / f"index{self.backend_index.ext}"

    @property
    def index_file_lock_path(self) -> Path:
        """Path to the lock file for the back-end's index file"""
        return self.index_file.parent / f"{self.index_file.name}.lock"

    @property
    def is_empty(self) -> bool:
        """
        Whether the database is empty or not

        Returns
        -------
        :
            `True` if the database is empty, `False` otherwise
        """
        return not self.index_file.exists()

    def create_reader(
        self,
        *,
        lock: bool | filelock.BaseFileLock | None = True,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> OpenSCMDBReader:
        """
        Create a database reader

        Parameters
        ----------
        lock
            Lock to give to the reader.

            If `True`, we create a new lock for the database, such that,
            if the reader is holding the lock,
            no operations can be performed on the database.

            If `False`, the reader is not given any lock.

        index_file_lock
            Lock for the database's index file

            Used while loading the index from disk.

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Database reader
        """
        if isinstance(lock, bool):
            if lock:
                try:
                    import filelock  # noqa: PLC0415
                except ImportError as exc:
                    raise MissingOptionalDependencyError(  # noqa: TRY003
                        "create_reader(..., lock=True, ...)", requirement="filelock"
                    ) from exc

                # Create a new lock for the reader
                lock = filelock.FileLock(self.index_file_lock_path)

            else:
                # Convert to None
                lock = None

        db_index = self.load_index(index_file_lock=index_file_lock)
        db_file_map = self.load_file_map(index_file_lock=index_file_lock)

        res = OpenSCMDBReader(
            backend_data=self.backend_data,
            db_dir=self.db_dir,
            db_index=db_index,
            db_file_map=db_file_map,
            lock=lock,
        )

        return res

    def delete(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
        parallel_op_config: ParallelOpConfig | None = None,
        progress: bool = False,
        max_workers: int | None = None,
    ) -> None:
        """
        Delete all data in the database

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        parallel_op_config
            Configuration for executing the operation in parallel with progress bars

            If not supplied, we use the values of `progress` and `max_workers`.

        progress
            Should progress bar(s) be used to display the progress of the deletion?

            Only used if `parallel_op_config` is `None`.

        max_workers
            Maximum number of workers to use for parallel processing.

            If supplied, we create an instance of
            [concurrent.futures.ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor)
            with the provided number of workers
            (a thread pool makes sense as deletion is I/O-bound).

            If not supplied, the deletions are executed serially.

            Only used if `parallel_op_config` is `None`.
        """
        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            files_to_delete = {
                *self.db_dir.glob(f"*{self.backend_data.ext}"),
                *self.db_dir.glob(f"*{self.backend_index.ext}"),
            }
            delete_files(
                files_to_delete=files_to_delete,
                parallel_op_config=parallel_op_config,
                progress=progress,
                max_workers=max_workers,
            )

    @classmethod
    def from_gzipped_tar_archive(
        cls,
        tar_archive: Path,
        db_dir: Path,
        backend_data: OpenSCMDBDataBackend | None = None,
        backend_index: OpenSCMDBIndexBackend | None = None,
    ) -> OpenSCMDB:
        """
        Initialise from a gzipped tar archive

        This also unpacks the files to disk

        Parameters
        ----------
        tar_archive
            Tar archive from which to initialise

        db_dir
            Directory in which to unpack the database

        backend_data
            Backend to use for handling the data

        backend_index
            Backend to use for handling the index

        Returns
        -------
        :
            Initialised database
        """
        with tarfile.open(tar_archive, "r") as tar:
            for member in tar.getmembers():
                if not member.isreg():
                    # Only extract files
                    continue
                # Extract to the db_dir
                member.name = Path(member.name).name
                tar.extract(member, db_dir)
                if backend_index is None and member.name.startswith("index"):
                    backend_index = INDEX_BACKENDS.guess_backend(member.name)

                if backend_data is None and not any(
                    member.name.startswith(v) for v in ["index", "filemap"]
                ):
                    backend_data = DATA_BACKENDS.guess_backend(member.name)

        if backend_data is None:  # pragma: no cover
            # Should be impossible to get here
            raise TypeError(backend_data)

        if backend_index is None:  # pragma: no cover
            # Should be impossible to get here
            raise TypeError(backend_index)

        res = cls(  # ty: ignore[missing-argument]
            backend_data=backend_data, backend_index=backend_index, db_dir=db_dir
        )

        return res

    def get_new_data_file_path(self, file_id: int) -> DBPath:
        """
        Get the path in which to write a new data file

        Parameters
        ----------
        file_id
            ID to associate with the file

        Returns
        -------
        :
            Information about the path in which to write the new data

        Raises
        ------
        FileExistsError
            A file already exists for the given `file_id`
        """
        file_path = self.db_dir / f"{file_id}{self.backend_data.ext}"

        if file_path.exists():
            raise FileExistsError(file_path)

        return DBPath.from_abs_path_and_db_dir(abs=file_path, db_dir=self.db_dir)

    def load(  # noqa: PLR0913
        self,
        selector: pd.Index[Any] | pd.MultiIndex | pix.selectors.Selector | None = None,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
        out_columns_type: type | None = None,
        out_columns_name: str | None = None,
        parallel_op_config: ParallelOpConfig | None = None,
        progress: bool = False,
        max_workers: int | None = None,
    ) -> pd.DataFrame:
        """
        Load data

        Parameters
        ----------
        selector
            Selector to use to choose the data to load

        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        out_columns_type
            Type to set the output columns to.

            If not supplied, we don't set the output columns' type.

        out_columns_name
            The name for the columns in the output.

            If not supplied, we don't set the output columns' name.

            This can also be set with
            [pd.DataFrame.rename_axis][pandas.DataFrame.rename_axis]
            but we provide it here for convenience
            (and in case you couldn't find this trick for ages, like us).

        parallel_op_config
            Configuration for executing the operation in parallel with progress bars

            If not supplied, we use the values of `progress` and `max_workers`.

        progress
            Should progress bar(s) be used to display the progress of the deletion?

            Only used if `parallel_op_config` is `None`.

        max_workers
            Maximum number of workers to use for parallel processing.

            If supplied, we create an instance of
            [concurrent.futures.ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)
            with the provided number of workers.
            A process pool seems to be the sensible default from our experimentation,
            but it is not a universally better choice.
            If you need something else because of how your database is set up,
            simply pass `parallel_op_config`
            rather than using the shortcut of passing `max_workers`.

            If not supplied, the loading is executed serially.

            Only used if `parallel_op_config` is `None`.

        Returns
        -------
        :
            Loaded data

        Raises
        ------
        EmptyDBError
            The database is empty
        """
        if self.is_empty:
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            file_map = self.load_file_map(index_file_lock=index_file_lock)
            index = self.load_index(index_file_lock=index_file_lock)

            res = load_data(
                backend_data=self.backend_data,
                db_index=index,
                db_file_map=file_map,
                db_dir=self.db_dir,
                selector=selector,
                out_columns_type=out_columns_type,
                out_columns_name=out_columns_name,
                parallel_op_config=parallel_op_config,
                progress=progress,
                max_workers=max_workers,
            )

        return res

    def load_file_map(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> pd.Series[Path]:  # type: ignore # pandas type hints confused about what they support
        """
        Load the file map

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Map from file ID to file path

        Raises
        ------
        EmptyDBError
            The database is empty
        """
        if self.is_empty:
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            file_map = load_db_file_map(
                backend_index=self.backend_index, file_map_file=self.file_map_file
            )

        return file_map

    def load_index(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Database index

        Raises
        ------
        EmptyDBError
            The database is empty
        """
        if self.is_empty:
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            index = load_db_index(
                backend_index=self.backend_index,
                index_file=self.index_file,
            )

        return index

    def load_metadata(
        self,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
    ) -> pd.MultiIndex:
        """
        Load the database's metadata

        Parameters
        ----------
        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        Returns
        -------
        :
            Loaded metadata
        """
        if not self.index_file.exists():
            raise EmptyDBError(self)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            metadata = load_db_metadata(
                backend_index=self.backend_index, index_file=self.index_file
            )

        return metadata

    def save(  # noqa: PLR0913
        self,
        data: pd.DataFrame,
        *,
        index_file_lock: filelock.BaseFileLock | None = None,
        groupby: list[str] | None = None,
        allow_overwrite: bool = False,
        warn_on_partial_overwrite: bool = True,
        progress_grouping: ProgressLike | None = None,
        parallel_op_config_save: ParallelOpConfig | None = None,
        parallel_op_config_delete: ParallelOpConfig | None = None,
        parallel_op_config_rewrite: ParallelOpConfig | None = None,
        progress: bool = False,
        max_workers: int | None = None,
    ) -> None:
        """
        Save data into the database

        Parameters
        ----------
        data
            Data to add to the database

        index_file_lock
            Lock for the database's index file

            If not supplied, we use [self.index_file_lock][(c)].

        groupby
            Metadata columns to use to group the data.

            If not supplied, we save all the data in a single file.

        allow_overwrite
            Should overwrites of data that is already in the database be allowed?

            If this is `True`, there is a risk that, if interrupted halfway through,
            you can end up with duplicate data in your database
            or some other odd broken state.

        warn_on_partial_overwrite
            Should a warning be raised if a partial overwrite will occur?

            This is on by default so that users
            are warned about the slow operation of re-writing.

        progress_grouping
            Progress bar to use when grouping the data

            If not supplied, we use the values of `progress` and `max_workers`.

        parallel_op_config_save
            Parallel op configuration for executing save operations

            If not supplied, we use the values of `progress` and `max_workers`.

        parallel_op_config_delete
            Parallel op configuration for executing any needed delete operations

            If not supplied, we use the values of `progress` and `max_workers`.

        parallel_op_config_rewrite
            Parallel op configuration for executing any needed re-write operations

            If not supplied, we use the values of `progress` and `max_workers`.

        progress
            Should progress bar(s) be used to display the progress of the various steps?

            Only used if the corresponding `parallel_op_config_*` variable
            for the operation is `None`.

        max_workers
            Maximum number of workers to use for parallel processing.

            If supplied, we create instances of
            [concurrent.futures.Executor][]
            with the provided number of workers
            (the exact kind of executor depends on the operation).

            If not supplied, the operations are executed serially.

            Only used if the corresponding `parallel_op_config_*` variable
            for the operation is `None`.
        """
        if not isinstance(data.index, pd.MultiIndex):
            msg = (
                "`data.index` must be an instance of `pd.MultiIndex`. "
                f"Received {type(data.index)=}"
            )
            raise TypeError(msg)

        if data.index.duplicated().any():
            duplicate_rows = data.index.duplicated(keep=False)
            duplicates = data.loc[duplicate_rows, :]
            msg = (
                "`data` contains rows with the same metadata. "
                f"duplicates=\n{duplicates}"
            )

            raise ValueError(msg)

        if index_file_lock is None:
            index_file_lock = self.index_file_lock

        with index_file_lock:
            if self.is_empty:
                move_plan = None
                index_non_data = None
                file_map_non_data = None
                min_file_id = 0

            else:
                file_map_db = self.load_file_map(index_file_lock=index_file_lock)
                index_db = self.load_index(index_file_lock=index_file_lock)
                if not allow_overwrite:
                    data_index_unified, index_db_index_unified = (
                        unify_index_levels_check_index_types(data.index, index_db.index)
                    )
                    overwrite_required = multi_index_match(
                        data_index_unified, index_db_index_unified
                    )

                    if overwrite_required.any():
                        data_to_write_already_in_db = data.loc[overwrite_required, :]
                        raise AlreadyInDBError(
                            already_in_db=data_to_write_already_in_db
                        )

                move_plan = make_move_plan(
                    index_start=index_db,
                    file_map_start=file_map_db,
                    data_to_write=data,
                    get_new_data_file_path=self.get_new_data_file_path,
                    db_dir=self.db_dir,
                )

                # As needed, re-write files without deleting the old files
                if move_plan.rewrite_actions is not None:
                    if warn_on_partial_overwrite:
                        msg = (
                            "Overwriting the data will require re-writing. "
                            "This may be slow. "
                            "If that is an issue, the way to solve it "
                            "is to update your workflow to ensure "
                            "that you are not overwriting data "
                            "or are only overwriting entire files."
                        )
                        warnings.warn(msg)

                    rewrite_files(
                        move_plan.rewrite_actions,
                        backend=self.backend_data,
                        parallel_op_config=parallel_op_config_rewrite,
                        progress=progress,
                        max_workers=max_workers,
                    )

                # Write the new data
                current_largest_file_id = file_map_db.index.max()
                if not move_plan.moved_file_map.empty:
                    current_largest_file_id = max(
                        move_plan.moved_file_map.index.max(), current_largest_file_id
                    )

                index_non_data = move_plan.moved_index
                file_map_non_data = move_plan.moved_file_map
                min_file_id = current_largest_file_id + 1

            save_data(
                data,
                backend_data=self.backend_data,
                get_new_data_file_path=self.get_new_data_file_path,
                backend_index=self.backend_index,
                index_file=self.index_file,
                file_map_file=self.file_map_file,
                index_non_data=index_non_data,
                file_map_non_data=file_map_non_data,
                min_file_id=min_file_id,
                groupby=groupby,
                progress_grouping=progress_grouping,
                parallel_op_config=parallel_op_config_save,
                progress=progress,
                max_workers=max_workers,
            )

            # As needed, delete files.
            # We delete files last to minimise the risk of losing data
            # (might end up with double if we get interrupted here,
            # but that is better than zero).
            if move_plan is not None and move_plan.delete_paths is not None:
                delete_files(
                    files_to_delete=move_plan.delete_paths,
                    parallel_op_config=parallel_op_config_delete,
                    progress=progress,
                    max_workers=max_workers,
                )

    def to_gzipped_tar_archive(
        self,
        out_file: Path,
        mode: Literal["w:gz", "x:gz"] = "w:gz",
    ) -> Path:
        """
        Convert to a gzipped tar archive

        Parameters
        ----------
        out_file
            File in which to write the output

        mode
            Mode to use to open `out_file`

        Returns
        -------
        :
            Path to the gzipped tar archive

            This is the same as `out_file`, but is returned for convenience.
        """
        with tarfile.open(out_file, mode=mode) as tar:
            tar.add(self.db_dir, arcname="db")

        return out_file

backend_data `class-attribute` `instance-attribute` #

backend_data: OpenSCMDBDataBackend = field(kw_only=True)

The backend for (de-)serialising data (from) to disk

backend_index `class-attribute` `instance-attribute` #

backend_index: OpenSCMDBIndexBackend = field(kw_only=True)

The backend for (de-)serialising the database index (from) to disk

db_dir `class-attribute` `instance-attribute` #

db_dir: Path = field(kw_only=True)

Path in which the database is stored

Both the index and the data files will be written in this directory.

file_map_file `property` #

file_map_file: Path

The file in which the file map is stored

The file map stores the mapping from file_id to file path.

Returns:

Type	Description
`Path`	Path to the file map file

index_file `property` #

index_file: Path

The file in which the database's index is stored

Returns:

Type	Description
`Path`	Path to the index file

index_file_lock `class-attribute` `instance-attribute` #

index_file_lock: BaseFileLock = field(kw_only=True)

Lock for the index file

index_file_lock_path `property` #

index_file_lock_path: Path

Path to the lock file for the back-end's index file

is_empty `property` #

is_empty: bool

Whether the database is empty or not

Returns:

Type	Description
`bool`	`True` if the database is empty, `False` otherwise

create_reader #

create_reader(
    *,
    lock: bool | BaseFileLock | None = True,
    index_file_lock: BaseFileLock | None = None,
) -> OpenSCMDBReader

Create a database reader

Parameters:

Name	Type	Description	Default
`lock`	`bool \| BaseFileLock \| None`	Lock to give to the reader. If `True`, we create a new lock for the database, such that, if the reader is holding the lock, no operations can be performed on the database. If `False`, the reader is not given any lock.	`True`
`index_file_lock`	`BaseFileLock \| None`	Lock for the database's index file Used while loading the index from disk. If not supplied, we use self.index_file_lock.	`None`

Returns:

Type	Description
`OpenSCMDBReader`	Database reader

Source code in src/pandas_openscm/db/openscm_db.py

def create_reader(
    self,
    *,
    lock: bool | filelock.BaseFileLock | None = True,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> OpenSCMDBReader:
    """
    Create a database reader

    Parameters
    ----------
    lock
        Lock to give to the reader.

        If `True`, we create a new lock for the database, such that,
        if the reader is holding the lock,
        no operations can be performed on the database.

        If `False`, the reader is not given any lock.

    index_file_lock
        Lock for the database's index file

        Used while loading the index from disk.

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Database reader
    """
    if isinstance(lock, bool):
        if lock:
            try:
                import filelock  # noqa: PLC0415
            except ImportError as exc:
                raise MissingOptionalDependencyError(  # noqa: TRY003
                    "create_reader(..., lock=True, ...)", requirement="filelock"
                ) from exc

            # Create a new lock for the reader
            lock = filelock.FileLock(self.index_file_lock_path)

        else:
            # Convert to None
            lock = None

    db_index = self.load_index(index_file_lock=index_file_lock)
    db_file_map = self.load_file_map(index_file_lock=index_file_lock)

    res = OpenSCMDBReader(
        backend_data=self.backend_data,
        db_dir=self.db_dir,
        db_index=db_index,
        db_file_map=db_file_map,
        lock=lock,
    )

    return res

default_index_file_lock #

default_index_file_lock() -> BaseFileLock

Get default lock for the back-end's index file

Source code in src/pandas_openscm/db/openscm_db.py

@index_file_lock.default  # ty: ignore[call-non-callable]
def default_index_file_lock(self) -> filelock.BaseFileLock:
    """Get default lock for the back-end's index file"""
    try:
        import filelock  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "default_index_file_lock", requirement="filelock"
        ) from exc

    return filelock.FileLock(self.index_file_lock_path)

delete #

delete(
    *,
    index_file_lock: BaseFileLock | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None

Delete all data in the database

Parameters:

Name	Type	Description	Default
`index_file_lock`	`BaseFileLock \| None`	Lock for the database's index file If not supplied, we use self.index_file_lock.	`None`
`parallel_op_config`	`ParallelOpConfig \| None`	Configuration for executing the operation in parallel with progress bars If not supplied, we use the values of `progress` and `max_workers`.	`None`
`progress`	`bool`	Should progress bar(s) be used to display the progress of the deletion? Only used if `parallel_op_config` is `None`.	`False`
`max_workers`	`int \| None`	Maximum number of workers to use for parallel processing. If supplied, we create an instance of concurrent.futures.ThreadPoolExecutor with the provided number of workers (a thread pool makes sense as deletion is I/O-bound). If not supplied, the deletions are executed serially. Only used if `parallel_op_config` is `None`.	`None`

Source code in src/pandas_openscm/db/openscm_db.py

def delete(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None:
    """
    Delete all data in the database

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    parallel_op_config
        Configuration for executing the operation in parallel with progress bars

        If not supplied, we use the values of `progress` and `max_workers`.

    progress
        Should progress bar(s) be used to display the progress of the deletion?

        Only used if `parallel_op_config` is `None`.

    max_workers
        Maximum number of workers to use for parallel processing.

        If supplied, we create an instance of
        [concurrent.futures.ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor)
        with the provided number of workers
        (a thread pool makes sense as deletion is I/O-bound).

        If not supplied, the deletions are executed serially.

        Only used if `parallel_op_config` is `None`.
    """
    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        files_to_delete = {
            *self.db_dir.glob(f"*{self.backend_data.ext}"),
            *self.db_dir.glob(f"*{self.backend_index.ext}"),
        }
        delete_files(
            files_to_delete=files_to_delete,
            parallel_op_config=parallel_op_config,
            progress=progress,
            max_workers=max_workers,
        )

from_gzipped_tar_archive `classmethod` #

from_gzipped_tar_archive(
    tar_archive: Path,
    db_dir: Path,
    backend_data: OpenSCMDBDataBackend | None = None,
    backend_index: OpenSCMDBIndexBackend | None = None,
) -> OpenSCMDB

Initialise from a gzipped tar archive

This also unpacks the files to disk

Parameters:

Name	Type	Description	Default
`tar_archive`	`Path`	Tar archive from which to initialise	required
`db_dir`	`Path`	Directory in which to unpack the database	required
`backend_data`	`OpenSCMDBDataBackend \| None`	Backend to use for handling the data	`None`
`backend_index`	`OpenSCMDBIndexBackend \| None`	Backend to use for handling the index	`None`

Returns:

Type	Description
`OpenSCMDB`	Initialised database

Source code in src/pandas_openscm/db/openscm_db.py

@classmethod
def from_gzipped_tar_archive(
    cls,
    tar_archive: Path,
    db_dir: Path,
    backend_data: OpenSCMDBDataBackend | None = None,
    backend_index: OpenSCMDBIndexBackend | None = None,
) -> OpenSCMDB:
    """
    Initialise from a gzipped tar archive

    This also unpacks the files to disk

    Parameters
    ----------
    tar_archive
        Tar archive from which to initialise

    db_dir
        Directory in which to unpack the database

    backend_data
        Backend to use for handling the data

    backend_index
        Backend to use for handling the index

    Returns
    -------
    :
        Initialised database
    """
    with tarfile.open(tar_archive, "r") as tar:
        for member in tar.getmembers():
            if not member.isreg():
                # Only extract files
                continue
            # Extract to the db_dir
            member.name = Path(member.name).name
            tar.extract(member, db_dir)
            if backend_index is None and member.name.startswith("index"):
                backend_index = INDEX_BACKENDS.guess_backend(member.name)

            if backend_data is None and not any(
                member.name.startswith(v) for v in ["index", "filemap"]
            ):
                backend_data = DATA_BACKENDS.guess_backend(member.name)

    if backend_data is None:  # pragma: no cover
        # Should be impossible to get here
        raise TypeError(backend_data)

    if backend_index is None:  # pragma: no cover
        # Should be impossible to get here
        raise TypeError(backend_index)

    res = cls(  # ty: ignore[missing-argument]
        backend_data=backend_data, backend_index=backend_index, db_dir=db_dir
    )

    return res

get_new_data_file_path #

get_new_data_file_path(file_id: int) -> DBPath

Get the path in which to write a new data file

Parameters:

Name	Type	Description	Default
`file_id`	`int`	ID to associate with the file	required

Returns:

Type	Description
`DBPath`	Information about the path in which to write the new data

Raises:

Type	Description
`FileExistsError`	A file already exists for the given `file_id`

Source code in src/pandas_openscm/db/openscm_db.py

def get_new_data_file_path(self, file_id: int) -> DBPath:
    """
    Get the path in which to write a new data file

    Parameters
    ----------
    file_id
        ID to associate with the file

    Returns
    -------
    :
        Information about the path in which to write the new data

    Raises
    ------
    FileExistsError
        A file already exists for the given `file_id`
    """
    file_path = self.db_dir / f"{file_id}{self.backend_data.ext}"

    if file_path.exists():
        raise FileExistsError(file_path)

    return DBPath.from_abs_path_and_db_dir(abs=file_path, db_dir=self.db_dir)

load #

load(
    selector: Index[Any]
    | MultiIndex
    | Selector
    | None = None,
    *,
    index_file_lock: BaseFileLock | None = None,
    out_columns_type: type | None = None,
    out_columns_name: str | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> DataFrame

Load data

Parameters:

Name	Type	Description	Default
`selector`	`Index[Any] \| MultiIndex \| Selector \| None`	Selector to use to choose the data to load	`None`
`index_file_lock`	`BaseFileLock \| None`	Lock for the database's index file If not supplied, we use self.index_file_lock.	`None`
`out_columns_type`	`type \| None`	Type to set the output columns to. If not supplied, we don't set the output columns' type.	`None`
`out_columns_name`	`str \| None`	The name for the columns in the output. If not supplied, we don't set the output columns' name. This can also be set with pd.DataFrame.rename_axis but we provide it here for convenience (and in case you couldn't find this trick for ages, like us).	`None`
`parallel_op_config`	`ParallelOpConfig \| None`	Configuration for executing the operation in parallel with progress bars If not supplied, we use the values of `progress` and `max_workers`.	`None`
`progress`	`bool`	Should progress bar(s) be used to display the progress of the deletion? Only used if `parallel_op_config` is `None`.	`False`
`max_workers`	`int \| None`	Maximum number of workers to use for parallel processing. If supplied, we create an instance of concurrent.futures.ProcessPoolExecutor with the provided number of workers. A process pool seems to be the sensible default from our experimentation, but it is not a universally better choice. If you need something else because of how your database is set up, simply pass `parallel_op_config` rather than using the shortcut of passing `max_workers`. If not supplied, the loading is executed serially. Only used if `parallel_op_config` is `None`.	`None`

Returns:

Type	Description
`DataFrame`	Loaded data

Raises:

Type	Description
`EmptyDBError`	The database is empty

Source code in src/pandas_openscm/db/openscm_db.py

def load(  # noqa: PLR0913
    self,
    selector: pd.Index[Any] | pd.MultiIndex | pix.selectors.Selector | None = None,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
    out_columns_type: type | None = None,
    out_columns_name: str | None = None,
    parallel_op_config: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> pd.DataFrame:
    """
    Load data

    Parameters
    ----------
    selector
        Selector to use to choose the data to load

    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    out_columns_type
        Type to set the output columns to.

        If not supplied, we don't set the output columns' type.

    out_columns_name
        The name for the columns in the output.

        If not supplied, we don't set the output columns' name.

        This can also be set with
        [pd.DataFrame.rename_axis][pandas.DataFrame.rename_axis]
        but we provide it here for convenience
        (and in case you couldn't find this trick for ages, like us).

    parallel_op_config
        Configuration for executing the operation in parallel with progress bars

        If not supplied, we use the values of `progress` and `max_workers`.

    progress
        Should progress bar(s) be used to display the progress of the deletion?

        Only used if `parallel_op_config` is `None`.

    max_workers
        Maximum number of workers to use for parallel processing.

        If supplied, we create an instance of
        [concurrent.futures.ProcessPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor)
        with the provided number of workers.
        A process pool seems to be the sensible default from our experimentation,
        but it is not a universally better choice.
        If you need something else because of how your database is set up,
        simply pass `parallel_op_config`
        rather than using the shortcut of passing `max_workers`.

        If not supplied, the loading is executed serially.

        Only used if `parallel_op_config` is `None`.

    Returns
    -------
    :
        Loaded data

    Raises
    ------
    EmptyDBError
        The database is empty
    """
    if self.is_empty:
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        file_map = self.load_file_map(index_file_lock=index_file_lock)
        index = self.load_index(index_file_lock=index_file_lock)

        res = load_data(
            backend_data=self.backend_data,
            db_index=index,
            db_file_map=file_map,
            db_dir=self.db_dir,
            selector=selector,
            out_columns_type=out_columns_type,
            out_columns_name=out_columns_name,
            parallel_op_config=parallel_op_config,
            progress=progress,
            max_workers=max_workers,
        )

    return res

load_file_map #

load_file_map(
    *, index_file_lock: BaseFileLock | None = None
) -> Series[Path]

Load the file map

Parameters:

Name	Type	Description	Default
`index_file_lock`	`BaseFileLock \| None`	Lock for the database's index file If not supplied, we use self.index_file_lock.	`None`

Returns:

Type	Description
`Series[Path]`	Map from file ID to file path

Raises:

Type	Description
`EmptyDBError`	The database is empty

Source code in src/pandas_openscm/db/openscm_db.py

def load_file_map(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> pd.Series[Path]:  # type: ignore # pandas type hints confused about what they support
    """
    Load the file map

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Map from file ID to file path

    Raises
    ------
    EmptyDBError
        The database is empty
    """
    if self.is_empty:
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        file_map = load_db_file_map(
            backend_index=self.backend_index, file_map_file=self.file_map_file
        )

    return file_map

load_index #

load_index(
    *, index_file_lock: BaseFileLock | None = None
) -> DataFrame

Load the index

Parameters:

Name	Type	Description	Default
`index_file_lock`	`BaseFileLock \| None`	Lock for the database's index file If not supplied, we use self.index_file_lock.	`None`

Returns:

Type	Description
`DataFrame`	Database index

Raises:

Type	Description
`EmptyDBError`	The database is empty

Source code in src/pandas_openscm/db/openscm_db.py

def load_index(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Database index

    Raises
    ------
    EmptyDBError
        The database is empty
    """
    if self.is_empty:
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        index = load_db_index(
            backend_index=self.backend_index,
            index_file=self.index_file,
        )

    return index

load_metadata #

load_metadata(
    *, index_file_lock: BaseFileLock | None = None
) -> MultiIndex

Load the database's metadata

Parameters:

Name	Type	Description	Default
`index_file_lock`	`BaseFileLock \| None`	Lock for the database's index file If not supplied, we use self.index_file_lock.	`None`

Returns:

Type	Description
`MultiIndex`	Loaded metadata

Source code in src/pandas_openscm/db/openscm_db.py

def load_metadata(
    self,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
) -> pd.MultiIndex:
    """
    Load the database's metadata

    Parameters
    ----------
    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    Returns
    -------
    :
        Loaded metadata
    """
    if not self.index_file.exists():
        raise EmptyDBError(self)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        metadata = load_db_metadata(
            backend_index=self.backend_index, index_file=self.index_file
        )

    return metadata

save #

save(
    data: DataFrame,
    *,
    index_file_lock: BaseFileLock | None = None,
    groupby: list[str] | None = None,
    allow_overwrite: bool = False,
    warn_on_partial_overwrite: bool = True,
    progress_grouping: ProgressLike | None = None,
    parallel_op_config_save: ParallelOpConfig | None = None,
    parallel_op_config_delete: ParallelOpConfig
    | None = None,
    parallel_op_config_rewrite: ParallelOpConfig
    | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None

Save data into the database

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to add to the database	required
`index_file_lock`	`BaseFileLock \| None`	Lock for the database's index file If not supplied, we use self.index_file_lock.	`None`
`groupby`	`list[str] \| None`	Metadata columns to use to group the data. If not supplied, we save all the data in a single file.	`None`
`allow_overwrite`	`bool`	Should overwrites of data that is already in the database be allowed? If this is `True`, there is a risk that, if interrupted halfway through, you can end up with duplicate data in your database or some other odd broken state.	`False`
`warn_on_partial_overwrite`	`bool`	Should a warning be raised if a partial overwrite will occur? This is on by default so that users are warned about the slow operation of re-writing.	`True`
`progress_grouping`	`ProgressLike \| None`	Progress bar to use when grouping the data If not supplied, we use the values of `progress` and `max_workers`.	`None`
`parallel_op_config_save`	`ParallelOpConfig \| None`	Parallel op configuration for executing save operations If not supplied, we use the values of `progress` and `max_workers`.	`None`
`parallel_op_config_delete`	`ParallelOpConfig \| None`	Parallel op configuration for executing any needed delete operations If not supplied, we use the values of `progress` and `max_workers`.	`None`
`parallel_op_config_rewrite`	`ParallelOpConfig \| None`	Parallel op configuration for executing any needed re-write operations If not supplied, we use the values of `progress` and `max_workers`.	`None`
`progress`	`bool`	Should progress bar(s) be used to display the progress of the various steps? Only used if the corresponding `parallel_op_config_*` variable for the operation is `None`.	`False`
`max_workers`	`int \| None`	Maximum number of workers to use for parallel processing. If supplied, we create instances of concurrent.futures.Executor with the provided number of workers (the exact kind of executor depends on the operation). If not supplied, the operations are executed serially. Only used if the corresponding `parallel_op_config_*` variable for the operation is `None`.	`None`

Source code in src/pandas_openscm/db/openscm_db.py

def save(  # noqa: PLR0913
    self,
    data: pd.DataFrame,
    *,
    index_file_lock: filelock.BaseFileLock | None = None,
    groupby: list[str] | None = None,
    allow_overwrite: bool = False,
    warn_on_partial_overwrite: bool = True,
    progress_grouping: ProgressLike | None = None,
    parallel_op_config_save: ParallelOpConfig | None = None,
    parallel_op_config_delete: ParallelOpConfig | None = None,
    parallel_op_config_rewrite: ParallelOpConfig | None = None,
    progress: bool = False,
    max_workers: int | None = None,
) -> None:
    """
    Save data into the database

    Parameters
    ----------
    data
        Data to add to the database

    index_file_lock
        Lock for the database's index file

        If not supplied, we use [self.index_file_lock][(c)].

    groupby
        Metadata columns to use to group the data.

        If not supplied, we save all the data in a single file.

    allow_overwrite
        Should overwrites of data that is already in the database be allowed?

        If this is `True`, there is a risk that, if interrupted halfway through,
        you can end up with duplicate data in your database
        or some other odd broken state.

    warn_on_partial_overwrite
        Should a warning be raised if a partial overwrite will occur?

        This is on by default so that users
        are warned about the slow operation of re-writing.

    progress_grouping
        Progress bar to use when grouping the data

        If not supplied, we use the values of `progress` and `max_workers`.

    parallel_op_config_save
        Parallel op configuration for executing save operations

        If not supplied, we use the values of `progress` and `max_workers`.

    parallel_op_config_delete
        Parallel op configuration for executing any needed delete operations

        If not supplied, we use the values of `progress` and `max_workers`.

    parallel_op_config_rewrite
        Parallel op configuration for executing any needed re-write operations

        If not supplied, we use the values of `progress` and `max_workers`.

    progress
        Should progress bar(s) be used to display the progress of the various steps?

        Only used if the corresponding `parallel_op_config_*` variable
        for the operation is `None`.

    max_workers
        Maximum number of workers to use for parallel processing.

        If supplied, we create instances of
        [concurrent.futures.Executor][]
        with the provided number of workers
        (the exact kind of executor depends on the operation).

        If not supplied, the operations are executed serially.

        Only used if the corresponding `parallel_op_config_*` variable
        for the operation is `None`.
    """
    if not isinstance(data.index, pd.MultiIndex):
        msg = (
            "`data.index` must be an instance of `pd.MultiIndex`. "
            f"Received {type(data.index)=}"
        )
        raise TypeError(msg)

    if data.index.duplicated().any():
        duplicate_rows = data.index.duplicated(keep=False)
        duplicates = data.loc[duplicate_rows, :]
        msg = (
            "`data` contains rows with the same metadata. "
            f"duplicates=\n{duplicates}"
        )

        raise ValueError(msg)

    if index_file_lock is None:
        index_file_lock = self.index_file_lock

    with index_file_lock:
        if self.is_empty:
            move_plan = None
            index_non_data = None
            file_map_non_data = None
            min_file_id = 0

        else:
            file_map_db = self.load_file_map(index_file_lock=index_file_lock)
            index_db = self.load_index(index_file_lock=index_file_lock)
            if not allow_overwrite:
                data_index_unified, index_db_index_unified = (
                    unify_index_levels_check_index_types(data.index, index_db.index)
                )
                overwrite_required = multi_index_match(
                    data_index_unified, index_db_index_unified
                )

                if overwrite_required.any():
                    data_to_write_already_in_db = data.loc[overwrite_required, :]
                    raise AlreadyInDBError(
                        already_in_db=data_to_write_already_in_db
                    )

            move_plan = make_move_plan(
                index_start=index_db,
                file_map_start=file_map_db,
                data_to_write=data,
                get_new_data_file_path=self.get_new_data_file_path,
                db_dir=self.db_dir,
            )

            # As needed, re-write files without deleting the old files
            if move_plan.rewrite_actions is not None:
                if warn_on_partial_overwrite:
                    msg = (
                        "Overwriting the data will require re-writing. "
                        "This may be slow. "
                        "If that is an issue, the way to solve it "
                        "is to update your workflow to ensure "
                        "that you are not overwriting data "
                        "or are only overwriting entire files."
                    )
                    warnings.warn(msg)

                rewrite_files(
                    move_plan.rewrite_actions,
                    backend=self.backend_data,
                    parallel_op_config=parallel_op_config_rewrite,
                    progress=progress,
                    max_workers=max_workers,
                )

            # Write the new data
            current_largest_file_id = file_map_db.index.max()
            if not move_plan.moved_file_map.empty:
                current_largest_file_id = max(
                    move_plan.moved_file_map.index.max(), current_largest_file_id
                )

            index_non_data = move_plan.moved_index
            file_map_non_data = move_plan.moved_file_map
            min_file_id = current_largest_file_id + 1

        save_data(
            data,
            backend_data=self.backend_data,
            get_new_data_file_path=self.get_new_data_file_path,
            backend_index=self.backend_index,
            index_file=self.index_file,
            file_map_file=self.file_map_file,
            index_non_data=index_non_data,
            file_map_non_data=file_map_non_data,
            min_file_id=min_file_id,
            groupby=groupby,
            progress_grouping=progress_grouping,
            parallel_op_config=parallel_op_config_save,
            progress=progress,
            max_workers=max_workers,
        )

        # As needed, delete files.
        # We delete files last to minimise the risk of losing data
        # (might end up with double if we get interrupted here,
        # but that is better than zero).
        if move_plan is not None and move_plan.delete_paths is not None:
            delete_files(
                files_to_delete=move_plan.delete_paths,
                parallel_op_config=parallel_op_config_delete,
                progress=progress,
                max_workers=max_workers,
            )

to_gzipped_tar_archive #

to_gzipped_tar_archive(
    out_file: Path, mode: Literal["w:gz", "x:gz"] = "w:gz"
) -> Path

Convert to a gzipped tar archive

Parameters:

Name	Type	Description	Default
`out_file`	`Path`	File in which to write the output	required
`mode`	`Literal['w:gz', 'x:gz']`	Mode to use to open `out_file`	`'w:gz'`

Returns:

Type	Description
`Path`	Path to the gzipped tar archive This is the same as `out_file`, but is returned for convenience.

Source code in src/pandas_openscm/db/openscm_db.py

def to_gzipped_tar_archive(
    self,
    out_file: Path,
    mode: Literal["w:gz", "x:gz"] = "w:gz",
) -> Path:
    """
    Convert to a gzipped tar archive

    Parameters
    ----------
    out_file
        File in which to write the output

    mode
        Mode to use to open `out_file`

    Returns
    -------
    :
        Path to the gzipped tar archive

        This is the same as `out_file`, but is returned for convenience.
    """
    with tarfile.open(out_file, mode=mode) as tar:
        tar.add(self.db_dir, arcname="db")

    return out_file

OpenSCMDBDataBackend #

Bases: Protocol

Backend for (de-)serialising data

Designed to be used with OpenSCMDB

Methods:

Name	Description
`load_data`	Load a data file
`save_data`	Save data to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with data files saved by this backend.
`preserves_index`	`bool`	Whether this backend preserves the index of data upon (de-)serialisation

Source code in src/pandas_openscm/db/interfaces.py

@runtime_checkable
class OpenSCMDBDataBackend(Protocol):
    """
    Backend for (de-)serialising data

    Designed to be used with [OpenSCMDB][(m)]
    """

    ext: str
    """
    Extension to use with data files saved by this backend.
    """

    preserves_index: bool
    """
    Whether this backend preserves the index of data upon (de-)serialisation
    """

    @staticmethod
    def load_data(data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        This is a low-level method
        that just handles the specifics of loading the data from disk.
        Working out the path from which to load the data
        should happen in higher-level functions.

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data

        Notes
        -----
        This just loads the data directly from disk.
        If the data had a `pd.MultiIndex` when it was saved,
        this may or not be restored.
        It is up to the user
        to decide whether to do any `pd.MultiIndex` restoration or not,
        based on their use case and the value of `self.preserves_index`.
        We do not make this choice as converting back to a
        `pd.MultiIndex` can be a very expensive operation,
        and we want to give the user control over any such optimisations.
        """

    @staticmethod
    def save_data(
        data: pd.DataFrame,
        data_file: Path,
    ) -> None:
        """
        Save data to disk

        This is a low-level method
        that just handles the specifics of serialising the data to disk.
        Working out what to save and in what path
        should happen in higher-level functions.

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """

ext `instance-attribute` #

ext: str

Extension to use with data files saved by this backend.

preserves_index `instance-attribute` #

preserves_index: bool

Whether this backend preserves the index of data upon (de-)serialisation

load_data `staticmethod` #

load_data(data_file: Path) -> DataFrame

Load a data file

This is a low-level method that just handles the specifics of loading the data from disk. Working out the path from which to load the data should happen in higher-level functions.

Parameters:

Name	Type	Description	Default
`data_file`	`Path`	File from which to load the data	required

Returns:

Type	Description
`DataFrame`	Loaded data

Notes

This just loads the data directly from disk. If the data had a pd.MultiIndex when it was saved, this may or not be restored. It is up to the user to decide whether to do any pd.MultiIndex restoration or not, based on their use case and the value of self.preserves_index. We do not make this choice as converting back to a pd.MultiIndex can be a very expensive operation, and we want to give the user control over any such optimisations.

Source code in src/pandas_openscm/db/interfaces.py

@staticmethod
def load_data(data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    This is a low-level method
    that just handles the specifics of loading the data from disk.
    Working out the path from which to load the data
    should happen in higher-level functions.

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data

    Notes
    -----
    This just loads the data directly from disk.
    If the data had a `pd.MultiIndex` when it was saved,
    this may or not be restored.
    It is up to the user
    to decide whether to do any `pd.MultiIndex` restoration or not,
    based on their use case and the value of `self.preserves_index`.
    We do not make this choice as converting back to a
    `pd.MultiIndex` can be a very expensive operation,
    and we want to give the user control over any such optimisations.
    """

save_data `staticmethod` #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

This is a low-level method that just handles the specifics of serialising the data to disk. Working out what to save and in what path should happen in higher-level functions.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to save	required
`data_file`	`Path`	File in which to save the data	required

Source code in src/pandas_openscm/db/interfaces.py

@staticmethod
def save_data(
    data: pd.DataFrame,
    data_file: Path,
) -> None:
    """
    Save data to disk

    This is a low-level method
    that just handles the specifics of serialising the data to disk.
    Working out what to save and in what path
    should happen in higher-level functions.

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """

OpenSCMDBIndexBackend #

Bases: Protocol

Backend for (de-)serialising the index (and file map)

Designed to be used with OpenSCMDB

Methods:

Name	Description
`load_file_map`	Load the file map
`load_index`	Load the index
`save_file_map`	Save the file map to disk
`save_index`	Save the index to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with index files saved by this backend.
`preserves_index`	`bool`	Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation

Source code in src/pandas_openscm/db/interfaces.py

@runtime_checkable
class OpenSCMDBIndexBackend(Protocol):
    """
    Backend for (de-)serialising the index (and file map)

    Designed to be used with [OpenSCMDB][(m)]
    """

    ext: str
    """
    Extension to use with index files saved by this backend.
    """

    preserves_index: bool
    """
    Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
    """

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the file map

        This is a low-level method
        that just handles the specifics of loading the index from disk.
        Working out the path from which to load the file map
        should happen in higher-level functions.

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map

        Notes
        -----
        This returns a [pd.DataFrame][pandas.DataFrame].
        It is up to the user to cast this to a [pd.Series][pandas.Series]
        if they wish.
        """

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        This is a low-level method
        that just handles the specifics of loading the index from disk.
        Working out the path from which to load the index
        should happen in higher-level functions.

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index

        Notes
        -----
        This just loads the index directly from disk.
        If the index had a `pd.MultiIndex` when it was saved,
        this may or not be restored.
        It is up to the user
        to decide whether to do any `pd.MultiIndex` restoration or not,
        based on their use case and the value of `self.preserves_index`.
        We do not make this choice as converting back to a
        `pd.MultiIndex` can be a very expensive operation,
        and we want to give the user control over any such optimisations.
        """

    def save_file_map(
        self,
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        This is a low-level method
        that just handles the specifics of serialising the file map to disk.
        Working out what to save and in what path
        should happen in higher-level functions.

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """

    def save_index(
        self,
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        This is a low-level method
        that just handles the specifics of serialising the index to disk.
        Working out what to save and in what path
        should happen in higher-level functions.

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """

ext `instance-attribute` #

ext: str

Extension to use with index files saved by this backend.

preserves_index `instance-attribute` #

preserves_index: bool

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

load_file_map `staticmethod` #

load_file_map(file_map_file: Path) -> DataFrame

Load the file map

This is a low-level method that just handles the specifics of loading the index from disk. Working out the path from which to load the file map should happen in higher-level functions.

Parameters:

Name	Type	Description	Default
`file_map_file`	`Path`	File from which to load the file map	required

Returns:

Type	Description
`DataFrame`	Loaded file map

Notes

This returns a pd.DataFrame. It is up to the user to cast this to a pd.Series if they wish.

Source code in src/pandas_openscm/db/interfaces.py

@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the file map

    This is a low-level method
    that just handles the specifics of loading the index from disk.
    Working out the path from which to load the file map
    should happen in higher-level functions.

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map

    Notes
    -----
    This returns a [pd.DataFrame][pandas.DataFrame].
    It is up to the user to cast this to a [pd.Series][pandas.Series]
    if they wish.
    """

load_index `staticmethod` #

load_index(index_file: Path) -> DataFrame

Load the index

This is a low-level method that just handles the specifics of loading the index from disk. Working out the path from which to load the index should happen in higher-level functions.

Parameters:

Name	Type	Description	Default
`index_file`	`Path`	File from which to load the index	required

Returns:

Type	Description
`DataFrame`	Loaded index

Notes

This just loads the index directly from disk. If the index had a pd.MultiIndex when it was saved, this may or not be restored. It is up to the user to decide whether to do any pd.MultiIndex restoration or not, based on their use case and the value of self.preserves_index. We do not make this choice as converting back to a pd.MultiIndex can be a very expensive operation, and we want to give the user control over any such optimisations.

Source code in src/pandas_openscm/db/interfaces.py

@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    This is a low-level method
    that just handles the specifics of loading the index from disk.
    Working out the path from which to load the index
    should happen in higher-level functions.

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index

    Notes
    -----
    This just loads the index directly from disk.
    If the index had a `pd.MultiIndex` when it was saved,
    this may or not be restored.
    It is up to the user
    to decide whether to do any `pd.MultiIndex` restoration or not,
    based on their use case and the value of `self.preserves_index`.
    We do not make this choice as converting back to a
    `pd.MultiIndex` can be a very expensive operation,
    and we want to give the user control over any such optimisations.
    """

save_file_map #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

This is a low-level method that just handles the specifics of serialising the file map to disk. Working out what to save and in what path should happen in higher-level functions.

Parameters:

Name	Type	Description	Default
`file_map`	`Series[Path]`	File map to save	required
`file_map_file`	`Path`	File in which to save the file map	required

Source code in src/pandas_openscm/db/interfaces.py

def save_file_map(
    self,
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    This is a low-level method
    that just handles the specifics of serialising the file map to disk.
    Working out what to save and in what path
    should happen in higher-level functions.

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """

save_index #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

This is a low-level method that just handles the specifics of serialising the index to disk. Working out what to save and in what path should happen in higher-level functions.

Parameters:

Name	Type	Description	Default
`index`	`DataFrame`	Index to save	required
`index_file`	`Path`	File in which to save the index	required

Source code in src/pandas_openscm/db/interfaces.py

def save_index(
    self,
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    This is a low-level method
    that just handles the specifics of serialising the index to disk.
    Working out what to save and in what path
    should happen in higher-level functions.

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """

netCDFDataBackend #

netCDF data backend

Methods:

Name	Description
`load_data`	Load a data file
`save_data`	Save data to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[True]`	Whether this backend preserves the index of data upon (de-)serialisation
`timeseries_dim`	`str`	Name of the timeseries dimension in serialised output

Source code in src/pandas_openscm/db/netcdf.py

@define
class netCDFDataBackend:
    """
    netCDF data backend
    """

    ext: str = ".nc"
    """
    Extension to use with files saved by this backend.
    """

    timeseries_dim: str = "ts_id"
    """
    Name of the timeseries dimension in serialised output
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the index of data upon (de-)serialisation
        """
        return True

    def load_data(self, data_file: Path) -> pd.DataFrame:
        """
        Load a data file

        Parameters
        ----------
        data_file
            File from which to load the data

        Returns
        -------
        :
            Loaded data
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_data", requirement="xarray"
            ) from exc

        raw = xr.load_dataset(data_file)

        data: pd.DataFrame = raw["values"].to_pandas()  # type: ignore
        index = metadata_xr_to_df(raw)
        index_concat = index.loc[raw[self.timeseries_dim].values]

        res = pd.concat([index_concat, data], axis="columns").set_index(
            index.columns.to_list()
        )

        return res

    def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
        """
        Save data to disk

        Parameters
        ----------
        data
            Data to save

        data_file
            File in which to save the data
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.save_data", requirement="xarray"
            ) from exc

        # Resetting the index will also give each timeseries a unique ID
        data_rs = data.reset_index()
        timeseries_coord_info = {self.timeseries_dim: data_rs.index.values}
        if data.columns.name is None:
            time_dim = "time"
        else:
            time_dim = str(data.columns.name)

        time_coord_info = {time_dim: data.columns.values}

        data_index_xr = metadata_df_to_xr(
            data_rs[data.index.names],
            timeseries_id_coord=xr.Coordinates(timeseries_coord_info),
            timeseries_dim=self.timeseries_dim,
        )
        data_values_xr = xr.DataArray(
            data,
            dims=[self.timeseries_dim, time_dim],
            coords=xr.Coordinates(timeseries_coord_info | time_coord_info),
        )
        data_xr = xr.merge([data_index_xr, data_values_xr.to_dataset(name="values")])
        data_xr.to_netcdf(data_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.nc'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the index of data upon (de-)serialisation

timeseries_dim `class-attribute` `instance-attribute` #

timeseries_dim: str = 'ts_id'

Name of the timeseries dimension in serialised output

load_data #

load_data(data_file: Path) -> DataFrame

Load a data file

Parameters:

Name	Type	Description	Default
`data_file`	`Path`	File from which to load the data	required

Returns:

Type	Description
`DataFrame`	Loaded data

Source code in src/pandas_openscm/db/netcdf.py

def load_data(self, data_file: Path) -> pd.DataFrame:
    """
    Load a data file

    Parameters
    ----------
    data_file
        File from which to load the data

    Returns
    -------
    :
        Loaded data
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_data", requirement="xarray"
        ) from exc

    raw = xr.load_dataset(data_file)

    data: pd.DataFrame = raw["values"].to_pandas()  # type: ignore
    index = metadata_xr_to_df(raw)
    index_concat = index.loc[raw[self.timeseries_dim].values]

    res = pd.concat([index_concat, data], axis="columns").set_index(
        index.columns.to_list()
    )

    return res

save_data #

save_data(data: DataFrame, data_file: Path) -> None

Save data to disk

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	Data to save	required
`data_file`	`Path`	File in which to save the data	required

Source code in src/pandas_openscm/db/netcdf.py

def save_data(self, data: pd.DataFrame, data_file: Path) -> None:
    """
    Save data to disk

    Parameters
    ----------
    data
        Data to save

    data_file
        File in which to save the data
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.save_data", requirement="xarray"
        ) from exc

    # Resetting the index will also give each timeseries a unique ID
    data_rs = data.reset_index()
    timeseries_coord_info = {self.timeseries_dim: data_rs.index.values}
    if data.columns.name is None:
        time_dim = "time"
    else:
        time_dim = str(data.columns.name)

    time_coord_info = {time_dim: data.columns.values}

    data_index_xr = metadata_df_to_xr(
        data_rs[data.index.names],
        timeseries_id_coord=xr.Coordinates(timeseries_coord_info),
        timeseries_dim=self.timeseries_dim,
    )
    data_values_xr = xr.DataArray(
        data,
        dims=[self.timeseries_dim, time_dim],
        coords=xr.Coordinates(timeseries_coord_info | time_coord_info),
    )
    data_xr = xr.merge([data_index_xr, data_values_xr.to_dataset(name="values")])
    data_xr.to_netcdf(data_file)

netCDFIndexBackend #

netCDF index backend

Methods:

Name	Description
`load_file_map`	Load the database's file map
`load_index`	Load the index
`save_file_map`	Save the file map to disk
`save_index`	Save the index to disk

Attributes:

Name	Type	Description
`ext`	`str`	Extension to use with files saved by this backend.
`preserves_index`	`Literal[True]`	Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
`timeseries_dim`	`str`	Name of the timeseries dimension in serialised output

Source code in src/pandas_openscm/db/netcdf.py

@define
class netCDFIndexBackend:
    """
    netCDF index backend
    """

    ext: str = ".nc"
    """
    Extension to use with files saved by this backend.
    """

    timeseries_dim: str = "ts_id"
    """
    Name of the timeseries dimension in serialised output
    """

    @property
    def preserves_index(self) -> Literal[True]:
        """
        Whether this backend preserves the `pd.MultiIndex` upon (de-)serialisation
        """
        return True

    @staticmethod
    def load_file_map(file_map_file: Path) -> pd.DataFrame:
        """
        Load the database's file map

        Parameters
        ----------
        file_map_file
            File from which to load the file map

        Returns
        -------
        :
            Loaded file map
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_file_map", requirement="xarray"
            ) from exc

        res = xr.load_dataset(file_map_file).to_pandas()
        if isinstance(res, pd.Series):  # pragma: no cover
            raise TypeError(res)

        return res

    @staticmethod
    def load_index(index_file: Path) -> pd.DataFrame:
        """
        Load the index

        Parameters
        ----------
        index_file
            File from which to load the index

        Returns
        -------
        :
            Loaded index
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.load_index", requirement="xarray"
            ) from exc

        raw = xr.load_dataset(index_file)

        intermediate = metadata_xr_to_df(raw)
        res = intermediate.set_index(
            intermediate.columns.difference(["file_id"]).to_list()
        )

        return res

    @staticmethod
    def save_file_map(
        file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
        file_map_file: Path,
    ) -> None:
        """
        Save the file map to disk

        Parameters
        ----------
        file_map
            File map to save

        file_map_file
            File in which to save the file map
        """
        try:
            import xarray as xr  # noqa: PLC0415
        except ImportError as exc:
            raise MissingOptionalDependencyError(
                "netCDFBackend.save_file_map", requirement="xarray"
            ) from exc

        file_map_xr = xr.DataArray.from_series(file_map.astype(str))
        file_map_xr.to_netcdf(file_map_file)

    def save_index(
        self,
        index: pd.DataFrame,
        index_file: Path,
    ) -> None:
        """
        Save the index to disk

        Parameters
        ----------
        index
            Index to save

        index_file
            File in which to save the index
        """
        # Use a different name because the timeseries IDs in the index
        # won't necessarily line up with those in the file(s).
        # This should not matter for users, who never see them side-by-side,
        # but just in case.
        index_xr = metadata_df_to_xr(
            # Have to reset the index so we can serialise to disk
            index.reset_index(),
            timeseries_dim=f"{self.timeseries_dim}_index",
        )
        index_xr.to_netcdf(index_file)

ext `class-attribute` `instance-attribute` #

ext: str = '.nc'

Extension to use with files saved by this backend.

preserves_index `property` #

preserves_index: Literal[True]

Whether this backend preserves the pd.MultiIndex upon (de-)serialisation

timeseries_dim `class-attribute` `instance-attribute` #

timeseries_dim: str = 'ts_id'

Name of the timeseries dimension in serialised output

load_file_map `staticmethod` #

load_file_map(file_map_file: Path) -> DataFrame

Load the database's file map

Parameters:

Name	Type	Description	Default
`file_map_file`	`Path`	File from which to load the file map	required

Returns:

Type	Description
`DataFrame`	Loaded file map

Source code in src/pandas_openscm/db/netcdf.py

@staticmethod
def load_file_map(file_map_file: Path) -> pd.DataFrame:
    """
    Load the database's file map

    Parameters
    ----------
    file_map_file
        File from which to load the file map

    Returns
    -------
    :
        Loaded file map
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_file_map", requirement="xarray"
        ) from exc

    res = xr.load_dataset(file_map_file).to_pandas()
    if isinstance(res, pd.Series):  # pragma: no cover
        raise TypeError(res)

    return res

load_index `staticmethod` #

load_index(index_file: Path) -> DataFrame

Load the index

Parameters:

Name	Type	Description	Default
`index_file`	`Path`	File from which to load the index	required

Returns:

Type	Description
`DataFrame`	Loaded index

Source code in src/pandas_openscm/db/netcdf.py

@staticmethod
def load_index(index_file: Path) -> pd.DataFrame:
    """
    Load the index

    Parameters
    ----------
    index_file
        File from which to load the index

    Returns
    -------
    :
        Loaded index
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.load_index", requirement="xarray"
        ) from exc

    raw = xr.load_dataset(index_file)

    intermediate = metadata_xr_to_df(raw)
    res = intermediate.set_index(
        intermediate.columns.difference(["file_id"]).to_list()
    )

    return res

save_file_map `staticmethod` #

save_file_map(
    file_map: Series[Path], file_map_file: Path
) -> None

Save the file map to disk

Parameters:

Name	Type	Description	Default
`file_map`	`Series[Path]`	File map to save	required
`file_map_file`	`Path`	File in which to save the file map	required

Source code in src/pandas_openscm/db/netcdf.py

@staticmethod
def save_file_map(
    file_map: pd.Series[Path],  # type: ignore # pandas confused about what it supports
    file_map_file: Path,
) -> None:
    """
    Save the file map to disk

    Parameters
    ----------
    file_map
        File map to save

    file_map_file
        File in which to save the file map
    """
    try:
        import xarray as xr  # noqa: PLC0415
    except ImportError as exc:
        raise MissingOptionalDependencyError(
            "netCDFBackend.save_file_map", requirement="xarray"
        ) from exc

    file_map_xr = xr.DataArray.from_series(file_map.astype(str))
    file_map_xr.to_netcdf(file_map_file)

save_index #

save_index(index: DataFrame, index_file: Path) -> None

Save the index to disk

Parameters:

Name	Type	Description	Default
`index`	`DataFrame`	Index to save	required
`index_file`	`Path`	File in which to save the index	required

Source code in src/pandas_openscm/db/netcdf.py

def save_index(
    self,
    index: pd.DataFrame,
    index_file: Path,
) -> None:
    """
    Save the index to disk

    Parameters
    ----------
    index
        Index to save

    index_file
        File in which to save the index
    """
    # Use a different name because the timeseries IDs in the index
    # won't necessarily line up with those in the file(s).
    # This should not matter for users, who never see them side-by-side,
    # but just in case.
    index_xr = metadata_df_to_xr(
        # Have to reset the index so we can serialise to disk
        index.reset_index(),
        timeseries_dim=f"{self.timeseries_dim}_index",
    )
    index_xr.to_netcdf(index_file)

pandas_openscm.db#

DATA_BACKENDS module-attribute #

INDEX_BACKENDS module-attribute #

AlreadyInDBError #

__init__ #

CSVDataBackend #

ext class-attribute instance-attribute #

preserves_index property #

load_data staticmethod #

save_data staticmethod #

CSVIndexBackend #

ext class-attribute instance-attribute #

preserves_index property #

load_file_map staticmethod #

load_index staticmethod #

save_file_map staticmethod #

save_index staticmethod #

EmptyDBError #

__init__ #

FeatherDataBackend #

ext class-attribute instance-attribute #

preserves_index property #

load_data staticmethod #

save_data staticmethod #

FeatherIndexBackend #

ext class-attribute instance-attribute #

preserves_index property #

load_file_map staticmethod #

load_index staticmethod #

save_file_map staticmethod #

save_index staticmethod #

InMemoryDataBackend #

data class-attribute instance-attribute #

ext class-attribute instance-attribute #

preserves_index property #

load_data #

save_data #

InMemoryIndexBackend #

ext class-attribute instance-attribute #

file_map class-attribute instance-attribute #

index class-attribute instance-attribute #

preserves_index property #

load_file_map #

load_index #

save_file_map #

save_index #

OpenSCMDB #

backend_data class-attribute instance-attribute #

backend_index class-attribute instance-attribute #

db_dir class-attribute instance-attribute #

file_map_file property #

index_file property #

index_file_lock class-attribute instance-attribute #

index_file_lock_path property #

is_empty property #

create_reader #

default_index_file_lock #

delete #

from_gzipped_tar_archive classmethod #

get_new_data_file_path #

load #

load_file_map #

load_index #

load_metadata #

save #

to_gzipped_tar_archive #

OpenSCMDBDataBackend #

ext instance-attribute #

preserves_index instance-attribute #

load_data staticmethod #

save_data staticmethod #

OpenSCMDBIndexBackend #

ext instance-attribute #

preserves_index instance-attribute #

load_file_map staticmethod #

load_index staticmethod #

save_file_map #

save_index #

netCDFDataBackend #

ext class-attribute instance-attribute #

DATA_BACKENDS `module-attribute` #

INDEX_BACKENDS `module-attribute` #

init #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

load_data `staticmethod` #

save_data `staticmethod` #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

load_file_map `staticmethod` #

load_index `staticmethod` #

save_file_map `staticmethod` #

save_index `staticmethod` #

init #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

load_data `staticmethod` #

save_data `staticmethod` #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

load_file_map `staticmethod` #

load_index `staticmethod` #

save_file_map `staticmethod` #

save_index `staticmethod` #

data `class-attribute` `instance-attribute` #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

ext `class-attribute` `instance-attribute` #

file_map `class-attribute` `instance-attribute` #

index `class-attribute` `instance-attribute` #

preserves_index `property` #

backend_data `class-attribute` `instance-attribute` #

backend_index `class-attribute` `instance-attribute` #

db_dir `class-attribute` `instance-attribute` #

file_map_file `property` #

index_file `property` #

index_file_lock `class-attribute` `instance-attribute` #

index_file_lock_path `property` #

is_empty `property` #

from_gzipped_tar_archive `classmethod` #

ext `instance-attribute` #

preserves_index `instance-attribute` #

load_data `staticmethod` #

save_data `staticmethod` #

ext `instance-attribute` #

preserves_index `instance-attribute` #

load_file_map `staticmethod` #

load_index `staticmethod` #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

timeseries_dim `class-attribute` `instance-attribute` #

ext `class-attribute` `instance-attribute` #

preserves_index `property` #

timeseries_dim `class-attribute` `instance-attribute` #

load_file_map `staticmethod` #

load_index `staticmethod` #

save_file_map `staticmethod` #