Skip to content

pandas_openscm.indexing#

Helpers for working with pandas

Really these should either go into pandas_indexing or pandas long-term, but they're ok here for now.

Functions:

Name Description
index_name_aware_lookup

Perform a look up with an index, being aware of the index's name.

index_name_aware_match

Perform a match with an index, being aware of the index's name.

mi_loc

Select data, being slightly smarter than the default pandas.DataFrame.loc.

multi_index_lookup

Perform a multi-index look up

multi_index_match

Perform a multi-index match

index_name_aware_lookup #

index_name_aware_lookup(
    pandas_obj: P, locator: Index[Any]
) -> P

Perform a look up with an index, being aware of the index's name.

For the problem this is solving, see index_name_aware_match.

Parameters:

Name Type Description Default
pandas_obj P

Pandas object in which to find matches

required
locator Index[Any]

Locator to use for finding matches

required

Returns:

Type Description
P

Rows of pandas_obj that are in locator, given locator.name.

Examples:

>>> import numpy as np
>>> import pandas as pd
>>>
>>> base = pd.DataFrame(
...     data=np.arange(8).reshape((4, 2)),
...     columns=[2000, 2020],
...     index=pd.MultiIndex.from_tuples(
...         (
...             ("ma", "sa", 1),
...             ("ma", "sb", 2),
...             ("mb", "sa", 4),
...             ("mb", "sb", 3),
...         ),
...         names=["model", "scenario", "id"],
...     ),
... )
>>>
>>> # A locator that lines up with the third level only
>>> loc = pd.Index([1, 3], name="id")
>>> index_name_aware_lookup(base, loc)
                   2000  2020
model scenario id
ma    sa       1      0     1
mb    sb       3      6     7
Source code in src/pandas_openscm/indexing.py
def index_name_aware_lookup(pandas_obj: P, locator: pd.Index[Any]) -> P:
    """
    Perform a look up with an index, being aware of the index's name.

    For the problem this is solving, see [index_name_aware_match][(m)].

    Parameters
    ----------
    pandas_obj
        Pandas object in which to find matches

    locator
        Locator to use for finding matches

    Returns
    -------
    :
        Rows of `pandas_obj` that are in `locator`, given `locator.name`.

    Examples
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>>
    >>> base = pd.DataFrame(
    ...     data=np.arange(8).reshape((4, 2)),
    ...     columns=[2000, 2020],
    ...     index=pd.MultiIndex.from_tuples(
    ...         (
    ...             ("ma", "sa", 1),
    ...             ("ma", "sb", 2),
    ...             ("mb", "sa", 4),
    ...             ("mb", "sb", 3),
    ...         ),
    ...         names=["model", "scenario", "id"],
    ...     ),
    ... )
    >>>
    >>> # A locator that lines up with the third level only
    >>> loc = pd.Index([1, 3], name="id")
    >>> index_name_aware_lookup(base, loc)
                       2000  2020
    model scenario id
    ma    sa       1      0     1
    mb    sb       3      6     7
    """
    if not isinstance(pandas_obj.index, pd.MultiIndex):
        msg = (
            "This function is only intended to be used "
            "when `pandas_obj`'s index is an instance of `MultiIndex`. "
            f"Received {type(pandas_obj.index)=}"
        )
        raise TypeError(msg)

    return pandas_obj.loc[index_name_aware_match(pandas_obj.index, locator)]

index_name_aware_match #

index_name_aware_match(
    idx: MultiIndex, locator: Index[Any]
) -> NDArray[bool]

Perform a match with an index, being aware of the index's name.

This works, even if the index being looked up is not the first index.

Parameters:

Name Type Description Default
idx MultiIndex

Index in which to find matches

required
locator Index[Any]

Locator to use for finding matches

required

Returns:

Type Description
NDArray[bool]

Location of the rows in idx which are in locator, given locator.name.

Examples:

>>> import pandas as pd
>>>
>>> base = pd.MultiIndex.from_tuples(
...     (
...         ("ma", "sa", 1),
...         ("ma", "sb", 2),
...         ("mb", "sa", 1),
...         ("mb", "sb", 3),
...     ),
...     names=["model", "scenario", "id"],
... )
>>>
>>> # A locator that lines up with the third level only
>>> loc = pd.Index([1, 3], name="id")
>>> index_name_aware_match(base, loc)
array([ True, False,  True,  True])
Source code in src/pandas_openscm/indexing.py
def index_name_aware_match(
    idx: pd.MultiIndex, locator: pd.Index[Any]
) -> np.typing.NDArray[np.bool]:
    """
    Perform a match with an index, being aware of the index's name.

    This works, even if the index being looked up is not the first index.

    Parameters
    ----------
    idx
        Index in which to find matches

    locator
        Locator to use for finding matches

    Returns
    -------
    :
        Location of the rows in `idx` which are in `locator`, given `locator.name`.

    Examples
    --------
    >>> import pandas as pd
    >>>
    >>> base = pd.MultiIndex.from_tuples(
    ...     (
    ...         ("ma", "sa", 1),
    ...         ("ma", "sb", 2),
    ...         ("mb", "sa", 1),
    ...         ("mb", "sb", 3),
    ...     ),
    ...     names=["model", "scenario", "id"],
    ... )
    >>>
    >>> # A locator that lines up with the third level only
    >>> loc = pd.Index([1, 3], name="id")
    >>> index_name_aware_match(base, loc)
    array([ True, False,  True,  True])
    """
    res = idx.isin(locator.values, level=locator.name)

    return res

mi_loc #

mi_loc(
    pandas_obj: P,
    locator: Index[Any] | MultiIndex | Selector,
) -> P

Select data, being slightly smarter than the default pandas.DataFrame.loc.

Parameters:

Name Type Description Default
pandas_obj P

Pandas object on which to do the .loc operation

required
locator Index[Any] | MultiIndex | Selector

Locator to apply

If this is a multi-index, we use multi_index_lookup to ensure correct alignment.

If this is an index that has a name, we use the name to ensure correct alignment.

required

Returns:

Type Description
P

Selected data

Notes

If you have pandas_indexing installed, you can get the same (perhaps even better) functionality using something like the following instead

...
pandas_obj.loc[pandas_indexing.isin(locator)]
...
Source code in src/pandas_openscm/indexing.py
def mi_loc(
    pandas_obj: P,
    locator: pd.Index[Any] | pd.MultiIndex | pix.selectors.Selector,
) -> P:
    """
    Select data, being slightly smarter than the default [pandas.DataFrame.loc][].

    Parameters
    ----------
    pandas_obj
        Pandas object on which to do the `.loc` operation

    locator
        Locator to apply

        If this is a multi-index, we use
        [multi_index_lookup][(m).] to ensure correct alignment.

        If this is an index that has a name,
        we use the name to ensure correct alignment.

    Returns
    -------
    :
        Selected data

    Notes
    -----
    If you have [pandas_indexing][] installed,
    you can get the same (perhaps even better) functionality
    using something like the following instead

    ```python
    ...
    pandas_obj.loc[pandas_indexing.isin(locator)]
    ...
    ```
    """
    if isinstance(locator, pd.MultiIndex):
        res: P = multi_index_lookup(pandas_obj, locator)

    elif isinstance(locator, pd.Index) and locator.name is not None:
        res = index_name_aware_lookup(pandas_obj, locator)

    else:
        res = cast(P, pandas_obj.loc[cast(Any, locator)])

    return res

multi_index_lookup #

multi_index_lookup(
    pandas_obj: P, locator: MultiIndex
) -> P

Perform a multi-index look up

For the problem this is solving, see multi_index_match.

Parameters:

Name Type Description Default
pandas_obj P

Pandas object in which to find matches

required
locator MultiIndex

Locator to use for finding matches

required

Returns:

Type Description
P

Rows of pandas_obj that are in locator.

Examples:

>>> import numpy as np
>>> import pandas as pd
>>>
>>> base = pd.DataFrame(
...     data=np.arange(8).reshape((4, 2)),
...     columns=[2000, 2020],
...     index=pd.MultiIndex.from_tuples(
...         (
...             ("ma", "sa", 1),
...             ("ma", "sb", 2),
...             ("mb", "sa", 4),
...             ("mb", "sb", 3),
...         ),
...         names=["model", "scenario", "id"],
...     ),
... )
>>>
>>> # A locator that lines up with the second and third level only
>>> loc_first_level = pd.MultiIndex.from_tuples(
...     (
...         ("sa", 1),
...         ("sb", 3),
...     ),
...     names=["scenario", "id"],
... )
>>> multi_index_lookup(base, loc_first_level)
                   2000  2020
model scenario id
ma    sa       1      0     1
mb    sb       3      6     7
Source code in src/pandas_openscm/indexing.py
def multi_index_lookup(pandas_obj: P, locator: pd.MultiIndex) -> P:
    """
    Perform a multi-index look up

    For the problem this is solving, see [multi_index_match][(m)].

    Parameters
    ----------
    pandas_obj
        Pandas object in which to find matches

    locator
        Locator to use for finding matches

    Returns
    -------
    :
        Rows of `pandas_obj` that are in `locator`.

    Examples
    --------
    >>> import numpy as np
    >>> import pandas as pd
    >>>
    >>> base = pd.DataFrame(
    ...     data=np.arange(8).reshape((4, 2)),
    ...     columns=[2000, 2020],
    ...     index=pd.MultiIndex.from_tuples(
    ...         (
    ...             ("ma", "sa", 1),
    ...             ("ma", "sb", 2),
    ...             ("mb", "sa", 4),
    ...             ("mb", "sb", 3),
    ...         ),
    ...         names=["model", "scenario", "id"],
    ...     ),
    ... )
    >>>
    >>> # A locator that lines up with the second and third level only
    >>> loc_first_level = pd.MultiIndex.from_tuples(
    ...     (
    ...         ("sa", 1),
    ...         ("sb", 3),
    ...     ),
    ...     names=["scenario", "id"],
    ... )
    >>> multi_index_lookup(base, loc_first_level)
                       2000  2020
    model scenario id
    ma    sa       1      0     1
    mb    sb       3      6     7
    """
    if not isinstance(pandas_obj.index, pd.MultiIndex):
        msg = (
            "This function is only intended to be used "
            "when `pandas_obj`'s index is an instance of `MultiIndex`. "
            f"Received {type(pandas_obj.index)=}"
        )
        raise TypeError(msg)

    res = pandas_obj.loc[multi_index_match(pandas_obj.index, locator)]

    return res

multi_index_match #

multi_index_match(
    idx: MultiIndex, locator: MultiIndex
) -> NDArray[bool]

Perform a multi-index match

This works, even if the levels of the locator are not the same as the levels of the index in which to match.

Arguably, this should be moved to pandas_indexing or pandas. Relevant issues:

Parameters:

Name Type Description Default
idx MultiIndex

Index in which to find matches

required
locator MultiIndex

Locator to use for finding matches

required

Returns:

Type Description
NDArray[bool]

Location of the rows in idx which are in locator.

Raises:

Type Description
KeyError

locator has levels which are not in idx

Examples:

>>> import pandas as pd
>>> base = pd.MultiIndex.from_tuples(
...     (
...         ("ma", "sa", 1),
...         ("ma", "sb", 2),
...         ("mb", "sa", 1),
...         ("mb", "sb", 3),
...     ),
...     names=["model", "scenario", "id"],
... )
>>>
>>> # A locator that lines up with the multi-index levels exactly
>>> loc_simple = pd.MultiIndex.from_tuples(
...     (
...         ("ma", "sa", 1),
...         ("mb", "sa", 1),
...     ),
...     names=["model", "scenario", "id"],
... )
>>> multi_index_match(base, loc_simple)
array([ True, False,  True, False])
>>>
>>> # A locator that lines up with the first level only
>>> loc_first_level = pd.MultiIndex.from_tuples(
...     (("ma",),),
...     names=["model"],
... )
>>> multi_index_match(base, loc_first_level)
array([ True,  True, False, False])
>>>
>>> # A locator that lines up with the second level only
>>> loc_first_level = pd.MultiIndex.from_tuples(
...     (("sa",),),
...     names=["scenario"],
... )
>>> multi_index_match(base, loc_first_level)
array([ True, False,  True, False])
>>>
>>> # A locator that lines up with the second and third level only
>>> loc_first_level = pd.MultiIndex.from_tuples(
...     (("sb", 3),),
...     names=["scenario", "id"],
... )
>>> multi_index_match(base, loc_first_level)
array([False, False, False,  True])
Source code in src/pandas_openscm/indexing.py
def multi_index_match(
    idx: pd.MultiIndex, locator: pd.MultiIndex
) -> np.typing.NDArray[np.bool]:
    """
    Perform a multi-index match

    This works, even if the levels of the locator are not the same
    as the levels of the index in which to match.

    Arguably, this should be moved to
    [pandas_indexing](https://github.com/coroa/pandas-indexing)
    or [pandas](https://github.com/pandas-dev/pandas).
    Relevant issues:

    - [pandas#55279](https://github.com/pandas-dev/pandas/issues/55279)
    - [pandas-indexing#64](https://github.com/coroa/pandas-indexing/issues/64)

    Parameters
    ----------
    idx
        Index in which to find matches

    locator
        Locator to use for finding matches

    Returns
    -------
    :
        Location of the rows in `idx` which are in `locator`.

    Raises
    ------
    KeyError
        `locator` has levels which are not in `idx`

    Examples
    --------
    >>> import pandas as pd
    >>> base = pd.MultiIndex.from_tuples(
    ...     (
    ...         ("ma", "sa", 1),
    ...         ("ma", "sb", 2),
    ...         ("mb", "sa", 1),
    ...         ("mb", "sb", 3),
    ...     ),
    ...     names=["model", "scenario", "id"],
    ... )
    >>>
    >>> # A locator that lines up with the multi-index levels exactly
    >>> loc_simple = pd.MultiIndex.from_tuples(
    ...     (
    ...         ("ma", "sa", 1),
    ...         ("mb", "sa", 1),
    ...     ),
    ...     names=["model", "scenario", "id"],
    ... )
    >>> multi_index_match(base, loc_simple)
    array([ True, False,  True, False])
    >>>
    >>> # A locator that lines up with the first level only
    >>> loc_first_level = pd.MultiIndex.from_tuples(
    ...     (("ma",),),
    ...     names=["model"],
    ... )
    >>> multi_index_match(base, loc_first_level)
    array([ True,  True, False, False])
    >>>
    >>> # A locator that lines up with the second level only
    >>> loc_first_level = pd.MultiIndex.from_tuples(
    ...     (("sa",),),
    ...     names=["scenario"],
    ... )
    >>> multi_index_match(base, loc_first_level)
    array([ True, False,  True, False])
    >>>
    >>> # A locator that lines up with the second and third level only
    >>> loc_first_level = pd.MultiIndex.from_tuples(
    ...     (("sb", 3),),
    ...     names=["scenario", "id"],
    ... )
    >>> multi_index_match(base, loc_first_level)
    array([False, False, False,  True])
    """
    idx_names = list(idx.names)
    locator_names = list(locator.names)
    try:
        idx_reordered: pd.MultiIndex = idx.reorder_levels(
            [*locator_names, *[v for v in idx_names if v not in locator_names]]
        )
    except KeyError as exc:
        unusable = [v for v in locator_names if v not in idx_names]
        if unusable:
            msg = (
                f"The following levels in `locator` are not in `idx`: {unusable}. "
                f"{locator.names=} {idx.names=}"
            )
            raise KeyError(msg) from exc

        raise  # pragma: no cover

    return idx_reordered.isin(locator)