pandas_openscm.comparison#

Tools that support comparisons between pd.DataFrame's

Functions:

Name	Description
`compare_close`	Compare two pd.DataFrame's

compare_close #

compare_close(
    left: DataFrame,
    right: DataFrame,
    left_name: str,
    right_name: str,
    isclose: Callable[
        [
            NP_ARRAY_OF_FLOAT_OR_INT,
            NP_ARRAY_OF_FLOAT_OR_INT,
        ],
        NP_ARRAY_OF_BOOL,
    ] = isclose,
    future_stack: bool = True,
) -> DataFrame

Compare two pd.DataFrame's

This is like pd.DataFrame.compare except you can specify the function to determine whether values are close or not.

Parameters:

Name	Type	Description	Default
`left`	`DataFrame`	First pd.DataFrame to compare	required
`right`	`DataFrame`	Other pd.DataFrame to compare	required
`left_name`	`str`	Name of `left` to use in the result	required
`right_name`	`str`	Name of `right` to use in the result	required
`isclose`	`Callable[[NP_ARRAY_OF_FLOAT_OR_INT, NP_ARRAY_OF_FLOAT_OR_INT], NP_ARRAY_OF_BOOL]`	Function to use to determine whether values are close (Hint: use functools.partial to specify a custom tolerance with np.isclose.)	`isclose`
`future_stack`	`bool`	Passed to the `stack` calls of `left` and `right`	`True`

Returns:

Type	Description
`DataFrame`	The comparison between `left` and `right` at the provided tolerance Only indexes where `left` and `right` differ are returned, i.e. if the result is empty, `left` and `right` are equal for all indexes.

Examples:

>>> import pandas as pd
>>> left = pd.DataFrame(
...     [[1.0, 2.0, 3.0], [1.1, 1.2, 1.3], [-1.1, 0.0, 0.5]],
...     columns=pd.Index([2.0, 4.0, 10.0], name="time"),
...     index=pd.MultiIndex.from_tuples(
...         [("v1", "kg"), ("v2", "m"), ("v3", "yr")], names=["variable", "unit"]
...     ),
... )
>>> left
time           2.0   4.0   10.0
variable unit
v1       kg     1.0   2.0   3.0
v2       m      1.1   1.2   1.3
v3       yr    -1.1   0.0   0.5
>>>
>>> right = pd.DataFrame(
...     [[1.1, 2.1, 3.1], [1.11, 1.2, 1.31], [-1.12, 0.0000001, 0.5]],
...     columns=pd.Index([2.0, 4.0, 10.0], name="time"),
...     index=pd.MultiIndex.from_tuples(
...         [("v1", "kg"), ("v2", "m"), ("v3", "yr")], names=["variable", "unit"]
...     ),
... )
>>> right
time           2.0           4.0   10.0
variable unit
v1       kg    1.10  2.100000e+00  3.10
v2       m     1.11  1.200000e+00  1.31
v3       yr   -1.12  1.000000e-07  0.50

>>>
>>> # Default tolerances are quite tight
>>> compare_close(left, right, "left", "right")
                    left         right
variable unit time
v1       kg   2.0    1.0  1.100000e+00
              4.0    2.0  2.100000e+00
              10.0   3.0  3.100000e+00
v2       m    2.0    1.1  1.110000e+00
              10.0   1.3  1.310000e+00
v3       yr   2.0   -1.1 -1.120000e+00
              4.0    0.0  1.000000e-07
>>>
>>> from functools import partial
>>> import numpy as np
>>>
>>> # We can use `functools.partial` to loosen the tolerances
>>> compare_close(
...     left, right, "left", "right", isclose=partial(np.isclose, atol=0.01)
... )
                    left  right
variable unit time
v1       kg   2.0    1.0   1.10
              4.0    2.0   2.10
              10.0   3.0   3.10
v3       yr   2.0   -1.1  -1.12
>>>
>>> compare_close(
...     left,
...     right,
...     # Note you can also change the displayed names
...     left_name="Bill",
...     right_name="Ben",
...     isclose=partial(np.isclose, rtol=0.1),
... )
                         Bill           Ben
variable unit time
v3       yr   4.0         0.0  1.000000e-07
>>>
>>> # If we make the tolerance sufficiently loose,
>>> # all points are considered equal
>>> # and the result is empty.
>>> loose_comparison = compare_close(
...     left,
...     right,
...     "left",
...     "right",
...     isclose=partial(np.isclose, rtol=0.1, atol=0.001),
... )
>>> loose_comparison.empty
True

Source code in src/pandas_openscm/comparison.py

def compare_close(  # noqa: PLR0913
    left: pd.DataFrame,
    right: pd.DataFrame,
    left_name: str,
    right_name: str,
    isclose: Callable[
        [NP_ARRAY_OF_FLOAT_OR_INT, NP_ARRAY_OF_FLOAT_OR_INT], NP_ARRAY_OF_BOOL
    ] = np.isclose,
    future_stack: bool = True,
) -> pd.DataFrame:
    """
    Compare two [pd.DataFrame][pandas.DataFrame]'s

    This is like [pd.DataFrame.compare][pandas.DataFrame.compare]
    except you can specify the function to determine
    whether values are close or not.

    Parameters
    ----------
    left
        First [pd.DataFrame][pandas.DataFrame] to compare

    right
        Other [pd.DataFrame][pandas.DataFrame] to compare

    left_name
        Name of `left` to use in the result

    right_name
        Name of `right` to use in the result

    isclose
        Function to use to determine whether values are close

        (Hint: use [functools.partial][] to specify a custom
        tolerance with [np.isclose][numpy.isclose].)

    future_stack
        Passed to the `stack` calls of `left` and `right`

    Returns
    -------
    :
        The comparison between `left` and `right` at the provided tolerance

        Only indexes where `left` and `right` differ are returned,
        i.e. if the result is empty, `left` and `right` are equal for all indexes.

    Examples
    --------
    >>> import pandas as pd
    >>> left = pd.DataFrame(
    ...     [[1.0, 2.0, 3.0], [1.1, 1.2, 1.3], [-1.1, 0.0, 0.5]],
    ...     columns=pd.Index([2.0, 4.0, 10.0], name="time"),
    ...     index=pd.MultiIndex.from_tuples(
    ...         [("v1", "kg"), ("v2", "m"), ("v3", "yr")], names=["variable", "unit"]
    ...     ),
    ... )
    >>> left
    time           2.0   4.0   10.0
    variable unit
    v1       kg     1.0   2.0   3.0
    v2       m      1.1   1.2   1.3
    v3       yr    -1.1   0.0   0.5
    >>>
    >>> right = pd.DataFrame(
    ...     [[1.1, 2.1, 3.1], [1.11, 1.2, 1.31], [-1.12, 0.0000001, 0.5]],
    ...     columns=pd.Index([2.0, 4.0, 10.0], name="time"),
    ...     index=pd.MultiIndex.from_tuples(
    ...         [("v1", "kg"), ("v2", "m"), ("v3", "yr")], names=["variable", "unit"]
    ...     ),
    ... )
    >>> right
    time           2.0           4.0   10.0
    variable unit
    v1       kg    1.10  2.100000e+00  3.10
    v2       m     1.11  1.200000e+00  1.31
    v3       yr   -1.12  1.000000e-07  0.50

    >>>
    >>> # Default tolerances are quite tight
    >>> compare_close(left, right, "left", "right")
                        left         right
    variable unit time
    v1       kg   2.0    1.0  1.100000e+00
                  4.0    2.0  2.100000e+00
                  10.0   3.0  3.100000e+00
    v2       m    2.0    1.1  1.110000e+00
                  10.0   1.3  1.310000e+00
    v3       yr   2.0   -1.1 -1.120000e+00
                  4.0    0.0  1.000000e-07
    >>>
    >>> from functools import partial
    >>> import numpy as np
    >>>
    >>> # We can use `functools.partial` to loosen the tolerances
    >>> compare_close(
    ...     left, right, "left", "right", isclose=partial(np.isclose, atol=0.01)
    ... )
                        left  right
    variable unit time
    v1       kg   2.0    1.0   1.10
                  4.0    2.0   2.10
                  10.0   3.0   3.10
    v3       yr   2.0   -1.1  -1.12
    >>>
    >>> compare_close(
    ...     left,
    ...     right,
    ...     # Note you can also change the displayed names
    ...     left_name="Bill",
    ...     right_name="Ben",
    ...     isclose=partial(np.isclose, rtol=0.1),
    ... )
                             Bill           Ben
    variable unit time
    v3       yr   4.0         0.0  1.000000e-07
    >>>
    >>> # If we make the tolerance sufficiently loose,
    >>> # all points are considered equal
    >>> # and the result is empty.
    >>> loose_comparison = compare_close(
    ...     left,
    ...     right,
    ...     "left",
    ...     "right",
    ...     isclose=partial(np.isclose, rtol=0.1, atol=0.001),
    ... )
    >>> loose_comparison.empty
    True
    """
    left_stacked = left.stack(future_stack=future_stack)  # type: ignore # pandas-stubs confused
    if not isinstance(left_stacked, pd.Series):
        msg = (
            f"left ({left_name=}) "
            "is not a `pd.Series` after stacking, this will not work"
        )
        raise TypeError(msg)

    left_stacked.name = left_name

    right_stacked = right.stack(future_stack=future_stack)  # type: ignore # pandas-stubs confused
    if not isinstance(right_stacked, pd.Series):
        msg = (
            f"right ({right_name=}) "
            "is not a `pd.Series` after stacking, this will not work"
        )
        raise TypeError(msg)

    right_stacked.name = right_name

    left_stacked_aligned, right_stacked_aligned = left_stacked.align(right_stacked)
    differences_locator = ~isclose(
        left_stacked_aligned.values,  # type: ignore
        right_stacked_aligned.values,  # type: ignore
    )

    res = pd.concat(
        [
            left_stacked_aligned[differences_locator],
            right_stacked_aligned[differences_locator],
        ],
        axis="columns",
    )

    return res