Source code for segregation.inference.inference_wrappers

"""Inference wrapper classes for segregation measures."""

__author__ = "Renan X. Cortes <renanc@ucr.edu> Sergio J. Rey <sergio.rey@ucr.edu> and Elijah Knaap <elijah.knaap@ucr.edu>"

import multiprocessing
import warnings

import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from scipy import stats
from tqdm.auto import tqdm

from .._base import MultiGroupIndex
from .comparative import (
    DUAL_SIMULATORS,
    _estimate_counterfac_difference,
    _estimate_random_label_difference,
    _generate_counterfactual,
    _prepare_random_label,
)
from .randomization import SIMULATORS, simulate_null


def _infer_segregation(
    seg_class,
    iterations_under_null=500,
    null_approach="systematic",
    two_tailed=True,
    index_kwargs=None,
    n_jobs=-1,
    backend="loky",
    null_value=0,
):
    """Compare segregation statistic against a simulated null distribution.

    Parameters
    ----------
    seg_class : segregation.singlegroup or segregation.multigroup object
        fitted segregation index class
    iterations_under_null : int
        number of iterations under null hyphothesis
    null_approach : str
        Which counterfactual approach to use when generating null hypothesis distribution. See Notes.

        * ``systematic``:
        assumes that every group has the same probability with restricted conditional probabilities
        p_0_j = p_1_j = p_j = n_j/n (multinomial distribution).

        * ``bootstrap``:
        generates bootstrap replications of the units with replacement of the same size of the
        original data. This procedure creates a confidence interval for the index statistic to test
        whether the null value lies within.

        * ``evenness``:
        assumes that each spatial unit has the same global probability of drawing elements from the
        minority group of the fixed total unit population (binomial distribution).

        * ``person_permutation``:
        randomly allocates individuals into units keeping the total population of each
        equal to the original.

        * ``geographic_permutation``:
        randomly allocates the units over space keeping the original values.

        * ``systematic_permutation``:
        assumes absence of systematic segregation and randomly allocates the units over
        space.

        * ``even_permutation``:
        Assumes the same global probability of drawning elements from the minority group in
        each spatial unit and randomly allocates the units over space.

    two_tailed : boolean
        If True, p_value is two-tailed. Otherwise, it is right one-tailed. The one-tailed p_value attribute
        might not be appropriate for some measures, as the two-tailed. Therefore, it is better to rely on the
        est_sim attribute.
    n_jobs: int, optional
        number of cores to use for estimation. If -1 all available cpus will be used
    backend: str, optional
        which backend to use with joblib. Options include "loky", "multiprocessing", or "threading"
    index_kwargs : dict, optional
        additional keyword arguments passed to the index class

    Attributes
    ----------
    p_value : float
        Pseudo One or Two-Tailed p-value estimated from the simulations
    est_sim : numpy array
       Estimates of the segregation measure under the null hypothesis
    statistic : float
        The value of the segregation index being tested

    """
    if null_approach not in SIMULATORS.keys():
        raise ValueError(f"null_approach must one of {list(SIMULATORS.keys())}")

    if type(two_tailed) is not bool:
        raise TypeError("two_tailed is not a boolean object")

    point_estimation = seg_class.statistic

    # if using the bootstrap test, we're testing the null estimate against the index's distribution
    # in all other cases, we test the index value against a null distribution
    if null_approach == "bootstrap":
        point_estimation = null_value

    aux = str(type(seg_class))
    _class_name = aux[
        1 + aux.rfind(".") : -2
    ]  # 'rfind' finds the last occurence of a pattern in a string

    Estimates_Stars = simulate_null(
        iterations=iterations_under_null,
        sim_func=SIMULATORS[null_approach],
        seg_class=seg_class,
        index_kwargs=index_kwargs,
        n_jobs=n_jobs,
        backend=backend,
    ).values

    # Check and, if the case, remove iterations_under_null that resulted in nan or infinite values
    if any((np.isinf(Estimates_Stars) | np.isnan(Estimates_Stars))):
        warnings.warn(
            "Some estimates resulted in NaN or infinite values for estimations under null hypothesis. "
            "These values will be removed for the final results."
        )
        Estimates_Stars = Estimates_Stars[
            ~(np.isinf(Estimates_Stars) | np.isnan(Estimates_Stars))
        ]

    if not two_tailed:
        p_value = sum(Estimates_Stars > point_estimation) / iterations_under_null
    else:
        aux1 = (point_estimation < Estimates_Stars).sum()
        aux2 = (point_estimation > Estimates_Stars).sum()
        p_value = 2 * np.array([aux1, aux2]).min() / len(Estimates_Stars)

    return p_value, Estimates_Stars, point_estimation, _class_name



[docs]
class SingleValueTest:
    """Statistical inference for a single segregation measure.

    Parameters
    ----------
    seg_class : segregation.singlegroup or segregation.multigroup object
        fitted segregation index class
    iterations_under_null : int
        number of iterations under null hyphothesis
    null_approach : str
        Which counterfactual approach to use when generating null hypothesis distribution. One of the following:.

        * ``bootstrap``:
        Generate bootstrap replications of the units with replacement of the same size of the
        original data to create a distribution of the segregation index. Then the `null_value` argument
        is tested against this distribution. The null_value may be 0, or may be estimated empirically using
        the `simulate_null` function.

        * ``systematic``:
        assumes that every group has the same probability with restricted conditional probabilities
        p_0_j = p_1_j = p_j = n_j/n (multinomial distribution).

        * ``evenness``:
        Generate a distribution of segregation indices under the assumption of evenness, which
        assumes that each spatial unit has the same global probability of drawing elements from the
        minority group of the fixed total unit population (binomial distribution). Then test the observed
        segregation index against this distribution

        * ``person_permutation``:
        Generate a distribution of segregation indices under the assumption of individual-level randomization,
        which randomly allocates individuals into units keeping the total population of each
        equal to the original.Then test the observed segregation index against this distribution

        * ``geographic_permutation``:
        Generate a distribution of segregation indices under the assumption of geographit unit-level randomization,
        which randomly allocates the units over space keeping the original values. Then test the observed segregation
        index against this distribution

        * ``systematic_permutation``:
        Generate a distribution of segregation indices under the assumption of systemic randomization,
        then randomly allocate units over space. Then test the observed segregation index against this distribution

        * ``even_permutation``:
        Generate a distribution of segregation indices under the assumption of evenness, then randomly allocating
        the units over space. Then test the observed segregation index against this distribution

    two_tailed : boolean
        If True, p_value is two-tailed. Otherwise, it is right one-tailed. The one-tailed p_value attribute
        might not be appropriate for some measures, as the two-tailed. Therefore, it is better to rely on the
        est_sim attribute.
    n_jobs: int, optional
        number of cores to use for estimation. If -1 all available cpus will be used
    backend: str, optional
        which backend to use with joblib. Options include "loky", "multiprocessing", or "threading"
    index_kwargs : dict, optional
        additional keyword arguments passed to the index class

    Attributes
    ----------
    p_value : float
        Pseudo One or Two-Tailed p-value estimated from the simulations
    est_sim : numpy array
       Estimates of the segregation measure under the null hypothesis
    statistic : float
        The value of the segregation index being tested

    Notes
    -----
    1) The different approaches for the null hypothesis affect directly the results of the inference depending on the
    combination of the index type of seg_class and the null_approach chosen. Therefore, the user needs to be aware of
    how these approaches are affecting the data generation process of the simulations in order to draw meaningful
    conclusions. For example, the Modified Dissimilarity (ModifiedDissim) and  Modified Gini (ModifiedGiniSeg) indexes,
    rely exactly on the distance between evenness through sampling which, therefore, the "evenness" value for null
    approach would not be the most appropriate for these indexes.

    Examples
    --------
    Several examples can be found here https://github.com/pysal/segregation/blob/master/notebooks/inference_wrappers_example.ipynb.
    """


[docs]
    def __init__(
        self,
        seg_class,
        iterations_under_null=500,
        null_approach="systematic",
        two_tailed=True,
        n_jobs=-1,
        **kwargs,
    ):
        aux = _infer_segregation(
            seg_class,
            iterations_under_null,
            null_approach,
            two_tailed,
            n_jobs=n_jobs,
            **kwargs,
        )

        self.p_value = aux[0]
        self.est_sim = aux[1]
        self.statistic = aux[2]
        self._class_name = aux[3]



[docs]
    def plot(self, color="darkblue", kde=True, ax=None, **kwargs):
        """Plot the distribution of simulated values and the observed index being tested.

        Parameters
        ----------
        color : str, optional
            color of histogram, by default 'darkblue'
        kde : bool, optional
            Whether to plot the kernel density estimate along with the histogram, by default True
        ax : matplotlib.axes, optional
            axes object to plot onto, by default None
        kwargs : seaborn.histplot argument, optional
            additional keyword arguments passed to seaborn's histplot function

        Returns
        -------
        matplotlib.axes
            pyplot axes object
        """
        try:
            import matplotlib.pyplot as plt
            import seaborn as sns
        except ImportError:
            warnings.warn("This method relies on importing `matplotlib` and `seaborn`")

        f = sns.histplot(self.est_sim, color=color, kde=kde, ax=ax, **kwargs)
        plt.axvline(self.statistic, color="red")
        plt.title("{} (Value = {})".format(self._class_name, round(self.statistic, 3)))
        return f




def _compare_segregation(
    seg_class_1,
    seg_class_2,
    iterations,
    null_approach,
    index_kwargs_1,
    index_kwargs_2,
    n_jobs,
    backend,
):
    """Perform inference comparison for a two segregation measures.

    Parameters
    ----------
    seg_class_1 : segregation.singlegroup or segregation.multigroup class
        a fitted segregation class to be compared to seg_class_2
    seg_class_2 : segregation.singlegroup or segregation.multigroup class
        a fitted segregation class to be compared to seg_class_1
    iterations_under_null : int
        number of iterations to simulate observations in a null distribution
    null_approach : str
        Which type of null hypothesis the inference will iterate. One of the following:

        * ``random_label``:
        Randomly assign each spatial unit to a region then recalculate segregation indices and take the difference

        * ``bootstrap``:
        Use bootstrap resampling to generate distributions of the segregation index for each index in the comparison,
        then use a two sample t-test to compare differences in the mean of each distribution

        * ``composition``:
        Generate counterfactual estimates for each region using the sim_composition approach.
        On each iteration, generate a synthetic dataset for each region where each unit has a 50% chance
        of belonging to the original data or the counterfactual data. Recalculate segregation indices on
        the synthetic datasets.

        * ``share``:
        Generate counterfactual estimates for each region using the sim_share approach.
        On each iteration, generate a synthetic dataset for each region where each unit has a 50% chance
        of belonging to the original data or the counterfactual data. Recalculate segregation indices on
        the synthetic datasets.

        * ``dual_composition``:
        Generate counterfactual estimates for each region using the sim_dual_composition
        approach. On each iteration, generate a synthetic dataset for each region where each unit has a 50%
        chance of belonging to the original data or the counterfactual data. Recalculate segregation
        indices on the synthetic datasets.

        * ``person_permutation``:
        Use the simulate_person_permutation approach to randomly reallocate the combined
        population across both regions then recalculate segregation indices

    n_jobs: int, optional
        number of cores to use for estimation. If -1 all available cpus will be used
    backend: str, optional
        which backend to use with joblib. Options include "loky", "multiprocessing", or "threading"
    index_kwargs_1 : dict, optional
        extra parameters to pass to segregation index 1.
    index_kwargs_2 : dict, optional
        extra parameters to pass to segregation index 2.

    Attributes
    ----------
    p_value : float
        Two-Tailed p-value
    est_sim : numpy array
        Estimates of the segregation measure differences under the null hypothesis
    est_point_diff : float
        Observed difference between the segregation measures

    Notes
    -----
    This function performs inference to compare two segregation measures. This can be either two measures of the same locations in two different points in time or it can be two different locations at the same point in time.
    The null hypothesis is H0: Segregation_1 is not different than Segregation_2.

    Based on Rey, Sergio J., and Myrna L. Sastré-Gutiérrez. "Interregional inequality dynamics in Mexico." Spatial Economic Analysis 5.3 (2010): 277-298.

    """
    if not index_kwargs_1:
        index_kwargs_1 = {}
    if not index_kwargs_2:
        index_kwargs_2 = {}
    if n_jobs == -1:
        n_jobs = multiprocessing.cpu_count()
    if null_approach not in [
        "random_label",
        "composition",
        "share",
        "dual_composition",
        "person_permutation",
        "bootstrap",
    ]:
        raise ValueError(
            f"null_approach must one of {list(DUAL_SIMULATORS.keys())+['random_label', 'person_permutation', 'bootstrap']}"
        )

    if type(seg_class_1) != type(seg_class_2):
        raise TypeError("seg_class_1 and seg_class_2 must be the same type/class.")

    point_estimation = seg_class_1.statistic - seg_class_2.statistic

    aux = str(type(seg_class_1))
    _class_name = aux[
        1 + aux.rfind(".") : -2
    ]  # 'rfind' finds the last occurence of a pattern in a string

    data_1 = seg_class_1.data.copy()
    data_2 = seg_class_2.data.copy()

    if null_approach == "bootstrap":
        boot1 = SingleValueTest(
            seg_class_1,
            iterations_under_null=iterations,
            null_approach="bootstrap",
            n_jobs=n_jobs,
            backend=backend,
            **index_kwargs_1,
        ).est_sim
        boot2 = SingleValueTest(
            seg_class_2,
            iterations_under_null=iterations,
            null_approach="bootstrap",
            n_jobs=n_jobs,
            backend=backend,
            **index_kwargs_2,
        ).est_sim
        # test statistic follows from <http://dx.doi.org/10.1016/j.jeconom.2008.11.004>, page 34
        tt = (boot1.mean() - boot2.mean()) / np.sqrt(
            (np.std(boot1) ** 2 + np.std(boot2) ** 2)
        )
        # p-value from <https://docs.scipy.org/doc/scipy/reference/tutorial/stats.html>
        p_value = stats.t.sf(np.abs(tt), iterations - 1) * 2
        estimates = (boot1, boot2)
        return p_value, estimates, point_estimation, _class_name

    if null_approach in ["random_label", "person_permutation"]:
        if isinstance(seg_class_1, MultiGroupIndex):
            groups = seg_class_1.groups
        else:
            groups = None

        stacked = _prepare_random_label(seg_class_1, seg_class_2)

        estimates = Parallel(n_jobs=n_jobs, backend=backend)(
            delayed(_estimate_random_label_difference)(
                (
                    stacked,
                    seg_class_1._function,
                    index_kwargs_1,
                    index_kwargs_2,
                    seg_class_1.index_type,
                    groups,
                    null_approach,
                )
            )
            for i in tqdm(range(iterations))
        )

    if null_approach in [
        "composition",
        "share",
        "dual_composition",
    ]:
        if isinstance(seg_class_1, MultiGroupIndex):
            raise ValueError("Not implemented for MultiGroup indexes.")

        counterfac_df1, counterfac_df2 = _generate_counterfactual(
            data_1,
            data_2,
            seg_class_1.group_pop_var,
            seg_class_1.total_pop_var,
            seg_class_2.group_pop_var,
            seg_class_2.total_pop_var,
            null_approach,
        )

        if null_approach in ["share", "dual_composition"]:
            data_1[seg_class_1.total_pop_var] = counterfac_df1[
                "counterfactual_total_pop"
            ]
            data_2[seg_class_2.total_pop_var] = counterfac_df2[
                "counterfactual_total_pop"
            ]

        estimates = Parallel(n_jobs=n_jobs, backend=backend)(
            delayed(_estimate_counterfac_difference)(
                (
                    data_1,
                    data_2,
                    seg_class_1.group_pop_var,
                    seg_class_1.total_pop_var,
                    seg_class_2.group_pop_var,
                    seg_class_2.total_pop_var,
                    index_kwargs_1,
                    index_kwargs_2,
                    null_approach,
                    seg_class_1._function,
                    counterfac_df1,
                    counterfac_df2,
                )
            )
            for i in tqdm(range(iterations))
        )
    estimates = pd.Series(estimates).dropna()
    if len(estimates) < iterations:
        warnings.warn("Some observations were removed for NA values")

    # Two-Tailed p-value
    # Obs.: the null distribution can be located far from zero. Therefore, this is the the appropriate way to calculate the two tailed p-value.
    aux1 = (point_estimation < estimates).sum()
    aux2 = (point_estimation > estimates).sum()
    p_value = 2 * np.array([aux1, aux2]).min() / len(estimates)

    return p_value, estimates, point_estimation, _class_name



[docs]
class TwoValueTest:
    """Perform comparative inference for two segregation measures.

    Parameters
    ----------
    seg_class_1 : segregation.singlegroup or segregation.multigroup class
        a fitted segregation class to be compared to seg_class_2
    seg_class_2 : segregation.singlegroup or segregation.multigroup class
        a fitted segregation class to be compared to seg_class_1
    iterations_under_null : int
        number of iterations to simulate observations in a null distribution
    null_approach : str
        Which type of null hypothesis the inference will iterate. One of the following:

        * ``random_label``:
        Randomly assign each spatial unit to a region then recalculate segregation indices and take their
        difference. Repeat this process `iterations` times to generate a reference distribution. Then test
        the observed difference aginst this distribution.

        * ``bootstrap``:
        Use bootstrap resampling to generate distributions of each segregation index in the
        comparison, then use a two sample t-test to compare differences between the distribution means.

        * ``composition``:
        Generate counterfactual estimates for each region using the sim_composition approach.
        On each iteration, generate a synthetic dataset for each region where each unit has a 50% chance
        of belonging to the original data or the counterfactual data. Recalculate segregation indices on
        the synthetic datasets.

        * ``share``:
        Generate counterfactual estimates for each region using the sim_share approach.
        On each iteration, generate a synthetic dataset for each region where each unit has a 50% chance
        of belonging to the original data or the counterfactual data. Recalculate segregation indices on
        the synthetic datasets. Then follow the random labeling method on these synthetic data

        * ``dual_composition``:
        Generate counterfactual estimates for each region using the sim_dual_composition
        approach. On each iteration, generate a synthetic dataset for each region where each unit has a 50%
        chance of belonging to the original data or the counterfactual data. Then follow the random labeling
        method on these synthetic data

        * ``person_permutation``:
        Use the simulate_person_permutation approach to randomly reallocate the combined
        population across both regions then recalculate segregation indices

    n_jobs: int, optional
        number of cores to use for estimation. If -1 all available cpus will be used
    backend: str, optional
        which backend to use with joblib. Options include "loky", "multiprocessing", or "threading"
    index_kwargs_1 : dict, optional
        extra parameters to pass to segregation index 1.
    index_kwargs_2 : dict, optional
        extra parameters to pass to segregation index 2.

    Attributes
    ----------
    p_value : float
        Two-Tailed p-value
    est_sim : numpy array
        Estimates of the segregation measure differences under the null hypothesis
    est_point_diff : float
        Observed difference between the segregation measures

    Notes
    -----
    This function performs inference to compare two segregation measures. This can be either
    two measures of the same locations in two different points in time or it can be two
    different locations at the same point in time. The null hypothesis is H0: Segregation_1
    is not different than Segregation_2.
    Based on Rey, Sergio J., and Myrna L. Sastré-Gutiérrez. "Interregional inequality dynamics in Mexico." Spatial Economic Analysis 5.3 (2010): 277-298.

    Examples
    --------
    Several examples can be found here https://github.com/pysal/segregation/blob/master/notebooks/inference_wrappers_example.ipynb.
    """


[docs]
    def __init__(
        self,
        seg_class_1,
        seg_class_2,
        iterations_under_null=500,
        null_approach="random_label",
        n_jobs=-1,
        backend="loky",
        index_kwargs_1=None,
        index_kwargs_2=None,
        **kwargs,
    ):
        aux = _compare_segregation(
            seg_class_1,
            seg_class_2,
            iterations=iterations_under_null,
            null_approach=null_approach,
            n_jobs=n_jobs,
            backend=backend,
            index_kwargs_1=index_kwargs_1,
            index_kwargs_2=index_kwargs_2,
        )

        self.p_value = aux[0]
        self.est_sim = aux[1]
        self.est_point_diff = aux[2]
        self._class_name = aux[3]
        self._null_approach = null_approach



[docs]
    def plot(self, color="darkblue", color2="darkred", kde=True, ax=None, **kwargs):
        """Plot the distribution of simulated values and the index value being tested.

        Parameters
        ----------
        color : str, optional
            histogram color, by default 'darkblue'
        color2: str, optional, by default "darkred"
             Color for second histogram. Only relevant for bootstrap test
        kde : bool, optional
            Whether to plot the kernel density estimate along with the histogram,
            by default True
        ax : matplotlib.axes, optional
            axes object to plot onto, by default None
        kwargs : seaborn.histplot argument, optional
            additional keyword arguments passed to seaborn's histplot function

        Returns
        -------
        matplotlib.axes
            pyplot axes object
        """
        try:
            import matplotlib.pyplot as plt
            import seaborn as sns
        except ImportError:
            warnings.warn("This method relies on importing `matplotlib` and `seaborn`")
        if ax is None:
            _, ax = plt.subplots()

        if self._null_approach == "bootstrap":
            ax = sns.histplot(self.est_sim[0], color=color, kde=kde, ax=ax, **kwargs)
            ax = sns.histplot(self.est_sim[1], color=color2, kde=kde, ax=ax, **kwargs)
            ax.set_title(
                f"{self._class_name} (Diff. value = {round(self.est_point_diff, 3)})"
            )
        else:
            ax = sns.histplot(self.est_sim, color=color, kde=kde, ax=ax, **kwargs)
            ax.vlines(self.est_point_diff, 0, ax.get_ylim()[1], color="red")
            ax.set_title(
                f"{self._class_name} (Diff. value = {round(self.est_point_diff, 3)})"
            )
        return ax