Source code for segregation.dynamics.divergence_profile

import numpy as np
import pandas as pd

from scipy.spatial.distance import pdist, squareform
from scipy.special import rel_entr as relative_entropy

from ..network import compute_travel_cost_matrix
from warnings import warn



[docs]
def compute_divergence_profiles(
    gdf, groups, metric="euclidean", network=None, distance_matrix=None
):
    """
    A segregation metric using Kullback-Leiber (KL) divergence to quantify the
    difference in the population characteristics between (1) an area and (2) the total population.

    Parameters
    ----------
    data : pandas.DataFrame or geopandas.GeoDataFrame, required
        dataframe or geodataframe if spatial index holding data for location of interest
    groups : list, required
        list of columns on dataframe holding population totals for each group
    metric : str (optional; 'euclidean' by default)
        Distance metric for calculating pairwise distances,
        Accepts any inputs to `scipy.spatial.distance.pdist`.
        Ignored if passing a network or distance matrix
    network: pandana.Network object (optional, None by default)
        A pandana Network object used to compute distance between observations
    distance_matrix: numpy.array (optional; None by default)
        numpy array of distances between observations in the dataset

    Returns
    ----------
    aux : geopandas.GeoDataFrame
        geodataframe of the KL divergence measure, between the aggregated population and the
        total population, will converge to zero for the final row of each
        observation to represent that the total population is covered.
        population_covered : the population count within the aggregated population.
        Returns a concatenated object of Pandas dataframes. Each dataframe contains a
        set of divergence levels between an area and the total population. These areas
        become consecutively larger, starting from a single location and aggregating
        outward from this location, until the area represents the total population.
        Thus, together the divergence levels within a dataframe represent a profile
        of divergence from an area. The concatenated object is the collection of these
        divergence profiles for every areas within the total population.

    """
    # Store the observation index to return with the results
    indices = gdf.index.copy()
    centroids = gdf.geometry.centroid
    df = gdf[groups].values

    coordinates = np.column_stack((centroids.x, centroids.y))

    # If given a pandana network, use shortest network distance, otherwise use scikit
    if network:
        if metric != "network":
            warn(
                f"metric set to {metric} but a pandana.Network object was passed. Using network distances instead"
                "If you wish to use a scipy distance matrix, do not include a `network` argument`"
            )
        dist_matrix = compute_travel_cost_matrix(gdf, gdf, network).values
    elif distance_matrix:
        if metric != "precomputed":
            warn(
                f"metric set to {metric} but a distance_matrix argument was passed. Using precomputed distances instead"
            )
        dist_matrix = distance_matrix
    else:
        dist_matrix = squareform(pdist(coordinates, metric=metric))

    # Preparing list for results
    results = []

    # Loop to calculate KL divergence
    for (i, distances) in enumerate(dist_matrix):

        # Creating the q and r objects
        sorted_indices = np.argsort(distances)
        cumul_pop_by_group = np.cumsum(df[sorted_indices], axis=0)
        obs_cumul_pop = np.sum(cumul_pop_by_group, axis=1)[:, np.newaxis]
        q_cumul_proportions = cumul_pop_by_group / obs_cumul_pop
        total_pop_by_group = np.sum(df, axis=0, keepdims=True)
        total_pop = np.sum(df)
        r_total_proportions = total_pop_by_group / total_pop

        # Input q and r objects into relative entropy (KL divergence) function
        kl_divergence = relative_entropy(q_cumul_proportions, r_total_proportions).sum(
            axis=1
        )

        # Creating an output dataframe
        output = pd.DataFrame().from_dict(
            dict(
                observation=indices[i],
                distance=distances[sorted_indices],
                divergence=kl_divergence,
                population_covered=obs_cumul_pop.sum(axis=1),
            )
        )

        # Append (bring together) all outputs into results list
        results.append(output)

    aux = pd.concat(results)

    return aux