Source code for segregation.dynamics.divergence_profile

import numpy as np
import pandas as pd

from scipy.spatial.distance import pdist, squareform
from scipy.special import rel_entr as relative_entropy

from ..network import compute_travel_cost_matrix
from warnings import warn


[docs]def compute_divergence_profiles( gdf, groups, metric="euclidean", network=None, distance_matrix=None ): """ A segregation metric using Kullback-Leiber (KL) divergence to quantify the difference in the population characteristics between (1) an area and (2) the total population. Parameters ---------- data : pandas.DataFrame or geopandas.GeoDataFrame, required dataframe or geodataframe if spatial index holding data for location of interest groups : list, required list of columns on dataframe holding population totals for each group metric : str (optional; 'euclidean' by default) Distance metric for calculating pairwise distances, Accepts any inputs to `scipy.spatial.distance.pdist`. Ignored if passing a network or distance matrix network: pandana.Network object (optional, None by default) A pandana Network object used to compute distance between observations distance_matrix: numpy.array (optional; None by default) numpy array of distances between observations in the dataset Returns ---------- aux : geopandas.GeoDataFrame geodataframe of the KL divergence measure, between the aggregated population and the total population, will converge to zero for the final row of each observation to represent that the total population is covered. population_covered : the population count within the aggregated population. Returns a concatenated object of Pandas dataframes. Each dataframe contains a set of divergence levels between an area and the total population. These areas become consecutively larger, starting from a single location and aggregating outward from this location, until the area represents the total population. Thus, together the divergence levels within a dataframe represent a profile of divergence from an area. The concatenated object is the collection of these divergence profiles for every areas within the total population. """ # Store the observation index to return with the results indices = gdf.index.copy() centroids = gdf.geometry.centroid df = gdf[groups].values coordinates = np.column_stack((centroids.x, centroids.y)) # If given a pandana network, use shortest network distance, otherwise use scikit if network: if metric != "network": warn( f"metric set to {metric} but a pandana.Network object was passed. Using network distances instead" "If you wish to use a scipy distance matrix, do not include a `network` argument`" ) dist_matrix = compute_travel_cost_matrix(gdf, gdf, network).values elif distance_matrix: if metric != "precomputed": warn( f"metric set to {metric} but a distance_matrix argument was passed. Using precomputed distances instead" ) dist_matrix = distance_matrix else: dist_matrix = squareform(pdist(coordinates, metric=metric)) # Preparing list for results results = [] # Loop to calculate KL divergence for (i, distances) in enumerate(dist_matrix): # Creating the q and r objects sorted_indices = np.argsort(distances) cumul_pop_by_group = np.cumsum(df[sorted_indices], axis=0) obs_cumul_pop = np.sum(cumul_pop_by_group, axis=1)[:, np.newaxis] q_cumul_proportions = cumul_pop_by_group / obs_cumul_pop total_pop_by_group = np.sum(df, axis=0, keepdims=True) total_pop = np.sum(df) r_total_proportions = total_pop_by_group / total_pop # Input q and r objects into relative entropy (KL divergence) function kl_divergence = relative_entropy(q_cumul_proportions, r_total_proportions).sum( axis=1 ) # Creating an output dataframe output = pd.DataFrame().from_dict( dict( observation=indices[i], distance=distances[sorted_indices], divergence=kl_divergence, population_covered=obs_cumul_pop.sum(axis=1), ) ) # Append (bring together) all outputs into results list results.append(output) aux = pd.concat(results) return aux