Source code for esda.geary_local

import numpy as np
import pandas as pd
from libpysal import weights
from sklearn.base import BaseEstimator

from esda.crand import _prepare_univariate
from esda.crand import crand as _crand_plus
from esda.crand import njit as _njit



[docs]
class Geary_Local(BaseEstimator):  # noqa: N801
    """Local Geary - Univariate"""


[docs]
    def __init__(
        self,
        connectivity=None,
        labels=False,
        sig=0.05,
        permutations=999,
        n_jobs=1,
        keep_simulations=True,
        seed=None,
        island_weight=0,
        drop_islands=True,
    ):
        """
        Initialize a Local_Geary estimator

        Parameters
        ----------
        connectivity     : W | Graph
                           spatial weights instance as W or Graph aligned with y
        labels           : boolean
                           (default=False)
                           If True use, label if an observation
                           belongs to an outlier, cluster, other,
                           or non-significant group. 1 = outlier,
                           2 = cluster, 3 = other, 4 = non-significant.
                           Note that this is not the exact same as the
                           cluster map produced by GeoDa.
        sig              : float
                           (default=0.05)
                           Default significance threshold used for
                           creation of labels groups.
        permutations     : int
                           (default=999)
                           number of random permutations for calculation
                           of pseudo p_values
        n_jobs           : int
                           (default=1)
                           Number of cores to be used in the conditional
                           randomisation. If -1, all available cores are used.
        keep_simulations : Boolean
                           (default=True)
                           If True, the entire matrix of replications under
                           the null is stored in memory and accessible;
                           otherwise, replications are not saved
        seed             : None/int
                           Seed to ensure reproducibility of conditional
                           randomizations. Must be set here, and not outside
                           of the function, since numba does not correctly
                           interpret external seeds nor
                           numpy.random.RandomState instances.
        island_weight :
            value to use as a weight for the "fake" neighbor for every island.
            If numpy.nan, will propagate to the final local statistic depending
            on the `stat_func`. If 0, then the lag is always zero for islands.
        drop_islands : bool (default True)
            Whether or not to preserve islands as entries in the adjacency
            list. By default, observations with no neighbors do not appear
            in the adjacency list. If islands are kept, they are coded as
            self-neighbors with zero weight. See ``libpysal.weights.to_adjlist()``.

        Attributes
        ----------
        localG          : numpy array
                          array containing the observed univariate
                          Local Geary values.
        p_sim           : numpy array
                          array containing the simulated
                          p-values for each unit.
        labs            : numpy array
                          array containing the labels for if each observation.
        """

        self.connectivity = connectivity
        self.labels = labels
        self.sig = sig
        self.permutations = permutations
        self.n_jobs = n_jobs
        self.keep_simulations = keep_simulations
        self.seed = seed
        self.island_weight = island_weight
        self.drop_islands = drop_islands



[docs]
    def fit(self, x):
        """
        Parameters
        ----------
        x                : numpy.ndarray
                           array containing continuous data

        Returns
        -------
        the fitted estimator.

        Notes
        -----
        Technical details and derivations can be found in :cite:`Anselin1995`.

        Examples
        --------
        Guerry data replication GeoDa tutorial
        >>> import libpysal as lp
        >>> import geopandas as gpd
        >>> guerry = lp.examples.load_example('Guerry')
        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
        >>> y = guerry_ds['Donatns']
        >>> lG = Local_Geary(connectivity=w).fit(y)
        >>> lG.localG[0:5]
        >>> lG.p_sim[0:5]
        """
        x = np.asarray(x).flatten()

        w = self.connectivity
        if isinstance(w, weights.W):
            w.transform = "r"
        else:
            w = w.transform("r")

        permutations = self.permutations
        sig = self.sig
        keep_simulations = self.keep_simulations
        n_jobs = self.n_jobs

        self.localG = self._statistic(x, w, self.drop_islands)

        if permutations:
            self.p_sim, self.rlocalG = _crand_plus(
                z=(x - np.mean(x)) / np.std(x),
                w=w,
                observed=self.localG,
                permutations=permutations,
                keep=keep_simulations,
                n_jobs=n_jobs,
                stat_func=_local_geary,
                island_weight=self.island_weight,
            )

        if self.labels:
            Eij_mean = np.mean(self.localG)
            x_mean = np.mean(x)
            # Create empty vector to fill
            self.labs = np.empty(len(x)) * np.nan
            # Outliers
            locg_lt_eij = self.localG < Eij_mean
            p_leq_sig = self.p_sim <= sig
            self.labs[locg_lt_eij & (x > x_mean) & p_leq_sig] = 1
            # Clusters
            self.labs[locg_lt_eij & (x < x_mean) & p_leq_sig] = 2
            # Other
            self.labs[(self.localG > Eij_mean) & p_leq_sig] = 3
            # Non-significant
            self.labs[self.p_sim > sig] = 4

        return self


    @staticmethod
    def _statistic(x, w, drop_islands):
        # Caclulate z-scores for x
        zscore_x = (x - np.mean(x)) / np.std(x)
        # Create focal (xi) and neighbor (zi) values
        if isinstance(w, weights.W):
            adj_list = w.to_adjlist(remove_symmetric=False, drop_islands=drop_islands)
            zseries = pd.Series(zscore_x, index=w.id_order)
        else:
            adj_list = w.adjacency
            if drop_islands:
                adj_list = adj_list.drop(w.isolates)
            adj_list = adj_list.reset_index()
            zseries = pd.Series(zscore_x, index=w.unique_ids)
        zi = zseries.loc[adj_list.focal].values
        zj = zseries.loc[adj_list.neighbor].values
        # Carry out local Geary calculation
        gs = adj_list.weight.values * (zi - zj) ** 2
        # Reorganize data
        adj_list_gs = pd.DataFrame(adj_list.focal.values, gs).reset_index()
        adj_list_gs.columns = ["gs", "ID"]
        adj_list_gs = adj_list_gs.groupby(by="ID").sum()
        if isinstance(w, weights.W):
            # Rearrange data based on w id order
            adj_list_gs["w_order"] = w.id_order
            adj_list_gs.sort_values(by="w_order", inplace=True)

        localG = adj_list_gs.gs.values

        return localG



# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------

# Note: does not using the scaling parameter


@_njit(fastmath=True)
def _local_geary(i, z, permuted_ids, weights_i, scaling):  # noqa: ARG001
    other_weights = weights_i[1:]
    zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights)
    return (zi - zrand) ** 2 @ other_weights