import numpy as np
import pandas as pd
from libpysal import weights
from scipy import stats
from sklearn.base import BaseEstimator
from sklearn.utils import check_array
[docs]
class Geary_Local_MV(BaseEstimator): # noqa: N801
"""Local Geary - Multivariate"""
[docs]
def __init__(self, connectivity=None, permutations=999, drop_islands=True):
"""
Initialize a Local_Geary_MV estimator
Parameters
----------
connectivity : W | Graph
spatial weights instance as W or Graph aligned with y
permutations : int
(default=999)
number of random permutations for calculation
of pseudo p_values
drop_islands : bool (default True)
Whether or not to preserve islands as entries in the adjacency
list. By default, observations with no neighbors do not appear
in the adjacency list. If islands are kept, they are coded as
self-neighbors with zero weight. See ``libpysal.weights.to_adjlist()``.
Attributes
----------
localG : numpy array
array containing the observed multivariate
Local Geary values.
p_sim : numpy array
array containing the simulated
p-values for each unit.
"""
self.connectivity = connectivity
self.permutations = permutations
self.drop_islands = drop_islands
[docs]
def fit(self, variables):
"""
Parameters
----------
variables : numpy.ndarray
array containing continuous data
Returns
-------
the fitted estimator.
Notes
-----
Technical details and derivations can be found in :cite:`Anselin1995`.
Examples
--------
Guerry data replication GeoDa tutorial
>>> import libpysal
>>> import geopandas as gpd
>>> guerry = lp.examples.load_example('Guerry')
>>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
>>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
>>> import libpysal
>>> import geopandas as gpd
>>> guerry = lp.examples.load_example('Guerry')
>>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
>>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
>>> x1 = guerry_ds['Donatns']
>>> x2 = guerry_ds['Suicids']
>>> lG_mv = Local_Geary(connectivity=w).fit([x1,x2])
>>> lG_mv.localG[0:5]
>>> lG_mv.p_sim[0:5]
"""
self.variables = check_array(
variables,
accept_sparse=False,
dtype="float",
force_all_finite=True,
estimator=self,
)
w = self.connectivity
if isinstance(w, weights.W):
w.transform = "r"
else:
w = w.transform("r")
self.n = len(variables[0])
self.w = w
permutations = self.permutations
# Caclulate z-scores for input variables
# to be used in _statistic and _crand
zvariables = stats.zscore(variables, axis=1)
stat = self._statistic_w if isinstance(w, weights.W) else self._statistic_g
self.localG = stat(variables, zvariables, w, self.drop_islands)
if permutations:
self._crand(zvariables)
sim = np.transpose(self.Gs)
above = sim >= self.localG
larger = above.sum(0)
low_extreme = (permutations - larger) < larger
larger[low_extreme] = permutations - larger[low_extreme]
self.p_sim = (larger + 1.0) / (permutations + 1.0)
return self
@staticmethod
def _stat(zseries, adj_list, variables):
focal = [zseries[i].loc[adj_list.focal].values for i in range(len(variables))]
neighbor = [
zseries[i].loc[adj_list.neighbor].values for i in range(len(variables))
]
# Carry out local Geary calculation
gs = adj_list.weight.values * (np.array(focal) - np.array(neighbor)) ** 2
return gs
def _statistic_w(self, variables, zvariables, w, drop_islands):
# Define denominator adjustment
k = len(variables)
# Create focal and neighbor values
adj_list = w.to_adjlist(remove_symmetric=False, drop_islands=drop_islands)
zseries = [pd.Series(i, index=w.id_order) for i in zvariables]
gs = self._stat(zseries, adj_list, variables)
# Reorganize data
temp = pd.DataFrame(gs).T
adj_list_gs = temp.groupby(by=adj_list.focal.values).sum()
# Rearrange data based on w id order
adj_list_gs["w_order"] = w.id_order
adj_list_gs.sort_values(by="w_order", inplace=True)
adj_list_gs.drop(columns=["w_order"], inplace=True)
localG = np.array(adj_list_gs.sum(axis=1, numeric_only=True) / k)
return localG
def _statistic_g(self, variables, zvariables, w, drop_islands):
# Define denominator adjustment
k = len(variables)
# Create focal and neighbor values
if drop_islands:
adj_list = w.adjacency.drop(w.isolates).reset_index()
else:
adj_list = w.adjacency.reset_index()
zseries = [pd.Series(i, index=w.unique_ids) for i in zvariables]
gs = self._stat(zseries, adj_list, variables)
# Reorganize data
temp = pd.DataFrame(gs).T
adj_list_gs = temp.groupby(by=adj_list.focal.values).sum().reindex(w.unique_ids)
localG = np.array(adj_list_gs.sum(axis=1, numeric_only=True) / k)
return localG
def _crand(self, zvariables):
"""
conditional randomization
for observation i with ni neighbors, the candidate set cannot include
i (we don't want i being a neighbor of i). we have to sample without
replacement from a set of ids that doesn't include i. numpy doesn't
directly support sampling wo replacement and it is expensive to
implement this. instead we omit i from the original ids, permute the
ids and take the first ni elements of the permuted ids as the
neighbors to i in each randomization.
"""
nvars = self.variables.shape[0]
Gs = np.zeros((self.n, self.permutations))
prange = list(range(self.permutations))
k = (
self.w.max_neighbors + 1
if isinstance(self.w, weights.W)
else self.w.cardinalities.max() + 1
)
nn = self.n - 1
rids = np.array([np.random.permutation(nn)[0:k] for i in prange])
ids = np.arange(self.w.n)
if isinstance(self.w, weights.W):
ido = self.w.id_order
w = [self.w.weights[ido[i]] for i in ids]
wc = [self.w.cardinalities[ido[i]] for i in ids]
else:
w = list(self.w.weights.values())
wc = self.w.cardinalities.tolist()
for i in range(self.w.n):
idsi = ids[ids != i]
np.random.shuffle(idsi)
vars_rand = []
for j in range(nvars):
vars_rand.append(zvariables[j][idsi[rids[:, 0 : wc[i]]]])
# vars rand as tmp
# Calculate diff
diff = []
for z in range(nvars):
_diff = np.array((zvariables[z][i] - vars_rand[z]) ** 2 * w[i])
diff.append(_diff.sum(1) / nvars)
# add up differences
temp = np.array([sum(x) for x in zip(*diff, strict=True)])
# Assign to object to be returned
Gs[i] = temp
self.Gs = Gs