"""
Getis and Ord G statistic for spatial autocorrelation
"""
__author__ = "Sergio J. Rey <srey@asu.edu>, Myunghwa Hwang <mhwang4@gmail.com> "
__all__ = ["G", "G_Local"]
import warnings
import numpy as np
from scipy import stats
from libpysal.weights.spatial_lag import lag_spatial as slag
from libpysal.weights.util import fill_diagonal
from .crand import _prepare_univariate
from .crand import crand as _crand_plus
from .crand import njit as _njit
from .tabular import _univariate_handler
PERMUTATIONS = 999
[docs]
class G(object):
"""
Global G Autocorrelation Statistic
Parameters
----------
y : array (n,1)
Attribute values
w : W
DistanceBand W spatial weights based on distance band
permutations : int
the number of random permutations for calculating pseudo p_values
Attributes
----------
y : array
original variable
w : W
DistanceBand W spatial weights based on distance band
permutation : int
the number of permutations
G : float
the value of statistic
EG : float
the expected value of statistic
VG : float
the variance of G under normality assumption
z_norm : float
standard normal test statistic
p_norm : float
p-value under normality assumption (one-sided)
sim : array
(if permutations > 0)
vector of G values for permutated samples
p_sim : float
p-value based on permutations (one-sided)
null: spatial randomness
alternative: the observed G is extreme it is either
extremely high or extremely low
EG_sim : float
average value of G from permutations
VG_sim : float
variance of G from permutations
seG_sim : float
standard deviation of G under permutations.
z_sim : float
standardized G based on permutations
p_z_sim : float
p-value based on standard normal approximation from permutations (one-sided)
Notes
-----
Moments are based on normality assumption.
For technical details see :cite:`Getis_2010` and :cite:`Ord_2010`.
Examples
--------
>>> import libpysal
>>> import numpy
>>> numpy.random.seed(10)
Preparing a point data set
>>> points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)]
Creating a weights object from points
>>> w = libpysal.weights.DistanceBand(points,threshold=15)
>>> w.transform = "B"
Preparing a variable
>>> y = numpy.array([2, 3, 3.2, 5, 8, 7])
Applying Getis and Ord G test
>>> from esda.getisord import G
>>> g = G(y,w)
Examining the results
>>> round(g.G, 3)
0.557
>>> round(g.p_norm, 3)
0.173
"""
[docs]
def __init__(self, y, w, permutations=PERMUTATIONS):
y = np.asarray(y).flatten()
self.n = len(y)
self.y = y
w.transform = "B"
self.w = w
self.permutations = permutations
self.__moments()
self.y2 = y * y
y = y.reshape(
len(y), 1
) # Ensure that y is an n by 1 vector, otherwise y*y.T == y*y
self.den_sum = (y * y.T).sum() - (y * y).sum()
self.G = self.__calc(self.y)
self.z_norm = (self.G - self.EG) / np.sqrt(self.VG)
self.p_norm = 1.0 - stats.norm.cdf(np.abs(self.z_norm))
if permutations:
sim = [
self.__calc(np.random.permutation(self.y)) for i in range(permutations)
]
self.sim = sim = np.array(sim)
above = sim >= self.G
larger = sum(above)
if (self.permutations - larger) < larger:
larger = self.permutations - larger
self.p_sim = (larger + 1.0) / (permutations + 1.0)
self.EG_sim = sum(sim) / permutations
self.seG_sim = sim.std()
self.VG_sim = self.seG_sim**2
self.z_sim = (self.G - self.EG_sim) / self.seG_sim
self.p_z_sim = 1.0 - stats.norm.cdf(np.abs(self.z_sim))
def __moments(self):
y = self.y
n = self.n
w = self.w
n2 = n * n
s0 = w.s0
self.EG = s0 / (n * (n - 1))
s02 = s0 * s0
s1 = w.s1
s2 = w.s2
b0 = (n2 - 3 * n + 3) * s1 - n * s2 + 3 * s02
b1 = (-1.0) * ((n2 - n) * s1 - 2 * n * s2 + 6 * s02)
b2 = (-1.0) * (2 * n * s1 - (n + 3) * s2 + 6 * s02)
b3 = 4 * (n - 1) * s1 - 2 * (n + 1) * s2 + 8 * s02
b4 = s1 - s2 + s02
self.b0 = b0
self.b1 = b1
self.b2 = b2
self.b3 = b3
self.b4 = b4
y2 = y * y
y3 = y * y2
y4 = y2 * y2
EG2 = b0 * (sum(y2) ** 2) + b1 * sum(y4) + b2 * (sum(y) ** 2) * sum(y2)
EG2 += b3 * sum(y) * sum(y3) + b4 * (sum(y) ** 4)
EG2NUM = EG2
EG2DEN = ((sum(y) ** 2 - sum(y2)) ** 2) * n * (n - 1) * (n - 2) * (n - 3)
self.EG2 = EG2NUM / EG2DEN
self.VG = self.EG2 - self.EG**2
def __calc(self, y):
yl = slag(self.w, y)
self.num = y * yl
return self.num.sum() / self.den_sum
@property
def _statistic(self):
"""Standardized accessor for esda statistics"""
return self.G
[docs]
@classmethod
def by_col(
cls, df, cols, w=None, inplace=False, pvalue="sim", outvals=None, **stat_kws
):
"""
Function to compute a G statistic on a dataframe
Parameters
----------
df : pandas.DataFrame
a pandas dataframe with a geometry column
cols : string or list of string
name or list of names of columns to use to compute the statistic
w : pysal weights object
a weights object aligned with the dataframe. If not provided, this
is searched for in the dataframe's metadata
inplace : bool
a boolean denoting whether to operate on the dataframe inplace or to
return a series contaning the results of the computation. If
operating inplace, the derived columns will be named 'column_g'
pvalue : string
a string denoting which pvalue should be returned. Refer to the
the G statistic's documentation for available p-values
outvals : list of strings
list of arbitrary attributes to return as columns from the G statistic
**stat_kws : dict
options to pass to the underlying statistic. For this, see the
documentation for the G statistic.
Returns
--------
If inplace, None, and operation is conducted on dataframe
in memory. Otherwise, returns a copy of the dataframe with
the relevant columns attached.
"""
msg = (
"The `.by_cols()` methods are deprecated and will be "
"removed in a future version of `esda`."
)
warnings.warn(msg, FutureWarning)
return _univariate_handler(
df,
cols,
w=w,
inplace=inplace,
pvalue=pvalue,
outvals=outvals,
stat=cls,
swapname=cls.__name__.lower(),
**stat_kws,
)
[docs]
class G_Local(object):
"""
Generalized Local G Autocorrelation
Parameters
----------
y : array
variable
w : W
DistanceBand, weights instance that is based on threshold distance
and is assumed to be aligned with y
transform : {'R', 'B'}
the type of w, either 'B' (binary) or 'R' (row-standardized)
permutations : int
the number of random permutations for calculating
pseudo p values
star : boolean or float
whether or not to include focal observation in sums (default: False)
if the row-transformed weight is provided, then this is the default
value to use within the spatial lag. Generally, weights should be
provided in binary form, and standardization/self-weighting will be
handled by the function itself.
island_weight:
value to use as a weight for the "fake" neighbor for every island.
If numpy.nan, will propagate to the final local statistic depending
on the `stat_func`. If 0, then the lag is always zero for islands.
Attributes
----------
y : array
original variable
w : DistanceBand W
original weights object
permutations : int
the number of permutations
Gs : array
of floats, the value of the orginal G statistic in Getis & Ord (1992)
EGs : float
expected value of Gs under normality assumption
the values is scalar, since the expectation is identical
across all observations
VGs : array
of floats, variance values of Gs under normality assumption
Zs : array
of floats, standardized Gs
p_norm : array
of floats, p-value under normality assumption (one-sided)
for two-sided tests, this value should be multiplied by 2
sim : array
of arrays of floats (if permutations>0), vector of I values
for permutated samples
p_sim : array
of floats, p-value based on permutations (one-sided)
null - spatial randomness
alternative - the observed G is extreme
(it is either extremely high or extremely low)
EG_sim : array
of floats, average value of G from permutations
VG_sim : array
of floats, variance of G from permutations
seG_sim : array
of floats, standard deviation of G under permutations.
z_sim : array
of floats, standardized G based on permutations
p_z_sim : array
of floats, p-value based on standard normal approximation from
permutations (one-sided)
Notes
-----
To compute moments of Gs under normality assumption,
PySAL considers w is either binary or row-standardized.
For binary weights object, the weight value for self is 1
For row-standardized weights object, the weight value for self is
1/(the number of its neighbors + 1).
For technical details see :cite:`Getis_2010` and :cite:`Ord_2010`.
Examples
--------
>>> import libpysal
>>> import numpy
>>> numpy.random.seed(10)
Preparing a point data set
>>> points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)]
Creating a weights object from points
>>> w = libpysal.weights.DistanceBand(points,threshold=15)
Preparing a variable
>>> y = numpy.array([2, 3, 3.2, 5, 8, 7])
Applying Getis and Ord local G test using a binary weights object
>>> from esda.getisord import G_Local
>>> lg = G_Local(y,w,transform='B')
Examining the results
>>> lg.Zs
array([-1.0136729 , -0.04361589, 1.31558703, -0.31412676, 1.15373986,
1.77833941])
>>> round(lg.p_sim[0], 3)
0.101
p-value based on standard normal approximation from permutations
>>> round(lg.p_z_sim[0], 3)
0.154
>>> numpy.random.seed(10)
Applying Getis and Ord local G* test using a binary weights object
>>> lg_star = G_Local(y,w,transform='B',star=True)
Examining the results
>>> lg_star.Zs
array([-1.39727626, -0.28917762, 0.65064964, -0.28917762, 1.23452088,
2.02424331])
>>> round(lg_star.p_sim[0], 3)
0.101
>>> numpy.random.seed(12345)
Applying Getis and Ord local G test using a row-standardized weights object
>>> lg = G_Local(y,w,transform='R')
Examining the results
>>> lg.Zs
array([-0.62074534, -0.01780611, 1.31558703, -0.12824171, 0.28843496,
1.77833941])
>>> round(lg.p_sim[0], 3)
0.103
>>> numpy.random.seed(10)
Applying Getis and Ord local G* test using a row-standardized weights object
>>> lg_star = G_Local(y,w,transform='R',star=True)
Examining the results
>>> lg_star.Zs
array([-0.62488094, -0.09144599, 0.41150696, -0.09144599, 0.24690418,
1.28024388])
>>> round(lg_star.p_sim[0], 3)
0.101
"""
[docs]
def __init__(
self,
y,
w,
transform="R",
permutations=PERMUTATIONS,
star=False,
keep_simulations=True,
n_jobs=-1,
seed=None,
island_weight=0,
):
y = np.asarray(y).flatten()
self.n = len(y)
self.y = y
w, star = _infer_star_and_structure_w(w, star, transform)
w.transform = transform
self.w_transform = transform
self.w = w
self.permutations = permutations
self.star = star
self.calc()
self.p_norm = stats.norm.sf(np.abs(self.Zs))
if permutations:
self.p_sim, self.rGs = _crand_plus(
y,
w,
self.Gs,
permutations,
keep_simulations,
n_jobs=n_jobs,
stat_func=_g_local_star_crand if star else _g_local_crand,
scaling=y.sum(),
seed=seed,
island_weight=island_weight,
)
if keep_simulations:
self.sim = sim = self.rGs.T
self.EG_sim = sim.mean(axis=0)
self.seG_sim = sim.std(axis=0)
self.VG_sim = self.seG_sim * self.seG_sim
self.z_sim = (self.Gs - self.EG_sim) / self.seG_sim
self.p_z_sim = stats.norm.sf(np.abs(self.z_sim))
def __crand(self, keep_simulations):
y = self.y
if keep_simulations:
rGs = np.zeros((self.n, self.permutations))
larger = np.zeros((self.n,))
n_1 = self.n - 1
rid = list(range(n_1))
prange = list(range(self.permutations))
k = self.w.max_neighbors + 1
rids = np.array([np.random.permutation(rid)[0:k] for i in prange])
ids = np.arange(self.w.n)
wc = self.__getCardinalities()
if self.w_transform == "r":
den = np.array(wc) + self.star
else:
den = np.ones(self.w.n)
for i in range(self.w.n):
idsi = ids[ids != i]
np.random.shuffle(idsi)
yi_star = y[i] * self.star
wci = wc[i]
rGs_i = (y[idsi[rids[:, 0:wci]]]).sum(1) + yi_star
rGs_i = (np.array(rGs_i) / den[i]) / (self.y_sum - (1 - self.star) * y[i])
if keep_simulations:
rGs[i] = rGs_i
larger[i] = (rGs_i >= self.Gs[i]).sum()
if keep_simulations:
self.rGs = rGs
below = (self.permutations - larger) < larger
larger[below] = self.permutations - larger[below]
self.p_sim = (larger + 1) / (self.permutations + 1)
def __getCardinalities(self):
ido = self.w.id_order
self.wc = np.array([self.w.cardinalities[ido[i]] for i in range(self.n)])
return self.wc
[docs]
def calc(self):
w = self.w
W = w.sparse
self.y_sum = self.y.sum()
y = self.y
remove_self = not self.star
N = self.w.n - remove_self
statistic = (W @ y) / (y.sum() - y * remove_self)
# ----------------------------------------------------#
# compute moments necessary for analytical inference #
# ----------------------------------------------------#
empirical_mean = (y.sum() - y * remove_self) / N
# variance looks complex, yes, but it obtains from E[x^2] - E[x]^2.
# So, break it down to allow subtraction of the self-neighbor.
mean_of_squares = ((y**2).sum() - (y**2) * remove_self) / N
empirical_variance = mean_of_squares - empirical_mean**2
# Since we have corrected the diagonal, this should work
cardinality = np.asarray(W.sum(axis=1)).squeeze()
expected_value = cardinality / N
expected_variance = cardinality * (N - cardinality)
expected_variance /= N - 1
expected_variance *= 1 / N**2
expected_variance *= empirical_variance / (empirical_mean**2)
z_scores = (statistic - expected_value) / np.sqrt(expected_variance)
self.Gs = statistic
self.EGs = expected_value
self.VGs = expected_variance
self.Zs = z_scores
@property
def _statistic(self):
"""Standardized accessor for esda statistics"""
return self.Gs
[docs]
@classmethod
def by_col(
cls, df, cols, w=None, inplace=False, pvalue="sim", outvals=None, **stat_kws
):
"""
Function to compute a G_Local statistic on a dataframe
Parameters
----------
df : pandas.DataFrame
a pandas dataframe with a geometry column
cols : string or list of string
name or list of names of columns to use to compute the statistic
w : pysal weights object
a weights object aligned with the dataframe. If not provided, this
is searched for in the dataframe's metadata
inplace : bool
a boolean denoting whether to operate on the dataframe inplace or to
return a series contaning the results of the computation. If
operating inplace, the derived columns will be named 'column_g_local'
pvalue : string
a string denoting which pvalue should be returned. Refer to the
the G_Local statistic's documentation for available p-values
outvals : list of strings
list of arbitrary attributes to return as columns from the
G_Local statistic
**stat_kws : dict
options to pass to the underlying statistic. For this, see the
documentation for the G_Local statistic.
Returns
-------
pandas.DataFrame
If inplace, None, and operation is conducted on dataframe
in memory. Otherwise, returns a copy of the dataframe with
the relevant columns attached.
"""
msg = (
"The `.by_col()` methods are deprecated and will be "
"removed in a future version of `esda`."
)
warnings.warn(msg, FutureWarning)
return _univariate_handler(
df,
cols,
w=w,
inplace=inplace,
pvalue=pvalue,
outvals=outvals,
stat=cls,
swapname=cls.__name__.lower(),
**stat_kws,
)
def _infer_star_and_structure_w(weights, star, transform):
assert transform.lower() in ("r", "b"), (
f'Transforms must be binary "b" or row-standardized "r".'
f"Recieved: {transform}"
)
adj_matrix = weights.sparse
diagonal = adj_matrix.diagonal()
zero_diagonal = (diagonal == 0).all()
# Gi has a zero diagonal, Gi* has a nonzero diagonal
star = (not zero_diagonal) if star is None else star
# Want zero diagonal but do not have it
if (not zero_diagonal) & (star is False):
weights = fill_diagonal(weights, 0)
# Want nonzero diagonal and have it
elif (not zero_diagonal) & (star is True):
weights = weights
# Want zero diagonal and have it
elif zero_diagonal & (star is False):
weights = weights
# Want nonzero diagonal and do not have it
elif zero_diagonal & (star is True):
# if the input is binary or requested transform is binary,
# set the diagonal to 1.
if transform.lower() == "b" or weights.transform.lower() == "b":
weights = fill_diagonal(weights, 1)
# if we know the target is row-standardized, use the row max
# this works successfully for effectively binary but "O"-transformed input
elif transform.lower() == "r":
# This warning is presented in the documentation as well
warnings.warn(
"Gi* requested, but (a) weights are already row-standardized,"
" (b) no weights are on the diagonal, and"
" (c) no default value supplied to star. Assuming that the"
" self-weight is equivalent to the maximum weight in the"
" row. To use a different default (like, .5), set `star=.5`,"
" or use libpysal.weights.fill_diagonal() to set the diagonal"
" values of your weights matrix and use `star=None` in Gi_Local."
)
weights = fill_diagonal(
weights, np.asarray(adj_matrix.max(axis=1).todense()).flatten()
)
else: # star was something else, so try to fill the weights with it
try:
weights = fill_diagonal(weights, star)
except TypeError:
raise TypeError(
f"Type of star ({type(star)}) not understood."
f" Must be an integer, boolean, float, or numpy.ndarray."
)
star = (weights.sparse.diagonal() > 0).any()
weights.transform = transform
return weights, star
# --------------------------------------------------------------
# Conditional Randomization Function Implementations
# --------------------------------------------------------------
@_njit(fastmath=True)
def _g_local_crand(i, z, permuted_ids, weights_i, scaling):
other_weights = weights_i[1:]
zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights)
return (zrand @ other_weights) / (scaling - zi)
@_njit(fastmath=True)
def _g_local_star_crand(i, z, permuted_ids, weights_i, scaling):
self_weight = weights_i[0]
other_weights = weights_i[1:]
zi, zrand = _prepare_univariate(i, z, permuted_ids, other_weights)
return (zrand @ other_weights + self_weight * zi) / scaling
if __name__ == "__main__":
import geopandas
import numpy
from libpysal import examples, weights
import esda
df = geopandas.read_file(examples.get_path("NAT.shp"))
w = weights.Rook.from_dataframe(df)
for transform in ("r", "b"):
for star in (True, False):
test = esda.getisord.G_Local(df.GI89, w, transform=transform, star=star)
out = test._calc2()
(
statistic,
expected_value,
expected_variance,
z_scores,
empirical_mean,
empirical_variance,
) = out
numpy.testing.assert_allclose(statistic, test.Gs)
numpy.testing.assert_allclose(expected_value, test.EGs)
numpy.testing.assert_allclose(expected_variance, test.VGs)
numpy.testing.assert_allclose(z_scores, test.Zs)
numpy.testing.assert_allclose(empirical_mean, test.yl_mean)
numpy.testing.assert_allclose(empirical_variance, test.s2)
# Also check that the None configuration works
test = esda.getisord.G_Local(df.GI89, w, star=None)