Source code for spreg.skater_reg

"""Skater Regression classes."""

__author__ = "Luc Anselin anselin@uchicago.edu, Pedro Amaral pedroamaral@cedeplar.ufmg.br, Levi Wolf levi.john.wolf@bristol.ac.uk"

from scipy.sparse import csgraph as cg
from scipy.optimize import OptimizeWarning
from scipy.spatial.distance import cdist
from collections import namedtuple
from warnings import warn
from libpysal.weights import w_subset
from .utils import set_endog
import time
import numpy as np
import copy

try:
    from sklearn.metrics import euclidean_distances
except ImportError:
    from scipy.spatial.distance import pdist, cdist, squareform

    def euclidean_distances(X, Y=None):
        """
        fallback function to compute pairwise euclidean distances
        for a single input, or point-to-point euclidean distances
        for two inputs.
        """
        if Y is None:
            return squareform(pdist(X))
        else:
            return cdist(X, Y)


__all__ = ["Skater_reg"]

deletion = namedtuple("deletion", ("in_node", "out_node", "score"))


[docs]class Skater_reg(object): """ Initialize the Skater_reg algorithm based on :cite:`Anselin2021`. The function can currently estimate OLS, from spreg or stats_models, and Spatial Lag models from spreg. Fit method performs estimation and returns a Skater_reg object. Parameters ---------- dissimilarity : a callable distance metric. Default: sklearn.metrics.pairwise.euclidean_distances affinity : a callable affinity metric between 0,1. Will be inverted to provide a dissimilarity metric. reduction : the reduction applied over all clusters to provide the map score. Default: numpy.sum center : way to compute the center of each region in attribute space Default: numpy.mean NOTE: Optimization occurs with respect to a *dissimilarity* metric, so the reduction should yield some kind of score where larger values are *less desirable* than smaller values. Typically, this means we use addition. Attributes ---------- coords : array-like n*2, collection of n sets of (x,y) coordinates used for calibration locations y : array n*1, dependent variable X : array n*k, independent variable, exlcuding the constant bw : scalar bandwidth value consisting of either a distance or N nearest neighbors; user specified or obtained using Sel_BW family : family object underlying probability model; provides distribution-specific calculations offset : array n*1, the offset variable at the ith location. For Poisson model this term is often the size of the population at risk or the expected size of the outcome in spatial epidemiology Default is None where Ni becomes 1.0 for all locations sigma2_v1 : boolean specify form of corrected denominator of sigma squared to use for model diagnostics; Acceptable options are: 'True': n-tr(S) (defualt) 'False': n-2(tr(S)+tr(S'S)) kernel : string type of kernel function used to weight observations; available options: 'gaussian' 'bisquare' 'exponential' fixed : boolean True for distance based kernel function and False for adaptive (nearest neighbor) kernel function (default) constant : boolean True to include intercept (default) in model and False to exclude intercept spherical : boolean True for shperical coordinates (long-lat), False for projected coordinates (defalut). hat_matrix : boolean True to store full n by n hat matrix, False to not store full hat matrix to minimize memory footprint (defalut). n : integer number of observations k : integer number of independent variables mean_y : float mean of y std_y : float standard deviation of y fit_params : dict parameters passed into fit method to define estimation routine points : array-like n*2, collection of n sets of (x,y) coordinates used for calibration locations instead of all observations; defaults to None unles specified in predict method P : array n*k, independent variables used to make prediction; exlcuding the constant; default to None unless specified in predict method exog_scale : scalar estimated scale using sampled locations; defualt is None unless specified in predict method exog_resid : array-like estimated residuals using sampled locations; defualt is None unless specified in predict method Examples -------- >>> import libpysal as ps >>> import numpy as np >>> import spreg >>> from spreg.skater_reg import Skater_reg >>> data = ps.io.open(ps.examples.get_path('columbus.dbf')) >>> y = np.array(data.by_col('HOVAL')).reshape((-1,1)) >>> x_var = ['INC','CRIME'] >>> x = np.array([db.by_col(name) for name in x_var]).T >>> w = ps.weights.Queen.from_shapefile(ps.examples.get_path("columbus.shp")) >>> x_std = (x - np.mean(x,axis=0)) / np.std(x,axis=0) >>> results = Skater_reg().fit(3, w, x_std, {'reg':spreg.OLS,'y':y,'x':x}, quorum=10, trace=False) >>> results.current_labels_ array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 2, 2, 0, 0, 1, 0, 2, 1, 2, 1, 2, 0, 1, 0, 0, 1, 2, 2, 2, 1, 0, 2, 2], dtype=int32) """
[docs] def __init__( self, dissimilarity=euclidean_distances, affinity=None, reduction=np.sum, center=np.mean, ): if affinity is not None: # invert the 0,1 affinity to # to an unbounded positive dissimilarity metric = lambda x: -np.log(affinity(x)) else: metric = dissimilarity self.metric = metric self.reduction = reduction self.center = center
def __repr__(self): return "Skater_reg_object(metric = {}, reduction = {}, center = {})".format( self.metric, self.reduction, self.center )
[docs] def fit( self, n_clusters, W, data=None, data_reg=None, quorum=-np.inf, trace=True, islands="increase", verbose=False, model_family="spreg", ): """ Method that fits a model with a particular estimation routine. Parameters ---------- n_clusters : int of clusters wanted W : pysal W object expressing the neighbor relationships between observations. Should be symmetric and binary, so Queen/Rook, DistanceBand, or a symmetrized KNN. data : np.ndarray of (N,P) shape with N observations and P features This is the data that is used to evaluate the similarity between each observation. data_reg : list containing: 1- a callable regression method (ex. OLS or GM_Lag from spreg or OLS from statsmodels) 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included) 4- pysal W object to be used in the regression (optional) quorum : int with minimum size of each region. trace : bool denoting whether to store intermediate labelings as the tree gets pruned islands : string describing what to do with islands. If "ignore", will discover `n_clusters` regions, treating islands as their own regions. If "increase", will discover `n_clusters` regions, treating islands as separate from n_clusters. verbose : bool/int describing how much output to provide to the user, in terms of print statements or progressbars. model_family : string describing the fFamily of estimation method used for the regression. Must be either 'spreg' (default) or 'statsmodels' Returns ------- : Skater_reg object. """ if trace: self._trace = [] if data is None: attribute_kernel = np.ones((W.n, W.n)) data = np.ones((W.n, 1)) else: attribute_kernel = self.metric(data) W.transform = "b" W = W.sparse start = time.time() super_verbose = verbose > 1 start_W = time.time() dissim = W.multiply(attribute_kernel) dissim.eliminate_zeros() end_W = time.time() - start_W if super_verbose: print("Computing Affinity Kernel took {:.2f}s".format(end_W)) tree_time = time.time() MSF = cg.minimum_spanning_tree(dissim) tree_time = time.time() - tree_time if super_verbose: print("Computing initial MST took {:.2f}s".format(tree_time)) initial_component_time = time.time() current_n_subtrees, current_labels = cg.connected_components( MSF, directed=False ) initial_component_time = time.time() - initial_component_time if super_verbose: print( "Computing connected components took {:.2f}s.".format( initial_component_time ) ) if current_n_subtrees > 1: island_warnings = [ "Increasing `n_clusters` from {} to {} in order to account for islands.".format( n_clusters, n_clusters + current_n_subtrees ), "Counting islands towards the remaining {} clusters.".format( n_clusters - (current_n_subtrees) ), ] ignoring_islands = int(islands.lower() == "ignore") chosen_warning = island_warnings[ignoring_islands] warn( "By default, the graph is disconnected! {}".format(chosen_warning), OptimizeWarning, stacklevel=2, ) if not ignoring_islands: n_clusters += current_n_subtrees _, island_populations = np.unique(current_labels, return_counts=True) if (island_populations < quorum).any(): raise ValueError( "Islands must be larger than the quorum. If not, drop the small islands and solve for" " clusters in the remaining field." ) if trace: self._trace.append(([], deletion(np.nan, np.nan, np.inf))) if super_verbose: print(self._trace[-1]) trees_scores = None prev_score = np.inf while current_n_subtrees < n_clusters: # while we don't have enough regions ( best_deletion, trees_scores, new_MSF, current_n_subtrees, current_labels, ) = self.find_cut( MSF, data, data_reg, current_n_subtrees, current_labels, quorum=quorum, trees_scores=trees_scores, labels=None, target_label=None, verbose=verbose, model_family=model_family, ) if np.isfinite(best_deletion.score): # if our search succeeds # accept the best move as *the* move if super_verbose: print("cut made {}...".format(best_deletion)) if best_deletion.score > prev_score: raise ValueError( "The score increased with the number of clusters. Please check your data." ) prev_score = best_deletion.score MSF = new_MSF else: # otherwise, it means the MSF admits no further cuts prev_n_subtrees, _ = cg.connected_components(MSF, directed=False) warn( "MSF contains no valid moves after finding {} subtrees. " "Decrease the size of your quorum to find the remaining {} subtrees.".format( prev_n_subtrees, n_clusters - prev_n_subtrees ), OptimizeWarning, stacklevel=2, ) if trace: self._trace.append((current_labels, best_deletion)) self.current_labels_ = current_labels self.minimum_spanning_forest_ = MSF self._elapsed_time = time.time() - start return self
[docs] def score_spreg( self, data=None, data_reg=None, all_labels=None, quorum=-np.inf, current_labels=None, current_tree=None, ): """ This yields a score for the data using methods from the spreg library, given the labels provided. If no labels are provided, and the object has been fit, then the labels discovered from the previous fit are used. If a quorum is not passed, it is assumed to be irrelevant. If a quorum is passed and the labels do not meet quorum, the score is inf. data : (N,P) array of data on which to compute the score of the regions expressed in labels data_reg : list containing: 1- a callable spreg regression method (ex. OLS or GM_Lag) 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included) 4- pysal W object to be used in the regression (optional) all_labels : (N,) flat vector of labels expressing the classification of each observation into a region considering the cut under evaluation. quorum : int expressing the minimum size of regions. Can be -inf if there is no lower bound. Any region below quorum makes the score inf. current_labels: (N,) flat vector of labels expressing the classification of each observation into a region not considering the cut under evaluation. current_tree: integer indicating the tree label is currently being considered for division """ labels, subtree_quorums = self._prep_score( all_labels, current_tree, current_labels ) if (subtree_quorums < quorum).any(): return np.inf, None set_labels = set(labels) if data_reg is not None: kargs = { k: v for k, v in data_reg.items() if k not in ["reg", "y", "x", "w", "x_nd"] } trees_scores = {} l_arrays = np.array(all_labels) w_regi_i = None for l in set_labels: x = data_reg["x"][all_labels == l] if np.linalg.matrix_rank(x) < x.shape[1]: x = np.delete( x, np.nonzero(np.abs(np.diag(np.linalg.qr(x)[1])) < 1e-10), 1 ) if "w" not in data_reg: try: x = np.hstack((np.ones((x.shape[0], 1)), x)) reg = data_reg["reg"]( y=data_reg["y"][all_labels == l], x=x, **kargs ) except np.linalg.LinAlgError: x = np.delete(x, np.nonzero(np.ptp(x, axis=0) == 0), 1) x = np.hstack((np.ones((x.shape[0], 1)), x)) reg = data_reg["reg"]( y=data_reg["y"][all_labels == l], x=x, **kargs ) else: regi_ids = list(np.where(l_arrays == l)[0]) w_ids = list(map(data_reg["w"].id_order.__getitem__, regi_ids)) w_regi_i = w_subset(data_reg["w"], w_ids, silence_warnings=True) try: x = np.hstack((np.ones((x.shape[0], 1)), x)) reg = data_reg["reg"]( y=data_reg["y"][all_labels == l], x=x, w=w_regi_i, **kargs ) except np.linalg.LinAlgError: x = np.delete(x, np.nonzero(np.ptp(x, axis=0) == 0), 1) reg = data_reg["reg"]( y=data_reg["y"][all_labels == l], x=x, w=w_regi_i, **kargs ) trees_scores[l] = np.sum(reg.u ** 2) score = sum(trees_scores.values()) else: part_scores, score, trees_scores = self._data_reg_none( data, all_labels, l, set_labels ) return score, trees_scores
[docs] def score_stats( self, data=None, data_reg=None, all_labels=None, quorum=-np.inf, current_labels=None, current_tree=None, ): """ This yields a score for the data using methods from the stats_models library, given the labels provided. If no labels are provided, and the object has been fit, then the labels discovered from the previous fit are used. If a quorum is not passed, it is assumed to be irrelevant. If a quorum is passed and the labels do not meet quorum, the score is inf. data : (N,P) array of data on which to compute the score of the regions expressed in labels data_reg : list containing: 1- a callable statsmodels regression method (ex. OLS) 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included) 4- pysal W object to be used in the regression (optional) all_labels : (N,) flat vector of labels expressing the classification of each observation into a region considering the cut under evaluation. quorum : int expressing the minimum size of regions. Can be -inf if there is no lower bound. Any region below quorum makes the score inf. current_labels: (N,) flat vector of labels expressing the classification of each observation into a region not considering the cut under evaluation. current_tree: integer indicating the tree label is currently being considered for division NOTE: Optimization occurs with respect to a *dissimilarity* metric, so the problem *minimizes* the map dissimilarity. So, lower scores are better. """ labels, subtree_quorums = self._prep_score( all_labels, current_tree, current_labels ) if (subtree_quorums < quorum).any(): return np.inf, None set_labels = set(labels) if data_reg is not None: kargs = { k: v for k, v in data_reg.items() if k not in ["reg", "y", "x", "w", "x_nd"] } trees_scores = {} l_arrays = np.array(all_labels) w_regi_i = None for l in set_labels: x = data_reg["x"][all_labels == l] if np.linalg.matrix_rank(x) < x.shape[1]: x = np.delete( x, np.nonzero(np.abs(np.diag(np.linalg.qr(x)[1])) < 1e-10), 1 ) try: x = np.hstack((np.ones((x.shape[0], 1)), x)) reg = data_reg["reg"]( data_reg["y"][all_labels == l], x, **kargs ).fit() except np.linalg.LinAlgError: x = np.delete(x, np.nonzero(np.ptp(x, axis=0) == 0), 1) x = np.hstack((np.ones((x.shape[0], 1)), x)) reg = data_reg["reg"]( data_reg["y"][all_labels == l], x, **kargs ).fit() trees_scores[l] = np.sum(reg.resid ** 2) score = sum(trees_scores.values()) else: part_scores, score, trees_scores = self._data_reg_none( data, all_labels, l, set_labels ) return score, trees_scores
def _prep_score(self, all_labels, current_tree, current_labels): if all_labels is None: try: labels = self.current_labels_ except AttributeError: raise ValueError( "Labels not provided and MSF_Prune object has not been fit to data yet." ) if current_tree is not None: labels = all_labels[current_labels == current_tree] _, subtree_quorums = np.unique(labels, return_counts=True) return labels, subtree_quorums def _data_reg_none(self, data, all_labels, l, set_labels): assert data.shape[0] == len( all_labels ), "Length of label array ({}) does not match " "length of data ({})! ".format( all_labels.shape[0], data.shape[0] ) part_scores = [ self.reduction( self.metric( X=data[all_labels == l], Y=self.center(data[all_labels == l], axis=0).reshape(1, -1), ) ) for l in set_labels ] score = self.reduction(part_scores).item() trees_scores = {l: part_scores[i] for i, l in enumerate(set_labels)} return part_scores, score, trees_scores
[docs] def find_cut( self, MSF, data=None, data_reg=None, current_n_subtrees=None, current_labels=None, quorum=-np.inf, trees_scores=None, labels=None, target_label=None, make=False, verbose=False, model_family="spreg", ): """ Find the best cut from the MSF. MSF: (N,N) scipy sparse matrix with zero elements removed. Represents the adjacency matrix for the minimum spanning forest. Constructed from sparse.csgraph.sparse_from_dense or using MSF.eliminate_zeros(). You MUST remove zero entries for this to work, otherwise they are considered no-cost paths. data: (N,p) attribute matrix. If not provided, replaced with (N,1) vector of ones. data_reg: optional list containing: 1- a callable spreg or statsmodels regression method (ex. OLS or GM_Lag) 2- np.ndarray of (N,1) shape with N observations on the depedent variable for the regression 3- np.ndarray of (N,k) shape with N observations and k columns containing the explanatory variables (constant must not be included) 4- pysal W object to be used in the regression (optional) current_n_subtrees: integer indication the current number of subtrees. current_labels: (N,) flat vector of labels expressing the classification of each observation into a region not considering the cut under evaluation. quorum: int denoting the minimum number of elements in the region trees_scores: dictionary indicating subtress's labels and their respective current score. labels: (N,) flat vector of labels for each point. Represents the "cluster labels" for disconnected components of the graph. target_label: int from the labels array to subset the MSF. If passed along with `labels`, then a cut will be found that is restricted to that subset of the MSF. make: bool, whether or not to modify the input MSF in order to make the best cut that was found. verbose: bool/int, denoting how much output to provide to the user, in terms of print statements or progressbars Returns a namedtuple with in_node, out_node, and score. """ if data is None: data = np.ones(MSF.shape) if (labels is None) != (target_label is None): raise ValueError( "Both labels and target_label must be supplied! Only {} provided.".format( ["labels", "target_label"][int(target_label is None)] ) ) if verbose: try: from tqdm import tqdm except ImportError: def tqdm(noop, desc=""): return noop else: def tqdm(noop, desc=""): return noop zero_in = (labels is not None) and (target_label is not None) best_deletion = deletion(np.nan, np.nan, np.inf) best_d_score = -np.inf try: old_score = sum(trees_scores.values()) except: pass best_scores = {} current_list = current_labels.tolist() for in_node, out_node in tqdm( np.vstack(MSF.nonzero()).T, desc="finding cut..." ): # iterate over MSF edges if zero_in: if labels[in_node] != target_label: continue local_MSF = copy.deepcopy(MSF) # delete a candidate edge local_MSF[in_node, out_node] = 0 local_MSF.eliminate_zeros() current_tree = current_labels[in_node] # get the connected components local_n_subtrees, local_labels = cg.connected_components( local_MSF, directed=False ) if local_n_subtrees <= current_n_subtrees: raise Exception("Malformed MSF!") # compute the score of these components if model_family == "spreg": new_score, new_trees_scores = self.score_spreg( data, data_reg, local_labels, quorum, current_labels, current_tree ) elif model_family == "statsmodels": new_score, new_trees_scores = self.score_stats( data, data_reg, local_labels, quorum, current_labels, current_tree ) else: raise ValueError("Model family must be either spreg or statsmodels.") if np.isfinite(new_score): try: d_score = trees_scores[current_tree] - new_score score = old_score - d_score except: d_score = -new_score score = new_score # if the d_score is greater than the best score and quorum is met if d_score > best_d_score: best_deletion = deletion(in_node, out_node, score) best_d_score = d_score try: for i in set(current_labels): best_scores[ local_labels[current_list.index(i)] ] = trees_scores[i] for i in new_trees_scores: best_scores[i] = new_trees_scores[i] except: best_scores = new_trees_scores best_MSF = local_MSF best_labels = local_labels try: return best_deletion, best_scores, best_MSF, local_n_subtrees, best_labels except UnboundLocalError: # in case no solution is found return deletion(None, None, np.inf), np.inf, None, np.inf, None
def _const_x(x): x = np.delete(x, np.nonzero(np.ptp(x, axis=0) == 0), 1) x = np.hstack((np.ones((x.shape[0], 1)), x)) return x