Source code for spopt.region.region_k_means

"""
Region k-means


K-means with the constraint that all clusters form a spatially connected component.
"""

__author__ = "Serge Rey"
__email__ = "sjsrey@gmail.com"


from collections import defaultdict

import numpy

from ..BaseClass import BaseSpOptHeuristicSolver
from .base import (
    _centroid,
    _closest,
    _seeds,
    move_ok,
    ok_moves,
    region_neighbors,
    w_to_g,
)


def region_k_means(X, n_clusters, w, drop_islands=True, seed=0):  # noqa: N803
    """Solve the region-K-means problem with the constraint
    that each cluster forms a spatially connected component.

    Parameters
    ----------

    X : {numpy.ndarray, list}
        The observations to cluster shaped ``(n_samples, n_features)``.
    n_clusters : int
        The number of clusters to form.
    w : libpysal.weights.W
        Weights object created from given data.
    drop_islands : bool
        Drop observations that are islands (``True``) or keep them (``False``).
        Default is ``True``.
    seed : int
        Random state to pass into ``_seeds()``. Default is ``0``.

    Returns
    -------

    label : numpy.ndarray
        Integer array with shape ``(n_samples,)``, where ``label[i]`` is the
        code or index of the centroid the ``i``th observation is closest to.
    centroid : numpy.ndarray
       Floating point array of centroids in the shape of ``(k, n_features)``
       found at the last iteration of ``region_k_means``.
    iters : int
        The number of iterations for the reassignment phase.

    """

    data = X
    a_list = w.to_adjlist(remove_symmetric=False, drop_islands=drop_islands)
    areas = numpy.arange(w.n).astype(int)
    k = n_clusters
    seeds = _seeds(areas, k, seed)

    # initial assignment phase
    label = numpy.array([-1] * w.n).astype(int)
    for i, seed in enumerate(seeds):
        label[seed] = i
    to_assign = areas[label == -1]
    while to_assign.size > 0:
        assignments = defaultdict(list)
        for rid in range(k):
            region = areas[label == rid]
            neighbors = region_neighbors(a_list, region)
            neighbors = [j for j in neighbors if j in to_assign]
            if neighbors:
                d_min = numpy.inf
                centroid = data[region].mean(axis=0)
                for neighbor in neighbors:
                    d = ((data[neighbor] - centroid) ** 2).sum()
                    if d < d_min:
                        idx = neighbor
                        d_min = d
                assignments[idx].append([rid, d_min])
        for key in assignments:
            assignment = assignments[key]
            if len(assignment) == 1:
                r, d = assignment[0]
                label[key] = r
            else:
                d_min = numpy.inf
                for match in assignment:
                    r, d = match
                    if d < d_min:
                        idx = r
                        d_min = d
                label[key] = idx

        to_assign = areas[label == -1]

    # reassignment phase
    g = w_to_g(w)

    iters = 1

    # want to loop this until candidates is empty
    regions = [areas[label == r].tolist() for r in range(k)]
    centroid = _centroid(regions, data)
    closest = numpy.array(_closest(data, centroid))
    candidates = areas[closest != label]
    candidates = ok_moves(candidates, regions, label, closest, g, w, areas)
    while candidates:
        area = candidates.pop()
        # need to check move doesn't break component
        source = areas[label == label[area]]
        destination = areas[label == closest[area]]
        if move_ok(area, source, destination, g, w):
            # make move and update assignments, centroids, closest, candidates
            label[area] = closest[area]
            regions = [areas[label == r].tolist() for r in range(k)]
            centroid = _centroid(regions, data)
            closest = numpy.array(_closest(data, centroid))
            candidates = areas[closest != label]
            candidates = ok_moves(candidates, regions, label, closest, g, w, areas)
        iters += 1

    return centroid, label, iters



[docs]
class RegionKMeansHeuristic(BaseSpOptHeuristicSolver):
    """Solve the region-K-means problem with the constraint
    that each cluster forms a spatially connected component.


    Parameters
    ----------

    data : {numpy.ndarray, list}, required
        The observations to cluster shaped ``(n_samples, n_features)``.
    n_clusters : int
        The number of clusters to form.
    w : libpysal.weights.W, required
        Weights object created from given data.
    drop_islands : bool
        Drop observations that are islands (``True``) or keep them (``False``).
        Default is ``True``.
    seed : int
        Random state to pass into ``_seeds()``. Default is ``0``.

    Attributes
    ----------

    labels_ : numpy.array
        Region IDs for observations.
    centroids_ : numpy.ndarray
       Floating point array of centroids in the shape of ``(k, n_features)``
       found at the last iteration of ``region_k_means``.
    iters_ : int
        The number of iterations for the reassignment phase.

    """


[docs]
    def __init__(self, data, n_clusters, w, drop_islands=True, seed=0):
        self.data = data
        self.w = w
        self.n_clusters = n_clusters
        self.drop_islands = drop_islands
        self.seed = seed



[docs]
    def solve(self):
        """Solve the region k-means heuristic."""
        centroid, label, iters = region_k_means(
            self.data,
            self.n_clusters,
            self.w,
            drop_islands=self.drop_islands,
            seed=self.seed,
        )

        self.labels_ = label
        self.centroids_ = centroid
        self.iters_ = iters