Source code for spopt.region.spenc

# ruff: noqa: C408, B006, N806


from ..BaseClass import BaseSpOptHeuristicSolver
from .spenclib import SPENC



[docs]
class Spenc(BaseSpOptHeuristicSolver):
    """
    Spatially encouraged spectral clustering found in :cite:`wolf2018`.

    Apply clustering to a projection of the normalized laplacian, using
    spatial information to constrain the clustering. In practice Spectral
    Clustering is very useful when the structure of the individual clusters
    is highly non-convex or more generally when a measure of the center and
    spread of the cluster is not a suitable description of the complete cluster.
    For instance when clusters are nested circles on the 2D plan.

    Spatially-Encouraged Spectral Clustering (*SPENC*) is useful for when
    there may be highly non-convex clusters or clusters with irregular
    topology in a geographic context. If a binary weights matrix is provided
    during fit, this method can be used to find weighted normalized graph cuts.

    When calling ``fit``, an affinity matrix is constructed using either
    kernel function such the Gaussian (aka RBF) kernel of the euclidean
    distanced :math:`d(X, X)`::

        numpy.exp(-gamma * d(X,X) ** 2)

    or a :math:`k`-nearest neighbors connectivity matrix. Alternatively,
    using ``precomputed``, a user-provided affinity matrix can be used.
    Read more in the ``scikit-learn`` user guide on spectral clustering.

    """


[docs]
    def __init__(
        self,
        gdf,
        w,
        attrs_name,
        n_clusters=5,
        random_state=None,
        gamma=1,
        eigen_solver=None,
        n_init=10,
        affinity="rbf",
        n_neighbors=10,
        eigen_tol=1e-9,
        assign_labels="discretize",
        degree=3,
        coef0=1,
        kernel_params=None,
        n_jobs=1,
    ):
        """

        Parameters
        ----------

        gdf : geopandas.GeoDataFrame
            Input data.
        w : libpywal.weights.W
            Spatial weights matrix.
        attrs_name : list
            Strings for attribute names from columns in ``gdf``.
        n_clusters : int (default 5)
            The number of clusters to form.
        random_state : int or numpy.random.RandomState (default None)
            A pseudo random number generator used for the initialization of the lobpcg
            eigen vectors decomposition when ``eigen_solver='amg'`` and by the
            :math:`k`-Means initialization.  If ``int``, ``random_state`` is the seed
            used by the random number generator; If ``numpy.random.RandomState``,
            ``random_state`` is the random number generator; If ``None``,
            the random number generator is the numpy.random.RandomState
            instance used by ``numpy.random``.
        gamma : int, float (default 1)
            Kernel coefficient for rbf, poly, sigmoid, laplacian and chi2 kernels.
            Ignored for ``affinity='nearest_neighbors'``.
        eigen_solver : str (default None)
            The eigenvalue decomposition strategy to use. Valid values include
            ``{'arpack', 'lobpcg', 'amg'}``. AMG requires ``pyamg`` to be installed,
            which may be faster on very large, sparse problems, but may also lead to
            instabilities. *Note* – ``eigen_solver`` is ignored unless fitting using
            the ``breakme`` flag in the ``.fit()`` method (so do not use then).
        n_init : int (default 10)
            The number of times the :math:`k`-means algorithm will be run with
            different centroid seeds. The final results will be the best output of
            ``n_init`` consecutive runs in terms of inertia.
        affinity : str, array-like, callable (default 'rbf')
            If a ``str``, valid values include
            ``{'nearest_neighbors', 'precomputed', 'rbf'}`` or one of the kernels
            supported by ``sklearn.metrics.pairwise_kernels``. Only kernels that
            produce similarity scores (non-negative values that increase with
            similarity) should be used. *This property is not checked
            by the clustering algorithm*.
        n_neighbors : int (default 10)
            The number of neighbors to use when constructing the affinity matrix using
            the nearest neighbors method. Ignored for ``affinity='rbf'``.
        eigen_tol : float (default 1e-7)
            Stopping criterion for eigen decomposition of the Laplacian matrix
            when using ``'arpack'`` as the ``eigen_solver``.
        assign_labels : str (default 'discretize')
            The strategy to use to assign labels in the embedding
            space. There are three ways to assign labels after the laplacian
            embedding: ``{'kmeans', 'discretize', 'hierarchical'}``:

            * ``'kmeans'`` can be applied and is a popular choice. But it can also be sensitive to initialization.
            * ``'discretize'`` is another approach which is less sensitive to random initialization, and which usually finds better clusters.
            * ``'hierarchical'`` decomposition repeatedly bi-partitions the graph, instead of finding the decomposition all at once, as suggested in :cite:`shi_malik_2000`.

        degree : float (default 3)
            Degree of the polynomial affinity kernel. Ignored by other kernels.
        coef0 : float (default 1)
            Zero coefficient for polynomial and sigmoid affinity kernels.
            Ignored by other kernels.
        kernel_params : dict (default None)
            Parameters (keyword arguments) and values for affinity kernel passed as
            callable object. Ignored by other affinity kernels.
        n_jobs : int (default 1)
            The number of parallel jobs to run for the nearest-neighbors
            affinity kernel, if used. If ``-1``, then the number of jobs
            is set to the number of CPU cores.

        Attributes
        ----------

        affinity_matrix_ : array-like
            Affinity matrix used for clustering in the shape of
            ``(n_samples, n_samples)``. Available only if after calling ``fit``.
        labels_ : list
            Cluster labels of each point or area.

        Notes
        -----

        If you have an affinity matrix, such as a distance matrix,
        for which ``0`` means identical elements, and high values mean
        very dissimilar elements, it can be transformed in a
        similarity matrix that is well suited for the algorithm by
        applying the Gaussian (RBF, heat) kernel::

            numpy.exp(-dist_matrix ** 2 / (2. * delta ** 2))

        Where ``delta`` is a free parameter representing the width of the Gaussian
        kernel.

        Another alternative is to take a symmetric version of the
        :math:`k`-nearest neighbors connectivity matrix of the points/areas.

        References
        ----------

        * :cite:`shi_malik_2000` Normalized cuts and image segmentation, 2000 Jianbo Shi, Jitendra Malik – https://doi.org/10.1109/34.868688
        * :cite:`von2007tutorial` A Tutorial on Spectral Clustering, 2007 Ulrike von Luxburg – https://doi.org/10.1007/s11222-007-9033-z
        * :cite:`yu_shi_2003` Multiclass spectral clustering, 2003 Stella X. Yu, Jianbo Shi – https://doi.org/10.1109/ICCV.2003.1238361

        """  # noqa: E501

        self.gdf = gdf
        self.w = w
        self.attrs_name = attrs_name
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        self.eigen_solver = eigen_solver
        self.n_init = n_init
        self.affinity = affinity
        self.n_neighbors = n_neighbors
        self.eigen_tol = eigen_tol
        self.assign_labels = assign_labels
        self.degree = degree
        self.coef0 = coef0
        self.kernel_params = kernel_params
        self.n_jobs = n_jobs



[docs]
    def solve(self, fit_kwargs=dict()):
        """Solve the spenc.

        Parameters
        ----------

        fit_kwargs : dict
            Keyword arguments passed into ``spenclib.abstracts.SPENC.fit()``.

        """

        data = self.gdf
        X = data[self.attrs_name].values
        model = SPENC(
            n_clusters=self.n_clusters,
            random_state=self.random_state,
            gamma=self.gamma,
            eigen_solver=self.eigen_solver,
            n_init=self.n_init,
            affinity=self.affinity,
            n_neighbors=self.n_neighbors,
            eigen_tol=self.eigen_tol,
            assign_labels=self.assign_labels,
            degree=self.degree,
            coef0=self.coef0,
            kernel_params=self.kernel_params,
            n_jobs=self.n_jobs,
        )
        model.fit(X, self.w.sparse, **fit_kwargs)
        self.labels_ = model.labels_