Clustering and Regions

The previous notebook provided several illustrations of the power of visualization in the analysis of spatial data. This power stems from visualizations ability to tap into our human pattern recognition machinery.

In this notebook we introduce methods for regionalization and clustering.

Imports

import pandas as pd
import geopandas as gpd
import libpysal as lp
import matplotlib.pyplot as plt
import rasterio as rio
import numpy as np
import contextily as ctx
import shapely.geometry as geom
%matplotlib inline

neighborhoods = gpd.read_file('../data/neighborhoods.gpkg')
# was created in previous notebook with neighborhoods.to_file('data/neighborhoods.gpkg')
listings = gpd.read_file('../data/listings.gpkg')
# was created in previous notebook with listings.to_file('data/neighborhoods.gpkg')

listings['price'] = listings.price.str.replace('$','').str.replace(',','_').astype(float)

neighborhoods.head()

summaries = gpd.sjoin(listings[['price', 'accommodates', 'geometry']], neighborhoods, op='within')\
                .eval('price_per_head = price / accommodates')\
                .groupby('index_right')\
                .agg(dict(price_per_head='median',
                          price='median'))

neighborhoods['median_pph'] = summaries.price_per_head
neighborhoods['median_pri'] = summaries.price

f,ax = plt.subplots(1,2,figsize=(8,3))
neighborhoods.plot(column='median_pri', ax=ax[0])
neighborhoods.plot(column='median_pph', ax=ax[1])
ax[0].set_title('Median Price')
ax[1].set_title('Median Price per Head')

import libpysal

wq = libpysal.weights.Queen.from_dataframe(neighborhoods)

from sklearn.cluster import KMeans, AgglomerativeClustering

kmeans = KMeans(n_clusters=5)

import numpy
numpy.random.seed(0)
cluster_variables = ['median_pri']
k5cls = kmeans.fit(neighborhoods[cluster_variables])

k5cls.labels_

# Assign labels into a column
neighborhoods['k5cls'] = k5cls.labels_
# Setup figure and ax
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='k5cls', categorical=True, legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
plt.axis('equal')
# Add title
plt.title(r'Price Clusters (k-means, $k=5$)')
# Display the map
plt.show()

k5sizes = neighborhoods.groupby('k5cls').size()
k5sizes

k5means = neighborhoods.groupby('k5cls')[cluster_variables].mean()

k5means.T

import numpy
numpy.random.seed(0)

ward5 = AgglomerativeClustering(linkage='ward', n_clusters=5)
ward5.fit(neighborhoods[cluster_variables])
neighborhoods['ward5'] = ward5.labels_

ward5sizes = neighborhoods.groupby('ward5').size()
ward5sizes

ward5means = neighborhoods.groupby('ward5')[cluster_variables].mean()
ward5means.T

# Setup figure and ax
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='ward5', categorical=True, legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
plt.axis('equal')
# Add title
plt.title('Price Clusters (AHC, $k=5$)')
# Display the map
plt.show()

# Setup figure and ax
f, axs = plt.subplots(1, 2, figsize=(12, 6))

ax = axs[0]
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='ward5', categorical=True, cmap='Set2', 
                   legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
ax.axis('equal')
# Add title
ax.set_title('K-Means solution ($k=5$)')

ax = axs[1]
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='k5cls', categorical=True, cmap='Set3',
                   legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
ax.axis('equal')
# Add title
ax.set_title('AHC solution ($k=5$)')

# Display the map
plt.show()

numpy.random.seed(123456)
model = AgglomerativeClustering(linkage='ward',
                                connectivity=wq.sparse,
                                n_clusters=5)
model.fit(neighborhoods[cluster_variables])

neighborhoods['ward5wq'] = model.labels_
# Setup figure and ax
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='ward5wq', categorical=True, legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
plt.axis('equal')
# Add title
plt.title(r'Price Regions (Ward, $k=5$, Queen Contiguity)')
# Display the map
plt.show()

Spatial Clustering Based on Points

listings.shape

clipped_thiessens = gpd.read_file('../data/thiessens.gpkg')

clipped_thiessens.shape

wtq = libpysal.weights.Queen.from_dataframe(clipped_thiessens)

wtq.n

numpy.random.seed(123456)

logprice = numpy.log(listings[['price']]+1)

model = AgglomerativeClustering(linkage='ward',
                                            connectivity=wtq.sparse,
                                            n_clusters=5)
model.fit(logprice)

listings['region'] = model.labels_

numpy.unique(model.labels_, return_counts=True)

listings.sort_values('region').plot(column='region', marker='.')

region_sizes = listings.groupby('region').size()
region_sizes

model = AgglomerativeClustering(linkage='ward',n_clusters=5)
model.fit(logprice)

listings['region5'] = model.labels_

listings.groupby('region5').size()

region1 = listings[listings.region5==1]

region1.plot()

for region in range(5):
    region_ = listings[listings.region5==region]
    region_.plot()
    print(region_.shape)

region_.shape