Clustering and Regions
The previous notebook provided several illustrations of the power of visualization in the analysis of spatial data. This power stems from visualizations ability to tap into our human pattern recognition machinery.
In this notebook we introduce methods for regionalization and clustering.
Imports
import pandas as pd
import geopandas as gpd
import libpysal as lp
import matplotlib.pyplot as plt
import rasterio as rio
import numpy as np
import contextily as ctx
import shapely.geometry as geom
%matplotlib inline
neighborhoods = gpd.read_file('../data/neighborhoods.gpkg')
# was created in previous notebook with neighborhoods.to_file('data/neighborhoods.gpkg')
listings = gpd.read_file('../data/listings.gpkg')
# was created in previous notebook with listings.to_file('data/neighborhoods.gpkg')
listings['price'] = listings.price.str.replace('$','').str.replace(',','_').astype(float)
neighborhoods.head()
summaries = gpd.sjoin(listings[['price', 'accommodates', 'geometry']], neighborhoods, op='within')\
.eval('price_per_head = price / accommodates')\
.groupby('index_right')\
.agg(dict(price_per_head='median',
price='median'))
neighborhoods['median_pph'] = summaries.price_per_head
neighborhoods['median_pri'] = summaries.price
f,ax = plt.subplots(1,2,figsize=(8,3))
neighborhoods.plot(column='median_pri', ax=ax[0])
neighborhoods.plot(column='median_pph', ax=ax[1])
ax[0].set_title('Median Price')
ax[1].set_title('Median Price per Head')
import libpysal
wq = libpysal.weights.Queen.from_dataframe(neighborhoods)
from sklearn.cluster import KMeans, AgglomerativeClustering
kmeans = KMeans(n_clusters=5)
import numpy
numpy.random.seed(0)
cluster_variables = ['median_pri']
k5cls = kmeans.fit(neighborhoods[cluster_variables])
k5cls.labels_
# Assign labels into a column
neighborhoods['k5cls'] = k5cls.labels_
# Setup figure and ax
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='k5cls', categorical=True, legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
plt.axis('equal')
# Add title
plt.title(r'Price Clusters (k-means, $k=5$)')
# Display the map
plt.show()
k5sizes = neighborhoods.groupby('k5cls').size()
k5sizes
k5means = neighborhoods.groupby('k5cls')[cluster_variables].mean()
k5means.T
import numpy
numpy.random.seed(0)
ward5 = AgglomerativeClustering(linkage='ward', n_clusters=5)
ward5.fit(neighborhoods[cluster_variables])
neighborhoods['ward5'] = ward5.labels_
ward5sizes = neighborhoods.groupby('ward5').size()
ward5sizes
ward5means = neighborhoods.groupby('ward5')[cluster_variables].mean()
ward5means.T
# Setup figure and ax
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='ward5', categorical=True, legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
plt.axis('equal')
# Add title
plt.title('Price Clusters (AHC, $k=5$)')
# Display the map
plt.show()
# Setup figure and ax
f, axs = plt.subplots(1, 2, figsize=(12, 6))
ax = axs[0]
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='ward5', categorical=True, cmap='Set2',
legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
ax.axis('equal')
# Add title
ax.set_title('K-Means solution ($k=5$)')
ax = axs[1]
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='k5cls', categorical=True, cmap='Set3',
legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
ax.axis('equal')
# Add title
ax.set_title('AHC solution ($k=5$)')
# Display the map
plt.show()
numpy.random.seed(123456)
model = AgglomerativeClustering(linkage='ward',
connectivity=wq.sparse,
n_clusters=5)
model.fit(neighborhoods[cluster_variables])
neighborhoods['ward5wq'] = model.labels_
# Setup figure and ax
f, ax = plt.subplots(1, figsize=(9, 9))
# Plot unique values choropleth including a legend and with no boundary lines
neighborhoods.plot(column='ward5wq', categorical=True, legend=True, linewidth=0, ax=ax)
# Remove axis
ax.set_axis_off()
# Keep axes proportionate
plt.axis('equal')
# Add title
plt.title(r'Price Regions (Ward, $k=5$, Queen Contiguity)')
# Display the map
plt.show()
Spatial Clustering Based on Points
listings.shape
clipped_thiessens = gpd.read_file('../data/thiessens.gpkg')
clipped_thiessens.shape
wtq = libpysal.weights.Queen.from_dataframe(clipped_thiessens)
wtq.n
numpy.random.seed(123456)
logprice = numpy.log(listings[['price']]+1)
model = AgglomerativeClustering(linkage='ward',
connectivity=wtq.sparse,
n_clusters=5)
model.fit(logprice)
listings['region'] = model.labels_
numpy.unique(model.labels_, return_counts=True)
listings.sort_values('region').plot(column='region', marker='.')
region_sizes = listings.groupby('region').size()
region_sizes
model = AgglomerativeClustering(linkage='ward',n_clusters=5)
model.fit(logprice)
listings['region5'] = model.labels_
listings.groupby('region5').size()
region1 = listings[listings.region5==1]
region1.plot()
for region in range(5):
region_ = listings[listings.region5==region]
region_.plot()
print(region_.shape)
region_.shape