"""Model-based methods for areal interpolation."""
import numpy as np
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Gaussian, NegativeBinomial, Poisson
from ..dasymetric import _fast_append_profile_in_gdf
from ..util.util import _check_presence_of_crs
__all__ = ["glm"]
[docs]
def glm(
source_df=None,
target_df=None,
raster="nlcd_2011",
raster_codes=None,
variable=None,
formula=None,
likelihood="poisson",
force_crs_match=True,
return_model=False,
):
"""Train a generalized linear model to predict polygon attributes based on the collection of pixel values they contain.
Parameters
----------
source_df : geopandas.GeoDataFrame, required
geodataframe containing source original data to be represented by another geometry
target_df : geopandas.GeoDataFrame, required
geodataframe containing target boundaries that will be used to represent the source data
raster : str, required (default="nlcd_2011")
path to raster file that will be used to input data to the regression model.
i.e. a coefficients refer to the relationship between pixel counts and population counts.
Defaults to 2011 NLCD
raster_codes : list, required (default =[21, 22, 23, 24, 41, 42, 52])
list of integers that represent different types of raster cells. If no formula is given,
the model will be fit from a linear combination of the logged count of each cell type
listed here. Defaults to [21, 22, 23, 24, 41, 42, 52] which
are informative land type cells from the NLCD
variable : str, required
name of the variable (column) to be modeled from the `source_df`
formula : str, optional
patsy-style model formula that specifies the model. Raster codes should be prefixed with
"Type_", e.g. `"n_total_pop ~ -1 + np.log1p(Type_21) + np.log1p(Type_22)`
likelihood : str, {'poisson', 'gaussian', 'neg_binomial'} (default = "poisson")
the likelihood function used in the model
force_crs_match : bool
whether to coerce geodataframe and raster to the same CRS
return model : bool
whether to return the fitted model in addition to the interpolated geodataframe.
If true, this will return (geodataframe, model)
Returns
--------
interpolated : geopandas.GeoDataFrame
a new geopandas dataframe with boundaries from `target_df` and modeled attribute
data from the `source_df`. If `return_model` is true, the function will also return
the fitted regression model for further diagnostics
"""
source_df = source_df.copy()
target_df = target_df.copy()
_check_presence_of_crs(source_df)
liks = {"poisson": Poisson, "gaussian": Gaussian, "neg_binomial": NegativeBinomial}
if likelihood not in liks.keys():
raise ValueError(f"likelihood must one of {liks.keys()}")
if not raster_codes:
raster_codes = [21, 22, 23, 24, 41, 42, 52]
raster_codes = ["Type_" + str(i) for i in raster_codes]
if not formula:
formula = (
variable
+ "~ -1 +"
+ "+".join(["np.log1p(" + code + ")" for code in raster_codes])
)
profiled_df = _fast_append_profile_in_gdf(
source_df[[source_df.geometry.name, variable]], raster, force_crs_match
)
results = smf.glm(formula, data=profiled_df, family=liks[likelihood]()).fit()
out = target_df[[target_df.geometry.name]]
temp = _fast_append_profile_in_gdf(
out[[out.geometry.name]], raster, force_crs_match
)
out[variable] = results.predict(temp.drop(columns=[temp.geometry.name]).fillna(0))
if return_model:
return out, results
return out