from scipy import sparse as sp import numpy as np def spcategorical(n_cat_ids): ''' Returns a dummy matrix given an array of categorical variables. Parameters ---------- n_cat_ids : array A 1d vector of the categorical labels for n observations. Returns -------- dummy : array A sparse matrix of dummy (indicator/binary) variables for the categorical data. ''' if np.squeeze(n_cat_ids).ndim == 1: cat_set = np.unique(n_cat_ids) n = len(n_cat_ids) index = [np.where(cat_set == id).tolist() for id in n_cat_ids] #This list comprehension is likely print index #the most intense part of the algorithm indptr = np.arange(n+1, dtype=int) return sp.csr_matrix((np.ones(n), index, indptr)) else: raise IndexError("The index %s is not understood" % col) #If the variable, n_cat_ids, is already composed of integers and the integers are the n x 1 vector of #origins or destinations in OD pairs for which w ewant to build fixed effects then there is no need to #create the index variable, which probably takes the most time within this function. Instead n_cat_ids can #passed directly to the csr matrix constructor and some speed-ups can be achieved. In the case where the #origin/destination ids are not integers but are strings a speed-up may be possible by alterign the algorithm #so that the index is build in chunks (say each origin/destination) rather than for each row of of the n x 1 #n_cat_ids array as is done in creating the index variable.