Search
sparse_categorical_bottleneck
from scipy import sparse as sp
import numpy as np

def spcategorical(n_cat_ids):
    '''
    Returns a dummy matrix given an array of categorical variables.
    Parameters
    ----------
    n_cat_ids    : array
                   A 1d vector of the categorical labels for n observations.

    Returns
    --------
    dummy        : array
                   A sparse matrix of dummy (indicator/binary) variables for the
                   categorical data.  

    '''
    if np.squeeze(n_cat_ids).ndim == 1:
        cat_set = np.unique(n_cat_ids)
        n = len(n_cat_ids)
        index = [np.where(cat_set == id)[0].tolist()[0] for id in n_cat_ids] #This list comprehension is likely 
        print index                                                                    #the most intense part of the algorithm
        indptr = np.arange(n+1, dtype=int) 
        return sp.csr_matrix((np.ones(n), index, indptr))
    else:
        raise IndexError("The index %s is not understood" % col)

#If the variable, n_cat_ids, is already composed of integers and the integers are the n x 1 vector of
#origins or destinations in OD pairs for which w ewant to build fixed effects then there is no need to 
#create the index variable, which probably takes the most time within this function. Instead n_cat_ids can
#passed directly to the csr matrix constructor and some speed-ups can be achieved. In the case where the
#origin/destination ids are not integers but are strings a speed-up may be possible by alterign the algorithm
#so that the index is build in chunks (say each origin/destination) rather than for each row of of the n x 1
#n_cat_ids array as is done in creating the index variable.