sparse_categorical
import numpy as np
from scipy import sparse as sp
from statsmodels.tools.tools import categorical
from datetime import datetime as dt
def spcategorical(data):
'''
Returns a dummy matrix given an array of categorical variables.
Parameters
----------
data : array
A 1d vector of the categorical variable.
Returns
--------
dummy_matrix
A sparse matrix of dummy (indicator/binary) float variables for the
categorical data.
'''
if np.squeeze(data).ndim == 1:
tmp_arr = np.unique(data)
tmp_dummy = sp.csr_matrix((0, len(data)))
for each in tmp_arr[:, None]:
row = sp.csr_matrix((each == data).astype(float))
tmp_dummy = sp.vstack([tmp_dummy, row])
tmp_dummy = tmp_dummy.T
return tmp_dummy
else:
raise IndexError("The index %s is not understood" % col)
data = np.random.randint(1,100, 10000)
np.allclose(spcategorical(np.array(data)).toarray(), categorical(np.array(data), drop=True))
s = dt.now()
n = 3000
o = np.tile(np.arange(n),n)
o_dums = spcategorical(np.array(o))
n = 3000
d = np.repeat(np.arange(n),n)
d_dums = spcategorical(np.array(d))
sp.hstack((o_dums, d_dums))
e = dt.now()
print e-s
all_dums = sp.hstack((o_dums, d_dums))
all_dums
print spcategorical(np.array(data)).toarray().shape