Search
sparse_categorical
import numpy as np
from scipy import sparse as sp
from statsmodels.tools.tools import categorical
from datetime import datetime as dt
def spcategorical(data):
    '''
    Returns a dummy matrix given an array of categorical variables.
    Parameters
    ----------
    data : array
        A 1d vector of the categorical variable.

    Returns
    --------
    dummy_matrix
        A sparse matrix of dummy (indicator/binary) float variables for the
        categorical data.  

    '''
    if np.squeeze(data).ndim == 1:
        tmp_arr = np.unique(data)
        tmp_dummy = sp.csr_matrix((0, len(data)))
        for each in tmp_arr[:, None]:
            row = sp.csr_matrix((each == data).astype(float))
            tmp_dummy = sp.vstack([tmp_dummy, row])
        tmp_dummy = tmp_dummy.T
        return tmp_dummy
    else:
        raise IndexError("The index %s is not understood" % col)
data = np.random.randint(1,100, 10000)
np.allclose(spcategorical(np.array(data)).toarray(), categorical(np.array(data), drop=True))
True
s = dt.now()
n = 3000
o = np.tile(np.arange(n),n)
o_dums = spcategorical(np.array(o))
n = 3000
d = np.repeat(np.arange(n),n)
d_dums = spcategorical(np.array(d))
sp.hstack((o_dums, d_dums))
e = dt.now()
print e-s
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-4-a64b538a6ade> in <module>()
      2 n = 3000
      3 o = np.tile(np.arange(n),n)
----> 4 o_dums = spcategorical(np.array(o))
      5 n = 3000
      6 d = np.repeat(np.arange(n),n)

<ipython-input-2-68702ba242f4> in spcategorical(data)
     18         tmp_dummy = sp.csr_matrix((0, len(data)))
     19         for each in tmp_arr[:, None]:
---> 20             row = sp.csr_matrix((each == data).astype(float))
     21             tmp_dummy = sp.vstack([tmp_dummy, row])
     22         tmp_dummy = tmp_dummy.T

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/sparse/compressed.pyc in __init__(self, arg1, shape, dtype, copy)
     67                         self.format)
     68             from .coo import coo_matrix
---> 69             self._set_self(self.__class__(coo_matrix(arg1, dtype=dtype)))
     70 
     71         # Read matrix dimensions given, if any

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/sparse/coo.pyc in __init__(self, arg1, shape, dtype, copy)
    197                     self.shape = M.shape
    198 
--> 199                 self.row, self.col = M.nonzero()
    200                 self.data = M[self.row, self.col]
    201                 self.has_canonical_format = True

KeyboardInterrupt: 
all_dums = sp.hstack((o_dums, d_dums))
all_dums
<9000000x6000 sparse matrix of type '<type 'numpy.float64'>'
	with 18000000 stored elements in Compressed Sparse Column format>
print spcategorical(np.array(data)).toarray().shape
(10000, 99)