"""
selects parameters with L1 logistic regression
"""
import pandas as pd
from sklearn.base import BaseEstimator
[docs]class LassoSelector(BaseEstimator):
"""
This class is made to be used after cat2vec.lasso_gridsearch since it
selects the features from a dataframe that have the most weighted
coefficients (according to a L1-penalized linear model)
It inherits from sklearn.base.BaseEstimator to allow gridsearching the
best `n_features` using a pipeline and a basline classifier
Parameters
----------
n_features : int
number of top features to keep
lasso_coefs : pd.DataFrame
each row is the name of a category and its coef weight in LASSO
model
feature_col : str
name of the feature col (ie name of the categorical variable)
coef_col : str
name of the column of the LASSO coefficients in lasso_coefs dataframe
Examples
--------
>>> dico = {'coef': [0, 4.5, 1.2, 0.3], \
'colnames': ['feat1', 'feat2', 'feat3', 'feat4']}
>>> df = pd.DataFrame(dico)
keeps only feat2 and feat3
>>> selector = LassoSelector(2).fit(df['colnames'], df['coef'])
>>> X = [[0, 0, 1, 0], [1, 1, 0, 0], [0, 1, 0, 0]]
>>> selector.transform(X)
[[0, 1], [1, 0], [1, 0]]
"""
def __init__(self, lasso_coefs, feature_col, coef_col,
n_features=64):
self.n_features = n_features
self.feature_col = feature_col
self.lasso_coefs = lasso_coefs
self.coef_col = coef_col
[docs] def fit(self, X, y):
return self