Source code for clintk.cat2vec.feature_selection

"""
selects parameters with L1 logistic regression
"""
import pandas as pd

from sklearn.base import BaseEstimator


[docs]class LassoSelector(BaseEstimator): """ This class is made to be used after cat2vec.lasso_gridsearch since it selects the features from a dataframe that have the most weighted coefficients (according to a L1-penalized linear model) It inherits from sklearn.base.BaseEstimator to allow gridsearching the best `n_features` using a pipeline and a basline classifier Parameters ---------- n_features : int number of top features to keep lasso_coefs : pd.DataFrame each row is the name of a category and its coef weight in LASSO model feature_col : str name of the feature col (ie name of the categorical variable) coef_col : str name of the column of the LASSO coefficients in lasso_coefs dataframe Examples -------- >>> dico = {'coef': [0, 4.5, 1.2, 0.3], \ 'colnames': ['feat1', 'feat2', 'feat3', 'feat4']} >>> df = pd.DataFrame(dico) keeps only feat2 and feat3 >>> selector = LassoSelector(2).fit(df['colnames'], df['coef']) >>> X = [[0, 0, 1, 0], [1, 1, 0, 0], [0, 1, 0, 0]] >>> selector.transform(X) [[0, 1], [1, 0], [1, 0]] """ def __init__(self, lasso_coefs, feature_col, coef_col, n_features=64): self.n_features = n_features self.feature_col = feature_col self.lasso_coefs = lasso_coefs self.coef_col = coef_col
[docs] def fit(self, X, y): return self
[docs] def transform(self, X): """ Parameters ---------- X : pd.DataFrame contains only features Returns ------- ndarray contains the best n_features """ self.lasso_coefs['abs_coef'] = abs(self.lasso_coefs[self.coef_col]) self.lasso_coefs.sort_values(['abs_coef'], ascending=False, inplace=True) # keeping top features according to lasso coefs_to_keep = self.lasso_coefs.iloc[:self.n_features, :] coefs_to_keep = coefs_to_keep[self.feature_col] return X[coefs_to_keep.values].values