Source code for clintk.cat2vec.lasso_gridsearch

"""
The objective of this script is to select the best categories of a high
cardinality categorical feature using LASSO penalization.

For the moment only binary/continuous logistic regression is implemented

>> reload_ext autoreload
>> autoreload 2
"""
import pandas as pd
import numpy as np

from clintk.cat2vec import tools
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit


[docs]def lr_coefficients(path, features, targets, key, output_path, **kwargs):
    """
    Performs categorical variable selection using L1-penalized logistic
    regression model

    It only supports binary or continuous target for the moment


    Parameters
    ----------
    path : str
        input path or url for the dataframe

    features : str
        column name of the categorical column

    targets : str
        name of the target column in the df

    key : str
        key to group categorical variables

    output_path : str
        path to save the coefficients in a csv file

    kwargs
        keyword arguments for the hyperparameter grid

    Returns
    -------
    array
    the coefficients of the L1-logistic regression

    Examples
    --------
    >>> lr_coefficients('input.csv', 'medication_name', 'target', \
    solver=['liblinear', 'saga'], C=np.logspace(-6, 2, 10))

    """
    df = pd.read_csv(path, sep=';')
    df[features] = tools.normalize_cat(df[features], 'strings')
    dummies = pd.get_dummies(df[[key, features, targets]],
                             columns=[features])

    # avoid target replication
    agg_dic = {targets: 'first'}
    for colname in list(dummies.columns)[2:]:
        # summing dummy variables to have more than one 1 on each row
        agg_dic[colname] = 'sum'

    dummies_group = dummies.groupby(by=key, as_index=False).agg(agg_dic)

    y = dummies_group[targets]
    X = dummies_group.iloc[:, 2:]

    param_grid = {}
    for key, value in kwargs.items():
        param_grid[key] = value

    # using metrics for imbalanced dataset
    scoring = {'AUC': 'roc_auc', 'Precision': 'precision'}
    cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2)

    grid = GridSearchCV(LogisticRegression(penalty='l1', n_jobs=8),
                        param_grid=param_grid, scoring=scoring,
                        cv=cv, refit='AUC',
                        n_jobs=8, verbose=5)

    grid.fit(X, y)

    print(grid.cv_results_)

    print('Best score for LASSO: {} \n obtained with following '
          'parameters: {}'.format(grid.best_score_, grid.best_params_))

    lr = grid.best_estimator_

    colnames = np.array(dummies_group.columns)[2:]
    df_coefs = pd.DataFrame({'features': colnames,
                             'coef': lr.coef_.ravel()})

    df_coefs.to_csv(output_path,
                    sep=';',
                    encoding='utf-8')

    return lr.coef_