Source code for clintk.utils.outliers

"""
Scripts to remove the outliers and na values from the different tables

To be used for the values that are mistyped

"""
import pandas as pd
import numpy as np


[docs]class OutlierRemover(object):
    """ removes outliers and replaces them by value given in dic_path or by 
    imputing the column mean value
    
    Parameters
    ----------
    dic_path: str   
        path to the dictionary containing outliers information
    
    inplace: bool, default=True
        True to perform the transformation inplace
        False to do it on a copy of the dataframe
        
    """
    def __init__(self, dic_path, inplace=True):
        self.path = dic_path
        self.inplace = inplace

    # for sklearn pipeline compatibility
[docs]    def fit(self, X, y=None):
        return self

[docs]    def transform(self, X, y=None):
        return impute_df(X, self.path, self.inplace)


[docs]def impute_col(X, lbound, ubound, impute):
    """ imputes missing and mistyped values of one col of the dataframe

    Parameters
    ----------
    X : iterable, array-like
        column to which we want to impute missing values


        name of the column

    lbound : float
        lower bound for normal values

    ubound : float
        upper bound for normal values

    impute : float or None
        if float is given, replaces outlier by the given value
        if None, the mean value is returned


    Returns
    -------
    df.Series
        df.col_name except its wrong values are imputed according
        to strategy
    """
    impute_value = impute or np.mean(X)
    res = []

    for row in X:
        # check bounds + nan
        if (row < lbound) | (row > ubound) | (row == np.nan):
            res.append(impute_value)
        else:
            res.append(row)

    return pd.Series(res)


[docs]def impute_df(df, dic_path, inplace):
    """ cleans the df from missing/mistyped values

    Parameters
    ----------
    df : pd.DataFrame

    dic_path : str
        path containing name of the columns to clean and the upper/lower
        limits to consider point as outlier and optionnal third value is the
        imputing value

    inplace  : bool
        if True, performs the transformation inline
    Returns
    -------
    pd.DataFrame

    """
    if not inplace:
        # make a copy of df
        df = df[::]
    dic = {}

    with open(dic_path, 'r') as f: # add encoding?
        for line in f:
            print(line, type(line))
            key_values = line.split()
            try:
                dic[key_values[0]] = (float(key_values[1]),
                                      float(key_values[2]),
                                      float(key_values[3]))
            except IndexError:
                dic[key_values[0]] = (float(key_values[1]),
                                      float(key_values[2]))
    print(dic)
    for col in dic:
        if col in list(df.columns):
            series = df[col]
            try:
                series_clean = impute_col(series,
                                          dic[col][0],
                                          dic[col][1],
                                          dic[col][2])
            except IndexError:
                print("Default value not passed - using mean")
                series_clean = impute_col(series,
                                          dic[col][0],
                                          dic[col][1],
                                          None)
            df[col] = series_clean

        else:
            pass

    return df