Source code for clintk.utils.fold

"""
As data may come from different sources, it is best to retrieve all the bases 
into one single dataframe that would enables fetching the features very 
easily, as well as the dates at which the events/measures occured.

Doing so allows to retrieve the full timelines of the patients and 
therefore complete various tasks.  

The objective of this module is to parse the databases available in order to
have each one of them organized as

key1 | key2 | feature_name | value | date

"""
import pandas as pd

from multiprocessing.pool import Pool


[docs]class Folder: """ This object enables "unfolding" the features of a DataFrame, which means for a df that has 5 feature columns for instance, the unfolding would result in two feature columns: one is for the feature name and the other is the feature value. All the attributes are column names to indicate how to unfold the dataframe Parameters ---------- key1 : str indicator of the primary key indicator key2 : str, (optionnal?) secondary key features : list column names that contain the features date : str name of the date column, n_jobs : int number of CPUs to use for computation. If -1, all the available cores are used """ def __init__(self, key1, key2, features, date, n_jobs=1): self.key1 = key1 self.key2 = key2 self.features = features self.date = date self.n_jobs = n_jobs
[docs] def fold(self, df_base): """ Parameters ---------- df_base : pandas DataFrame Returns ------- pandas.DataFrame columns are [key1, key2, feature, value, date] where feature contains the features names and values are the values. Examples -------- >>> df = pd.DataFrame({'id1': [1, 2, 3], 'id2': ['id1', 'id2', 'id3'], ... 'feature_a': [0, 0.3, 1.4], ... 'date': ["12122012", "12122012","12122012"]}) >>> folder = fold.Folder('id1', 'id2', ['feature_a'], 'date') >>> folded = folder.fold(df) >>> print(folded) id1 id2 feature value date 0 1 id1 feature_a 0.0 12122012 1 2 id2 feature_a 0.3 12122012 2 3 id3 feature_a 1.4 12122012 For several features: >>> df['feature_b'] = [-1, 1, 0] >>> folder = fold.Folder('id1', 'id2', ['feature_a', 'feature_b'], ... 'date') >>> folded = folder.fold(df) >>> print(folded) id1 id2 feature value date 0 1 id1 feature_a 0.0 12122012 1 1 id1 feature_b -1.0 12122012 2 2 id2 feature_a 0.3 12122012 3 2 id2 feature_b 1.0 12122012 4 3 id3 feature_a 1.4 12122012 5 3 id3 feature_b 0.0 12122012 """ columns = [self.key1, self.key2, 'feature', 'value', 'date'] if self.n_jobs == -1: pool = Pool() else: pool = Pool(self.n_jobs) if len(self.features) > 1: dicts = pool.map(self._fold_several_features, df_base.iterrows()) pool.close() pool.join() merged_dict = {k: [] for k in columns} for key in dicts[0]: for dico in dicts: merged_dict[key] += dico[key] # only one feature else: dicts = pool.map(self._fold_one_feature, df_base.iterrows()) pool.close() pool.join() merged_dict = {k: [d[k] for d in dicts] for k in dicts[0]} return pd.DataFrame(merged_dict)
def _fold_several_features(self, row): """ folding function for tables with several features Parameters ---------- row : pandas.series current row during iteration over the base dataframe Returns ------- dico: dict duplication of the row """ # fetching value of the row, dropping index _, row = row dico = {self.key1: [], self.key2: [], 'feature': [], 'value': [], 'date': []} for cur_feat in self.features: dico[self.key1] += [row[self.key1]] dico[self.key2] += [row[self.key2]] dico['feature'] += [cur_feat] dico['value'] += [row[cur_feat]] dico['date'] += [row[self.date]] return dico def _fold_one_feature(self, row): """ folding function for one feature Parameters ---------- row : pd.Series Returns ------- dico : dict """ _, row = row dico = {self.key1: row[self.key1], self.key2: row[self.key2], 'feature': self.features[0], 'value': row[self.features[0]], 'date': row[self.date]} return dico