Source code for clintk.cat2vec.neural_embedding

"""
Embedding high cardinality categorical variables with distributed
representations

The first embedder relies on `Word2Vec` algorithm to learn vector
representations of words in a corpus

.. [1] "Distributed Representations of Words and Phrases and their
  Compositionality", Mikolov et al, Advances in Neural Information Processing
  Systems 26, pp 3111--3119, 2013.


The second one is based on `transfer learning
<https://en.wikipedia.org/wiki/Transfer_learning>`_ : we train a fully
connected neural network on a predictive task (only supports binary
classification for now) so that the upper layers learn higher level
representations of the categories.
After training, we can extract the categories vectors in the embedding space
"""
import pandas as pd

from gensim.models import Word2Vec, KeyedVectors
from sklearn.base import BaseEstimator
from keras.models import Sequential, clone_model
from keras.layers import Dense, Dropout


from ..text2vec import tools


[docs]class W2VVectorizer(object): """ vectorizes categories with word2vec model @deprecated Parameters ---------- group_key : str name of the column to group category_col : str name of the column containing the categorical variables size : int, default=128 dimension of the embedding vector min_count : int, default=1 minimum amount of instances to integrate it to the model sg : int {0, 1}, default=1 0 for skip-gram word2vec model 1 for CBOW (best suited for small datasets) window : int, default=3 size of the context strategy : str {'tokens', 'strings'}, default='tokens' if 'tokens', categories containing several words are split else, each category is considered as a word """ def __init__(self, group_key, category_col, size=128, min_count=1, sg=1, window=3, strategy='tokens', seed=0): self.key = group_key, self.cat_col = category_col self.size = size self.min_count = min_count self.sg = sg self.window = window self.strategy = strategy self.seed = seed self.w2v_ = None
[docs] def fit(self, X, y=None): """ fits the model by grouping categories by group_key in order to embed categories as text Parameters ---------- X : pd.DataFrame y Returns ------- """ df_grouped = X.groupby(self.key).agg({self.cat_col: ' '.join}) df_grouped[self.cat_col] = df_grouped[self.cat_col] \ .apply(lambda s: s.split(' ')) self.w2v_ = Word2Vec(df_grouped[self.cat_col], size=self.size, window=self.window, min_count=self.min_count, sg=self.sg, seed=self.seed).wv return self
[docs] def transform(self, X, y=None): """ Parameters ---------- X : pd.DataFrame y : None Returns ------- """ categories = X[self.cat_col].apply(lambda s: s.split(' ')) vectors = categories.apply(lambda cat: tools.avg_document(self.w2v_, cat)) X[self.cat_col + '_embedded'] = vectors return X
[docs] def fit_pretrained(self, path, **kwargs): """ fits model using pretrained word embedding from https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md Parameters ---------- path : str path do wiki.lg.vec file Returns ------- """ self.w2v_ = KeyedVectors.load_word2vec_format(path, **kwargs) return self
[docs]class NeuralEmbedder(BaseEstimator): """ Trains a MLP classifier to learn a distributed representation of categories Only available for binary targets @TODO optimizer argument should be able to receive keras.Optimizer class @TODO + batch_size + validation set ? input_dim : tuple, (int, int) input_dim[0] number of units in inpuot layer input_dim[1] : dimension of the input layer (= number of features) layers : tuple The ith element represents the number of neurons in the ith hidden layer. Similar to sklearn's MLP activation : str, default='relu' activation function in the intermediate layers output : str, default='sigmoid' output activation function, only supports sigmoid for binary classification optimizer : str, default='adam' optimizing function for backpropagation check https://keras.io/optimizers for available algorithms loss : str, default='binary-crossentropy' loss computed for optimization check https://keras.io/losses dropout : str, default=0.5 dropout rate metrics : list, default=['acc', 'mae'] metrics used uring training and testing epochs : int, default=20 number of epochs """ def __init__(self, input_dim, layers, activation='relu', output='sigmoid', optimizer='adam', loss='binary-crossentropy', dropout=0.5, metrics=['acc', 'mae'], epochs=20): self.optimizer = optimizer self.loss = loss self.metrics = metrics self.epochs = epochs # indicator of training state self.fit_ = None self.model = Sequential() # input layer self.model.add(Dense(input_dim[0], activation=activation, input_dim=input_dim[1])) self.model.add(Dropout(dropout)) # stacking following layers for i, units in enumerate(layers): self.model.add(Dense(units, activation=activation)) self.model.add(Dropout(dropout)) self.model.add(Dense(1, activation=output))
[docs] def fit(self, X, y): """ trains the model using input data Parameters ---------- X : iterable feature matrix y : iterable target vector (possibly one-hot-encoded?) Returns ------- keras.History.history record of training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values (if applicable) """ self.model.compile(optimizer=self.optimizer, loss=self.loss, metrics=self.metrics) hist = self.model.fit(X, y, epochs=self.epochs) self.fit_ = True return hist
[docs] def transform(self, X): """ Transform X into a distributed representation learned by fit Parameters ---------- X : iterable feature matrix to embed Returns ------- numpy array X projected into an embedding space """ model_cut = clone_model(self.model) # removing output layer + last dropout # @TODO change method (sub optimal and inelegant) model_cut.pop() model_cut.pop() return model_cut.predict(X)