"""
object classes for sklearn pipeline compatibility
"""
import numpy as np
from .tools import avg_corpus
from gensim.models import KeyedVectors, Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.base import BaseEstimator
[docs]class Text2Vector(BaseEstimator):
""" implementation of Doc2Vec model adapted to sklearn for
hyperparameters tuning
"""
def __init__(self, n_components=128, dm=1, window=3):
self.n_components = n_components
self.dm = dm
self.window = window
self.d2v_model_ = None
[docs] def fit(self, reports, y=None, **kwargs):
""" tags reports (for gensim's model consistence) and trains Doc2Vec
model on the corpus
Parameters
----------
reports : iterable of iterables
list of tokenized reports
y : not used, default=None
**kwargs
additionnal arguments to pass to gensim.Word2Vec (see appropriate
documentation for details)
Returns
-------
"""
tagged_docs = [TaggedDocument(j, 'doc_{}'.format(i))
for i, j in enumerate(reports)]
# self.d2v_model_ = self.d2v(tagged_docs)
self.d2v_model_ = Doc2Vec(tagged_docs, vector_size=self.n_components,
dm=self.dm, window=self.window,
**kwargs)
return self
[docs] def transform(self, reports):
""" transforms reports in embedding space based on previously trained
Doc2Vec model
Parameters
----------
reports : iterable of iterables
list of tokenized reports
Returns
-------
np.ndarray
vectorized reports
"""
return np.array([self.d2v_model_.infer_vector(document) for document
in reports])
[docs]class AverageWords2Vector(BaseEstimator):
""" trains a unsupervised word2vec model, and then fold
text data according to it
This function is only for convenience in using word2vec in a pipeline
Parameters
----------
n_components : int, default=128
dimension of the embedding vector
"""
def __init__(self,
n_components=128):
self.n_components = n_components
self.w2v_ = None
[docs] def fit(self, parsed_reports, y=None, **kwargs):
""" Trains the word2vec model with given corpus
as input
Parameters
----------
parsed_reports : iterable of iterables
contains parsed tokenized reports
y : None
**kwargs
additionnal arguments to pass to gensim.Word2Vec (see appropriate
documentation for details)
Returns
-------
"""
self.w2v_ = Word2Vec(parsed_reports, size=self.n_components,
**kwargs).wv
return self
[docs] def fit_pretrained(self, path, **kwargs):
""" fits a pretrained model from
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
Parameters
----------
path : str
path to the model
Returns
-------
"""
self.w2v_ = KeyedVectors.load_word2vec_format(path, **kwargs)
return self