Source code for clintk.text2vec.w2v_clusters

"""
clustering of word embeddings

@TODO documentation of the module
"""
import numpy as np

from sklearn.base import BaseEstimator
from gensim.models import Word2Vec, KeyedVectors
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


[docs]class WordClustering(BaseEstimator):
    """ theme-affinity vectorization of documents

    w2v_size : int, default=128
       size of the hidden layer in the embedding Word2Vec model

    n_clusters : int, default=30
        number of clusters,  to the number of output parameters for the
        vectorization.
        It is advised to set `n_clusters` to the approximate number of
        lexical fields

    clustering : sklearn.cluster instace, default=KMeans(n_clusters=30)
        clustering algorithm
        The number of clusters must be equal to `n_clusters`

    pretrained : bool, default=False
        False to train a new w2v model
        True to use a model already trained

    model_path : str, default=None
        path to the trained w2v model
        Only used when `pretrained` is set to True

    """
    def __init__(self,
                 w2v_size=128,
                 n_clusters=30,
                 clustering=KMeans(n_clusters=30),
                 pretrained=False,
                 model_path=None):
        self.w2v_size = w2v_size
        self.n_clusters = n_clusters
        self.clustering = clustering
        self.pretrained = pretrained
        self.model_path = model_path

        # vocabulary
        self.vocabulary_ = None
        # distribued representation of the words
        self.word_vectors_ = None
        # cluster id for each word
        self.cluster_ids_ = None

        self.clustering.set_params(n_clusters=n_clusters)


[docs]    def fit(self, X=None, y=None, **fit_params):
        """ train w2v and clustering models

        Parameters
        ----------
        X : iterable of iterable, defaul=None
            corpus of tokenized documents if `pretrained`=False
            else, X=None and the pretrained model is used


        y : None

        fit_params : additionnal parameters for word2vec algorithm

        Returns
        -------
        self

        """
        if self.pretrained:
            w2v = KeyedVectors.load_word2vec_format(self.model_path)
        else:
            w2v = Word2Vec(X, size=self.w2v_size)

        self.vocabulary_ = w2v.wv.vocab
        self.word_vectors_ = w2v[self.vocabulary_]

        self.cluster_ids_ = self.clustering.fit_predict(self.word_vectors_)

        return self

[docs]    def transform(self, X, y=None):
        """ transforms each row of `X` into a vector of clusters affinities

        Parameters
        ----------
        X : iterable of iterable

        y: None

        Returns
        -------
        numpy.ndarray, shape=(n, p)
            transformed docments, where `p=n_cluster`

        """
        vectors = []

        for x in X:
            vector = np.zeros(self.n_clusters)
            count = 0
            for t in x:
                try:
                    word_id = self.vocabulary_[t].index
                    word_cluster = self.cluster_ids_[word_id]
                    vector[word_cluster] = vector[word_cluster] + 1
                    count += 1
                # except word is not in vocabular
                except KeyError:
                    pass
            if count > 0:
                vectors.append(vector / count)
            else:
                vectors.append(vector)

        return np.array(vectors)

[docs]    def get_clusters_words(self):
        """ return the words in each cluster


        Returns
        -------
        dict
            keys are cluster ids, values are lists of words

        """
        words_cluster = {}
        for cluser_id in np.unique(self.cluster_ids_):
            words_cluster[str(cluser_id)] = []

        for i, word in enumerate(self.vocabulary_):
            label = str(self.cluster_ids_[i])
            words_cluster[label].append(word)

        return words_cluster




[docs]def embed_corpus(X, n_clusters, clustering, **kwargs):
    """ transforms X into vector of cluster affinities

    ..deprecated use `WordClustering` object instead
    Parameters
    ----------
    X : iterable of iterable, (length=n)
        corpus of document

    clustering : sklearn.cluster object
        instanciated clustering algorithm

    Returns
    -------
    np.ndarray, shape=(n, n_clusters)

    """
    # fit
    w2v = Word2Vec(X, size=128)

    words = w2v.wv.vocab
    word_vectors = w2v[words]

    pca_word_vectors = PCA(n_components=0.9).fit_transform(word_vectors)

    # clustering = AgglomerativeClustering(n_clusters, affinity='euclidean')

    cluster_ids = clustering.fit_predict(pca_word_vectors)
    
    # transform
    vectors = []
    for x in X:
        vector = np.zeros(n_clusters)
        count = 0
        for t in x:
            try:
                word_id = words[t].index
                word_cluster = cluster_ids[word_id]
                vector[word_cluster] = vector[word_cluster] + 1
                count += 1
            except KeyError:
                pass
        vectors.append(vector / count)

    return np.array(vectors), cluster_ids