Source code for clintk.text2vec.tools

"""

"""
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer


[docs]def avg_document(model, document):
    """ computes the average vector of the words in document
    in the word2vec model space

    Parameters
    ----------
    model : word2vec.KeyedVectors instance
    document : list
        tokenized document to fold into a vector

    Returns
    -------
    avg : np.ndarray
        the average of all the words in document

    """
    vocab = model.vocab
    n_features = model.vector_size # change to model.vector_sizes
    count = 0

    vectors = np.zeros((n_features,), dtype='float64')

    for word in document:
        if word in vocab:
            new_vec = model[word]
            # print(new_vec.shape, vectors.shape)    #debug statement
            vectors = np.vstack((vectors, new_vec))
            count += 1
    # print(vectors.shape)
    if count > 0:
        avg = np.mean(vectors[1:], axis=0)
    else:
        avg = vectors

    return avg


[docs]def avg_corpus(model, corpus):
    """ computes average vector for each document of the corpus

    Parameters
    ----------
    model : gensim.word2vec.Word2Vec instance
        Trained word2vec model
    corpus : iterable of iterables

    Returns
    -------
    """
    # n, p = len(corpus), model.layer1_size
    features = [avg_document(model, doc) for doc in corpus]

    return np.array(features)


[docs]def text_normalize(text, stop_words, stem=False):
    """ This functions performs the preprocessing steps
    needed to optimize the vectorization, such as normalization
    stop words removal, lemmatization etc...

    stemming for french not accurate enough yet
    @TODO lemmatization for french + adapt stemmer for other languages

    Parameters
    ----------
    text: string
        text to normalize
    
    stop_words : list
        list of additionnal stopwords to remove from the text
    
    stem : bool
        if True, stems the words to fetch the meaning of the words
        However, this functionality does not perform well with french


    Returns
    -------
    string
        same text as input but cleansed and normalized
    """
    sw = stopwords.words('french') + stop_words
    tokens = word_tokenize(text, 'french')

    tokens_filter = [word for word in tokens if word not in sw]

    if stem:
        stemmer = FrenchStemmer()
        tokens_filter = [stemmer.stem(word) for word in tokens_filter]

    return tokens_filter   # " ".join(tokens_filter)  #