Source code for clintk.text2vec.tools

"""

"""
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer


[docs]def avg_document(model, document): """ computes the average vector of the words in document in the word2vec model space Parameters ---------- model : word2vec.KeyedVectors instance document : list tokenized document to fold into a vector Returns ------- avg : np.ndarray the average of all the words in document """ vocab = model.vocab n_features = model.vector_size # change to model.vector_sizes count = 0 vectors = np.zeros((n_features,), dtype='float64') for word in document: if word in vocab: new_vec = model[word] # print(new_vec.shape, vectors.shape) #debug statement vectors = np.vstack((vectors, new_vec)) count += 1 # print(vectors.shape) if count > 0: avg = np.mean(vectors[1:], axis=0) else: avg = vectors return avg
[docs]def avg_corpus(model, corpus): """ computes average vector for each document of the corpus Parameters ---------- model : gensim.word2vec.Word2Vec instance Trained word2vec model corpus : iterable of iterables Returns ------- """ # n, p = len(corpus), model.layer1_size features = [avg_document(model, doc) for doc in corpus] return np.array(features)
[docs]def text_normalize(text, stop_words, stem=False): """ This functions performs the preprocessing steps needed to optimize the vectorization, such as normalization stop words removal, lemmatization etc... stemming for french not accurate enough yet @TODO lemmatization for french + adapt stemmer for other languages Parameters ---------- text: string text to normalize stop_words : list list of additionnal stopwords to remove from the text stem : bool if True, stems the words to fetch the meaning of the words However, this functionality does not perform well with french Returns ------- string same text as input but cleansed and normalized """ sw = stopwords.words('french') + stop_words tokens = word_tokenize(text, 'french') tokens_filter = [word for word in tokens if word not in sw] if stem: stemmer = FrenchStemmer() tokens_filter = [stemmer.stem(word) for word in tokens_filter] return tokens_filter # " ".join(tokens_filter) #