Source code for clintk.cat2vec.tools

"""
sample script for categorical encoding
"""

import re
import unidecode
import pandas as pd

import clintk.text2vec.tools


[docs]def normalize_cat(X, strat='tokens'): """ normalize categories in a Parameters ---------- X : iterable strat : str, ('tokens', 'strings'), default='tokens" if 'tokens', words in a category are kept split (use this for embedding categories by a nlp aproach) if 'strings', each category is considered as a single word Returns ------- pandas.Series same size as input, each entry corresponding to the normalized category name """ res = [] for x in X: try: x = unidecode.unidecode(x) except AttributeError: x = str(x) x = unidecode.unidecode(x) x = x.lower() patt = re.compile('[\W_]+') x_norm = clintk.text2vec.tools.text_normalize(patt.sub(' ', x), ['sw']) res.append(x_norm) if strat == 'tokens': return pd.Series(res).apply(lambda t: ' '.join(t)) elif strat == 'strings': return pd.Series(res).apply(lambda t: '_'.join(t))