Source code for clintk.text_parser.parser

"""
object to parse text reports, compatible with scikit-learn transformer API

The format of typical reports to be parsed can be found in data/ directory of
this repo. `ReportsParser` enables choosing custom :

* section delimiters with `headers` attribute
* tags that dont contain informative texte (style tag for instance) with
  `remove_tags`
* additional stop words, that may be specific to a corpus or a task

@TODO add examples
@TODO change remove_sections into sections_to_keep
"""
import pandas as pd

from bs4 import BeautifulSoup
from sklearn.base import BaseEstimator
from .section_manager import reduce_dic
from .parser_utils import main_parser, clean_string
from clintk.text2vec.tools import text_normalize
from multiprocessing.pool import Pool


[docs]class ReportsParser(BaseEstimator):
    """ a parser for html-like text reports

    Parameters
    ----------
    strategy : string, default='strings'
        defines the type of object returned by the transformation,
        if 'strings', each line of the returned df is string. 'strings' is to
        be used for CountVectorizer and TFiDFVectorizer
        if 'tokens', the string is split into a list of words. 'tokens' is to
        be used for gensim's Word2Vec and Doc2Vec models

    sections : tuple or None, default=None
        tuple containing section names  to keep
        if None, keep all the sections

    remove_tags : list, default=['h4', 'table', 'link', 'style']
        list of tags to remove from html  page

    headers : str or None, default='h3
        name of the html tag that delimits the sections in the

    is_html : bool, default=True
        boolean indicating weather the structure of the reports is strictly html
        format or not.
        Check documentation usage for details

    stop_words : list, default=[]
        additional words to remove from the text, specific to the kind
        of parsed document

    verbose : bool, default=False

    norm : bool, default=True
        weather normalising text (removing stopwords, lemmatization etc..)

    n_jobs : int, default=1
        number of CPU cores to use, if -1 then all the available one are used

    See Also
    --------
    .text_parser module : which contains the core functions to parse each text

    """
    def __init__(self,
                 strategy='strings',
                 sections=None,
                 remove_tags=['h4', 'table', 'link', 'style'],
                 col_name='report',
                 headers='h3',
                 is_html=True,
                 stop_words=[],
                 norm=True,
                 verbose=False,
                 n_jobs=1):

        self.strategy = strategy
        self.sections = sections
        self.remove_tags = remove_tags
        self.headers = headers
        self.is_html = is_html
        self.col_name = col_name
        self.verbose = verbose
        self.norm = norm
        self.stop_words = stop_words
        self.n_jobs = n_jobs

[docs]    def fit(self, X, y=None):
        return self

[docs]    def transform(self, X):
        """ parses the reports in input

        Parameters
        ----------
        X : pd.Series or DataFrame
            each entry is a string defining a report

        Returns
        -------
        pd.Series
            each entry is either a string or list of words depending on
            the strategy
        """
        if type(X) == pd.DataFrame:
            # then turn it into a Series
            X = X[self.col_name]
        if self.n_jobs == -1:
            pool = Pool()
        else:
            pool = Pool(self.n_jobs)

        res = pool.map(self._fetch_doc, X)

        pool.close()
        pool.join()

        ser_res = pd.Series(res)

        return ser_res

    def _fetch_doc(self, html):
        """ parses one html document using `self` parameters

        Method is protected as it is only made to be used to facilitate the
        serialization of the main loop in `transform`


        Parameters
        ----------
        html : str

        Returns
        -------
        str or list of str
            depending of `self.strategy`


        """
        if self.headers is None:
            # keep plain text
            text = clean_string(BeautifulSoup(str(html),
                                              'html.parser').text)

        # parse html split into self.headers
        else:
            dico = main_parser(html, self.is_html, self.verbose,
                               self.remove_tags,
                               headers=self.headers)
            text = reduce_dic(dico, self.sections).strip()

        if self.norm:
            text = text_normalize(text, self.stop_words, stem=False)

        if self.strategy == 'strings':
            if self.norm:
                return ' '.join(text)
            else:
                return ''.join(text)
        else:
            if self.norm:
                return text
            else:
                return text.split(' ')