Source code for clintk.text_parser.parser_utils

"""
This script contains the functions used to parse one report, ie the functions
to split the html text into a dictionnary of sections.

Only main_parser is used in practice since all the other functions
are auxiliary. Moreover, they should not be used "as-is" since they are
wrapped up in the `ReportsParser` object for convenience.

"""
from bs4 import BeautifulSoup

from unidecode import unidecode
import re


[docs]def main_parser(text, is_html, verbose, remove, headers):
    """ takes as input the string from the report and splits it into sections

    Parameters
    ----------
    text : string
        report in html format

    is_html : bool
        set True if text is actually structured as html

    verbose : bool
        True for logging

    remove : list
        name of the tags to remove because contain useless information

    headers : string
        name of the tags that delimit the sections

    Returns
    -------
    dict
        keys are section names, values are the content of the section

    """
    try:
        soup = BeautifulSoup(text, 'html.parser')
        soup = BeautifulSoup(soup.prettify(), 'html.parser')
    except TypeError:
        if verbose:
            print('{} can not be parsed'.format(text))
        soup = BeautifulSoup('', 'html.parser')

    clean_soup(soup, remove, verbose)

    return parse_soup(soup, is_html, verbose, headers)


[docs]def text_between_tags(tag1, tag2, is_html):
    """ This function fetches the text between tag 1 and tag 2

    The soup should already be cleansed from useless tags such as  span

    Parameters
    ----------
    tag1

    tag2

    is_html

    Returns
    -------
    str
        all the text between tag1 and tag2

    """
    if is_html:
        res = tag1.text
        next_tag = tag1.find_next()
        # iterates over tags to append text to res
        while next_tag != tag2:
            res += next_tag.text + ' '
            next_tag = next_tag.find_next()

        return clean_string(res)

    else:
        res = tag1.next_sibling.strip()
        next_tag = tag1.find_next()
        while next_tag != tag2:
            res += next_tag.next_sibling.strip() + ' '
            next_tag = next_tag.find_next()

        return clean_string(res)


[docs]def last_tag_text(final_tag, is_html):
    """ Fetches text from last tag

    Parameters
    ----------
    final_tag

    Returns
    -------
    string
        content of the last section

    """
    if is_html:
        res = ''
        cur_tag = final_tag.find_next()
        while cur_tag is not None:
            res += cur_tag.text + ' '
            cur_tag = cur_tag.find_next()
        return clean_string(res)
    else:
        return clean_string(final_tag.next_sibling)


[docs]def parse_soup(soup, is_html, verbose, headers='h3'):
    """Splits the soup between headers and returns a dictionnary

    Parameters
    ----------
    soup : BeautifulSoup

    is_html : bool
        true if text is exact html format

    verbose: bool, (default=False)
        weather to print information about parsing

    headers : string
        delimiters of the sections

    Returns
    -------
    dict
        keys are section names values are section contents

    """

    res_dic = {}
    header_list = list(soup.find_all(headers))

    for index, header in enumerate(header_list[:-1]):
        try:
            if is_html:
                new_text = text_between_tags(header.find_next(),
                                             header_list[index + 1],
                                             is_html)
            else:
                new_text = text_between_tags(header,
                                             header_list[index + 1],
                                             is_html)
            key = header.text
            res_dic[clean_string(key)] = new_text
        except AttributeError as e:
            # @TODO fix verbosity
            print('{} occurred at {}'.format(e, soup.name))
    try:
        final_text = last_tag_text(header_list[-1], is_html)
        final_key = header_list[-1].text
        res_dic[clean_string(final_key)] = final_text
    except IndexError as e:
        if verbose:
            print('{} current report {} is empty'.format(e, soup.name))
    if verbose:
        print('Document {} has been parsed'.format(soup.name))
    return res_dic


[docs]def clean_soup(soup, remove, verbose):
    """ Remove the tags indicated in remove parameter from the soup
    @TODO change function name to `to_alpha_num`
    Transfo done inplace

    Parameters
    ----------
    soup : BeautifulSoup instance

    remove : list
        name of the tags to remove from the soup

    verbose: bool
        controls logging

    Returns
    -------
    BeautifulSoup
        the same as input, transformation is done inplace

    """
    # remove first span <span style= "color: red"> that indicates
    # color legend
    try:
        soup.find('span', attrs={'style': "color: red"}).extract()
    except AttributeError:
        if verbose:
            print('No legend found')

    # remove tags indicated in input
    for tag in remove:
        cont = True
        while cont:
            try:
                soup.find(tag).extract()
            except AttributeError:
                if verbose:
                    print('No tag {} in the soup {}'.format(tag, soup.name))
                cont = False
    return


[docs]def clean_string(s):
    """ remove non alphanumeric characters from string s
    returns the lowerCase string

    Parameters
    ----------
    s : str

    Returns
    -------
    str
        string with only alphanumeric and lowercased
    """
    try:
        s_decoded = unidecode(s).replace('\n', '').replace('  ', ' ')
        pattern = re.compile('[\W_]+')
        return pattern.sub(' ', s_decoded).lower().strip()
    except:
        return s or ''