ColossalAI/applications/ColossalQA/colossalqa/text_splitter/utils.py

import re


def remove_format(text: str) -> str:
    # if the accout of \t, \r, \v, \f is less than 3, replace \t, \r, \v, \f with space
    if len(re.findall(r"\s", text.replace(" ", ""))) > 3:
        # in case this is a line of a table
        return text
    return re.sub(r"\s", " ", text)


# remove newlines
def get_cleaned_paragraph(s: str) -> str:
    text = str(s)
    text = re.sub(r"\n{3,}", r"\n", text)  # replace \n\n\n... with \n
    text = re.sub("\n\n", "", text)
    lines = text.split("\n")
    lines_remove_format = [remove_format(line) for line in lines]
    return lines_remove_format