DB-GPT/dbgpt/util/string_utils.py

import re
import string
from typing import Dict


def is_all_chinese(text):
    ### Determine whether the string is pure Chinese
    pattern = re.compile(r"^[一-龥]+$")
    match = re.match(pattern, text)
    return match is not None


def contains_chinese(text):
    """Check if the text contains Chinese characters."""
    return re.search(r"[\u4e00-\u9fa5]", text) is not None


def is_number_chinese(text):
    ### Determine whether the string is numbers and Chinese
    pattern = re.compile(r"^[\d一-龥]+$")
    match = re.match(pattern, text)
    return match is not None


def is_chinese_include_number(text):
    ### Determine whether the string is pure Chinese or Chinese containing numbers
    pattern = re.compile(r"^[一-龥]+[\d一-龥]*$")
    match = re.match(pattern, text)
    return match is not None


def is_scientific_notation(string):
    # 科学计数法的正则表达式
    pattern = r"^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$"
    # 使用正则表达式匹配字符串
    match = re.match(pattern, str(string))
    # 判断是否匹配成功
    if match is not None:
        return True
    else:
        return False


def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]:
    # extract text
    match_map = {}
    start_index = long_string.find(s1)
    while start_index != -1:
        if is_include:
            end_index = long_string.find(s2, start_index + len(s1) + 1)
            extracted_content = long_string[start_index : end_index + len(s2)]
        else:
            end_index = long_string.find(s2, start_index + len(s1))
            extracted_content = long_string[start_index + len(s1) : end_index]
        if extracted_content:
            match_map[start_index] = extracted_content
        start_index = long_string.find(s1, start_index + 1)
    return match_map


def extract_content_open_ending(long_string, s1, s2, is_include: bool = False):
    # extract text  open ending
    match_map = {}
    start_index = long_string.find(s1)
    while start_index != -1:
        if long_string.find(s2, start_index) <= 0:
            end_index = len(long_string)
        else:
            if is_include:
                end_index = long_string.find(s2, start_index + len(s1) + 1)
            else:
                end_index = long_string.find(s2, start_index + len(s1))
        if is_include:
            extracted_content = long_string[start_index : end_index + len(s2)]
        else:
            extracted_content = long_string[start_index + len(s1) : end_index]
        if extracted_content:
            match_map[start_index] = extracted_content
        start_index = long_string.find(s1, start_index + 1)
    return match_map


def str_to_bool(s):
    if s.lower() in ("true", "t", "1", "yes", "y"):
        return True
    elif s.lower().startswith("true"):
        return True
    elif s.lower() in ("false", "f", "0", "no", "n"):
        return False
    else:
        return False


def _to_str(x, charset="utf8", errors="strict"):
    if x is None or isinstance(x, str):
        return x

    if isinstance(x, bytes):
        return x.decode(charset, errors)

    return str(x)


def remove_trailing_punctuation(s):
    """Remove trailing punctuation from a string."""
    punctuation = set(string.punctuation)
    chinese_punctuation = {
        "。",
        "，",
        "！",
        "？",
        "；",
        "：",
        "“",
        "”",
        "‘",
        "’",
        "（",
        "）",
        "【",
        "】",
        "—",
        "…",
        "《",
        "》",
    }
    punctuation.update(chinese_punctuation)
    while s and s[-1] in punctuation:
        s = s[:-1]

    return s