DB-GPT/dbgpt/util/string_utils.py
明天 b124ecc10b
feat: (0.6)New UI (#1855)
Co-authored-by: 夏姜 <wenfengjiang.jwf@digital-engine.com>
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: wb-lh513319 <wb-lh513319@alibaba-inc.com>
Co-authored-by: csunny <cfqsunny@163.com>
2024-08-21 17:37:45 +08:00

132 lines
3.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import string
from typing import Dict
def is_all_chinese(text):
### Determine whether the string is pure Chinese
pattern = re.compile(r"^[一-龥]+$")
match = re.match(pattern, text)
return match is not None
def contains_chinese(text):
"""Check if the text contains Chinese characters."""
return re.search(r"[\u4e00-\u9fa5]", text) is not None
def is_number_chinese(text):
### Determine whether the string is numbers and Chinese
pattern = re.compile(r"^[\d一-龥]+$")
match = re.match(pattern, text)
return match is not None
def is_chinese_include_number(text):
### Determine whether the string is pure Chinese or Chinese containing numbers
pattern = re.compile(r"^[一-龥]+[\d一-龥]*$")
match = re.match(pattern, text)
return match is not None
def is_scientific_notation(string):
# 科学计数法的正则表达式
pattern = r"^[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?$"
# 使用正则表达式匹配字符串
match = re.match(pattern, str(string))
# 判断是否匹配成功
if match is not None:
return True
else:
return False
def extract_content(long_string, s1, s2, is_include: bool = False) -> Dict[int, str]:
# extract text
match_map = {}
start_index = long_string.find(s1)
while start_index != -1:
if is_include:
end_index = long_string.find(s2, start_index + len(s1) + 1)
extracted_content = long_string[start_index : end_index + len(s2)]
else:
end_index = long_string.find(s2, start_index + len(s1))
extracted_content = long_string[start_index + len(s1) : end_index]
if extracted_content:
match_map[start_index] = extracted_content
start_index = long_string.find(s1, start_index + 1)
return match_map
def extract_content_open_ending(long_string, s1, s2, is_include: bool = False):
# extract text open ending
match_map = {}
start_index = long_string.find(s1)
while start_index != -1:
if long_string.find(s2, start_index) <= 0:
end_index = len(long_string)
else:
if is_include:
end_index = long_string.find(s2, start_index + len(s1) + 1)
else:
end_index = long_string.find(s2, start_index + len(s1))
if is_include:
extracted_content = long_string[start_index : end_index + len(s2)]
else:
extracted_content = long_string[start_index + len(s1) : end_index]
if extracted_content:
match_map[start_index] = extracted_content
start_index = long_string.find(s1, start_index + 1)
return match_map
def str_to_bool(s):
if s.lower() in ("true", "t", "1", "yes", "y"):
return True
elif s.lower().startswith("true"):
return True
elif s.lower() in ("false", "f", "0", "no", "n"):
return False
else:
return False
def _to_str(x, charset="utf8", errors="strict"):
if x is None or isinstance(x, str):
return x
if isinstance(x, bytes):
return x.decode(charset, errors)
return str(x)
def remove_trailing_punctuation(s):
"""Remove trailing punctuation from a string."""
punctuation = set(string.punctuation)
chinese_punctuation = {
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
}
punctuation.update(chinese_punctuation)
while s and s[-1] in punctuation:
s = s[:-1]
return s