mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-26 13:27:46 +00:00
81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
from typing import Callable, List
|
|
|
|
|
|
def split_text_keep_separator(text: str, separator: str) -> List[str]:
|
|
"""Split text with separator and keep the separator at the end of each split."""
|
|
parts = text.split(separator)
|
|
result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
|
|
return [s for s in result if s]
|
|
|
|
|
|
def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
|
|
"""Split text by separator."""
|
|
if keep_sep:
|
|
return lambda text: split_text_keep_separator(text, sep)
|
|
else:
|
|
return lambda text: text.split(sep)
|
|
|
|
|
|
def split_by_char() -> Callable[[str], List[str]]:
|
|
"""Split text by character."""
|
|
return lambda text: list(text)
|
|
|
|
|
|
def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
|
|
import os
|
|
|
|
import nltk
|
|
from llama_index.utils import get_cache_dir
|
|
|
|
cache_dir = get_cache_dir()
|
|
nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir)
|
|
|
|
# update nltk path for nltk so that it finds the data
|
|
if nltk_data_dir not in nltk.data.path:
|
|
nltk.data.path.append(nltk_data_dir)
|
|
|
|
try:
|
|
nltk.data.find("tokenizers/punkt")
|
|
except LookupError:
|
|
nltk.download("punkt", download_dir=nltk_data_dir)
|
|
|
|
tokenizer = nltk.tokenize.PunktSentenceTokenizer()
|
|
|
|
# get the spans and then return the sentences
|
|
# using the start index of each span
|
|
# instead of using end, use the start of the next span if available
|
|
def split(text: str) -> List[str]:
|
|
spans = list(tokenizer.span_tokenize(text))
|
|
sentences = []
|
|
for i, span in enumerate(spans):
|
|
start = span[0]
|
|
if i < len(spans) - 1:
|
|
end = spans[i + 1][0]
|
|
else:
|
|
end = len(text)
|
|
sentences.append(text[start:end])
|
|
|
|
return sentences
|
|
|
|
return split
|
|
|
|
|
|
def split_by_regex(regex: str) -> Callable[[str], List[str]]:
|
|
"""Split text by regex."""
|
|
import re
|
|
|
|
return lambda text: re.findall(regex, text)
|
|
|
|
|
|
def split_by_phrase_regex() -> Callable[[str], List[str]]:
|
|
"""Split text by phrase regex.
|
|
|
|
This regular expression will split the sentences into phrases,
|
|
where each phrase is a sequence of one or more non-comma,
|
|
non-period, and non-semicolon characters, followed by an optional comma,
|
|
period, or semicolon. The regular expression will also capture the
|
|
delimiters themselves as separate items in the list of phrases.
|
|
"""
|
|
regex = "[^,.;。]+[,.;。]?"
|
|
return split_by_regex(regex)
|