mirror of
				https://github.com/csunny/DB-GPT.git
				synced 2025-10-25 11:35:41 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			81 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			81 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from typing import Callable, List
 | |
| 
 | |
| 
 | |
| def split_text_keep_separator(text: str, separator: str) -> List[str]:
 | |
|     """Split text with separator and keep the separator at the end of each split."""
 | |
|     parts = text.split(separator)
 | |
|     result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
 | |
|     return [s for s in result if s]
 | |
| 
 | |
| 
 | |
| def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
 | |
|     """Split text by separator."""
 | |
|     if keep_sep:
 | |
|         return lambda text: split_text_keep_separator(text, sep)
 | |
|     else:
 | |
|         return lambda text: text.split(sep)
 | |
| 
 | |
| 
 | |
| def split_by_char() -> Callable[[str], List[str]]:
 | |
|     """Split text by character."""
 | |
|     return lambda text: list(text)
 | |
| 
 | |
| 
 | |
| def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
 | |
|     import os
 | |
| 
 | |
|     import nltk
 | |
|     from llama_index.utils import get_cache_dir
 | |
| 
 | |
|     cache_dir = get_cache_dir()
 | |
|     nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir)
 | |
| 
 | |
|     # update nltk path for nltk so that it finds the data
 | |
|     if nltk_data_dir not in nltk.data.path:
 | |
|         nltk.data.path.append(nltk_data_dir)
 | |
| 
 | |
|     try:
 | |
|         nltk.data.find("tokenizers/punkt")
 | |
|     except LookupError:
 | |
|         nltk.download("punkt", download_dir=nltk_data_dir)
 | |
| 
 | |
|     tokenizer = nltk.tokenize.PunktSentenceTokenizer()
 | |
| 
 | |
|     # get the spans and then return the sentences
 | |
|     # using the start index of each span
 | |
|     # instead of using end, use the start of the next span if available
 | |
|     def split(text: str) -> List[str]:
 | |
|         spans = list(tokenizer.span_tokenize(text))
 | |
|         sentences = []
 | |
|         for i, span in enumerate(spans):
 | |
|             start = span[0]
 | |
|             if i < len(spans) - 1:
 | |
|                 end = spans[i + 1][0]
 | |
|             else:
 | |
|                 end = len(text)
 | |
|             sentences.append(text[start:end])
 | |
| 
 | |
|         return sentences
 | |
| 
 | |
|     return split
 | |
| 
 | |
| 
 | |
| def split_by_regex(regex: str) -> Callable[[str], List[str]]:
 | |
|     """Split text by regex."""
 | |
|     import re
 | |
| 
 | |
|     return lambda text: re.findall(regex, text)
 | |
| 
 | |
| 
 | |
| def split_by_phrase_regex() -> Callable[[str], List[str]]:
 | |
|     """Split text by phrase regex.
 | |
| 
 | |
|     This regular expression will split the sentences into phrases,
 | |
|     where each phrase is a sequence of one or more non-comma,
 | |
|     non-period, and non-semicolon characters, followed by an optional comma,
 | |
|     period, or semicolon. The regular expression will also capture the
 | |
|     delimiters themselves as separate items in the list of phrases.
 | |
|     """
 | |
|     regex = "[^,.;。]+[,.;。]?"
 | |
|     return split_by_regex(regex)
 |