from typing import Callable, List def split_text_keep_separator(text: str, separator: str) -> List[str]: """Split text with separator and keep the separator at the end of each split.""" parts = text.split(separator) result = [separator + s if i > 0 else s for i, s in enumerate(parts)] return [s for s in result if s] def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]: """Split text by separator.""" if keep_sep: return lambda text: split_text_keep_separator(text, sep) else: return lambda text: text.split(sep) def split_by_char() -> Callable[[str], List[str]]: """Split text by character.""" return lambda text: list(text) def split_by_sentence_tokenizer() -> Callable[[str], List[str]]: import os import nltk from llama_index.utils import get_cache_dir cache_dir = get_cache_dir() nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir) # update nltk path for nltk so that it finds the data if nltk_data_dir not in nltk.data.path: nltk.data.path.append(nltk_data_dir) try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt", download_dir=nltk_data_dir) tokenizer = nltk.tokenize.PunktSentenceTokenizer() # get the spans and then return the sentences # using the start index of each span # instead of using end, use the start of the next span if available def split(text: str) -> List[str]: spans = list(tokenizer.span_tokenize(text)) sentences = [] for i, span in enumerate(spans): start = span[0] if i < len(spans) - 1: end = spans[i + 1][0] else: end = len(text) sentences.append(text[start:end]) return sentences return split def split_by_regex(regex: str) -> Callable[[str], List[str]]: """Split text by regex.""" import re return lambda text: re.findall(regex, text) def split_by_phrase_regex() -> Callable[[str], List[str]]: """Split text by phrase regex. This regular expression will split the sentences into phrases, where each phrase is a sequence of one or more non-comma, non-period, and non-semicolon characters, followed by an optional comma, period, or semicolon. The regular expression will also capture the delimiters themselves as separate items in the list of phrases. """ regex = "[^,.;。]+[,.;。]?" return split_by_regex(regex)