refactor: RAG Refactor (#985)

Co-authored-by: Aralhi <xiaoping0501@gmail.com> Co-authored-by: csunny <cfqsunny@163.com>
2025-10-23 01:49:58 +00:00 · 2024-01-03 09:45:26 +08:00
parent 90775aad50
commit 9ad70a2961
206 changed files with 5766 additions and 2419 deletions
--- a/dbgpt/util/splitter_utils.py
+++ b/dbgpt/util/splitter_utils.py
@@ -0,0 +1,81 @@
+from typing import Callable, List
+
+
+def split_text_keep_separator(text: str, separator: str) -> List[str]:
+    """Split text with separator and keep the separator at the end of each split."""
+    parts = text.split(separator)
+    result = [separator + s if i > 0 else s for i, s in enumerate(parts)]
+    return [s for s in result if s]
+
+
+def split_by_sep(sep: str, keep_sep: bool = True) -> Callable[[str], List[str]]:
+    """Split text by separator."""
+    if keep_sep:
+        return lambda text: split_text_keep_separator(text, sep)
+    else:
+        return lambda text: text.split(sep)
+
+
+def split_by_char() -> Callable[[str], List[str]]:
+    """Split text by character."""
+    return lambda text: list(text)
+
+
+def split_by_sentence_tokenizer() -> Callable[[str], List[str]]:
+    import os
+
+    import nltk
+
+    from llama_index.utils import get_cache_dir
+
+    cache_dir = get_cache_dir()
+    nltk_data_dir = os.environ.get("NLTK_DATA", cache_dir)
+
+    # update nltk path for nltk so that it finds the data
+    if nltk_data_dir not in nltk.data.path:
+        nltk.data.path.append(nltk_data_dir)
+
+    try:
+        nltk.data.find("tokenizers/punkt")
+    except LookupError:
+        nltk.download("punkt", download_dir=nltk_data_dir)
+
+    tokenizer = nltk.tokenize.PunktSentenceTokenizer()
+
+    # get the spans and then return the sentences
+    # using the start index of each span
+    # instead of using end, use the start of the next span if available
+    def split(text: str) -> List[str]:
+        spans = list(tokenizer.span_tokenize(text))
+        sentences = []
+        for i, span in enumerate(spans):
+            start = span[0]
+            if i < len(spans) - 1:
+                end = spans[i + 1][0]
+            else:
+                end = len(text)
+            sentences.append(text[start:end])
+
+        return sentences
+
+    return split
+
+
+def split_by_regex(regex: str) -> Callable[[str], List[str]]:
+    """Split text by regex."""
+    import re
+
+    return lambda text: re.findall(regex, text)
+
+
+def split_by_phrase_regex() -> Callable[[str], List[str]]:
+    """Split text by phrase regex.
+
+    This regular expression will split the sentences into phrases,
+    where each phrase is a sequence of one or more non-comma,
+    non-period, and non-semicolon characters, followed by an optional comma,
+    period, or semicolon. The regular expression will also capture the
+    delimiters themselves as separate items in the list of phrases.
+    """
+    regex = "[^,.;。]+[,.;。]?"
+    return split_by_regex(regex)