support oceanbase as an optional vector database (#1435)

Signed-off-by: shanhaikang.shk <shanhaikang.shk@oceanbase.com>
2025-09-04 18:40:10 +00:00 · 2024-04-24 16:08:30 +08:00
parent 91c1371234
commit 6520367623
10 changed files with 975 additions and 12 deletions
--- a/dbgpt/rag/text_splitter/tests/test_splitters.py
+++ b/dbgpt/rag/text_splitter/tests/test_splitters.py
@@ -25,11 +25,11 @@ def test_md_header_text_splitter() -> None:
    output = markdown_splitter.split_text(markdown_document)
    expected_output = [
        Chunk(
-            content="{'Header 1': 'dbgpt', 'Header 2': 'description'}, my name is dbgpt",
+            content='"dbgpt-description": my name is dbgpt',
            metadata={"Header 1": "dbgpt", "Header 2": "description"},
        ),
        Chunk(
-            content="{'Header 1': 'dbgpt', 'Header 2': 'content'}, my name is aries",
+            content='"dbgpt-content": my name is aries',
            metadata={"Header 1": "dbgpt", "Header 2": "content"},
        ),
    ]
--- a/dbgpt/rag/text_splitter/text_splitter.py
+++ b/dbgpt/rag/text_splitter/text_splitter.py
@@ -515,7 +515,8 @@ class MarkdownHeaderTextSplitter(TextSplitter):
                aggregated_chunks[-1]["content"] += "  \n" + line["content"]
            else:
                # Otherwise, append the current line to the aggregated list
-                line["content"] = f"{line['metadata']}, " + line["content"]
+                subtitles = "-".join((list(line["metadata"].values())))
+                line["content"] = f'"{subtitles}": ' + line["content"]
                aggregated_chunks.append(line)

        return [
@@ -557,16 +558,28 @@ class MarkdownHeaderTextSplitter(TextSplitter):
        # header_stack: List[Dict[str, Union[int, str]]] = []
        header_stack: List[HeaderType] = []
        initial_metadata: Dict[str, str] = {}
+        # Determine whether a line is within a markdown code block.
+        in_code_block = False
        for line in lines:
            stripped_line = line.strip()
+            # A code frame starts with "```"
+            with_code_frame = stripped_line.startswith("```") and (
+                stripped_line != "```"
+            )
+            if (not in_code_block) and with_code_frame:
+                in_code_block = True
            # Check each line against each of the header types (e.g., #, ##)
            for sep, name in self.headers_to_split_on:
                # Check if line starts with a header that we intend to split on
-                if stripped_line.startswith(sep) and (
-                    # Header with no text OR header is followed by space
-                    # Both are valid conditions that sep is being used a header
-                    len(stripped_line) == len(sep)
-                    or stripped_line[len(sep)] == " "
+                if (
+                    (not in_code_block)
+                    and stripped_line.startswith(sep)
+                    and (
+                        # Header with no text OR header is followed by space
+                        # Both are valid conditions that sep is being used a header
+                        len(stripped_line) == len(sep)
+                        or stripped_line[len(sep)] == " "
+                    )
                ):
                    # Ensure we are tracking the header as metadata
                    if name is not None:
@@ -620,6 +633,10 @@ class MarkdownHeaderTextSplitter(TextSplitter):
                    )
                    current_content.clear()

+            # Code block ends
+            if in_code_block and stripped_line == "```":
+                in_code_block = False
+
            current_metadata = initial_metadata.copy()
        if current_content:
            lines_with_metadata.append(