feat: add GraphRAG framework and integrate TuGraph (#1506)

Co-authored-by: KingSkyLi <15566300566@163.com> Co-authored-by: aries_ckt <916701291@qq.com> Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
2025-11-03 08:58:29 +00:00 · 2024-05-16 15:39:50 +08:00
parent 593e974405
commit a9087c3853
133 changed files with 10139 additions and 6631 deletions
--- a/dbgpt/storage/vector_store/base.py
+++ b/dbgpt/storage/vector_store/base.py
@@ -1,27 +1,25 @@
 """Vector store base class."""
 import logging
 import math
-import time
 from abc import ABC, abstractmethod
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Dict, List, Optional
+from typing import Any, List, Optional

-from dbgpt._private.pydantic import BaseModel, ConfigDict, Field, model_to_dict
+from dbgpt._private.pydantic import ConfigDict, Field
 from dbgpt.core import Chunk, Embeddings
 from dbgpt.core.awel.flow import Parameter
+from dbgpt.rag.index.base import IndexStoreBase, IndexStoreConfig
 from dbgpt.storage.vector_store.filters import MetadataFilters
 from dbgpt.util.i18n_utils import _

 logger = logging.getLogger(__name__)

-
 _COMMON_PARAMETERS = [
    Parameter.build_from(
        _("Collection Name"),
        "name",
        str,
        description=_(
-            "The name of vector store, if not set, will use the default " "name."
+            "The name of vector store, if not set, will use the default name."
        ),
        optional=True,
        default="dbgpt_collection",
@@ -31,7 +29,7 @@ _COMMON_PARAMETERS = [
        "user",
        str,
        description=_(
-            "The user of vector store, if not set, will use the default " "user."
+            "The user of vector store, if not set, will use the default user."
        ),
        optional=True,
        default=None,
@@ -84,99 +82,26 @@ _COMMON_PARAMETERS = [
 ]


-class VectorStoreConfig(BaseModel):
+class VectorStoreConfig(IndexStoreConfig):
    """Vector store config."""

-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    model_config = ConfigDict(arbitrary_types_allowed=True, extra="allow")

-    name: str = Field(
-        default="dbgpt_collection",
-        description="The name of vector store, if not set, will use the default name.",
-    )
    user: Optional[str] = Field(
        default=None,
        description="The user of vector store, if not set, will use the default user.",
    )
    password: Optional[str] = Field(
        default=None,
-        description="The password of vector store, if not set, will use the default "
-        "password.",
-    )
-    embedding_fn: Optional[Embeddings] = Field(
-        default=None,
-        description="The embedding function of vector store, if not set, will use the "
-        "default embedding function.",
-    )
-    max_chunks_once_load: int = Field(
-        default=10,
-        description="The max number of chunks to load at once. If your document is "
-        "large, you can set this value to a larger number to speed up the loading "
-        "process. Default is 10.",
-    )
-    max_threads: int = Field(
-        default=1,
-        description="The max number of threads to use. Default is 1. If you set this "
-        "bigger than 1, please make sure your vector store is thread-safe.",
+        description=(
+            "The password of vector store, if not set, will use the default password."
+        ),
    )

-    def to_dict(self, **kwargs) -> Dict[str, Any]:
-        """Convert to dict."""
-        return model_to_dict(self, **kwargs)

-
-class VectorStoreBase(ABC):
+class VectorStoreBase(IndexStoreBase, ABC):
    """Vector store base class."""

-    @abstractmethod
-    def load_document(self, chunks: List[Chunk]) -> List[str]:
-        """Load document in vector database.
-
-        Args:
-            chunks(List[Chunk]): document chunks.
-
-        Return:
-            List[str]: chunk ids.
-        """
-
-    def load_document_with_limit(
-        self, chunks: List[Chunk], max_chunks_once_load: int = 10, max_threads: int = 1
-    ) -> List[str]:
-        """Load document in vector database with specified limit.
-
-        Args:
-            chunks(List[Chunk]): Document chunks.
-            max_chunks_once_load(int): Max number of chunks to load at once.
-            max_threads(int): Max number of threads to use.
-
-        Return:
-            List[str]: Chunk ids.
-        """
-        # Group the chunks into chunks of size max_chunks
-        chunk_groups = [
-            chunks[i : i + max_chunks_once_load]
-            for i in range(0, len(chunks), max_chunks_once_load)
-        ]
-        logger.info(
-            f"Loading {len(chunks)} chunks in {len(chunk_groups)} groups with "
-            f"{max_threads} threads."
-        )
-        ids = []
-        loaded_cnt = 0
-        start_time = time.time()
-        with ThreadPoolExecutor(max_workers=max_threads) as executor:
-            tasks = []
-            for chunk_group in chunk_groups:
-                tasks.append(executor.submit(self.load_document, chunk_group))
-            for future in tasks:
-                success_ids = future.result()
-                ids.extend(success_ids)
-                loaded_cnt += len(success_ids)
-                logger.info(f"Loaded {loaded_cnt} chunks, total {len(chunks)} chunks.")
-        logger.info(
-            f"Loaded {len(chunks)} chunks in {time.time() - start_time} seconds"
-        )
-        return ids
-
    def filter_by_score_threshold(
        self, chunks: List[Chunk], score_threshold: float
    ) -> List[Chunk]:
@@ -207,63 +132,11 @@ class VectorStoreBase(ABC):
                )
        return candidates_chunks

-    @abstractmethod
-    def similar_search(
-        self, text: str, topk: int, filters: Optional[MetadataFilters] = None
-    ) -> List[Chunk]:
-        """Similar search in vector database.
-
-        Args:
-            text(str): The query text.
-            topk(int): The number of similar documents to return.
-            filters(Optional[MetadataFilters]): metadata filters.
-        Return:
-            List[Chunk]: The similar documents.
-        """
-        pass
-
-    @abstractmethod
-    def similar_search_with_scores(
-        self,
-        text,
-        topk,
-        score_threshold: float,
-        filters: Optional[MetadataFilters] = None,
-    ) -> List[Chunk]:
-        """Similar search with scores in vector database.
-
-        Args:
-            text(str): The query text.
-            topk(int): The number of similar documents to return.
-            score_threshold(int): score_threshold: Optional, a floating point value
-                between 0 to 1
-            filters(Optional[MetadataFilters]): metadata filters.
-        Return:
-            List[Chunk]: The similar documents.
-        """
-
    @abstractmethod
    def vector_name_exists(self) -> bool:
        """Whether vector name exists."""
        return False

-    @abstractmethod
-    def delete_by_ids(self, ids: str):
-        """Delete vectors by ids.
-
-        Args:
-            ids(str): The ids of vectors to delete, separated by comma.
-        """
-
-    @abstractmethod
-    def delete_vector_name(self, vector_name: str):
-        """Delete vector by name.
-
-        Args:
-            vector_name(str): The name of vector to delete.
-        """
-        pass
-
    def convert_metadata_filters(self, filters: MetadataFilters) -> Any:
        """Convert metadata filters to vector store filters.

@@ -285,3 +158,14 @@ class VectorStoreBase(ABC):
    def _default_relevance_score_fn(self, distance: float) -> float:
        """Return a similarity score on a scale [0, 1]."""
        return 1.0 - distance / math.sqrt(2)
+
+    async def aload_document(self, chunks: List[Chunk]) -> List[str]:  # type: ignore
+        """Load document in index database.
+
+        Args:
+            chunks(List[Chunk]): document chunks.
+
+        Return:
+            List[str]: chunk ids.
+        """
+        raise NotImplementedError