fix(ChatKnowledge): add aload_document (#1548)

2025-09-12 20:53:48 +00:00 · 2024-05-23 11:59:34 +08:00
parent 7f55aa4b6e
commit 83d7e9d82d
14 changed files with 180 additions and 238 deletions
--- a/dbgpt/rag/assembler/embedding.py
+++ b/dbgpt/rag/assembler/embedding.py
@@ -1,9 +1,11 @@
 """Embedding Assembler."""
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, List, Optional

 from dbgpt.core import Chunk, Embeddings
 from dbgpt.storage.vector_store.connector import VectorStoreConnector

+from ...util.executor_utils import blocking_func_to_async
 from ..assembler.base import BaseAssembler
 from ..chunk_manager import ChunkParameters
 from ..embedding.embedding_factory import DefaultEmbeddingFactory
@@ -98,6 +100,41 @@ class EmbeddingAssembler(BaseAssembler):
            embeddings=embeddings,
        )

+    @classmethod
+    async def aload_from_knowledge(
+        cls,
+        knowledge: Knowledge,
+        vector_store_connector: VectorStoreConnector,
+        chunk_parameters: Optional[ChunkParameters] = None,
+        embedding_model: Optional[str] = None,
+        embeddings: Optional[Embeddings] = None,
+        executor: Optional[ThreadPoolExecutor] = None,
+    ) -> "EmbeddingAssembler":
+        """Load document embedding into vector store from path.
+
+        Args:
+            knowledge: (Knowledge) Knowledge datasource.
+            vector_store_connector: (VectorStoreConnector) VectorStoreConnector to use.
+            chunk_parameters: (Optional[ChunkParameters]) ChunkManager to use for
+                chunking.
+            embedding_model: (Optional[str]) Embedding model to use.
+            embeddings: (Optional[Embeddings]) Embeddings to use.
+            executor: (Optional[ThreadPoolExecutor) ThreadPoolExecutor to use.
+
+        Returns:
+             EmbeddingAssembler
+        """
+        executor = executor or ThreadPoolExecutor()
+        return await blocking_func_to_async(
+            executor,
+            cls,
+            knowledge,
+            vector_store_connector,
+            chunk_parameters,
+            embedding_model,
+            embeddings,
+        )
+
    def persist(self) -> List[str]:
        """Persist chunks into vector store.

--- a/dbgpt/rag/index/base.py
+++ b/dbgpt/rag/index/base.py
@@ -8,6 +8,7 @@ from typing import Any, Dict, List, Optional
 from dbgpt._private.pydantic import BaseModel, ConfigDict, Field, model_to_dict
 from dbgpt.core import Chunk, Embeddings
 from dbgpt.storage.vector_store.filters import MetadataFilters
+from dbgpt.util.executor_utils import blocking_func_to_async

 logger = logging.getLogger(__name__)

@@ -46,6 +47,10 @@ class IndexStoreConfig(BaseModel):
 class IndexStoreBase(ABC):
    """Index store base class."""

+    def __init__(self, executor: Optional[ThreadPoolExecutor] = None):
+        """Init index store."""
+        self._executor = executor or ThreadPoolExecutor()
+
    @abstractmethod
    def load_document(self, chunks: List[Chunk]) -> List[str]:
        """Load document in index database.
@@ -143,6 +148,27 @@ class IndexStoreBase(ABC):
        )
        return ids

+    async def aload_document_with_limit(
+        self, chunks: List[Chunk], max_chunks_once_load: int = 10, max_threads: int = 1
+    ) -> List[str]:
+        """Load document in index database with specified limit.
+
+        Args:
+            chunks(List[Chunk]): Document chunks.
+            max_chunks_once_load(int): Max number of chunks to load at once.
+            max_threads(int): Max number of threads to use.
+
+        Return:
+            List[str]: Chunk ids.
+        """
+        return await blocking_func_to_async(
+            self._executor,
+            self.load_document_with_limit,
+            chunks,
+            max_chunks_once_load,
+            max_threads,
+        )
+
    def similar_search(
        self, text: str, topk: int, filters: Optional[MetadataFilters] = None
    ) -> List[Chunk]: