feat: add GraphRAG framework and integrate TuGraph (#1506)

Co-authored-by: KingSkyLi <15566300566@163.com>
Co-authored-by: aries_ckt <916701291@qq.com>
Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
This commit is contained in:
Florian
2024-05-16 15:39:50 +08:00
committed by GitHub
parent 593e974405
commit a9087c3853
133 changed files with 10139 additions and 6631 deletions

View File

@@ -1,27 +1,25 @@
"""Vector store base class."""
import logging
import math
import time
from abc import ABC, abstractmethod
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional
from typing import Any, List, Optional
from dbgpt._private.pydantic import BaseModel, ConfigDict, Field, model_to_dict
from dbgpt._private.pydantic import ConfigDict, Field
from dbgpt.core import Chunk, Embeddings
from dbgpt.core.awel.flow import Parameter
from dbgpt.rag.index.base import IndexStoreBase, IndexStoreConfig
from dbgpt.storage.vector_store.filters import MetadataFilters
from dbgpt.util.i18n_utils import _
logger = logging.getLogger(__name__)
_COMMON_PARAMETERS = [
Parameter.build_from(
_("Collection Name"),
"name",
str,
description=_(
"The name of vector store, if not set, will use the default " "name."
"The name of vector store, if not set, will use the default name."
),
optional=True,
default="dbgpt_collection",
@@ -31,7 +29,7 @@ _COMMON_PARAMETERS = [
"user",
str,
description=_(
"The user of vector store, if not set, will use the default " "user."
"The user of vector store, if not set, will use the default user."
),
optional=True,
default=None,
@@ -84,99 +82,26 @@ _COMMON_PARAMETERS = [
]
class VectorStoreConfig(BaseModel):
class VectorStoreConfig(IndexStoreConfig):
"""Vector store config."""
model_config = ConfigDict(arbitrary_types_allowed=True)
model_config = ConfigDict(arbitrary_types_allowed=True, extra="allow")
name: str = Field(
default="dbgpt_collection",
description="The name of vector store, if not set, will use the default name.",
)
user: Optional[str] = Field(
default=None,
description="The user of vector store, if not set, will use the default user.",
)
password: Optional[str] = Field(
default=None,
description="The password of vector store, if not set, will use the default "
"password.",
)
embedding_fn: Optional[Embeddings] = Field(
default=None,
description="The embedding function of vector store, if not set, will use the "
"default embedding function.",
)
max_chunks_once_load: int = Field(
default=10,
description="The max number of chunks to load at once. If your document is "
"large, you can set this value to a larger number to speed up the loading "
"process. Default is 10.",
)
max_threads: int = Field(
default=1,
description="The max number of threads to use. Default is 1. If you set this "
"bigger than 1, please make sure your vector store is thread-safe.",
description=(
"The password of vector store, if not set, will use the default password."
),
)
def to_dict(self, **kwargs) -> Dict[str, Any]:
"""Convert to dict."""
return model_to_dict(self, **kwargs)
class VectorStoreBase(ABC):
class VectorStoreBase(IndexStoreBase, ABC):
"""Vector store base class."""
@abstractmethod
def load_document(self, chunks: List[Chunk]) -> List[str]:
"""Load document in vector database.
Args:
chunks(List[Chunk]): document chunks.
Return:
List[str]: chunk ids.
"""
def load_document_with_limit(
self, chunks: List[Chunk], max_chunks_once_load: int = 10, max_threads: int = 1
) -> List[str]:
"""Load document in vector database with specified limit.
Args:
chunks(List[Chunk]): Document chunks.
max_chunks_once_load(int): Max number of chunks to load at once.
max_threads(int): Max number of threads to use.
Return:
List[str]: Chunk ids.
"""
# Group the chunks into chunks of size max_chunks
chunk_groups = [
chunks[i : i + max_chunks_once_load]
for i in range(0, len(chunks), max_chunks_once_load)
]
logger.info(
f"Loading {len(chunks)} chunks in {len(chunk_groups)} groups with "
f"{max_threads} threads."
)
ids = []
loaded_cnt = 0
start_time = time.time()
with ThreadPoolExecutor(max_workers=max_threads) as executor:
tasks = []
for chunk_group in chunk_groups:
tasks.append(executor.submit(self.load_document, chunk_group))
for future in tasks:
success_ids = future.result()
ids.extend(success_ids)
loaded_cnt += len(success_ids)
logger.info(f"Loaded {loaded_cnt} chunks, total {len(chunks)} chunks.")
logger.info(
f"Loaded {len(chunks)} chunks in {time.time() - start_time} seconds"
)
return ids
def filter_by_score_threshold(
self, chunks: List[Chunk], score_threshold: float
) -> List[Chunk]:
@@ -207,63 +132,11 @@ class VectorStoreBase(ABC):
)
return candidates_chunks
@abstractmethod
def similar_search(
self, text: str, topk: int, filters: Optional[MetadataFilters] = None
) -> List[Chunk]:
"""Similar search in vector database.
Args:
text(str): The query text.
topk(int): The number of similar documents to return.
filters(Optional[MetadataFilters]): metadata filters.
Return:
List[Chunk]: The similar documents.
"""
pass
@abstractmethod
def similar_search_with_scores(
self,
text,
topk,
score_threshold: float,
filters: Optional[MetadataFilters] = None,
) -> List[Chunk]:
"""Similar search with scores in vector database.
Args:
text(str): The query text.
topk(int): The number of similar documents to return.
score_threshold(int): score_threshold: Optional, a floating point value
between 0 to 1
filters(Optional[MetadataFilters]): metadata filters.
Return:
List[Chunk]: The similar documents.
"""
@abstractmethod
def vector_name_exists(self) -> bool:
"""Whether vector name exists."""
return False
@abstractmethod
def delete_by_ids(self, ids: str):
"""Delete vectors by ids.
Args:
ids(str): The ids of vectors to delete, separated by comma.
"""
@abstractmethod
def delete_vector_name(self, vector_name: str):
"""Delete vector by name.
Args:
vector_name(str): The name of vector to delete.
"""
pass
def convert_metadata_filters(self, filters: MetadataFilters) -> Any:
"""Convert metadata filters to vector store filters.
@@ -285,3 +158,14 @@ class VectorStoreBase(ABC):
def _default_relevance_score_fn(self, distance: float) -> float:
"""Return a similarity score on a scale [0, 1]."""
return 1.0 - distance / math.sqrt(2)
async def aload_document(self, chunks: List[Chunk]) -> List[str]: # type: ignore
"""Load document in index database.
Args:
chunks(List[Chunk]): document chunks.
Return:
List[str]: chunk ids.
"""
raise NotImplementedError