feat(rag): Support RAG SDK (#1322)

This commit is contained in:
Fangyin Cheng
2024-03-22 15:36:57 +08:00
committed by GitHub
parent e65732d6e4
commit 8a17099dd2
69 changed files with 1332 additions and 558 deletions

View File

@@ -0,0 +1,16 @@
"""Assembler Module For RAG.
The Assembler is a module that is responsible for assembling the knowledge.
"""
from .base import BaseAssembler # noqa: F401
from .db_schema import DBSchemaAssembler # noqa: F401
from .embedding import EmbeddingAssembler # noqa: F401
from .summary import SummaryAssembler # noqa: F401
__all__ = [
"BaseAssembler",
"DBSchemaAssembler",
"EmbeddingAssembler",
"SummaryAssembler",
]

View File

@@ -0,0 +1,75 @@
"""Base Assembler."""
from abc import ABC, abstractmethod
from typing import Any, List, Optional
from dbgpt.core import Chunk
from dbgpt.util.tracer import root_tracer
from ..chunk_manager import ChunkManager, ChunkParameters
from ..extractor.base import Extractor
from ..knowledge.base import Knowledge
from ..retriever.base import BaseRetriever
class BaseAssembler(ABC):
"""Base Assembler."""
def __init__(
self,
knowledge: Knowledge,
chunk_parameters: Optional[ChunkParameters] = None,
extractor: Optional[Extractor] = None,
**kwargs: Any,
) -> None:
"""Initialize with Assembler arguments.
Args:
knowledge(Knowledge): Knowledge datasource.
chunk_parameters: (Optional[ChunkParameters]) ChunkManager to use for
chunking.
extractor(Optional[Extractor]): Extractor to use for summarization.
"""
self._knowledge = knowledge
self._chunk_parameters = chunk_parameters or ChunkParameters()
self._extractor = extractor
self._chunk_manager = ChunkManager(
knowledge=self._knowledge, chunk_parameter=self._chunk_parameters
)
self._chunks: List[Chunk] = []
metadata = {
"knowledge_cls": self._knowledge.__class__.__name__
if self._knowledge
else None,
"knowledge_type": self._knowledge.type().value if self._knowledge else None,
"path": self._knowledge._path
if self._knowledge and hasattr(self._knowledge, "_path")
else None,
"chunk_parameters": self._chunk_parameters.dict(),
}
with root_tracer.start_span("BaseAssembler.load_knowledge", metadata=metadata):
self.load_knowledge(self._knowledge)
def load_knowledge(self, knowledge: Optional[Knowledge] = None) -> None:
"""Load knowledge Pipeline."""
if not knowledge:
raise ValueError("knowledge must be provided.")
with root_tracer.start_span("BaseAssembler.knowledge.load"):
documents = knowledge.load()
with root_tracer.start_span("BaseAssembler.chunk_manager.split"):
self._chunks = self._chunk_manager.split(documents)
@abstractmethod
def as_retriever(self, **kwargs: Any) -> BaseRetriever:
"""Return a retriever."""
@abstractmethod
def persist(self) -> List[str]:
"""Persist chunks.
Returns:
List[str]: List of persisted chunk ids.
"""
def get_chunks(self) -> List[Chunk]:
"""Return chunks."""
return self._chunks

View File

@@ -0,0 +1,135 @@
"""DBSchemaAssembler."""
from typing import Any, List, Optional
from dbgpt.core import Chunk, Embeddings
from dbgpt.datasource.base import BaseConnector
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from ..assembler.base import BaseAssembler
from ..chunk_manager import ChunkParameters
from ..embedding.embedding_factory import DefaultEmbeddingFactory
from ..knowledge.datasource import DatasourceKnowledge
from ..retriever.db_schema import DBSchemaRetriever
class DBSchemaAssembler(BaseAssembler):
"""DBSchemaAssembler.
Example:
.. code-block:: python
from dbgpt.datasource.rdbms.conn_sqlite import SQLiteTempConnector
from dbgpt.serve.rag.assembler.db_struct import DBSchemaAssembler
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from dbgpt.storage.vector_store.chroma_store import ChromaVectorConfig
connection = SQLiteTempConnector.create_temporary_db()
assembler = DBSchemaAssembler.load_from_connection(
connector=connection,
embedding_model=embedding_model_path,
)
assembler.persist()
# get db struct retriever
retriever = assembler.as_retriever(top_k=3)
"""
def __init__(
self,
connector: BaseConnector,
vector_store_connector: VectorStoreConnector,
chunk_parameters: Optional[ChunkParameters] = None,
embedding_model: Optional[str] = None,
embeddings: Optional[Embeddings] = None,
**kwargs: Any,
) -> None:
"""Initialize with Embedding Assembler arguments.
Args:
connector: (BaseConnector) BaseConnector connection.
vector_store_connector: (VectorStoreConnector) VectorStoreConnector to use.
chunk_manager: (Optional[ChunkManager]) ChunkManager to use for chunking.
embedding_model: (Optional[str]) Embedding model to use.
embeddings: (Optional[Embeddings]) Embeddings to use.
"""
knowledge = DatasourceKnowledge(connector)
self._connector = connector
self._vector_store_connector = vector_store_connector
self._embedding_model = embedding_model
if self._embedding_model and not embeddings:
embeddings = DefaultEmbeddingFactory(
default_model_name=self._embedding_model
).create(self._embedding_model)
if (
embeddings
and self._vector_store_connector.vector_store_config.embedding_fn is None
):
self._vector_store_connector.vector_store_config.embedding_fn = embeddings
super().__init__(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
**kwargs,
)
@classmethod
def load_from_connection(
cls,
connector: BaseConnector,
vector_store_connector: VectorStoreConnector,
chunk_parameters: Optional[ChunkParameters] = None,
embedding_model: Optional[str] = None,
embeddings: Optional[Embeddings] = None,
) -> "DBSchemaAssembler":
"""Load document embedding into vector store from path.
Args:
connector: (BaseConnector) BaseConnector connection.
vector_store_connector: (VectorStoreConnector) VectorStoreConnector to use.
chunk_parameters: (Optional[ChunkParameters]) ChunkManager to use for
chunking.
embedding_model: (Optional[str]) Embedding model to use.
embeddings: (Optional[Embeddings]) Embeddings to use.
Returns:
DBSchemaAssembler
"""
return cls(
connector=connector,
vector_store_connector=vector_store_connector,
embedding_model=embedding_model,
chunk_parameters=chunk_parameters,
embeddings=embeddings,
)
def get_chunks(self) -> List[Chunk]:
"""Return chunk ids."""
return self._chunks
def persist(self) -> List[str]:
"""Persist chunks into vector store.
Returns:
List[str]: List of chunk ids.
"""
return self._vector_store_connector.load_document(self._chunks)
def _extract_info(self, chunks) -> List[Chunk]:
"""Extract info from chunks."""
return []
def as_retriever(self, top_k: int = 4, **kwargs) -> DBSchemaRetriever:
"""Create DBSchemaRetriever.
Args:
top_k(int): default 4.
Returns:
DBSchemaRetriever
"""
return DBSchemaRetriever(
top_k=top_k,
connector=self._connector,
is_embeddings=True,
vector_store_connector=self._vector_store_connector,
)

View File

@@ -0,0 +1,124 @@
"""Embedding Assembler."""
from typing import Any, List, Optional
from dbgpt.core import Chunk, Embeddings
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from ..assembler.base import BaseAssembler
from ..chunk_manager import ChunkParameters
from ..embedding.embedding_factory import DefaultEmbeddingFactory
from ..knowledge.base import Knowledge
from ..retriever.embedding import EmbeddingRetriever
class EmbeddingAssembler(BaseAssembler):
"""Embedding Assembler.
Example:
.. code-block:: python
from dbgpt.rag.assembler import EmbeddingAssembler
pdf_path = "path/to/document.pdf"
knowledge = KnowledgeFactory.from_file_path(pdf_path)
assembler = EmbeddingAssembler.load_from_knowledge(
knowledge=knowledge,
embedding_model="text2vec",
)
"""
def __init__(
self,
knowledge: Knowledge,
vector_store_connector: VectorStoreConnector,
chunk_parameters: Optional[ChunkParameters] = None,
embedding_model: Optional[str] = None,
embeddings: Optional[Embeddings] = None,
**kwargs: Any,
) -> None:
"""Initialize with Embedding Assembler arguments.
Args:
knowledge: (Knowledge) Knowledge datasource.
vector_store_connector: (VectorStoreConnector) VectorStoreConnector to use.
chunk_parameters: (Optional[ChunkParameters]) ChunkManager to use for
chunking.
embedding_model: (Optional[str]) Embedding model to use.
embeddings: (Optional[Embeddings]) Embeddings to use.
"""
if knowledge is None:
raise ValueError("knowledge datasource must be provided.")
self._vector_store_connector = vector_store_connector
self._embedding_model = embedding_model
if self._embedding_model and not embeddings:
embeddings = DefaultEmbeddingFactory(
default_model_name=self._embedding_model
).create(self._embedding_model)
if (
embeddings
and self._vector_store_connector.vector_store_config.embedding_fn is None
):
self._vector_store_connector.vector_store_config.embedding_fn = embeddings
super().__init__(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
**kwargs,
)
@classmethod
def load_from_knowledge(
cls,
knowledge: Knowledge,
vector_store_connector: VectorStoreConnector,
chunk_parameters: Optional[ChunkParameters] = None,
embedding_model: Optional[str] = None,
embeddings: Optional[Embeddings] = None,
) -> "EmbeddingAssembler":
"""Load document embedding into vector store from path.
Args:
knowledge: (Knowledge) Knowledge datasource.
vector_store_connector: (VectorStoreConnector) VectorStoreConnector to use.
chunk_parameters: (Optional[ChunkParameters]) ChunkManager to use for
chunking.
embedding_model: (Optional[str]) Embedding model to use.
embeddings: (Optional[Embeddings]) Embeddings to use.
Returns:
EmbeddingAssembler
"""
return cls(
knowledge=knowledge,
vector_store_connector=vector_store_connector,
chunk_parameters=chunk_parameters,
embedding_model=embedding_model,
embeddings=embeddings,
)
def persist(self) -> List[str]:
"""Persist chunks into vector store.
Returns:
List[str]: List of chunk ids.
"""
return self._vector_store_connector.load_document(self._chunks)
def _extract_info(self, chunks) -> List[Chunk]:
"""Extract info from chunks."""
return []
def as_retriever(self, top_k: int = 4, **kwargs) -> EmbeddingRetriever:
"""Create a retriever.
Args:
top_k(int): default 4.
Returns:
EmbeddingRetriever
"""
return EmbeddingRetriever(
top_k=top_k, vector_store_connector=self._vector_store_connector
)

View File

@@ -0,0 +1,131 @@
"""Summary Assembler."""
import os
from typing import Any, List, Optional
from dbgpt.core import Chunk, LLMClient
from ..assembler.base import BaseAssembler
from ..chunk_manager import ChunkParameters
from ..extractor.base import Extractor
from ..knowledge.base import Knowledge
from ..retriever.base import BaseRetriever
class SummaryAssembler(BaseAssembler):
"""Summary Assembler.
Example:
.. code-block:: python
pdf_path = "../../../DB-GPT/docs/docs/awel.md"
OPEN_AI_KEY = "{your_api_key}"
OPEN_AI_BASE = "{your_api_base}"
llm_client = OpenAILLMClient(api_key=OPEN_AI_KEY, api_base=OPEN_AI_BASE)
knowledge = KnowledgeFactory.from_file_path(pdf_path)
chunk_parameters = ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
assembler = SummaryAssembler.load_from_knowledge(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
llm_client=llm_client,
model_name="gpt-3.5-turbo",
)
summary = await assembler.generate_summary()
"""
def __init__(
self,
knowledge: Knowledge,
chunk_parameters: Optional[ChunkParameters] = None,
model_name: Optional[str] = None,
llm_client: Optional[LLMClient] = None,
extractor: Optional[Extractor] = None,
language: Optional[str] = "en",
**kwargs: Any,
) -> None:
"""Initialize with Embedding Assembler arguments.
Args:
knowledge: (Knowledge) Knowledge datasource.
chunk_manager: (Optional[ChunkManager]) ChunkManager to use for chunking.
model_name: (Optional[str]) llm model to use.
llm_client: (Optional[LLMClient]) LLMClient to use.
extractor: (Optional[Extractor]) Extractor to use for summarization.
language: (Optional[str]) The language of the prompt. Defaults to "en".
"""
if knowledge is None:
raise ValueError("knowledge datasource must be provided.")
model_name = model_name or os.getenv("LLM_MODEL")
if not extractor:
from ..extractor.summary import SummaryExtractor
if not llm_client:
raise ValueError("llm_client must be provided.")
if not model_name:
raise ValueError("model_name must be provided.")
extractor = SummaryExtractor(
llm_client=llm_client,
model_name=model_name,
language=language,
)
if not extractor:
raise ValueError("extractor must be provided.")
self._extractor: Extractor = extractor
super().__init__(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
extractor=self._extractor,
**kwargs,
)
@classmethod
def load_from_knowledge(
cls,
knowledge: Knowledge,
chunk_parameters: Optional[ChunkParameters] = None,
model_name: Optional[str] = None,
llm_client: Optional[LLMClient] = None,
extractor: Optional[Extractor] = None,
language: Optional[str] = "en",
**kwargs: Any,
) -> "SummaryAssembler":
"""Load document embedding into vector store from path.
Args:
knowledge: (Knowledge) Knowledge datasource.
chunk_parameters: (Optional[ChunkParameters]) ChunkManager to use for
chunking.
model_name: (Optional[str]) llm model to use.
llm_client: (Optional[LLMClient]) LLMClient to use.
extractor: (Optional[Extractor]) Extractor to use for summarization.
language: (Optional[str]) The language of the prompt. Defaults to "en".
Returns:
SummaryAssembler
"""
return cls(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
model_name=model_name,
llm_client=llm_client,
extractor=extractor,
language=language,
**kwargs,
)
async def generate_summary(self) -> str:
"""Generate summary."""
return await self._extractor.aextract(self._chunks)
def persist(self) -> List[str]:
"""Persist chunks into store."""
raise NotImplementedError
def _extract_info(self, chunks) -> List[Chunk]:
"""Extract info from chunks."""
return []
def as_retriever(self, **kwargs: Any) -> BaseRetriever:
"""Return a retriever."""
raise NotImplementedError

View File

View File

@@ -0,0 +1,76 @@
from unittest.mock import MagicMock
import pytest
from dbgpt.datasource.rdbms.conn_sqlite import SQLiteTempConnector
from dbgpt.rag.assembler.embedding import EmbeddingAssembler
from dbgpt.rag.chunk_manager import ChunkParameters, SplitterType
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
from dbgpt.rag.knowledge.base import Knowledge
from dbgpt.rag.text_splitter.text_splitter import CharacterTextSplitter
from dbgpt.storage.vector_store.connector import VectorStoreConnector
@pytest.fixture
def mock_db_connection():
"""Create a temporary database connection for testing."""
connect = SQLiteTempConnector.create_temporary_db()
connect.create_temp_tables(
{
"user": {
"columns": {
"id": "INTEGER PRIMARY KEY",
"name": "TEXT",
"age": "INTEGER",
},
"data": [
(1, "Tom", 10),
(2, "Jerry", 16),
(3, "Jack", 18),
(4, "Alice", 20),
(5, "Bob", 22),
],
}
}
)
return connect
@pytest.fixture
def mock_chunk_parameters():
return MagicMock(spec=ChunkParameters)
@pytest.fixture
def mock_embedding_factory():
return MagicMock(spec=EmbeddingFactory)
@pytest.fixture
def mock_vector_store_connector():
return MagicMock(spec=VectorStoreConnector)
@pytest.fixture
def mock_knowledge():
return MagicMock(spec=Knowledge)
def test_load_knowledge(
mock_db_connection,
mock_knowledge,
mock_chunk_parameters,
mock_embedding_factory,
mock_vector_store_connector,
):
mock_chunk_parameters.chunk_strategy = "CHUNK_BY_SIZE"
mock_chunk_parameters.text_splitter = CharacterTextSplitter()
mock_chunk_parameters.splitter_type = SplitterType.USER_DEFINE
assembler = EmbeddingAssembler(
knowledge=mock_knowledge,
chunk_parameters=mock_chunk_parameters,
embeddings=mock_embedding_factory.create(),
vector_store_connector=mock_vector_store_connector,
)
assembler.load_knowledge(knowledge=mock_knowledge)
assert len(assembler._chunks) == 0

View File

@@ -0,0 +1,68 @@
from unittest.mock import MagicMock
import pytest
from dbgpt.datasource.rdbms.conn_sqlite import SQLiteTempConnector
from dbgpt.rag.assembler.db_schema import DBSchemaAssembler
from dbgpt.rag.chunk_manager import ChunkParameters, SplitterType
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
from dbgpt.rag.text_splitter.text_splitter import CharacterTextSplitter
from dbgpt.storage.vector_store.connector import VectorStoreConnector
@pytest.fixture
def mock_db_connection():
"""Create a temporary database connection for testing."""
connect = SQLiteTempConnector.create_temporary_db()
connect.create_temp_tables(
{
"user": {
"columns": {
"id": "INTEGER PRIMARY KEY",
"name": "TEXT",
"age": "INTEGER",
},
"data": [
(1, "Tom", 10),
(2, "Jerry", 16),
(3, "Jack", 18),
(4, "Alice", 20),
(5, "Bob", 22),
],
}
}
)
return connect
@pytest.fixture
def mock_chunk_parameters():
return MagicMock(spec=ChunkParameters)
@pytest.fixture
def mock_embedding_factory():
return MagicMock(spec=EmbeddingFactory)
@pytest.fixture
def mock_vector_store_connector():
return MagicMock(spec=VectorStoreConnector)
def test_load_knowledge(
mock_db_connection,
mock_chunk_parameters,
mock_embedding_factory,
mock_vector_store_connector,
):
mock_chunk_parameters.chunk_strategy = "CHUNK_BY_SIZE"
mock_chunk_parameters.text_splitter = CharacterTextSplitter()
mock_chunk_parameters.splitter_type = SplitterType.USER_DEFINE
assembler = DBSchemaAssembler(
connector=mock_db_connection,
chunk_parameters=mock_chunk_parameters,
embeddings=mock_embedding_factory.create(),
vector_store_connector=mock_vector_store_connector,
)
assert len(assembler._chunks) == 1