feat(ChatKnowledge): ChatKnowledge Support Keyword Retrieve (#1624)

Co-authored-by: Fangyin Cheng <staneyffer@gmail.com>
This commit is contained in:
Aries-ckt
2024-06-13 13:49:17 +08:00
committed by GitHub
parent 162e2c9b1c
commit 58d08780d6
86 changed files with 948 additions and 440 deletions

View File

@@ -76,14 +76,12 @@ from dbgpt.core.awel import DAG
from dbgpt.rag import ChunkParameters
from dbgpt.rag.knowledge import KnowledgeType
from dbgpt.rag.operators import EmbeddingAssemblerOperator, KnowledgeOperator
from dbgpt.storage.vector_store.chroma_store import ChromaVectorConfig
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from dbgpt.storage.vector_store.chroma_store import ChromaStore, ChromaVectorConfig
# Delete old vector store directory(/tmp/awel_rag_test_vector_store)
shutil.rmtree("/tmp/awel_rag_test_vector_store", ignore_errors=True)
vector_connector = VectorStoreConnector.from_default(
"Chroma",
vector_store = ChromaStore(
vector_store_config=ChromaVectorConfig(
name="test_vstore",
persist_path="/tmp/awel_rag_test_vector_store",
@@ -95,7 +93,7 @@ with DAG("load_knowledge_dag") as knowledge_dag:
# Load knowledge from URL
knowledge_task = KnowledgeOperator(knowledge_type=KnowledgeType.URL.name)
assembler_task = EmbeddingAssemblerOperator(
vector_store_connector=vector_connector,
index_store=vector_store,
chunk_parameters=ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
)
knowledge_task >> assembler_task
@@ -116,7 +114,7 @@ from dbgpt.rag.operators import EmbeddingRetrieverOperator
with DAG("retriever_dag") as retriever_dag:
retriever_task = EmbeddingRetrieverOperator(
top_k=3,
vector_store_connector=vector_connector,
index_store=vector_store,
)
content_task = MapOperator(lambda cks: "\n".join(c.content for c in cks))
retriever_task >> content_task
@@ -218,7 +216,7 @@ with DAG("llm_rag_dag") as rag_dag:
input_task = InputOperator(input_source=InputSource.from_callable())
retriever_task = EmbeddingRetrieverOperator(
top_k=3,
vector_store_connector=vector_connector,
index_store=vector_store,
)
content_task = MapOperator(lambda cks: "\n".join(c.content for c in cks))
@@ -256,10 +254,10 @@ from dbgpt.core.awel import DAG, MapOperator, InputOperator, JoinOperator, Input
from dbgpt.core.operators import PromptBuilderOperator, RequestBuilderOperator
from dbgpt.rag import ChunkParameters
from dbgpt.rag.knowledge import KnowledgeType
from dbgpt.rag.operators import EmbeddingAssemblerOperator, KnowledgeOperator, EmbeddingRetrieverOperator
from dbgpt.rag.operators import EmbeddingAssemblerOperator, KnowledgeOperator,
EmbeddingRetrieverOperator
from dbgpt.rag.embedding import DefaultEmbeddingFactory
from dbgpt.storage.vector_store.chroma_store import ChromaVectorConfig
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from dbgpt.storage.vector_store.chroma_store import ChromaStore, ChromaVectorConfig
from dbgpt.model.operators import LLMOperator
from dbgpt.model.proxy import OpenAILLMClient
@@ -273,8 +271,7 @@ llm_client = OpenAILLMClient()
# Delete old vector store directory(/tmp/awel_rag_test_vector_store)
shutil.rmtree("/tmp/awel_rag_test_vector_store", ignore_errors=True)
vector_connector = VectorStoreConnector.from_default(
"Chroma",
vector_store = ChromaStore(
vector_store_config=ChromaVectorConfig(
name="test_vstore",
persist_path="/tmp/awel_rag_test_vector_store",
@@ -286,7 +283,7 @@ with DAG("load_knowledge_dag") as knowledge_dag:
# Load knowledge from URL
knowledge_task = KnowledgeOperator(knowledge_type=KnowledgeType.URL.name)
assembler_task = EmbeddingAssemblerOperator(
vector_store_connector=vector_connector,
index_store=vector_store,
chunk_parameters=ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
)
knowledge_task >> assembler_task
@@ -294,7 +291,6 @@ with DAG("load_knowledge_dag") as knowledge_dag:
chunks = asyncio.run(assembler_task.call("https://docs.dbgpt.site/docs/latest/awel/"))
print(f"Chunk length: {len(chunks)}\n")
prompt = """Based on the known information below, provide users with professional and concise answers to their questions.
If the answer cannot be obtained from the provided content, please say:
"The information provided in the knowledge base is not sufficient to answer this question.".
@@ -305,17 +301,17 @@ It is forbidden to make up information randomly. When answering, it is best to s
{question}
"""
with DAG("llm_rag_dag") as rag_dag:
input_task = InputOperator(input_source=InputSource.from_callable())
retriever_task = EmbeddingRetrieverOperator(
top_k=3,
vector_store_connector=vector_connector,
index_store=vector_store,
)
content_task = MapOperator(lambda cks: "\n".join(c.content for c in cks))
merge_task = JoinOperator(lambda context, question: {"context": context, "question": question})
merge_task = JoinOperator(
lambda context, question: {"context": context, "question": question})
prompt_task = PromptBuilderOperator(prompt)
# The model is gpt-3.5-turbo, you can replace it with other models.
req_build_task = RequestBuilderOperator(model="gpt-3.5-turbo")

View File

@@ -84,19 +84,19 @@ import shutil
from dbgpt.core.awel import DAG, InputOperator
from dbgpt.rag import ChunkParameters
from dbgpt.rag.operators import DBSchemaAssemblerOperator
from dbgpt.storage.vector_store.chroma_store import ChromaVectorConfig
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from dbgpt.storage.vector_store.chroma_store import ChromaVectorConfig, ChromaStore
# Delete old vector store directory(/tmp/awel_with_data_vector_store)
shutil.rmtree("/tmp/awel_with_data_vector_store", ignore_errors=True)
vector_connector = VectorStoreConnector.from_default(
"Chroma",
vector_store_config=ChromaVectorConfig(
name="db_schema_vector_store",
persist_path="/tmp/awel_with_data_vector_store",
),
embedding_fn=embeddings
vector_store = ChromaStore(
ChromaVectorConfig(
embedding_fn=embeddings,
vector_store_config=ChromaVectorConfig(
name="db_schema_vector_store",
persist_path="/tmp/awel_with_data_vector_store",
),
)
)
with DAG("load_schema_dag") as load_schema_dag:
@@ -104,7 +104,7 @@ with DAG("load_schema_dag") as load_schema_dag:
# Load database schema to vector store
assembler_task = DBSchemaAssemblerOperator(
connector=db_conn,
vector_store_connector=vector_connector,
index_store=vector_store,
chunk_parameters=ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
)
input_task >> assembler_task
@@ -124,7 +124,7 @@ with DAG("retrieve_schema_dag") as retrieve_schema_dag:
# Retrieve database schema from vector store
retriever_task = DBSchemaRetrieverOperator(
top_k=1,
vector_store_connector=vector_connector,
index_store=vector_store,
)
input_task >> retriever_task
@@ -244,7 +244,7 @@ with DAG("chat_data_dag") as chat_data_dag:
input_task = InputOperator(input_source=InputSource.from_callable())
retriever_task = DBSchemaRetrieverOperator(
top_k=1,
vector_store_connector=vector_connector,
index_store=vector_store,
)
content_task = MapOperator(lambda cks: [c.content for c in cks])
merge_task = JoinOperator(lambda table_info, ext_dict: {"table_info": table_info, **ext_dict})
@@ -456,8 +456,7 @@ from dbgpt.model.proxy import OpenAILLMClient
from dbgpt.rag import ChunkParameters
from dbgpt.rag.embedding import DefaultEmbeddingFactory
from dbgpt.rag.operators import DBSchemaAssemblerOperator, DBSchemaRetrieverOperator
from dbgpt.storage.vector_store.chroma_store import ChromaVectorConfig
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from dbgpt.storage.vector_store.chroma_store import ChromaVectorConfig, ChromaStore
# Delete old vector store directory(/tmp/awel_with_data_vector_store)
shutil.rmtree("/tmp/awel_with_data_vector_store", ignore_errors=True)
@@ -488,13 +487,14 @@ db_conn.create_temp_tables(
}
)
vector_connector = VectorStoreConnector.from_default(
"Chroma",
vector_store_config=ChromaVectorConfig(
name="db_schema_vector_store",
persist_path="/tmp/awel_with_data_vector_store",
),
embedding_fn=embeddings,
vector_store = ChromaStore(
ChromaVectorConfig(
embedding_fn=embeddings,
vector_store_config=ChromaVectorConfig(
name="db_schema_vector_store",
persist_path="/tmp/awel_with_data_vector_store",
),
)
)
antv_charts = [
@@ -627,7 +627,7 @@ with DAG("load_schema_dag") as load_schema_dag:
# Load database schema to vector store
assembler_task = DBSchemaAssemblerOperator(
connector=db_conn,
vector_store_connector=vector_connector,
index_store=vector_store,
chunk_parameters=ChunkParameters(chunk_strategy="CHUNK_BY_SIZE"),
)
input_task >> assembler_task
@@ -635,12 +635,11 @@ with DAG("load_schema_dag") as load_schema_dag:
chunks = asyncio.run(assembler_task.call())
print(chunks)
with DAG("chat_data_dag") as chat_data_dag:
input_task = InputOperator(input_source=InputSource.from_callable())
retriever_task = DBSchemaRetrieverOperator(
top_k=1,
vector_store_connector=vector_connector,
index_store=vector_store,
)
content_task = MapOperator(lambda cks: [c.content for c in cks])
merge_task = JoinOperator(
@@ -653,11 +652,11 @@ with DAG("chat_data_dag") as chat_data_dag:
db_query_task = DatasourceOperator(connector=db_conn)
(
input_task
>> MapOperator(lambda x: x["user_input"])
>> retriever_task
>> content_task
>> merge_task
input_task
>> MapOperator(lambda x: x["user_input"])
>> retriever_task
>> content_task
>> merge_task
)
input_task >> merge_task
merge_task >> prompt_task >> req_build_task >> llm_task >> sql_parse_task

View File

@@ -129,19 +129,20 @@ To maintain compatibility with existing conventional RAG frameworks, we continue
```python
from dbgpt.model.proxy.llms.chatgpt import OpenAILLMClient
from dbgpt.storage.vector_store.base import VectorStoreConfig
from dbgpt.storage.vector_store.connector import VectorStoreConnector
from dbgpt.storage.knowledge_graph.knowledge_graph import (
BuiltinKnowledgeGraph,
BuiltinKnowledgeGraphConfig,
)
def _create_vector_connector():
"""Create vector connector."""
return VectorStoreConnector(
vector_store_type="KnowledgeGraph",
vector_store_config=VectorStoreConfig(
name="graph_rag_test_kg",
def _create_kg_connector():
"""Create knowledge graph connector."""
return BuiltinKnowledgeGraph(
config=BuiltinKnowledgeGraphConfig(
name="graph_rag_test",
embedding_fn=None,
llm_client=OpenAILLMClient(),
model_name="gpt-4"
)
model_name="gpt-4",
),
)
```
@@ -162,13 +163,13 @@ from dbgpt.rag.knowledge import KnowledgeFactory
async def main():
file_path = os.path.join(ROOT_PATH, "examples/test_files/tranformers_story.md")
knowledge = KnowledgeFactory.from_file_path(file_path)
vector_connector = _create_kg_connector()
graph_store = _create_kg_connector()
chunk_parameters = ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
# get embedding assembler
assembler = EmbeddingAssembler.load_from_knowledge(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
vector_store_connector=vector_connector,
index_store=graph_store,
)
assembler.persist()
# get embeddings retriever
@@ -178,7 +179,7 @@ async def main():
score_threshold=0.3
)
print(f"embedding rag example results:{chunks}")
vector_connector.delete_vector_name("graph_rag_test")
graph_store.delete_vector_name("graph_rag_test")
```

View File

@@ -0,0 +1,132 @@
# Keyword Search RAG User Manual
In this example, we will show how to use the Full Text Search RAG framework in DB-GPT. Using traditional full-text search to implement RAG can, to some extent, alleviate the uncertainty and interpretability issues brought about by vector database retrieval.
You can refer to the python example file `DB-GPT/examples/rag/keyword_rag_example.py` in the source code. This example demonstrates how to load knowledge from a document and persist it in a full text store. Subsequently, it recalls knowledge relevant to your question by searching for keywords in the full text store.
### The Constraints of Vector Retrieve
Vector Retrieve offers clear advantages, the technology does have some constraints:
- Computationally Intensive - Generating vectors for entire corpora of documents and querying based on vector similarity requires significantly more processing power than keyword indexing and matching. Latency can be an issue if systems are not properly optimized.
- Requires Massive Training Data - The semantic connections made by models like BERT rely on being trained on massive, diverse datasets over long periods. This data may not be readily available for specialized corpora, limiting the quality of vectors.
- Less Effective for Precise Keyword Queries - Vector search adds little benefit when queries contain clear, precise keywords and intent. Searching for "apple fruit" would likely return poorer results than just "apple" because the vector focuses on overall meaning more than keywords.
### How to choose Between Vector Retrieve and Keyword Retrieve ?
When is vector search preferable over keyword search, and vice versa? Here are some best practices on when to use each:
When to Use Vector Search
Early stage research when query intent is vague or broad
Need to grasp concepts and subject matter more than keywords
Exploring a topic with loose information needs
User search queries are more conversational
The semantic capabilities of vector search allow it to shine for these use cases. It can point users in the right direction even with limited keywords or understanding of a topic.
When to Use Keyword Search:
- Looking for something ultra-specific and already understand the topic
- Research is narrowly focused with clear objectives
- Queries contain unique proper nouns like brand names
- Needs require fast results more than exhaustive relevancy
For precise or time-sensitive queries, keyword search will target the exact terms efficiently. Vector search may meander with unnecessary semantic expansion.
The search method should align with the user's intent and specificity needs. Vector search for exploration, keyword search for precision. With both available, users get the best of both worlds.
### Install Dependencies
First, you need to install the `dbgpt` library.
```bash
pip install "dbgpt[rag]>=0.5.8"
````
### Prepare Full Text Search Engine
`Elasticsearch` is the distributed search and analytics engine at the heart of the Elastic Stack. Logstash and Beats facilitate collecting, aggregating, and enriching your data and storing it in Elasticsearch. Kibana enables you to interactively explore, visualize, and share insights into your data and manage and monitor the stack. Elasticsearch is where the indexing, search, and analysis magic happens.
refer https://www.elastic.co/guide/en/elasticsearch/reference/current/elasticsearch-intro.html
Install Elasticsearch refer https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html
### Keyword Search Configuration
Set variables below in `.env` file, let DB-GPT know how to connect to Full Text Search Engine Storage.
```
ELASTICSEARCH_URL=localhost
ELASTICSEARCH_PORT=9200
ELASTICSEARCH_USERNAME=elastic
ELASTICSEARCH_PASSWORD=dbgpt
```
### Load into Full Text Search Engine
When using a `Elaticsearch` full text engine as the underlying knowledge storage platform, it is necessary to build document inverted index to facilitate the archiving and retrieval of documents.
The following code demonstrates how to create a connection to the Elasticsearch search engine.
```python
from dbgpt.storage.full_text.elasticsearch import ElasticDocumentConfig, \
ElasticDocumentStore
def _create_es_connector():
"""Create es connector."""
config = ElasticDocumentConfig(
name="keyword_rag_test",
uri="localhost",
port="9200",
user="elastic",
password="dbgpt",
)
return ElasticDocumentStore(config)
```
### Keyword Retrieve from Full Text Search Engine
Keyword Retrieve is a simple and efficient way to retrieve relevant information from a large number of documents. It is based on the full-text search engine Elasticsearch. The user can input a query and retrieve the most relevant documents based on the query.
```python
import os
from dbgpt.configs.model_config import ROOT_PATH
from dbgpt.rag import ChunkParameters
from dbgpt.rag.assembler import EmbeddingAssembler
from dbgpt.rag.knowledge import KnowledgeFactory
async def main():
file_path = os.path.join(ROOT_PATH, "docs/docs/awel/awel.md")
knowledge = KnowledgeFactory.from_file_path(file_path)
keyword_store = _create_es_connector()
chunk_parameters = ChunkParameters(chunk_strategy="CHUNK_BY_SIZE")
# get embedding assembler
assembler = EmbeddingAssembler.load_from_knowledge(
knowledge=knowledge,
chunk_parameters=chunk_parameters,
index_store=keyword_store,
)
assembler.persist()
# get embeddings retriever
retriever = assembler.as_retriever(3)
chunks = await retriever.aretrieve_with_scores("what is awel talk about", 0.3)
print(f"keyword rag example results:{chunks}")
```
### Chat Knowledge via Keyword RAG
Here we demonstrate how to achieve chat knowledge through Keyword RAG on web page.
First, create a knowledge base using the `Full Text` type. Upload the knowledge documents and wait for the slicing to complete.
<p align="left">
<img src={'/img/chat_knowledge/keyword_rag/create_keyword_rag.jpg'} width="1000px"/>
</p>
Start chat to knowledge based on Keyword RAG.
<p align="left">
<img src={'/img/chat_knowledge/keyword_rag/keyword_search_chat.jpg'} width="1000px"/>
</p>

View File

@@ -477,7 +477,10 @@ const sidebars = {
{
type: 'doc',
id: 'cookbook/rag/graph_rag_app_develop',
}
},{
type: 'doc',
id: 'cookbook/rag/keyword_rag_app_develop',
},
],
},
{

Binary file not shown.

After

Width:  |  Height:  |  Size: 225 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB