From 20e7ccc83148bdef45a4c70202f0af64f89ffd38 Mon Sep 17 00:00:00 2001 From: "Kevin.Shin" Date: Thu, 6 Jun 2024 13:45:50 +0800 Subject: [PATCH] FIX:1598 Use PageTextSplitter for DatasourceKnowledge (#1599) Co-authored-by: shenk-b Co-authored-by: aries_ckt <916701291@qq.com> --- dbgpt/rag/knowledge/datasource.py | 10 ++++++++++ dbgpt/rag/summary/db_summary_client.py | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/dbgpt/rag/knowledge/datasource.py b/dbgpt/rag/knowledge/datasource.py index cbafe3d41..78ae045dd 100644 --- a/dbgpt/rag/knowledge/datasource.py +++ b/dbgpt/rag/knowledge/datasource.py @@ -52,6 +52,7 @@ class DatasourceKnowledge(Knowledge): return [ ChunkStrategy.CHUNK_BY_SIZE, ChunkStrategy.CHUNK_BY_SEPARATOR, + ChunkStrategy.CHUNK_BY_PAGE, ] @classmethod @@ -63,3 +64,12 @@ class DatasourceKnowledge(Knowledge): def document_type(cls) -> DocumentType: """Return document type.""" return DocumentType.DATASOURCE + + @classmethod + def default_chunk_strategy(cls) -> ChunkStrategy: + """Return default chunk strategy. + + Returns: + ChunkStrategy: default chunk strategy + """ + return ChunkStrategy.CHUNK_BY_PAGE diff --git a/dbgpt/rag/summary/db_summary_client.py b/dbgpt/rag/summary/db_summary_client.py index 29eb0e046..e77560477 100644 --- a/dbgpt/rag/summary/db_summary_client.py +++ b/dbgpt/rag/summary/db_summary_client.py @@ -101,8 +101,10 @@ class DBSummaryClient: from dbgpt.rag.assembler.db_schema import DBSchemaAssembler db_assembler = DBSchemaAssembler.load_from_connection( - connector=db_summary_client.db, vector_store_connector=vector_connector + connector=db_summary_client.db, + vector_store_connector=vector_connector, ) + if len(db_assembler.get_chunks()) > 0: db_assembler.persist() else: