mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-28 06:17:14 +00:00
feat(ChatKnowledge):chunk add enable_merge parameter (#1014)
Co-authored-by: Aralhi <xiaoping0501@gmail.com>
This commit is contained in:
parent
fd30588e55
commit
ca83443c48
@ -7,7 +7,7 @@ from dbgpt.model import DefaultLLMClient
|
|||||||
from dbgpt.rag.chunk import Chunk
|
from dbgpt.rag.chunk import Chunk
|
||||||
from dbgpt.rag.chunk_manager import ChunkParameters
|
from dbgpt.rag.chunk_manager import ChunkParameters
|
||||||
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
|
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
|
||||||
from dbgpt.rag.knowledge.base import KnowledgeType
|
from dbgpt.rag.knowledge.base import KnowledgeType, ChunkStrategy
|
||||||
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||||
from dbgpt.rag.text_splitter.text_splitter import (
|
from dbgpt.rag.text_splitter.text_splitter import (
|
||||||
RecursiveCharacterTextSplitter,
|
RecursiveCharacterTextSplitter,
|
||||||
@ -234,7 +234,7 @@ class KnowledgeService:
|
|||||||
f" doc:{doc.doc_name} status is {doc.status}, can not sync"
|
f" doc:{doc.doc_name} status is {doc.status}, can not sync"
|
||||||
)
|
)
|
||||||
chunk_parameters = sync_request.chunk_parameters
|
chunk_parameters = sync_request.chunk_parameters
|
||||||
if "Automatic" == chunk_parameters.chunk_strategy:
|
if chunk_parameters.chunk_strategy != ChunkStrategy.CHUNK_BY_SIZE.name:
|
||||||
space_context = self.get_space_context(space_name)
|
space_context = self.get_space_context(space_name)
|
||||||
chunk_parameters.chunk_size = (
|
chunk_parameters.chunk_size = (
|
||||||
CFG.KNOWLEDGE_CHUNK_SIZE
|
CFG.KNOWLEDGE_CHUNK_SIZE
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1 @@
|
|||||||
|
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,f,d,k,b,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,f,d,k,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,f,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,k,b,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,f,n,k,u,b,h,"static/chunks/10-f02ccef88f814547.js","static/chunks/pages/knowledge-f3c914cac944c089.js"],"/knowledge/chunk":[s,e,t,d,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,b,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,d,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
@ -1 +0,0 @@
|
|||||||
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,d,f,b,k,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,d,f,b,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,d,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,b,k,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,d,n,b,u,k,h,"static/chunks/450-bd680f0e37e9b4b9.js","static/chunks/pages/knowledge-b9300e7addf1931f.js"],"/knowledge/chunk":[s,e,t,f,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,k,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,f,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
|
17
dbgpt/app/static/_next/static/chunks/10-f02ccef88f814547.js
Normal file
17
dbgpt/app/static/_next/static/chunks/10-f02ccef88f814547.js
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
|
|||||||
default="\n",
|
default="\n",
|
||||||
description="chunk separator",
|
description="chunk separator",
|
||||||
)
|
)
|
||||||
|
enable_merge: bool = Field(
|
||||||
|
default=None,
|
||||||
|
description="enable chunk merge by chunk_size.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ChunkManager:
|
class ChunkManager:
|
||||||
@ -134,4 +138,5 @@ class ChunkManager:
|
|||||||
chunk_size=self._chunk_parameters.chunk_size,
|
chunk_size=self._chunk_parameters.chunk_size,
|
||||||
chunk_overlap=self._chunk_parameters.chunk_overlap,
|
chunk_overlap=self._chunk_parameters.chunk_overlap,
|
||||||
separator=self._chunk_parameters.separator,
|
separator=self._chunk_parameters.separator,
|
||||||
|
enable_merge=self._chunk_parameters.enable_merge,
|
||||||
)
|
)
|
||||||
|
@ -47,8 +47,18 @@ class ChunkStrategy(Enum):
|
|||||||
CHUNK_BY_SIZE = (
|
CHUNK_BY_SIZE = (
|
||||||
RecursiveCharacterTextSplitter,
|
RecursiveCharacterTextSplitter,
|
||||||
[
|
[
|
||||||
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
|
{
|
||||||
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
|
"param_name": "chunk_size",
|
||||||
|
"param_type": "int",
|
||||||
|
"default_value": 512,
|
||||||
|
"description": "The size of the data chunks used in processing.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"param_name": "chunk_overlap",
|
||||||
|
"param_type": "int",
|
||||||
|
"default_value": 50,
|
||||||
|
"description": "The amount of overlap between adjacent data chunks.",
|
||||||
|
},
|
||||||
],
|
],
|
||||||
"chunk size",
|
"chunk size",
|
||||||
"split document by chunk size",
|
"split document by chunk size",
|
||||||
@ -56,13 +66,33 @@ class ChunkStrategy(Enum):
|
|||||||
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
|
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
|
||||||
CHUNK_BY_PARAGRAPH = (
|
CHUNK_BY_PARAGRAPH = (
|
||||||
ParagraphTextSplitter,
|
ParagraphTextSplitter,
|
||||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
[
|
||||||
|
{
|
||||||
|
"param_name": "separator",
|
||||||
|
"param_type": "string",
|
||||||
|
"default_value": "\\n",
|
||||||
|
"description": "paragraph separator",
|
||||||
|
}
|
||||||
|
],
|
||||||
"paragraph",
|
"paragraph",
|
||||||
"split document by paragraph",
|
"split document by paragraph",
|
||||||
)
|
)
|
||||||
CHUNK_BY_SEPARATOR = (
|
CHUNK_BY_SEPARATOR = (
|
||||||
SeparatorTextSplitter,
|
SeparatorTextSplitter,
|
||||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
[
|
||||||
|
{
|
||||||
|
"param_name": "separator",
|
||||||
|
"param_type": "string",
|
||||||
|
"default_value": "\\n",
|
||||||
|
"description": "chunk separator",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"param_name": "enable_merge",
|
||||||
|
"param_type": "boolean",
|
||||||
|
"default_value": False,
|
||||||
|
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
|
||||||
|
},
|
||||||
|
],
|
||||||
"separator",
|
"separator",
|
||||||
"split document by separator",
|
"split document by separator",
|
||||||
)
|
)
|
||||||
@ -80,6 +110,7 @@ class ChunkStrategy(Enum):
|
|||||||
self.description = description
|
self.description = description
|
||||||
|
|
||||||
def match(self, *args, **kwargs):
|
def match(self, *args, **kwargs):
|
||||||
|
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||||
return self.value[0](*args, **kwargs)
|
return self.value[0](*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):
|
|||||||
|
|
||||||
def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
|
def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
|
||||||
"""Create a new TextSplitter."""
|
"""Create a new TextSplitter."""
|
||||||
|
self._merge = kwargs.pop("enable_merge") or False
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self._separator = separator
|
self._separator = separator
|
||||||
self._filter = filters
|
self._filter = filters
|
||||||
@ -696,7 +697,9 @@ class SeparatorTextSplitter(CharacterTextSplitter):
|
|||||||
splits = text.split(separator)
|
splits = text.split(separator)
|
||||||
else:
|
else:
|
||||||
splits = list(text)
|
splits = list(text)
|
||||||
|
if self._merge:
|
||||||
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
|
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
|
||||||
|
return list(filter(None, text.split(separator)))
|
||||||
|
|
||||||
|
|
||||||
class PageTextSplitter(TextSplitter):
|
class PageTextSplitter(TextSplitter):
|
||||||
|
@ -61,7 +61,6 @@ const resources = {
|
|||||||
recall_score: 'recall_score',
|
recall_score: 'recall_score',
|
||||||
Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors',
|
Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors',
|
||||||
recall_type: 'recall_type',
|
recall_type: 'recall_type',
|
||||||
Recall_Type: 'recall type',
|
|
||||||
model: 'model',
|
model: 'model',
|
||||||
A_model_used: 'A model used to create vector representations of text or other data',
|
A_model_used: 'A model used to create vector representations of text or other data',
|
||||||
Automatic: 'Automatic',
|
Automatic: 'Automatic',
|
||||||
@ -239,12 +238,11 @@ const resources = {
|
|||||||
Please_select_a_file: '请上传一个文件',
|
Please_select_a_file: '请上传一个文件',
|
||||||
Please_input_the_text: '请输入文本',
|
Please_input_the_text: '请输入文本',
|
||||||
Embedding: '嵌入',
|
Embedding: '嵌入',
|
||||||
topk: '球',
|
topk: 'TopK',
|
||||||
the_top_k_vectors: '基于相似度得分的前 k 个向量',
|
the_top_k_vectors: '基于相似度得分的前 k 个向量',
|
||||||
recall_score: '召回分数',
|
recall_score: '召回分数',
|
||||||
Set_a_threshold_score: '设置相似向量检索的阈值分数',
|
Set_a_threshold_score: '设置相似向量检索的阈值分数',
|
||||||
recall_type: '回忆类型',
|
recall_type: '召回类型',
|
||||||
Recall_Type: '回忆类型',
|
|
||||||
model: '模型',
|
model: '模型',
|
||||||
A_model_used: '用于创建文本或其他数据的矢量表示的模型',
|
A_model_used: '用于创建文本或其他数据的矢量表示的模型',
|
||||||
Automatic: '自动切片',
|
Automatic: '自动切片',
|
||||||
|
@ -47,7 +47,7 @@ export default function ArgumentsModal({ space, argumentsShow, setArgumentsShow
|
|||||||
</Form.Item>
|
</Form.Item>
|
||||||
</Col>
|
</Col>
|
||||||
<Col span={12}>
|
<Col span={12}>
|
||||||
<Form.Item<IArguments> tooltip={t(`Recall_Type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
|
<Form.Item<IArguments> tooltip={t(`recall_type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
|
||||||
<Input className="mb-5 h-12" />
|
<Input className="mb-5 h-12" />
|
||||||
</Form.Item>
|
</Form.Item>
|
||||||
</Col>
|
</Col>
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import { IChunkStrategyResponse } from '@/types/knowledge';
|
import { IChunkStrategyResponse } from '@/types/knowledge';
|
||||||
import { Alert, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
|
import { Alert, Checkbox, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
|
||||||
import { useState } from 'react';
|
import { useState } from 'react';
|
||||||
import { useTranslation } from 'react-i18next';
|
import { useTranslation } from 'react-i18next';
|
||||||
const { TextArea } = Input;
|
const { TextArea } = Input;
|
||||||
@ -25,7 +25,7 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
|
|||||||
const [selectedStrategy, setSelectedStrategy] = useState<string>();
|
const [selectedStrategy, setSelectedStrategy] = useState<string>();
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
const DEFAULT_STRATEGY = {
|
const DEFAULT_STRATEGY = {
|
||||||
strategy: t('Automatic'),
|
strategy: 'Automatic',
|
||||||
name: t('Automatic'),
|
name: t('Automatic'),
|
||||||
desc: t('Automatic_desc'),
|
desc: t('Automatic_desc'),
|
||||||
};
|
};
|
||||||
@ -50,17 +50,30 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
|
|||||||
{parameters?.map((param) => (
|
{parameters?.map((param) => (
|
||||||
<Form.Item
|
<Form.Item
|
||||||
key={`param_${param.param_name}`}
|
key={`param_${param.param_name}`}
|
||||||
label={`${param.param_name}: ${param.param_type}`}
|
label={param.param_name}
|
||||||
name={[field!.name, 'chunk_parameters', param.param_name]}
|
name={[field!.name, 'chunk_parameters', param.param_name]}
|
||||||
rules={[{ required: true, message: t('Please_input_the_name') }]}
|
rules={[{ required: true, message: t('Please_input_the_name') }]}
|
||||||
initialValue={param.default_value}
|
initialValue={param.default_value}
|
||||||
|
valuePropName={param.param_type === 'boolean' ? 'checked' : 'value'}
|
||||||
|
tooltip={param.description}
|
||||||
>
|
>
|
||||||
{param.param_type === 'int' ? <InputNumber className="w-full" min={1} /> : <TextArea className="w-full" rows={2} maxLength={6} />}
|
{renderParamByType(param.param_type)}
|
||||||
</Form.Item>
|
</Form.Item>
|
||||||
))}
|
))}
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function renderParamByType(type: string) {
|
||||||
|
switch (type) {
|
||||||
|
case 'int':
|
||||||
|
return <InputNumber className="w-full" min={1} />;
|
||||||
|
case 'string':
|
||||||
|
return <TextArea className="w-full" rows={2} />;
|
||||||
|
case 'boolean':
|
||||||
|
return <Checkbox />;
|
||||||
|
}
|
||||||
|
}
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
<Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}>
|
<Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}>
|
||||||
|
@ -82,6 +82,7 @@ export type IStrategyParameter = {
|
|||||||
param_name: string;
|
param_name: string;
|
||||||
param_type: string;
|
param_type: string;
|
||||||
default_value?: string | number;
|
default_value?: string | number;
|
||||||
|
description: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
export type IChunkStrategyResponse = {
|
export type IChunkStrategyResponse = {
|
||||||
|
Loading…
Reference in New Issue
Block a user