mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-07-22 11:51:42 +00:00
feat(ChatKnowledge):chunk add enable_merge parameter (#1014)
Co-authored-by: Aralhi <xiaoping0501@gmail.com>
This commit is contained in:
parent
fd30588e55
commit
ca83443c48
@ -7,7 +7,7 @@ from dbgpt.model import DefaultLLMClient
|
||||
from dbgpt.rag.chunk import Chunk
|
||||
from dbgpt.rag.chunk_manager import ChunkParameters
|
||||
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
|
||||
from dbgpt.rag.knowledge.base import KnowledgeType
|
||||
from dbgpt.rag.knowledge.base import KnowledgeType, ChunkStrategy
|
||||
from dbgpt.rag.knowledge.factory import KnowledgeFactory
|
||||
from dbgpt.rag.text_splitter.text_splitter import (
|
||||
RecursiveCharacterTextSplitter,
|
||||
@ -234,7 +234,7 @@ class KnowledgeService:
|
||||
f" doc:{doc.doc_name} status is {doc.status}, can not sync"
|
||||
)
|
||||
chunk_parameters = sync_request.chunk_parameters
|
||||
if "Automatic" == chunk_parameters.chunk_strategy:
|
||||
if chunk_parameters.chunk_strategy != ChunkStrategy.CHUNK_BY_SIZE.name:
|
||||
space_context = self.get_space_context(space_name)
|
||||
chunk_parameters.chunk_size = (
|
||||
CFG.KNOWLEDGE_CHUNK_SIZE
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1 @@
|
||||
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,f,d,k,b,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,f,d,k,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,f,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,k,b,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,f,n,k,u,b,h,"static/chunks/10-f02ccef88f814547.js","static/chunks/pages/knowledge-f3c914cac944c089.js"],"/knowledge/chunk":[s,e,t,d,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,b,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,d,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
@ -1 +0,0 @@
|
||||
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,d,f,b,k,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,d,f,b,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,d,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,b,k,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,d,n,b,u,k,h,"static/chunks/450-bd680f0e37e9b4b9.js","static/chunks/pages/knowledge-b9300e7addf1931f.js"],"/knowledge/chunk":[s,e,t,f,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,k,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,f,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();
|
17
dbgpt/app/static/_next/static/chunks/10-f02ccef88f814547.js
Normal file
17
dbgpt/app/static/_next/static/chunks/10-f02ccef88f814547.js
Normal file
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
|
||||
default="\n",
|
||||
description="chunk separator",
|
||||
)
|
||||
enable_merge: bool = Field(
|
||||
default=None,
|
||||
description="enable chunk merge by chunk_size.",
|
||||
)
|
||||
|
||||
|
||||
class ChunkManager:
|
||||
@ -134,4 +138,5 @@ class ChunkManager:
|
||||
chunk_size=self._chunk_parameters.chunk_size,
|
||||
chunk_overlap=self._chunk_parameters.chunk_overlap,
|
||||
separator=self._chunk_parameters.separator,
|
||||
enable_merge=self._chunk_parameters.enable_merge,
|
||||
)
|
||||
|
@ -47,8 +47,18 @@ class ChunkStrategy(Enum):
|
||||
CHUNK_BY_SIZE = (
|
||||
RecursiveCharacterTextSplitter,
|
||||
[
|
||||
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
|
||||
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
|
||||
{
|
||||
"param_name": "chunk_size",
|
||||
"param_type": "int",
|
||||
"default_value": 512,
|
||||
"description": "The size of the data chunks used in processing.",
|
||||
},
|
||||
{
|
||||
"param_name": "chunk_overlap",
|
||||
"param_type": "int",
|
||||
"default_value": 50,
|
||||
"description": "The amount of overlap between adjacent data chunks.",
|
||||
},
|
||||
],
|
||||
"chunk size",
|
||||
"split document by chunk size",
|
||||
@ -56,13 +66,33 @@ class ChunkStrategy(Enum):
|
||||
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
|
||||
CHUNK_BY_PARAGRAPH = (
|
||||
ParagraphTextSplitter,
|
||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
||||
[
|
||||
{
|
||||
"param_name": "separator",
|
||||
"param_type": "string",
|
||||
"default_value": "\\n",
|
||||
"description": "paragraph separator",
|
||||
}
|
||||
],
|
||||
"paragraph",
|
||||
"split document by paragraph",
|
||||
)
|
||||
CHUNK_BY_SEPARATOR = (
|
||||
SeparatorTextSplitter,
|
||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
||||
[
|
||||
{
|
||||
"param_name": "separator",
|
||||
"param_type": "string",
|
||||
"default_value": "\\n",
|
||||
"description": "chunk separator",
|
||||
},
|
||||
{
|
||||
"param_name": "enable_merge",
|
||||
"param_type": "boolean",
|
||||
"default_value": False,
|
||||
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
|
||||
},
|
||||
],
|
||||
"separator",
|
||||
"split document by separator",
|
||||
)
|
||||
@ -80,6 +110,7 @@ class ChunkStrategy(Enum):
|
||||
self.description = description
|
||||
|
||||
def match(self, *args, **kwargs):
|
||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
return self.value[0](*args, **kwargs)
|
||||
|
||||
|
||||
|
@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):
|
||||
|
||||
def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
|
||||
"""Create a new TextSplitter."""
|
||||
self._merge = kwargs.pop("enable_merge") or False
|
||||
super().__init__(**kwargs)
|
||||
self._separator = separator
|
||||
self._filter = filters
|
||||
@ -696,7 +697,9 @@ class SeparatorTextSplitter(CharacterTextSplitter):
|
||||
splits = text.split(separator)
|
||||
else:
|
||||
splits = list(text)
|
||||
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
|
||||
if self._merge:
|
||||
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
|
||||
return list(filter(None, text.split(separator)))
|
||||
|
||||
|
||||
class PageTextSplitter(TextSplitter):
|
||||
|
@ -61,7 +61,6 @@ const resources = {
|
||||
recall_score: 'recall_score',
|
||||
Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors',
|
||||
recall_type: 'recall_type',
|
||||
Recall_Type: 'recall type',
|
||||
model: 'model',
|
||||
A_model_used: 'A model used to create vector representations of text or other data',
|
||||
Automatic: 'Automatic',
|
||||
@ -239,12 +238,11 @@ const resources = {
|
||||
Please_select_a_file: '请上传一个文件',
|
||||
Please_input_the_text: '请输入文本',
|
||||
Embedding: '嵌入',
|
||||
topk: '球',
|
||||
topk: 'TopK',
|
||||
the_top_k_vectors: '基于相似度得分的前 k 个向量',
|
||||
recall_score: '召回分数',
|
||||
Set_a_threshold_score: '设置相似向量检索的阈值分数',
|
||||
recall_type: '回忆类型',
|
||||
Recall_Type: '回忆类型',
|
||||
recall_type: '召回类型',
|
||||
model: '模型',
|
||||
A_model_used: '用于创建文本或其他数据的矢量表示的模型',
|
||||
Automatic: '自动切片',
|
||||
|
@ -47,7 +47,7 @@ export default function ArgumentsModal({ space, argumentsShow, setArgumentsShow
|
||||
</Form.Item>
|
||||
</Col>
|
||||
<Col span={12}>
|
||||
<Form.Item<IArguments> tooltip={t(`Recall_Type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
|
||||
<Form.Item<IArguments> tooltip={t(`recall_type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
|
||||
<Input className="mb-5 h-12" />
|
||||
</Form.Item>
|
||||
</Col>
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { IChunkStrategyResponse } from '@/types/knowledge';
|
||||
import { Alert, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
|
||||
import { Alert, Checkbox, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
|
||||
import { useState } from 'react';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
const { TextArea } = Input;
|
||||
@ -25,7 +25,7 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
|
||||
const [selectedStrategy, setSelectedStrategy] = useState<string>();
|
||||
const { t } = useTranslation();
|
||||
const DEFAULT_STRATEGY = {
|
||||
strategy: t('Automatic'),
|
||||
strategy: 'Automatic',
|
||||
name: t('Automatic'),
|
||||
desc: t('Automatic_desc'),
|
||||
};
|
||||
@ -50,17 +50,30 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
|
||||
{parameters?.map((param) => (
|
||||
<Form.Item
|
||||
key={`param_${param.param_name}`}
|
||||
label={`${param.param_name}: ${param.param_type}`}
|
||||
label={param.param_name}
|
||||
name={[field!.name, 'chunk_parameters', param.param_name]}
|
||||
rules={[{ required: true, message: t('Please_input_the_name') }]}
|
||||
initialValue={param.default_value}
|
||||
valuePropName={param.param_type === 'boolean' ? 'checked' : 'value'}
|
||||
tooltip={param.description}
|
||||
>
|
||||
{param.param_type === 'int' ? <InputNumber className="w-full" min={1} /> : <TextArea className="w-full" rows={2} maxLength={6} />}
|
||||
{renderParamByType(param.param_type)}
|
||||
</Form.Item>
|
||||
))}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
function renderParamByType(type: string) {
|
||||
switch (type) {
|
||||
case 'int':
|
||||
return <InputNumber className="w-full" min={1} />;
|
||||
case 'string':
|
||||
return <TextArea className="w-full" rows={2} />;
|
||||
case 'boolean':
|
||||
return <Checkbox />;
|
||||
}
|
||||
}
|
||||
return (
|
||||
<>
|
||||
<Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}>
|
||||
|
@ -82,6 +82,7 @@ export type IStrategyParameter = {
|
||||
param_name: string;
|
||||
param_type: string;
|
||||
default_value?: string | number;
|
||||
description: string;
|
||||
};
|
||||
|
||||
export type IChunkStrategyResponse = {
|
||||
|
Loading…
Reference in New Issue
Block a user