feat(ChatKnowledge):chunk add enable_merge parameter (#1014)

Co-authored-by: Aralhi <xiaoping0501@gmail.com>
This commit is contained in:
Aries-ckt 2024-01-04 10:04:41 +08:00 committed by GitHub
parent fd30588e55
commit ca83443c48
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 98 additions and 41 deletions

View File

@ -7,7 +7,7 @@ from dbgpt.model import DefaultLLMClient
from dbgpt.rag.chunk import Chunk
from dbgpt.rag.chunk_manager import ChunkParameters
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
from dbgpt.rag.knowledge.base import KnowledgeType
from dbgpt.rag.knowledge.base import KnowledgeType, ChunkStrategy
from dbgpt.rag.knowledge.factory import KnowledgeFactory
from dbgpt.rag.text_splitter.text_splitter import (
RecursiveCharacterTextSplitter,
@ -234,7 +234,7 @@ class KnowledgeService:
f" doc:{doc.doc_name} status is {doc.status}, can not sync"
)
chunk_parameters = sync_request.chunk_parameters
if "Automatic" == chunk_parameters.chunk_strategy:
if chunk_parameters.chunk_strategy != ChunkStrategy.CHUNK_BY_SIZE.name:
space_context = self.get_space_context(space_name)
chunk_parameters.chunk_size = (
CFG.KNOWLEDGE_CHUNK_SIZE

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,f,d,k,b,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,f,d,k,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,f,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,k,b,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,f,n,k,u,b,h,"static/chunks/10-f02ccef88f814547.js","static/chunks/pages/knowledge-f3c914cac944c089.js"],"/knowledge/chunk":[s,e,t,d,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,b,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,d,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();

View File

@ -1 +0,0 @@
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,d,f,b,k,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,d,f,b,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,d,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,b,k,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,d,n,b,u,k,h,"static/chunks/450-bd680f0e37e9b4b9.js","static/chunks/pages/knowledge-b9300e7addf1931f.js"],"/knowledge/chunk":[s,e,t,f,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,k,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,f,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
default="\n",
description="chunk separator",
)
enable_merge: bool = Field(
default=None,
description="enable chunk merge by chunk_size.",
)
class ChunkManager:
@ -134,4 +138,5 @@ class ChunkManager:
chunk_size=self._chunk_parameters.chunk_size,
chunk_overlap=self._chunk_parameters.chunk_overlap,
separator=self._chunk_parameters.separator,
enable_merge=self._chunk_parameters.enable_merge,
)

View File

@ -47,8 +47,18 @@ class ChunkStrategy(Enum):
CHUNK_BY_SIZE = (
RecursiveCharacterTextSplitter,
[
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
{
"param_name": "chunk_size",
"param_type": "int",
"default_value": 512,
"description": "The size of the data chunks used in processing.",
},
{
"param_name": "chunk_overlap",
"param_type": "int",
"default_value": 50,
"description": "The amount of overlap between adjacent data chunks.",
},
],
"chunk size",
"split document by chunk size",
@ -56,13 +66,33 @@ class ChunkStrategy(Enum):
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
CHUNK_BY_PARAGRAPH = (
ParagraphTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
[
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "paragraph separator",
}
],
"paragraph",
"split document by paragraph",
)
CHUNK_BY_SEPARATOR = (
SeparatorTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
[
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "chunk separator",
},
{
"param_name": "enable_merge",
"param_type": "boolean",
"default_value": False,
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
},
],
"separator",
"split document by separator",
)
@ -80,6 +110,7 @@ class ChunkStrategy(Enum):
self.description = description
def match(self, *args, **kwargs):
kwargs = {k: v for k, v in kwargs.items() if v is not None}
return self.value[0](*args, **kwargs)

View File

@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):
def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
"""Create a new TextSplitter."""
self._merge = kwargs.pop("enable_merge") or False
super().__init__(**kwargs)
self._separator = separator
self._filter = filters
@ -696,7 +697,9 @@ class SeparatorTextSplitter(CharacterTextSplitter):
splits = text.split(separator)
else:
splits = list(text)
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
if self._merge:
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
return list(filter(None, text.split(separator)))
class PageTextSplitter(TextSplitter):

View File

@ -61,7 +61,6 @@ const resources = {
recall_score: 'recall_score',
Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors',
recall_type: 'recall_type',
Recall_Type: 'recall type',
model: 'model',
A_model_used: 'A model used to create vector representations of text or other data',
Automatic: 'Automatic',
@ -239,12 +238,11 @@ const resources = {
Please_select_a_file: '请上传一个文件',
Please_input_the_text: '请输入文本',
Embedding: '嵌入',
topk: '',
topk: 'TopK',
the_top_k_vectors: '基于相似度得分的前 k 个向量',
recall_score: '召回分数',
Set_a_threshold_score: '设置相似向量检索的阈值分数',
recall_type: '回忆类型',
Recall_Type: '回忆类型',
recall_type: '召回类型',
model: '模型',
A_model_used: '用于创建文本或其他数据的矢量表示的模型',
Automatic: '自动切片',

View File

@ -47,7 +47,7 @@ export default function ArgumentsModal({ space, argumentsShow, setArgumentsShow
</Form.Item>
</Col>
<Col span={12}>
<Form.Item<IArguments> tooltip={t(`Recall_Type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
<Form.Item<IArguments> tooltip={t(`recall_type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
<Input className="mb-5 h-12" />
</Form.Item>
</Col>

View File

@ -1,5 +1,5 @@
import { IChunkStrategyResponse } from '@/types/knowledge';
import { Alert, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
import { Alert, Checkbox, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
import { useState } from 'react';
import { useTranslation } from 'react-i18next';
const { TextArea } = Input;
@ -25,7 +25,7 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
const [selectedStrategy, setSelectedStrategy] = useState<string>();
const { t } = useTranslation();
const DEFAULT_STRATEGY = {
strategy: t('Automatic'),
strategy: 'Automatic',
name: t('Automatic'),
desc: t('Automatic_desc'),
};
@ -50,17 +50,30 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
{parameters?.map((param) => (
<Form.Item
key={`param_${param.param_name}`}
label={`${param.param_name}: ${param.param_type}`}
label={param.param_name}
name={[field!.name, 'chunk_parameters', param.param_name]}
rules={[{ required: true, message: t('Please_input_the_name') }]}
initialValue={param.default_value}
valuePropName={param.param_type === 'boolean' ? 'checked' : 'value'}
tooltip={param.description}
>
{param.param_type === 'int' ? <InputNumber className="w-full" min={1} /> : <TextArea className="w-full" rows={2} maxLength={6} />}
{renderParamByType(param.param_type)}
</Form.Item>
))}
</div>
);
}
function renderParamByType(type: string) {
switch (type) {
case 'int':
return <InputNumber className="w-full" min={1} />;
case 'string':
return <TextArea className="w-full" rows={2} />;
case 'boolean':
return <Checkbox />;
}
}
return (
<>
<Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}>

View File

@ -82,6 +82,7 @@ export type IStrategyParameter = {
param_name: string;
param_type: string;
default_value?: string | number;
description: string;
};
export type IChunkStrategyResponse = {