feat(ChatKnowledge):chunk add enable_merge parameter (#1014)

Co-authored-by: Aralhi <xiaoping0501@gmail.com>
This commit is contained in:
Aries-ckt 2024-01-04 10:04:41 +08:00 committed by GitHub
parent fd30588e55
commit ca83443c48
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 98 additions and 41 deletions

View File

@ -7,7 +7,7 @@ from dbgpt.model import DefaultLLMClient
from dbgpt.rag.chunk import Chunk from dbgpt.rag.chunk import Chunk
from dbgpt.rag.chunk_manager import ChunkParameters from dbgpt.rag.chunk_manager import ChunkParameters
from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory from dbgpt.rag.embedding.embedding_factory import EmbeddingFactory
from dbgpt.rag.knowledge.base import KnowledgeType from dbgpt.rag.knowledge.base import KnowledgeType, ChunkStrategy
from dbgpt.rag.knowledge.factory import KnowledgeFactory from dbgpt.rag.knowledge.factory import KnowledgeFactory
from dbgpt.rag.text_splitter.text_splitter import ( from dbgpt.rag.text_splitter.text_splitter import (
RecursiveCharacterTextSplitter, RecursiveCharacterTextSplitter,
@ -234,7 +234,7 @@ class KnowledgeService:
f" doc:{doc.doc_name} status is {doc.status}, can not sync" f" doc:{doc.doc_name} status is {doc.status}, can not sync"
) )
chunk_parameters = sync_request.chunk_parameters chunk_parameters = sync_request.chunk_parameters
if "Automatic" == chunk_parameters.chunk_strategy: if chunk_parameters.chunk_strategy != ChunkStrategy.CHUNK_BY_SIZE.name:
space_context = self.get_space_context(space_name) space_context = self.get_space_context(space_name)
chunk_parameters.chunk_size = ( chunk_parameters.chunk_size = (
CFG.KNOWLEDGE_CHUNK_SIZE CFG.KNOWLEDGE_CHUNK_SIZE

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1 @@
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,f,d,k,b,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,f,d,k,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,f,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,k,b,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,f,n,k,u,b,h,"static/chunks/10-f02ccef88f814547.js","static/chunks/pages/knowledge-f3c914cac944c089.js"],"/knowledge/chunk":[s,e,t,d,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,b,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,d,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();

View File

@ -1 +0,0 @@
self.__BUILD_MANIFEST=function(s,c,a,e,t,n,d,f,b,k,h,i,u,j){return{__rewrites:{beforeFiles:[],afterFiles:[],fallback:[]},"/":["static/chunks/29107295-90b90cb30c825230.js",s,c,e,a,d,f,b,h,"static/chunks/861-78929b4f98dbbfd6.js","static/chunks/161-96143606b49cf4a1.js","static/chunks/pages/index-ba9785759e4fc934.js"],"/_error":["static/chunks/pages/_error-dee72aff9b2e2c12.js"],"/agent":[s,c,a,t,d,n,"static/chunks/pages/agent-a2599efbeb46e056.js"],"/chat":["static/chunks/pages/chat-47a20abbae16e858.js"],"/chat/[scene]/[id]":["static/chunks/pages/chat/[scene]/[id]-8df445f91cde33fa.js"],"/database":[s,c,e,a,t,n,b,k,"static/chunks/643-d8f53f40dd3c5b40.js","static/chunks/pages/database-d36f41810fc357a6.js"],"/knowledge":[i,s,c,a,t,d,n,b,u,k,h,"static/chunks/450-bd680f0e37e9b4b9.js","static/chunks/pages/knowledge-b9300e7addf1931f.js"],"/knowledge/chunk":[s,e,t,f,n,"static/chunks/pages/knowledge/chunk-652744b9d90c26c9.js"],"/models":[i,s,c,e,a,j,k,"static/chunks/pages/models-1145859ba0e2f20a.js"],"/prompt":[s,c,e,a,j,f,u,"static/chunks/346-b0aea1c99abd6f1e.js","static/chunks/607-2dedaf19149304c0.js","static/chunks/pages/prompt-fca5ed813d5018b1.js"],sortedPages:["/","/_app","/_error","/agent","/chat","/chat/[scene]/[id]","/database","/knowledge","/knowledge/chunk","/models","/prompt"]}}("static/chunks/113-15fc0b8bd2b5b9a1.js","static/chunks/17-d6c52cecd9ecc451.js","static/chunks/479-33b3ebe9be79a971.js","static/chunks/9-bb2c54d5c06ba4bf.js","static/chunks/442-197e6cbc1e54109a.js","static/chunks/813-cce9482e33f2430c.js","static/chunks/553-a89ad624ca0f1ffa.js","static/chunks/810-84757da754c6f3fc.js","static/chunks/411-b5d3e7f64bee2335.js","static/chunks/928-74244889bd7f2699.js","static/chunks/234-42f62dc360b2d9e4.js","static/chunks/75fc9c18-a784766a129ec5fb.js","static/chunks/45-9ff739c09925ea35.js","static/chunks/947-5980a3ff49069ddd.js"),self.__BUILD_MANIFEST_CB&&self.__BUILD_MANIFEST_CB();

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
default="\n", default="\n",
description="chunk separator", description="chunk separator",
) )
enable_merge: bool = Field(
default=None,
description="enable chunk merge by chunk_size.",
)
class ChunkManager: class ChunkManager:
@ -134,4 +138,5 @@ class ChunkManager:
chunk_size=self._chunk_parameters.chunk_size, chunk_size=self._chunk_parameters.chunk_size,
chunk_overlap=self._chunk_parameters.chunk_overlap, chunk_overlap=self._chunk_parameters.chunk_overlap,
separator=self._chunk_parameters.separator, separator=self._chunk_parameters.separator,
enable_merge=self._chunk_parameters.enable_merge,
) )

View File

@ -47,8 +47,18 @@ class ChunkStrategy(Enum):
CHUNK_BY_SIZE = ( CHUNK_BY_SIZE = (
RecursiveCharacterTextSplitter, RecursiveCharacterTextSplitter,
[ [
{"param_name": "chunk_size", "param_type": "int", "default_value": 512}, {
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50}, "param_name": "chunk_size",
"param_type": "int",
"default_value": 512,
"description": "The size of the data chunks used in processing.",
},
{
"param_name": "chunk_overlap",
"param_type": "int",
"default_value": 50,
"description": "The amount of overlap between adjacent data chunks.",
},
], ],
"chunk size", "chunk size",
"split document by chunk size", "split document by chunk size",
@ -56,13 +66,33 @@ class ChunkStrategy(Enum):
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page") CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
CHUNK_BY_PARAGRAPH = ( CHUNK_BY_PARAGRAPH = (
ParagraphTextSplitter, ParagraphTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}], [
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "paragraph separator",
}
],
"paragraph", "paragraph",
"split document by paragraph", "split document by paragraph",
) )
CHUNK_BY_SEPARATOR = ( CHUNK_BY_SEPARATOR = (
SeparatorTextSplitter, SeparatorTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}], [
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "chunk separator",
},
{
"param_name": "enable_merge",
"param_type": "boolean",
"default_value": False,
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
},
],
"separator", "separator",
"split document by separator", "split document by separator",
) )
@ -80,6 +110,7 @@ class ChunkStrategy(Enum):
self.description = description self.description = description
def match(self, *args, **kwargs): def match(self, *args, **kwargs):
kwargs = {k: v for k, v in kwargs.items() if v is not None}
return self.value[0](*args, **kwargs) return self.value[0](*args, **kwargs)

View File

@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):
def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any): def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
"""Create a new TextSplitter.""" """Create a new TextSplitter."""
self._merge = kwargs.pop("enable_merge") or False
super().__init__(**kwargs) super().__init__(**kwargs)
self._separator = separator self._separator = separator
self._filter = filters self._filter = filters
@ -696,7 +697,9 @@ class SeparatorTextSplitter(CharacterTextSplitter):
splits = text.split(separator) splits = text.split(separator)
else: else:
splits = list(text) splits = list(text)
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs) if self._merge:
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
return list(filter(None, text.split(separator)))
class PageTextSplitter(TextSplitter): class PageTextSplitter(TextSplitter):

View File

@ -61,7 +61,6 @@ const resources = {
recall_score: 'recall_score', recall_score: 'recall_score',
Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors', Set_a_threshold_score: 'Set a threshold score for the retrieval of similar vectors',
recall_type: 'recall_type', recall_type: 'recall_type',
Recall_Type: 'recall type',
model: 'model', model: 'model',
A_model_used: 'A model used to create vector representations of text or other data', A_model_used: 'A model used to create vector representations of text or other data',
Automatic: 'Automatic', Automatic: 'Automatic',
@ -239,12 +238,11 @@ const resources = {
Please_select_a_file: '请上传一个文件', Please_select_a_file: '请上传一个文件',
Please_input_the_text: '请输入文本', Please_input_the_text: '请输入文本',
Embedding: '嵌入', Embedding: '嵌入',
topk: '', topk: 'TopK',
the_top_k_vectors: '基于相似度得分的前 k 个向量', the_top_k_vectors: '基于相似度得分的前 k 个向量',
recall_score: '召回分数', recall_score: '召回分数',
Set_a_threshold_score: '设置相似向量检索的阈值分数', Set_a_threshold_score: '设置相似向量检索的阈值分数',
recall_type: '回忆类型', recall_type: '召回类型',
Recall_Type: '回忆类型',
model: '模型', model: '模型',
A_model_used: '用于创建文本或其他数据的矢量表示的模型', A_model_used: '用于创建文本或其他数据的矢量表示的模型',
Automatic: '自动切片', Automatic: '自动切片',

View File

@ -47,7 +47,7 @@ export default function ArgumentsModal({ space, argumentsShow, setArgumentsShow
</Form.Item> </Form.Item>
</Col> </Col>
<Col span={12}> <Col span={12}>
<Form.Item<IArguments> tooltip={t(`Recall_Type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}> <Form.Item<IArguments> tooltip={t(`recall_type`)} rules={[{ required: true }]} label={t('recall_type')} name={['embedding', 'recall_type']}>
<Input className="mb-5 h-12" /> <Input className="mb-5 h-12" />
</Form.Item> </Form.Item>
</Col> </Col>

View File

@ -1,5 +1,5 @@
import { IChunkStrategyResponse } from '@/types/knowledge'; import { IChunkStrategyResponse } from '@/types/knowledge';
import { Alert, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd'; import { Alert, Checkbox, Form, FormListFieldData, Input, InputNumber, Radio, RadioChangeEvent } from 'antd';
import { useState } from 'react'; import { useState } from 'react';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
const { TextArea } = Input; const { TextArea } = Input;
@ -25,7 +25,7 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
const [selectedStrategy, setSelectedStrategy] = useState<string>(); const [selectedStrategy, setSelectedStrategy] = useState<string>();
const { t } = useTranslation(); const { t } = useTranslation();
const DEFAULT_STRATEGY = { const DEFAULT_STRATEGY = {
strategy: t('Automatic'), strategy: 'Automatic',
name: t('Automatic'), name: t('Automatic'),
desc: t('Automatic_desc'), desc: t('Automatic_desc'),
}; };
@ -50,17 +50,30 @@ export default function StrategyForm({ strategies, docType, fileName, field }: I
{parameters?.map((param) => ( {parameters?.map((param) => (
<Form.Item <Form.Item
key={`param_${param.param_name}`} key={`param_${param.param_name}`}
label={`${param.param_name}: ${param.param_type}`} label={param.param_name}
name={[field!.name, 'chunk_parameters', param.param_name]} name={[field!.name, 'chunk_parameters', param.param_name]}
rules={[{ required: true, message: t('Please_input_the_name') }]} rules={[{ required: true, message: t('Please_input_the_name') }]}
initialValue={param.default_value} initialValue={param.default_value}
valuePropName={param.param_type === 'boolean' ? 'checked' : 'value'}
tooltip={param.description}
> >
{param.param_type === 'int' ? <InputNumber className="w-full" min={1} /> : <TextArea className="w-full" rows={2} maxLength={6} />} {renderParamByType(param.param_type)}
</Form.Item> </Form.Item>
))} ))}
</div> </div>
); );
} }
function renderParamByType(type: string) {
switch (type) {
case 'int':
return <InputNumber className="w-full" min={1} />;
case 'string':
return <TextArea className="w-full" rows={2} />;
case 'boolean':
return <Checkbox />;
}
}
return ( return (
<> <>
<Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}> <Form.Item name={[field!.name, 'chunk_parameters', 'chunk_strategy']} initialValue={DEFAULT_STRATEGY.strategy}>

View File

@ -82,6 +82,7 @@ export type IStrategyParameter = {
param_name: string; param_name: string;
param_type: string; param_type: string;
default_value?: string | number; default_value?: string | number;
description: string;
}; };
export type IChunkStrategyResponse = { export type IChunkStrategyResponse = {