mirror of
https://github.com/csunny/DB-GPT.git
synced 2025-09-13 05:01:25 +00:00
feat(ChatKnowledge):chunk add enable_merge parameter (#1014)
Co-authored-by: Aralhi <xiaoping0501@gmail.com>
This commit is contained in:
@@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
|
||||
default="\n",
|
||||
description="chunk separator",
|
||||
)
|
||||
enable_merge: bool = Field(
|
||||
default=None,
|
||||
description="enable chunk merge by chunk_size.",
|
||||
)
|
||||
|
||||
|
||||
class ChunkManager:
|
||||
@@ -134,4 +138,5 @@ class ChunkManager:
|
||||
chunk_size=self._chunk_parameters.chunk_size,
|
||||
chunk_overlap=self._chunk_parameters.chunk_overlap,
|
||||
separator=self._chunk_parameters.separator,
|
||||
enable_merge=self._chunk_parameters.enable_merge,
|
||||
)
|
||||
|
@@ -47,8 +47,18 @@ class ChunkStrategy(Enum):
|
||||
CHUNK_BY_SIZE = (
|
||||
RecursiveCharacterTextSplitter,
|
||||
[
|
||||
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
|
||||
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
|
||||
{
|
||||
"param_name": "chunk_size",
|
||||
"param_type": "int",
|
||||
"default_value": 512,
|
||||
"description": "The size of the data chunks used in processing.",
|
||||
},
|
||||
{
|
||||
"param_name": "chunk_overlap",
|
||||
"param_type": "int",
|
||||
"default_value": 50,
|
||||
"description": "The amount of overlap between adjacent data chunks.",
|
||||
},
|
||||
],
|
||||
"chunk size",
|
||||
"split document by chunk size",
|
||||
@@ -56,13 +66,33 @@ class ChunkStrategy(Enum):
|
||||
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
|
||||
CHUNK_BY_PARAGRAPH = (
|
||||
ParagraphTextSplitter,
|
||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
||||
[
|
||||
{
|
||||
"param_name": "separator",
|
||||
"param_type": "string",
|
||||
"default_value": "\\n",
|
||||
"description": "paragraph separator",
|
||||
}
|
||||
],
|
||||
"paragraph",
|
||||
"split document by paragraph",
|
||||
)
|
||||
CHUNK_BY_SEPARATOR = (
|
||||
SeparatorTextSplitter,
|
||||
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
|
||||
[
|
||||
{
|
||||
"param_name": "separator",
|
||||
"param_type": "string",
|
||||
"default_value": "\\n",
|
||||
"description": "chunk separator",
|
||||
},
|
||||
{
|
||||
"param_name": "enable_merge",
|
||||
"param_type": "boolean",
|
||||
"default_value": False,
|
||||
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
|
||||
},
|
||||
],
|
||||
"separator",
|
||||
"split document by separator",
|
||||
)
|
||||
@@ -80,6 +110,7 @@ class ChunkStrategy(Enum):
|
||||
self.description = description
|
||||
|
||||
def match(self, *args, **kwargs):
|
||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
return self.value[0](*args, **kwargs)
|
||||
|
||||
|
||||
|
@@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):
|
||||
|
||||
def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
|
||||
"""Create a new TextSplitter."""
|
||||
self._merge = kwargs.pop("enable_merge") or False
|
||||
super().__init__(**kwargs)
|
||||
self._separator = separator
|
||||
self._filter = filters
|
||||
@@ -696,7 +697,9 @@ class SeparatorTextSplitter(CharacterTextSplitter):
|
||||
splits = text.split(separator)
|
||||
else:
|
||||
splits = list(text)
|
||||
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
|
||||
if self._merge:
|
||||
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
|
||||
return list(filter(None, text.split(separator)))
|
||||
|
||||
|
||||
class PageTextSplitter(TextSplitter):
|
||||
|
Reference in New Issue
Block a user