feat(ChatKnowledge):chunk add enable_merge parameter (#1014)

Co-authored-by: Aralhi <xiaoping0501@gmail.com>
This commit is contained in:
Aries-ckt
2024-01-04 10:04:41 +08:00
committed by GitHub
parent fd30588e55
commit ca83443c48
26 changed files with 98 additions and 41 deletions

View File

@@ -45,6 +45,10 @@ class ChunkParameters(BaseModel):
default="\n",
description="chunk separator",
)
enable_merge: bool = Field(
default=None,
description="enable chunk merge by chunk_size.",
)
class ChunkManager:
@@ -134,4 +138,5 @@ class ChunkManager:
chunk_size=self._chunk_parameters.chunk_size,
chunk_overlap=self._chunk_parameters.chunk_overlap,
separator=self._chunk_parameters.separator,
enable_merge=self._chunk_parameters.enable_merge,
)

View File

@@ -47,8 +47,18 @@ class ChunkStrategy(Enum):
CHUNK_BY_SIZE = (
RecursiveCharacterTextSplitter,
[
{"param_name": "chunk_size", "param_type": "int", "default_value": 512},
{"param_name": "chunk_overlap", "param_type": "int", "default_value": 50},
{
"param_name": "chunk_size",
"param_type": "int",
"default_value": 512,
"description": "The size of the data chunks used in processing.",
},
{
"param_name": "chunk_overlap",
"param_type": "int",
"default_value": 50,
"description": "The amount of overlap between adjacent data chunks.",
},
],
"chunk size",
"split document by chunk size",
@@ -56,13 +66,33 @@ class ChunkStrategy(Enum):
CHUNK_BY_PAGE = (PageTextSplitter, [], "page", "split document by page")
CHUNK_BY_PARAGRAPH = (
ParagraphTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
[
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "paragraph separator",
}
],
"paragraph",
"split document by paragraph",
)
CHUNK_BY_SEPARATOR = (
SeparatorTextSplitter,
[{"param_name": "separator", "param_type": "string", "default_value": "\n"}],
[
{
"param_name": "separator",
"param_type": "string",
"default_value": "\\n",
"description": "chunk separator",
},
{
"param_name": "enable_merge",
"param_type": "boolean",
"default_value": False,
"description": "Whether to merge according to the chunk_size after splitting by the separator.",
},
],
"separator",
"split document by separator",
)
@@ -80,6 +110,7 @@ class ChunkStrategy(Enum):
self.description = description
def match(self, *args, **kwargs):
kwargs = {k: v for k, v in kwargs.items() if v is not None}
return self.value[0](*args, **kwargs)

View File

@@ -682,6 +682,7 @@ class SeparatorTextSplitter(CharacterTextSplitter):
def __init__(self, separator: str = "\n", filters: list = [], **kwargs: Any):
"""Create a new TextSplitter."""
self._merge = kwargs.pop("enable_merge") or False
super().__init__(**kwargs)
self._separator = separator
self._filter = filters
@@ -696,7 +697,9 @@ class SeparatorTextSplitter(CharacterTextSplitter):
splits = text.split(separator)
else:
splits = list(text)
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
if self._merge:
return self._merge_splits(splits, separator, chunk_overlap=0, **kwargs)
return list(filter(None, text.split(separator)))
class PageTextSplitter(TextSplitter):