text-splitters[patch]: Extend TextSplitter:keep_separator functionality (#21130)

**Description:** Added extra functionality to `CharacterTextSplitter`,
`TextSplitter` classes.
The user can select whether to append the separator to the previous
chunk with `keep_separator='end' ` or else prepend to the next chunk.
Previous functionality prepended by default to next chunk.
  
**Issue:** Fixes #20908

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
This commit is contained in:
Christos Boulmpasakos
2024-05-22 23:17:45 +03:00
committed by GitHub
parent b859765752
commit c3bcfad66d
3 changed files with 59 additions and 6 deletions

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import re
from typing import Any, List, Optional
from typing import Any, List, Literal, Optional, Union
from langchain_text_splitters.base import Language, TextSplitter
@@ -29,17 +29,25 @@ class CharacterTextSplitter(TextSplitter):
def _split_text_with_regex(
text: str, separator: str, keep_separator: bool
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
) -> List[str]:
# Now that we have the separator, split the text
if separator:
if keep_separator:
# The parentheses in the pattern keep the delimiters in the result.
_splits = re.split(f"({separator})", text)
splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
splits = (
([_splits[i] + _splits[i + 1] for i in range(0, len(_splits) - 1, 2)])
if keep_separator == "end"
else ([_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)])
)
if len(_splits) % 2 == 0:
splits += _splits[-1:]
splits = [_splits[0]] + splits
splits = (
(splits + [_splits[-1]])
if keep_separator == "end"
else ([_splits[0]] + splits)
)
else:
splits = re.split(separator, text)
else: