diff --git a/libs/text-splitters/langchain_text_splitters/spacy.py b/libs/text-splitters/langchain_text_splitters/spacy.py index d83aea9a001..447a3e42960 100644 --- a/libs/text-splitters/langchain_text_splitters/spacy.py +++ b/libs/text-splitters/langchain_text_splitters/spacy.py @@ -20,6 +20,8 @@ class SpacyTextSplitter(TextSplitter): separator: str = "\n\n", pipeline: str = "en_core_web_sm", max_length: int = 1_000_000, + *, + strip_whitespace: bool = True, **kwargs: Any, ) -> None: """Initialize the spacy text splitter.""" @@ -28,10 +30,14 @@ class SpacyTextSplitter(TextSplitter): pipeline, max_length=max_length ) self._separator = separator + self._strip_whitespace = strip_whitespace def split_text(self, text: str) -> List[str]: """Split incoming text and return chunks.""" - splits = (s.text for s in self._tokenizer(text).sents) + splits = ( + s.text if self._strip_whitespace else s.text_with_ws + for s in self._tokenizer(text).sents + ) return self._merge_splits(splits, self._separator) diff --git a/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py b/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py index 402d01655d9..b19e8bf86ec 100644 --- a/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py +++ b/libs/text-splitters/tests/integration_tests/test_nlp_text_splitters.py @@ -37,3 +37,16 @@ def test_spacy_text_splitter(pipeline: str) -> None: output = splitter.split_text(text) expected_output = [f"This is sentence one.{separator}And this is sentence two."] assert output == expected_output + + +@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"]) +def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None: + """Test splitting by sentence using Spacy.""" + text = "This is sentence one. And this is sentence two." + separator = "|||" + splitter = SpacyTextSplitter( + separator=separator, pipeline=pipeline, strip_whitespace=False + ) + output = splitter.split_text(text) + expected_output = [f"This is sentence one. {separator}And this is sentence two."] + assert output == expected_output