text-splitters[patch]: Modified SpacyTextSplitter to fully keep whitespace when strip_whitespace is false (#23272)

Previously, regardless of whether or not strip_whitespace was set to
true or false, the strip text method in the SpacyTextSplitter class used
`sent.text` to get the sentence. I modified this to include a ternary
such that if strip_whitespace is false, it uses `sent.text_with_ws`
I also modified the project.toml to include the spacy pipeline package
and to lock the numpy version, as higher versions break spacy.

- **Issue:** N/a
- **Dependencies:** None
This commit is contained in:
Matthew DeGenaro
2024-09-02 17:15:56 -04:00
committed by GitHub
parent 3145995ed9
commit 66828f4ecc
2 changed files with 20 additions and 1 deletions

View File

@@ -37,3 +37,16 @@ def test_spacy_text_splitter(pipeline: str) -> None:
output = splitter.split_text(text)
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
assert output == expected_output
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
"""Test splitting by sentence using Spacy."""
text = "This is sentence one. And this is sentence two."
separator = "|||"
splitter = SpacyTextSplitter(
separator=separator, pipeline=pipeline, strip_whitespace=False
)
output = splitter.split_text(text)
expected_output = [f"This is sentence one. {separator}And this is sentence two."]
assert output == expected_output