mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 03:59:42 +00:00
text-splitters[patch]: Modified SpacyTextSplitter to fully keep whitespace when strip_whitespace is false (#23272)
Previously, regardless of whether or not strip_whitespace was set to true or false, the strip text method in the SpacyTextSplitter class used `sent.text` to get the sentence. I modified this to include a ternary such that if strip_whitespace is false, it uses `sent.text_with_ws` I also modified the project.toml to include the spacy pipeline package and to lock the numpy version, as higher versions break spacy. - **Issue:** N/a - **Dependencies:** None
This commit is contained in:
@@ -37,3 +37,16 @@ def test_spacy_text_splitter(pipeline: str) -> None:
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one.{separator}And this is sentence two."]
|
||||
assert output == expected_output
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipeline", ["sentencizer", "en_core_web_sm"])
|
||||
def test_spacy_text_splitter_strip_whitespace(pipeline: str) -> None:
|
||||
"""Test splitting by sentence using Spacy."""
|
||||
text = "This is sentence one. And this is sentence two."
|
||||
separator = "|||"
|
||||
splitter = SpacyTextSplitter(
|
||||
separator=separator, pipeline=pipeline, strip_whitespace=False
|
||||
)
|
||||
output = splitter.split_text(text)
|
||||
expected_output = [f"This is sentence one. {separator}And this is sentence two."]
|
||||
assert output == expected_output
|
||||
|
Reference in New Issue
Block a user