mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-11-04 02:03:32 +00:00 
			
		
		
		
	As seen in #23188, turned on Google-style docstrings by enabling `pydocstyle` linting in the `text-splitters` package. Each resulting linting error was addressed differently: ignored, resolved, suppressed, and missing docstrings were added. Fixes one of the checklist items from #25154, similar to #25939 in `core` package. Ran `make format`, `make lint` and `make test` from the root of the package `text-splitters` to ensure no issues were found. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
		
			
				
	
	
		
			61 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			61 lines
		
	
	
		
			1.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import annotations
 | 
						|
 | 
						|
from typing import Any, List
 | 
						|
 | 
						|
from langchain_text_splitters.base import TextSplitter
 | 
						|
 | 
						|
 | 
						|
class SpacyTextSplitter(TextSplitter):
 | 
						|
    """Splitting text using Spacy package.
 | 
						|
 | 
						|
    Per default, Spacy's `en_core_web_sm` model is used and
 | 
						|
    its default max_length is 1000000 (it is the length of maximum character
 | 
						|
    this model takes which can be increased for large files). For a faster, but
 | 
						|
    potentially less accurate splitting, you can use `pipeline='sentencizer'`.
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        separator: str = "\n\n",
 | 
						|
        pipeline: str = "en_core_web_sm",
 | 
						|
        max_length: int = 1_000_000,
 | 
						|
        *,
 | 
						|
        strip_whitespace: bool = True,
 | 
						|
        **kwargs: Any,
 | 
						|
    ) -> None:
 | 
						|
        """Initialize the spacy text splitter."""
 | 
						|
        super().__init__(**kwargs)
 | 
						|
        self._tokenizer = _make_spacy_pipeline_for_splitting(
 | 
						|
            pipeline, max_length=max_length
 | 
						|
        )
 | 
						|
        self._separator = separator
 | 
						|
        self._strip_whitespace = strip_whitespace
 | 
						|
 | 
						|
    def split_text(self, text: str) -> List[str]:
 | 
						|
        """Split incoming text and return chunks."""
 | 
						|
        splits = (
 | 
						|
            s.text if self._strip_whitespace else s.text_with_ws
 | 
						|
            for s in self._tokenizer(text).sents
 | 
						|
        )
 | 
						|
        return self._merge_splits(splits, self._separator)
 | 
						|
 | 
						|
 | 
						|
def _make_spacy_pipeline_for_splitting(
 | 
						|
    pipeline: str, *, max_length: int = 1_000_000
 | 
						|
) -> Any:  # avoid importing spacy
 | 
						|
    try:
 | 
						|
        import spacy
 | 
						|
    except ImportError:
 | 
						|
        raise ImportError(
 | 
						|
            "Spacy is not installed, please install it with `pip install spacy`."
 | 
						|
        )
 | 
						|
    if pipeline == "sentencizer":
 | 
						|
        from spacy.lang.en import English
 | 
						|
 | 
						|
        sentencizer: Any = English()
 | 
						|
        sentencizer.add_pipe("sentencizer")
 | 
						|
    else:
 | 
						|
        sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
 | 
						|
        sentencizer.max_length = max_length
 | 
						|
    return sentencizer
 |