From 2f8756d5bd7fd2d25a6ce51e1a1a3d197e1231dc Mon Sep 17 00:00:00 2001 From: "open-swe[bot]" Date: Thu, 31 Jul 2025 00:10:16 +0000 Subject: [PATCH] Apply patch [skip ci] --- .../sentence_transformers.py | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py index 33e27a53cb5..d2f214e578b 100644 --- a/libs/text-splitters/langchain_text_splitters/sentence_transformers.py +++ b/libs/text-splitters/langchain_text_splitters/sentence_transformers.py @@ -19,24 +19,32 @@ class SentenceTransformersTokenTextSplitter(TextSplitter): super().__init__(**kwargs, chunk_overlap=chunk_overlap) try: - from sentence_transformers import SentenceTransformer + from transformers import AutoConfig, AutoTokenizer except ImportError: msg = ( - "Could not import sentence_transformers python package. " + "Could not import transformers python package. " "This is needed in order to for SentenceTransformersTokenTextSplitter. " - "Please install it with `pip install sentence-transformers`." + "Please install it with `pip install transformers`." ) raise ImportError(msg) self.model_name = model_name - self._model = SentenceTransformer(self.model_name) - self.tokenizer = self._model.tokenizer + # Load tokenizer and config from transformers + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + self._config = AutoConfig.from_pretrained(self.model_name) self._initialize_chunk_configuration(tokens_per_chunk=tokens_per_chunk) def _initialize_chunk_configuration( self, *, tokens_per_chunk: Optional[int] ) -> None: - self.maximum_tokens_per_chunk = self._model.max_seq_length + # Get max_seq_length from config, fallback to max_position_embeddings + if hasattr(self._config, "max_seq_length"): + self.maximum_tokens_per_chunk = self._config.max_seq_length + elif hasattr(self._config, "max_position_embeddings"): + self.maximum_tokens_per_chunk = self._config.max_position_embeddings + else: + # Default fallback for models without explicit max length + self.maximum_tokens_per_chunk = 512 if tokens_per_chunk is None: self.tokens_per_chunk = self.maximum_tokens_per_chunk @@ -102,3 +110,4 @@ class SentenceTransformersTokenTextSplitter(TextSplitter): truncation="do_not_truncate", ) return cast("list[int]", token_ids_with_start_and_end_token_ids) +