From c2d09a5186b45c703500a99c1d918de49d0b35c3 Mon Sep 17 00:00:00 2001
From: GustavoSept <136198618+GustavoSept@users.noreply.github.com>
Date: Wed, 24 Apr 2024 21:32:40 -0300
Subject: [PATCH] experimental[patch]: Makes regex customizable in
 text_splitter.py (SemanticChunker class) (#20485)

- **Description:** Currently, the regex is static (`r"(?<=[.?!])\s+"`),
which is only useful for certain use cases. The current change only
moves this to be a parameter of split_text(). Which adds flexibility
without making it more complex (as the default regex is still the same).
- **Issue:** Not applicable (I searched, no one seems to have created
this issue yet).
  - **Dependencies:** None.


_If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, hwchase17._

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 libs/experimental/langchain_experimental/text_splitter.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py
index d53049e2326..1d0c462fa2a 100644
--- a/libs/experimental/langchain_experimental/text_splitter.py
+++ b/libs/experimental/langchain_experimental/text_splitter.py
@@ -112,12 +112,14 @@ class SemanticChunker(BaseDocumentTransformer):
         breakpoint_threshold_type: BreakpointThresholdType = "percentile",
         breakpoint_threshold_amount: Optional[float] = None,
         number_of_chunks: Optional[int] = None,
+        sentence_split_regex: str = r"(?<=[.?!])\s+",
     ):
         self._add_start_index = add_start_index
         self.embeddings = embeddings
         self.buffer_size = buffer_size
         self.breakpoint_threshold_type = breakpoint_threshold_type
         self.number_of_chunks = number_of_chunks
+        self.sentence_split_regex = sentence_split_regex
         if breakpoint_threshold_amount is None:
             self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[
                 breakpoint_threshold_type
@@ -189,8 +191,8 @@ class SemanticChunker(BaseDocumentTransformer):
         self,
         text: str,
     ) -> List[str]:
-        # Splitting the essay on '.', '?', and '!'
-        single_sentences_list = re.split(r"(?<=[.?!])\s+", text)
+        # Splitting the essay (by default on '.', '?', and '!')
+        single_sentences_list = re.split(self.sentence_split_regex, text)
 
         # having len(single_sentences_list) == 1 would cause the following
         # np.percentile to fail.