From c2d09a5186b45c703500a99c1d918de49d0b35c3 Mon Sep 17 00:00:00 2001 From: GustavoSept <136198618+GustavoSept@users.noreply.github.com> Date: Wed, 24 Apr 2024 21:32:40 -0300 Subject: [PATCH] experimental[patch]: Makes regex customizable in text_splitter.py (SemanticChunker class) (#20485) - **Description:** Currently, the regex is static (`r"(?<=[.?!])\s+"`), which is only useful for certain use cases. The current change only moves this to be a parameter of split_text(). Which adds flexibility without making it more complex (as the default regex is still the same). - **Issue:** Not applicable (I searched, no one seems to have created this issue yet). - **Dependencies:** None. _If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, hwchase17._ --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur --- libs/experimental/langchain_experimental/text_splitter.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/experimental/langchain_experimental/text_splitter.py b/libs/experimental/langchain_experimental/text_splitter.py index d53049e2326..1d0c462fa2a 100644 --- a/libs/experimental/langchain_experimental/text_splitter.py +++ b/libs/experimental/langchain_experimental/text_splitter.py @@ -112,12 +112,14 @@ class SemanticChunker(BaseDocumentTransformer): breakpoint_threshold_type: BreakpointThresholdType = "percentile", breakpoint_threshold_amount: Optional[float] = None, number_of_chunks: Optional[int] = None, + sentence_split_regex: str = r"(?<=[.?!])\s+", ): self._add_start_index = add_start_index self.embeddings = embeddings self.buffer_size = buffer_size self.breakpoint_threshold_type = breakpoint_threshold_type self.number_of_chunks = number_of_chunks + self.sentence_split_regex = sentence_split_regex if breakpoint_threshold_amount is None: self.breakpoint_threshold_amount = BREAKPOINT_DEFAULTS[ breakpoint_threshold_type @@ -189,8 +191,8 @@ class SemanticChunker(BaseDocumentTransformer): self, text: str, ) -> List[str]: - # Splitting the essay on '.', '?', and '!' - single_sentences_list = re.split(r"(?<=[.?!])\s+", text) + # Splitting the essay (by default on '.', '?', and '!') + single_sentences_list = re.split(self.sentence_split_regex, text) # having len(single_sentences_list) == 1 would cause the following # np.percentile to fail.