From 789db7398b54e622bf3441c8456c348bc6d98290 Mon Sep 17 00:00:00 2001 From: Ben <4490928+BenStuk@users.noreply.github.com> Date: Mon, 17 Mar 2025 23:32:33 +0000 Subject: [PATCH] text-splitters: Add JSFrameworkTextSplitter for Handling JavaScript Framework Code (#28972) ## Description This pull request introduces a new text splitter, `JSFrameworkTextSplitter`, to the Langchain library. The `JSFrameworkTextSplitter` extends the `RecursiveCharacterTextSplitter` to handle JavaScript framework code effectively, including React (JSX), Vue, and Svelte. It identifies and utilizes framework-specific component tags and syntax elements as splitting points, alongside standard JavaScript syntax. This ensures that code is divided at natural boundaries, enhancing the parsing and processing of JavaScript and framework-specific code. ### Key Features - Supports React (JSX), Vue, and Svelte frameworks. - Identifies and uses framework-specific tags and syntax elements as natural splitting points. - Extends the existing `RecursiveCharacterTextSplitter` for seamless integration. ## Issue No specific issue addressed. ## Dependencies No additional dependencies required. --------- Co-authored-by: ccurme --- .../langchain_text_splitters/__init__.py | 2 + .../langchain_text_splitters/jsx.py | 98 ++++++++++++ .../tests/unit_tests/test_text_splitters.py | 139 ++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 libs/text-splitters/langchain_text_splitters/jsx.py diff --git a/libs/text-splitters/langchain_text_splitters/__init__.py b/libs/text-splitters/langchain_text_splitters/__init__.py index 5ecd1626adb..2bcc8d0731e 100644 --- a/libs/text-splitters/langchain_text_splitters/__init__.py +++ b/libs/text-splitters/langchain_text_splitters/__init__.py @@ -36,6 +36,7 @@ from langchain_text_splitters.html import ( HTMLSemanticPreservingSplitter, ) from langchain_text_splitters.json import RecursiveJsonSplitter +from langchain_text_splitters.jsx import JSFrameworkTextSplitter from langchain_text_splitters.konlpy import KonlpyTextSplitter from langchain_text_splitters.latex import LatexTextSplitter from langchain_text_splitters.markdown import ( @@ -60,6 +61,7 @@ __all__ = [ "RecursiveCharacterTextSplitter", "RecursiveJsonSplitter", "LatexTextSplitter", + "JSFrameworkTextSplitter", "PythonCodeTextSplitter", "KonlpyTextSplitter", "SpacyTextSplitter", diff --git a/libs/text-splitters/langchain_text_splitters/jsx.py b/libs/text-splitters/langchain_text_splitters/jsx.py new file mode 100644 index 00000000000..fc13a58c5f7 --- /dev/null +++ b/libs/text-splitters/langchain_text_splitters/jsx.py @@ -0,0 +1,98 @@ +import re +from typing import Any, List, Optional + +from langchain_text_splitters import RecursiveCharacterTextSplitter + + +class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter): + """Text splitter that handles React (JSX), Vue, and Svelte code. + + This splitter extends RecursiveCharacterTextSplitter to handle + React (JSX), Vue, and Svelte code by: + 1. Detecting and extracting custom component tags from the text + 2. Using those tags as additional separators along with standard JS syntax + + The splitter combines: + - Custom component tags as separators (e.g. None: + """Initialize the JS Framework text splitter. + + Args: + separators: Optional list of custom separator strings to use + chunk_size: Maximum size of chunks to return + chunk_overlap: Overlap in characters between chunks + **kwargs: Additional arguments to pass to parent class + """ + super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs) + self._separators = separators or [] + + def split_text(self, text: str) -> List[str]: + """Split text into chunks. + + This method splits the text into chunks by: + - Extracting unique opening component tags using regex + - Creating separators list with extracted tags and JS separators + - Splitting the text using the separators by calling the parent class method + + Args: + text: String containing code to split + + Returns: + List of text chunks split on component and JS boundaries + """ + # Extract unique opening component tags using regex + # Regex to match opening tags, excluding self-closing tags + opening_tags = re.findall(r"<\s*([a-zA-Z0-9]+)[^>]*>", text) + + component_tags = [] + for tag in opening_tags: + if tag not in component_tags: + component_tags.append(tag) + component_separators = [f"<{tag}" for tag in component_tags] + + js_separators = [ + "\nexport ", + " export ", + "\nfunction ", + "\nasync function ", + " async function ", + "\nconst ", + "\nlet ", + "\nvar ", + "\nclass ", + " class ", + "\nif ", + " if ", + "\nfor ", + " for ", + "\nwhile ", + " while ", + "\nswitch ", + " switch ", + "\ncase ", + " case ", + "\ndefault ", + " default ", + ] + separators = ( + self._separators + + js_separators + + component_separators + + ["<>", "\n\n", "&&\n", "||\n"] + ) + self._separators = separators + chunks = super().split_text(text) + return chunks diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 9ee5409062f..900924aef8f 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -23,6 +23,7 @@ from langchain_text_splitters.html import ( HTMLSemanticPreservingSplitter, ) from langchain_text_splitters.json import RecursiveJsonSplitter +from langchain_text_splitters.jsx import JSFrameworkTextSplitter from langchain_text_splitters.markdown import ( ExperimentalMarkdownSyntaxTextSplitter, MarkdownHeaderTextSplitter, @@ -413,6 +414,144 @@ def test_python_text_splitter() -> None: assert splits == expected_splits +FAKE_JSX_TEXT = """ +import React from 'react'; +import OtherComponent from './OtherComponent'; + +function MyComponent() { + const [count, setCount] = React.useState(0); + + const handleClick = () => { + setCount(count + 1); + }; + + return ( +
+

Counter: {count}

+ + +
+ ); +} + +export default MyComponent; +""" + + +def test_jsx_text_splitter() -> None: + splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0) + splits = splitter.split_text(FAKE_JSX_TEXT) + + expected_splits = [ + "\nimport React from 'react';\n" + "import OtherComponent from './OtherComponent';\n", + "\nfunction MyComponent() {\n const [count, setCount] = React.useState(0);", + "\n\n const handleClick = () => {\n setCount(count + 1);\n };", + "return (", + "
", + "

Counter: {count}

\n ", + "\n ", + "\n
\n );\n}\n", + "export default MyComponent;", + ] + assert [s.strip() for s in splits] == [s.strip() for s in expected_splits] + + +FAKE_VUE_TEXT = """ + + + + + +""" + + +def test_vue_text_splitter() -> None: + splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0) + splits = splitter.split_text(FAKE_VUE_TEXT) + + expected_splits = [ + "", + "", + "", + ] + assert [s.strip() for s in splits] == [s.strip() for s in expected_splits] + + +FAKE_SVELTE_TEXT = """ + + +
+

Counter App

+ +
+ + +""" + + +def test_svelte_text_splitter() -> None: + splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0) + splits = splitter.split_text(FAKE_SVELTE_TEXT) + + expected_splits = [ + "", + "
", + "

Counter App

", + "\n
", + "", + ] + assert [s.strip() for s in splits] == [s.strip() for s in expected_splits] + + CHUNK_SIZE = 16