diff --git a/libs/text-splitters/langchain_text_splitters/__init__.py b/libs/text-splitters/langchain_text_splitters/__init__.py index 5ecd1626adb..2bcc8d0731e 100644 --- a/libs/text-splitters/langchain_text_splitters/__init__.py +++ b/libs/text-splitters/langchain_text_splitters/__init__.py @@ -36,6 +36,7 @@ from langchain_text_splitters.html import ( HTMLSemanticPreservingSplitter, ) from langchain_text_splitters.json import RecursiveJsonSplitter +from langchain_text_splitters.jsx import JSFrameworkTextSplitter from langchain_text_splitters.konlpy import KonlpyTextSplitter from langchain_text_splitters.latex import LatexTextSplitter from langchain_text_splitters.markdown import ( @@ -60,6 +61,7 @@ __all__ = [ "RecursiveCharacterTextSplitter", "RecursiveJsonSplitter", "LatexTextSplitter", + "JSFrameworkTextSplitter", "PythonCodeTextSplitter", "KonlpyTextSplitter", "SpacyTextSplitter", diff --git a/libs/text-splitters/langchain_text_splitters/jsx.py b/libs/text-splitters/langchain_text_splitters/jsx.py new file mode 100644 index 00000000000..fc13a58c5f7 --- /dev/null +++ b/libs/text-splitters/langchain_text_splitters/jsx.py @@ -0,0 +1,98 @@ +import re +from typing import Any, List, Optional + +from langchain_text_splitters import RecursiveCharacterTextSplitter + + +class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter): + """Text splitter that handles React (JSX), Vue, and Svelte code. + + This splitter extends RecursiveCharacterTextSplitter to handle + React (JSX), Vue, and Svelte code by: + 1. Detecting and extracting custom component tags from the text + 2. Using those tags as additional separators along with standard JS syntax + + The splitter combines: + - Custom component tags as separators (e.g. None: + """Initialize the JS Framework text splitter. + + Args: + separators: Optional list of custom separator strings to use + chunk_size: Maximum size of chunks to return + chunk_overlap: Overlap in characters between chunks + **kwargs: Additional arguments to pass to parent class + """ + super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs) + self._separators = separators or [] + + def split_text(self, text: str) -> List[str]: + """Split text into chunks. + + This method splits the text into chunks by: + - Extracting unique opening component tags using regex + - Creating separators list with extracted tags and JS separators + - Splitting the text using the separators by calling the parent class method + + Args: + text: String containing code to split + + Returns: + List of text chunks split on component and JS boundaries + """ + # Extract unique opening component tags using regex + # Regex to match opening tags, excluding self-closing tags + opening_tags = re.findall(r"<\s*([a-zA-Z0-9]+)[^>]*>", text) + + component_tags = [] + for tag in opening_tags: + if tag not in component_tags: + component_tags.append(tag) + component_separators = [f"<{tag}" for tag in component_tags] + + js_separators = [ + "\nexport ", + " export ", + "\nfunction ", + "\nasync function ", + " async function ", + "\nconst ", + "\nlet ", + "\nvar ", + "\nclass ", + " class ", + "\nif ", + " if ", + "\nfor ", + " for ", + "\nwhile ", + " while ", + "\nswitch ", + " switch ", + "\ncase ", + " case ", + "\ndefault ", + " default ", + ] + separators = ( + self._separators + + js_separators + + component_separators + + ["<>", "\n\n", "&&\n", "||\n"] + ) + self._separators = separators + chunks = super().split_text(text) + return chunks diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 9ee5409062f..900924aef8f 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -23,6 +23,7 @@ from langchain_text_splitters.html import ( HTMLSemanticPreservingSplitter, ) from langchain_text_splitters.json import RecursiveJsonSplitter +from langchain_text_splitters.jsx import JSFrameworkTextSplitter from langchain_text_splitters.markdown import ( ExperimentalMarkdownSyntaxTextSplitter, MarkdownHeaderTextSplitter, @@ -413,6 +414,144 @@ def test_python_text_splitter() -> None: assert splits == expected_splits +FAKE_JSX_TEXT = """ +import React from 'react'; +import OtherComponent from './OtherComponent'; + +function MyComponent() { + const [count, setCount] = React.useState(0); + + const handleClick = () => { + setCount(count + 1); + }; + + return ( +
+

Counter: {count}

+ + +
+ ); +} + +export default MyComponent; +""" + + +def test_jsx_text_splitter() -> None: + splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0) + splits = splitter.split_text(FAKE_JSX_TEXT) + + expected_splits = [ + "\nimport React from 'react';\n" + "import OtherComponent from './OtherComponent';\n", + "\nfunction MyComponent() {\n const [count, setCount] = React.useState(0);", + "\n\n const handleClick = () => {\n setCount(count + 1);\n };", + "return (", + "
", + "

Counter: {count}

\n ", + "\n ", + "\n
\n );\n}\n", + "export default MyComponent;", + ] + assert [s.strip() for s in splits] == [s.strip() for s in expected_splits] + + +FAKE_VUE_TEXT = """ + + + + + +""" + + +def test_vue_text_splitter() -> None: + splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0) + splits = splitter.split_text(FAKE_VUE_TEXT) + + expected_splits = [ + "", + "", + "", + ] + assert [s.strip() for s in splits] == [s.strip() for s in expected_splits] + + +FAKE_SVELTE_TEXT = """ + + +
+

Counter App

+ +
+ + +""" + + +def test_svelte_text_splitter() -> None: + splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0) + splits = splitter.split_text(FAKE_SVELTE_TEXT) + + expected_splits = [ + "", + "
", + "

Counter App

", + "\n
", + "", + ] + assert [s.strip() for s in splits] == [s.strip() for s in expected_splits] + + CHUNK_SIZE = 16