text-splitters: Add JSFrameworkTextSplitter for Handling JavaScript Framework Code (#28972)

## Description
This pull request introduces a new text splitter,
`JSFrameworkTextSplitter`, to the Langchain library. The
`JSFrameworkTextSplitter` extends the `RecursiveCharacterTextSplitter`
to handle JavaScript framework code effectively, including React (JSX),
Vue, and Svelte. It identifies and utilizes framework-specific component
tags and syntax elements as splitting points, alongside standard
JavaScript syntax. This ensures that code is divided at natural
boundaries, enhancing the parsing and processing of JavaScript and
framework-specific code.

### Key Features
- Supports React (JSX), Vue, and Svelte frameworks.
- Identifies and uses framework-specific tags and syntax elements as
natural splitting points.
- Extends the existing `RecursiveCharacterTextSplitter` for seamless
integration.

## Issue
No specific issue addressed.

## Dependencies
No additional dependencies required.

---------

Co-authored-by: ccurme <chester.curme@gmail.com>
This commit is contained in:
Ben 2025-03-17 23:32:33 +00:00 committed by GitHub
parent 9b48e4c2b0
commit 789db7398b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 239 additions and 0 deletions

View File

@ -36,6 +36,7 @@ from langchain_text_splitters.html import (
HTMLSemanticPreservingSplitter,
)
from langchain_text_splitters.json import RecursiveJsonSplitter
from langchain_text_splitters.jsx import JSFrameworkTextSplitter
from langchain_text_splitters.konlpy import KonlpyTextSplitter
from langchain_text_splitters.latex import LatexTextSplitter
from langchain_text_splitters.markdown import (
@ -60,6 +61,7 @@ __all__ = [
"RecursiveCharacterTextSplitter",
"RecursiveJsonSplitter",
"LatexTextSplitter",
"JSFrameworkTextSplitter",
"PythonCodeTextSplitter",
"KonlpyTextSplitter",
"SpacyTextSplitter",

View File

@ -0,0 +1,98 @@
import re
from typing import Any, List, Optional
from langchain_text_splitters import RecursiveCharacterTextSplitter
class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
"""Text splitter that handles React (JSX), Vue, and Svelte code.
This splitter extends RecursiveCharacterTextSplitter to handle
React (JSX), Vue, and Svelte code by:
1. Detecting and extracting custom component tags from the text
2. Using those tags as additional separators along with standard JS syntax
The splitter combines:
- Custom component tags as separators (e.g. <Component, <div)
- JavaScript syntax elements (function, const, if, etc)
- Standard text splitting on newlines
This allows chunks to break at natural boundaries in
React, Vue, and Svelte component code.
"""
def __init__(
self,
separators: Optional[List[str]] = None,
chunk_size: int = 2000,
chunk_overlap: int = 0,
**kwargs: Any,
) -> None:
"""Initialize the JS Framework text splitter.
Args:
separators: Optional list of custom separator strings to use
chunk_size: Maximum size of chunks to return
chunk_overlap: Overlap in characters between chunks
**kwargs: Additional arguments to pass to parent class
"""
super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
self._separators = separators or []
def split_text(self, text: str) -> List[str]:
"""Split text into chunks.
This method splits the text into chunks by:
- Extracting unique opening component tags using regex
- Creating separators list with extracted tags and JS separators
- Splitting the text using the separators by calling the parent class method
Args:
text: String containing code to split
Returns:
List of text chunks split on component and JS boundaries
"""
# Extract unique opening component tags using regex
# Regex to match opening tags, excluding self-closing tags
opening_tags = re.findall(r"<\s*([a-zA-Z0-9]+)[^>]*>", text)
component_tags = []
for tag in opening_tags:
if tag not in component_tags:
component_tags.append(tag)
component_separators = [f"<{tag}" for tag in component_tags]
js_separators = [
"\nexport ",
" export ",
"\nfunction ",
"\nasync function ",
" async function ",
"\nconst ",
"\nlet ",
"\nvar ",
"\nclass ",
" class ",
"\nif ",
" if ",
"\nfor ",
" for ",
"\nwhile ",
" while ",
"\nswitch ",
" switch ",
"\ncase ",
" case ",
"\ndefault ",
" default ",
]
separators = (
self._separators
+ js_separators
+ component_separators
+ ["<>", "\n\n", "&&\n", "||\n"]
)
self._separators = separators
chunks = super().split_text(text)
return chunks

View File

@ -23,6 +23,7 @@ from langchain_text_splitters.html import (
HTMLSemanticPreservingSplitter,
)
from langchain_text_splitters.json import RecursiveJsonSplitter
from langchain_text_splitters.jsx import JSFrameworkTextSplitter
from langchain_text_splitters.markdown import (
ExperimentalMarkdownSyntaxTextSplitter,
MarkdownHeaderTextSplitter,
@ -413,6 +414,144 @@ def test_python_text_splitter() -> None:
assert splits == expected_splits
FAKE_JSX_TEXT = """
import React from 'react';
import OtherComponent from './OtherComponent';
function MyComponent() {
const [count, setCount] = React.useState(0);
const handleClick = () => {
setCount(count + 1);
};
return (
<div>
<h1>Counter: {count}</h1>
<button onClick={handleClick}>
Increment
</button>
<OtherComponent />
</div>
);
}
export default MyComponent;
"""
def test_jsx_text_splitter() -> None:
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
splits = splitter.split_text(FAKE_JSX_TEXT)
expected_splits = [
"\nimport React from 'react';\n"
"import OtherComponent from './OtherComponent';\n",
"\nfunction MyComponent() {\n const [count, setCount] = React.useState(0);",
"\n\n const handleClick = () => {\n setCount(count + 1);\n };",
"return (",
"<div>",
"<h1>Counter: {count}</h1>\n ",
"<button onClick={handleClick}>\n Increment\n </button>\n ",
"<OtherComponent />\n </div>\n );\n}\n",
"export default MyComponent;",
]
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]
FAKE_VUE_TEXT = """
<template>
<div>
<h1>{{ title }}</h1>
<button @click="increment">
Count is: {{ count }}
</button>
</div>
</template>
<script>
export default {
data() {
return {
title: 'Counter App',
count: 0
}
},
methods: {
increment() {
this.count++
}
}
}
</script>
<style>
button {
color: blue;
}
</style>
"""
def test_vue_text_splitter() -> None:
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
splits = splitter.split_text(FAKE_VUE_TEXT)
expected_splits = [
"<template>",
"<div>",
"<h1>{{ title }}</h1>",
'<button @click="increment">\n Count is: {{ count }}\n'
" </button>\n </div>\n</template>",
"<script>",
"export",
" default {\n data() {\n return {\n title: 'Counter App',\n "
"count: 0\n }\n },\n methods: {\n increment() {\n "
"this.count++\n }\n }\n}\n</script>",
"<style>\nbutton {\n color: blue;\n}\n</style>",
]
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]
FAKE_SVELTE_TEXT = """
<script>
let count = 0
function increment() {
count += 1
}
</script>
<main>
<h1>Counter App</h1>
<button on:click={increment}>
Count is: {count}
</button>
</main>
<style>
button {
color: blue;
}
</style>
"""
def test_svelte_text_splitter() -> None:
splitter = JSFrameworkTextSplitter(chunk_size=30, chunk_overlap=0)
splits = splitter.split_text(FAKE_SVELTE_TEXT)
expected_splits = [
"<script>\n let count = 0",
"\n\n function increment() {\n count += 1\n }\n</script>",
"<main>",
"<h1>Counter App</h1>",
"<button on:click={increment}>\n Count is: {count}\n </button>\n</main>",
"<style>\n button {\n color: blue;\n }\n</style>",
]
assert [s.strip() for s in splits] == [s.strip() for s in expected_splits]
CHUNK_SIZE = 16