mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-28 11:55:21 +00:00
## Description This pull request introduces a new text splitter, `JSFrameworkTextSplitter`, to the Langchain library. The `JSFrameworkTextSplitter` extends the `RecursiveCharacterTextSplitter` to handle JavaScript framework code effectively, including React (JSX), Vue, and Svelte. It identifies and utilizes framework-specific component tags and syntax elements as splitting points, alongside standard JavaScript syntax. This ensures that code is divided at natural boundaries, enhancing the parsing and processing of JavaScript and framework-specific code. ### Key Features - Supports React (JSX), Vue, and Svelte frameworks. - Identifies and uses framework-specific tags and syntax elements as natural splitting points. - Extends the existing `RecursiveCharacterTextSplitter` for seamless integration. ## Issue No specific issue addressed. ## Dependencies No additional dependencies required. --------- Co-authored-by: ccurme <chester.curme@gmail.com>
82 lines
2.3 KiB
Python
82 lines
2.3 KiB
Python
"""**Text Splitters** are classes for splitting text.
|
|
|
|
**Class hierarchy:**
|
|
|
|
.. code-block::
|
|
|
|
BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter # Example: CharacterTextSplitter
|
|
RecursiveCharacterTextSplitter --> <name>TextSplitter
|
|
|
|
Note: **MarkdownHeaderTextSplitter** and **HTMLHeaderTextSplitter do not derive from TextSplitter.
|
|
|
|
|
|
**Main helpers:**
|
|
|
|
.. code-block::
|
|
|
|
Document, Tokenizer, Language, LineType, HeaderType
|
|
|
|
""" # noqa: E501
|
|
|
|
from langchain_text_splitters.base import (
|
|
Language,
|
|
TextSplitter,
|
|
Tokenizer,
|
|
TokenTextSplitter,
|
|
split_text_on_tokens,
|
|
)
|
|
from langchain_text_splitters.character import (
|
|
CharacterTextSplitter,
|
|
RecursiveCharacterTextSplitter,
|
|
)
|
|
from langchain_text_splitters.html import (
|
|
ElementType,
|
|
HTMLHeaderTextSplitter,
|
|
HTMLSectionSplitter,
|
|
HTMLSemanticPreservingSplitter,
|
|
)
|
|
from langchain_text_splitters.json import RecursiveJsonSplitter
|
|
from langchain_text_splitters.jsx import JSFrameworkTextSplitter
|
|
from langchain_text_splitters.konlpy import KonlpyTextSplitter
|
|
from langchain_text_splitters.latex import LatexTextSplitter
|
|
from langchain_text_splitters.markdown import (
|
|
ExperimentalMarkdownSyntaxTextSplitter,
|
|
HeaderType,
|
|
LineType,
|
|
MarkdownHeaderTextSplitter,
|
|
MarkdownTextSplitter,
|
|
)
|
|
from langchain_text_splitters.nltk import NLTKTextSplitter
|
|
from langchain_text_splitters.python import PythonCodeTextSplitter
|
|
from langchain_text_splitters.sentence_transformers import (
|
|
SentenceTransformersTokenTextSplitter,
|
|
)
|
|
from langchain_text_splitters.spacy import SpacyTextSplitter
|
|
|
|
__all__ = [
|
|
"TokenTextSplitter",
|
|
"TextSplitter",
|
|
"Tokenizer",
|
|
"Language",
|
|
"RecursiveCharacterTextSplitter",
|
|
"RecursiveJsonSplitter",
|
|
"LatexTextSplitter",
|
|
"JSFrameworkTextSplitter",
|
|
"PythonCodeTextSplitter",
|
|
"KonlpyTextSplitter",
|
|
"SpacyTextSplitter",
|
|
"NLTKTextSplitter",
|
|
"split_text_on_tokens",
|
|
"SentenceTransformersTokenTextSplitter",
|
|
"ElementType",
|
|
"HeaderType",
|
|
"LineType",
|
|
"HTMLHeaderTextSplitter",
|
|
"HTMLSectionSplitter",
|
|
"HTMLSemanticPreservingSplitter",
|
|
"MarkdownHeaderTextSplitter",
|
|
"MarkdownTextSplitter",
|
|
"CharacterTextSplitter",
|
|
"ExperimentalMarkdownSyntaxTextSplitter",
|
|
]
|