mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-11-04 10:10:09 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			98 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			98 lines
		
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import re
 | 
						|
from typing import Any, Optional
 | 
						|
 | 
						|
from langchain_text_splitters import RecursiveCharacterTextSplitter
 | 
						|
 | 
						|
 | 
						|
class JSFrameworkTextSplitter(RecursiveCharacterTextSplitter):
 | 
						|
    """Text splitter that handles React (JSX), Vue, and Svelte code.
 | 
						|
 | 
						|
    This splitter extends RecursiveCharacterTextSplitter to handle
 | 
						|
    React (JSX), Vue, and Svelte code by:
 | 
						|
    1. Detecting and extracting custom component tags from the text
 | 
						|
    2. Using those tags as additional separators along with standard JS syntax
 | 
						|
 | 
						|
    The splitter combines:
 | 
						|
    - Custom component tags as separators (e.g. <Component, <div)
 | 
						|
    - JavaScript syntax elements (function, const, if, etc)
 | 
						|
    - Standard text splitting on newlines
 | 
						|
 | 
						|
    This allows chunks to break at natural boundaries in
 | 
						|
    React, Vue, and Svelte component code.
 | 
						|
    """
 | 
						|
 | 
						|
    def __init__(
 | 
						|
        self,
 | 
						|
        separators: Optional[list[str]] = None,
 | 
						|
        chunk_size: int = 2000,
 | 
						|
        chunk_overlap: int = 0,
 | 
						|
        **kwargs: Any,
 | 
						|
    ) -> None:
 | 
						|
        """Initialize the JS Framework text splitter.
 | 
						|
 | 
						|
        Args:
 | 
						|
            separators: Optional list of custom separator strings to use
 | 
						|
            chunk_size: Maximum size of chunks to return
 | 
						|
            chunk_overlap: Overlap in characters between chunks
 | 
						|
            **kwargs: Additional arguments to pass to parent class
 | 
						|
        """
 | 
						|
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
 | 
						|
        self._separators = separators or []
 | 
						|
 | 
						|
    def split_text(self, text: str) -> list[str]:
 | 
						|
        """Split text into chunks.
 | 
						|
 | 
						|
        This method splits the text into chunks by:
 | 
						|
        - Extracting unique opening component tags using regex
 | 
						|
        - Creating separators list with extracted tags and JS separators
 | 
						|
        - Splitting the text using the separators by calling the parent class method
 | 
						|
 | 
						|
        Args:
 | 
						|
            text: String containing code to split
 | 
						|
 | 
						|
        Returns:
 | 
						|
            List of text chunks split on component and JS boundaries
 | 
						|
        """
 | 
						|
        # Extract unique opening component tags using regex
 | 
						|
        # Regex to match opening tags, excluding self-closing tags
 | 
						|
        opening_tags = re.findall(r"<\s*([a-zA-Z0-9]+)[^>]*>", text)
 | 
						|
 | 
						|
        component_tags = []
 | 
						|
        for tag in opening_tags:
 | 
						|
            if tag not in component_tags:
 | 
						|
                component_tags.append(tag)
 | 
						|
        component_separators = [f"<{tag}" for tag in component_tags]
 | 
						|
 | 
						|
        js_separators = [
 | 
						|
            "\nexport ",
 | 
						|
            " export ",
 | 
						|
            "\nfunction ",
 | 
						|
            "\nasync function ",
 | 
						|
            " async function ",
 | 
						|
            "\nconst ",
 | 
						|
            "\nlet ",
 | 
						|
            "\nvar ",
 | 
						|
            "\nclass ",
 | 
						|
            " class ",
 | 
						|
            "\nif ",
 | 
						|
            " if ",
 | 
						|
            "\nfor ",
 | 
						|
            " for ",
 | 
						|
            "\nwhile ",
 | 
						|
            " while ",
 | 
						|
            "\nswitch ",
 | 
						|
            " switch ",
 | 
						|
            "\ncase ",
 | 
						|
            " case ",
 | 
						|
            "\ndefault ",
 | 
						|
            " default ",
 | 
						|
        ]
 | 
						|
        separators = (
 | 
						|
            self._separators
 | 
						|
            + js_separators
 | 
						|
            + component_separators
 | 
						|
            + ["<>", "\n\n", "&&\n", "||\n"]
 | 
						|
        )
 | 
						|
        self._separators = separators
 | 
						|
        return super().split_text(text)
 |