diff --git a/docs/docs/how_to/code_splitter.ipynb b/docs/docs/how_to/code_splitter.ipynb index 7e23955882d..22dba423eb0 100644 --- a/docs/docs/how_to/code_splitter.ipynb +++ b/docs/docs/how_to/code_splitter.ipynb @@ -54,7 +54,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9e4144de-d925-4d4c-91c3-685ef8baa57c", + "id": "2bb9c73f-9d00-4a19-a81f-cab2f0fd921a", "metadata": {}, "outputs": [], "source": [ @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "a9e37aa1", "metadata": {}, "outputs": [], @@ -718,8 +718,44 @@ "php_splitter = RecursiveCharacterTextSplitter.from_language(\n", " language=Language.PHP, chunk_size=50, chunk_overlap=0\n", ")\n", - "haskell_docs = php_splitter.create_documents([PHP_CODE])\n", - "haskell_docs" + "php_docs = php_splitter.create_documents([PHP_CODE])\n", + "php_docs" + ] + }, + { + "cell_type": "markdown", + "id": "e9fa62c1", + "metadata": {}, + "source": [ + "## PowerShell\n", + "Here's an example using the PowerShell text splitter:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e6893ad", + "metadata": {}, + "outputs": [], + "source": [ + "POWERSHELL_CODE = \"\"\"\n", + "$directoryPath = Get-Location\n", + "\n", + "$items = Get-ChildItem -Path $directoryPath\n", + "\n", + "$files = $items | Where-Object { -not $_.PSIsContainer }\n", + "\n", + "$sortedFiles = $files | Sort-Object LastWriteTime\n", + "\n", + "foreach ($file in $sortedFiles) {\n", + " Write-Output (\"Name: \" + $file.Name + \" | Last Write Time: \" + $file.LastWriteTime)\n", + "}\n", + "\"\"\"\n", + "powershell_splitter = RecursiveCharacterTextSplitter.from_language(\n", + " language=Language.POWERSHELL, chunk_size=100, chunk_overlap=0\n", + ")\n", + "powershell_docs = powershell_splitter.create_documents([POWERSHELL_CODE])\n", + "powershell_docs" ] } ], @@ -739,7 +775,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.5" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 36de4bca09f..0e0a49c182d 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -294,6 +294,7 @@ class Language(str, Enum): PERL = "perl" HASKELL = "haskell" ELIXIR = "elixir" + POWERSHELL = "powershell" @dataclass(frozen=True) diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py index 3d8a8601c14..12f69484c15 100644 --- a/libs/text-splitters/langchain_text_splitters/character.py +++ b/libs/text-splitters/langchain_text_splitters/character.py @@ -659,6 +659,30 @@ class RecursiveCharacterTextSplitter(TextSplitter): " ", "", ] + elif language == Language.POWERSHELL: + return [ + # Split along function definitions + "\nfunction ", + # Split along parameter declarations (escape parentheses) + "\nparam ", + # Split along control flow statements + "\nif ", + "\nforeach ", + "\nfor ", + "\nwhile ", + "\nswitch ", + # Split along class definitions (for PowerShell 5.0 and above) + "\nclass ", + # Split along try-catch-finally blocks + "\ntry ", + "\ncatch ", + "\nfinally ", + # Split by normal lines and empty spaces + "\n\n", + "\n", + " ", + "", + ] elif language in Language._value2member_map_: raise ValueError(f"Language {language} is not implemented yet!") else: diff --git a/libs/text-splitters/tests/unit_tests/test_text_splitters.py b/libs/text-splitters/tests/unit_tests/test_text_splitters.py index 22294805228..1faee31cd0d 100644 --- a/libs/text-splitters/tests/unit_tests/test_text_splitters.py +++ b/libs/text-splitters/tests/unit_tests/test_text_splitters.py @@ -1974,3 +1974,56 @@ def test_split_json_many_calls() -> None: assert chunk0 == chunk0_output assert chunk1 == chunk1_output + + +def test_powershell_code_splitter_short_code() -> None: + splitter = RecursiveCharacterTextSplitter.from_language( + Language.POWERSHELL, chunk_size=60, chunk_overlap=0 + ) + code = """ +# Check if a file exists +$filePath = "C:\\temp\\file.txt" +if (Test-Path $filePath) { + # File exists +} else { + # File does not exist +} + """ + + chunks = splitter.split_text(code) + assert chunks == [ + '# Check if a file exists\n$filePath = "C:\\temp\\file.txt"', + "if (Test-Path $filePath) {\n # File exists\n} else {", + "# File does not exist\n}", + ] + + +def test_powershell_code_splitter_longer_code() -> None: + splitter = RecursiveCharacterTextSplitter.from_language( + Language.POWERSHELL, chunk_size=60, chunk_overlap=0 + ) + code = """ +# Get a list of all processes and export to CSV +$processes = Get-Process +$processes | Export-Csv -Path "C:\\temp\\processes.csv" -NoTypeInformation + +# Read the CSV file and display its content +$csvContent = Import-Csv -Path "C:\\temp\\processes.csv" +$csvContent | ForEach-Object { + $_.ProcessName +} + +# End of script + """ + + chunks = splitter.split_text(code) + assert chunks == [ + "# Get a list of all processes and export to CSV", + "$processes = Get-Process", + '$processes | Export-Csv -Path "C:\\temp\\processes.csv"', + "-NoTypeInformation", + "# Read the CSV file and display its content", + '$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"', + "$csvContent | ForEach-Object {\n $_.ProcessName\n}", + "# End of script", + ]