mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-19 05:13:46 +00:00
langchain : text_splitters Added PowerShell (#24582)
- **Description:** Added PowerShell support for text splitters language include docs relevant update - **Issue:** None - **Dependencies:** None --------- Co-authored-by: tzitman <tamir.zitman@intel.com> Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
parent
187ee96f7a
commit
b3e1378f2b
@ -54,7 +54,7 @@
|
|||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "9e4144de-d925-4d4c-91c3-685ef8baa57c",
|
"id": "2bb9c73f-9d00-4a19-a81f-cab2f0fd921a",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -63,7 +63,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 4,
|
||||||
"id": "a9e37aa1",
|
"id": "a9e37aa1",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -718,8 +718,44 @@
|
|||||||
"php_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
"php_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||||
" language=Language.PHP, chunk_size=50, chunk_overlap=0\n",
|
" language=Language.PHP, chunk_size=50, chunk_overlap=0\n",
|
||||||
")\n",
|
")\n",
|
||||||
"haskell_docs = php_splitter.create_documents([PHP_CODE])\n",
|
"php_docs = php_splitter.create_documents([PHP_CODE])\n",
|
||||||
"haskell_docs"
|
"php_docs"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e9fa62c1",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## PowerShell\n",
|
||||||
|
"Here's an example using the PowerShell text splitter:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7e6893ad",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"POWERSHELL_CODE = \"\"\"\n",
|
||||||
|
"$directoryPath = Get-Location\n",
|
||||||
|
"\n",
|
||||||
|
"$items = Get-ChildItem -Path $directoryPath\n",
|
||||||
|
"\n",
|
||||||
|
"$files = $items | Where-Object { -not $_.PSIsContainer }\n",
|
||||||
|
"\n",
|
||||||
|
"$sortedFiles = $files | Sort-Object LastWriteTime\n",
|
||||||
|
"\n",
|
||||||
|
"foreach ($file in $sortedFiles) {\n",
|
||||||
|
" Write-Output (\"Name: \" + $file.Name + \" | Last Write Time: \" + $file.LastWriteTime)\n",
|
||||||
|
"}\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"powershell_splitter = RecursiveCharacterTextSplitter.from_language(\n",
|
||||||
|
" language=Language.POWERSHELL, chunk_size=100, chunk_overlap=0\n",
|
||||||
|
")\n",
|
||||||
|
"powershell_docs = powershell_splitter.create_documents([POWERSHELL_CODE])\n",
|
||||||
|
"powershell_docs"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -739,7 +775,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.5"
|
"version": "3.10.4"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -294,6 +294,7 @@ class Language(str, Enum):
|
|||||||
PERL = "perl"
|
PERL = "perl"
|
||||||
HASKELL = "haskell"
|
HASKELL = "haskell"
|
||||||
ELIXIR = "elixir"
|
ELIXIR = "elixir"
|
||||||
|
POWERSHELL = "powershell"
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
@ -659,6 +659,30 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
" ",
|
" ",
|
||||||
"",
|
"",
|
||||||
]
|
]
|
||||||
|
elif language == Language.POWERSHELL:
|
||||||
|
return [
|
||||||
|
# Split along function definitions
|
||||||
|
"\nfunction ",
|
||||||
|
# Split along parameter declarations (escape parentheses)
|
||||||
|
"\nparam ",
|
||||||
|
# Split along control flow statements
|
||||||
|
"\nif ",
|
||||||
|
"\nforeach ",
|
||||||
|
"\nfor ",
|
||||||
|
"\nwhile ",
|
||||||
|
"\nswitch ",
|
||||||
|
# Split along class definitions (for PowerShell 5.0 and above)
|
||||||
|
"\nclass ",
|
||||||
|
# Split along try-catch-finally blocks
|
||||||
|
"\ntry ",
|
||||||
|
"\ncatch ",
|
||||||
|
"\nfinally ",
|
||||||
|
# Split by normal lines and empty spaces
|
||||||
|
"\n\n",
|
||||||
|
"\n",
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
]
|
||||||
elif language in Language._value2member_map_:
|
elif language in Language._value2member_map_:
|
||||||
raise ValueError(f"Language {language} is not implemented yet!")
|
raise ValueError(f"Language {language} is not implemented yet!")
|
||||||
else:
|
else:
|
||||||
|
@ -1974,3 +1974,56 @@ def test_split_json_many_calls() -> None:
|
|||||||
|
|
||||||
assert chunk0 == chunk0_output
|
assert chunk0 == chunk0_output
|
||||||
assert chunk1 == chunk1_output
|
assert chunk1 == chunk1_output
|
||||||
|
|
||||||
|
|
||||||
|
def test_powershell_code_splitter_short_code() -> None:
|
||||||
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
|
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
|
||||||
|
)
|
||||||
|
code = """
|
||||||
|
# Check if a file exists
|
||||||
|
$filePath = "C:\\temp\\file.txt"
|
||||||
|
if (Test-Path $filePath) {
|
||||||
|
# File exists
|
||||||
|
} else {
|
||||||
|
# File does not exist
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
chunks = splitter.split_text(code)
|
||||||
|
assert chunks == [
|
||||||
|
'# Check if a file exists\n$filePath = "C:\\temp\\file.txt"',
|
||||||
|
"if (Test-Path $filePath) {\n # File exists\n} else {",
|
||||||
|
"# File does not exist\n}",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_powershell_code_splitter_longer_code() -> None:
|
||||||
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
|
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
|
||||||
|
)
|
||||||
|
code = """
|
||||||
|
# Get a list of all processes and export to CSV
|
||||||
|
$processes = Get-Process
|
||||||
|
$processes | Export-Csv -Path "C:\\temp\\processes.csv" -NoTypeInformation
|
||||||
|
|
||||||
|
# Read the CSV file and display its content
|
||||||
|
$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"
|
||||||
|
$csvContent | ForEach-Object {
|
||||||
|
$_.ProcessName
|
||||||
|
}
|
||||||
|
|
||||||
|
# End of script
|
||||||
|
"""
|
||||||
|
|
||||||
|
chunks = splitter.split_text(code)
|
||||||
|
assert chunks == [
|
||||||
|
"# Get a list of all processes and export to CSV",
|
||||||
|
"$processes = Get-Process",
|
||||||
|
'$processes | Export-Csv -Path "C:\\temp\\processes.csv"',
|
||||||
|
"-NoTypeInformation",
|
||||||
|
"# Read the CSV file and display its content",
|
||||||
|
'$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"',
|
||||||
|
"$csvContent | ForEach-Object {\n $_.ProcessName\n}",
|
||||||
|
"# End of script",
|
||||||
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user