langchain : text_splitters Added PowerShell (#24582)

- **Description:** Added PowerShell support for text splitters language
include docs relevant update
  - **Issue:** None
  - **Dependencies:** None

---------

Co-authored-by: tzitman <tamir.zitman@intel.com>
Co-authored-by: Chester Curme <chester.curme@gmail.com>
This commit is contained in:
Tamir Zitman
2024-07-30 19:13:52 +03:00
committed by GitHub
parent 187ee96f7a
commit b3e1378f2b
4 changed files with 119 additions and 5 deletions

View File

@@ -294,6 +294,7 @@ class Language(str, Enum):
PERL = "perl"
HASKELL = "haskell"
ELIXIR = "elixir"
POWERSHELL = "powershell"
@dataclass(frozen=True)

View File

@@ -659,6 +659,30 @@ class RecursiveCharacterTextSplitter(TextSplitter):
" ",
"",
]
elif language == Language.POWERSHELL:
return [
# Split along function definitions
"\nfunction ",
# Split along parameter declarations (escape parentheses)
"\nparam ",
# Split along control flow statements
"\nif ",
"\nforeach ",
"\nfor ",
"\nwhile ",
"\nswitch ",
# Split along class definitions (for PowerShell 5.0 and above)
"\nclass ",
# Split along try-catch-finally blocks
"\ntry ",
"\ncatch ",
"\nfinally ",
# Split by normal lines and empty spaces
"\n\n",
"\n",
" ",
"",
]
elif language in Language._value2member_map_:
raise ValueError(f"Language {language} is not implemented yet!")
else:

View File

@@ -1974,3 +1974,56 @@ def test_split_json_many_calls() -> None:
assert chunk0 == chunk0_output
assert chunk1 == chunk1_output
def test_powershell_code_splitter_short_code() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
)
code = """
# Check if a file exists
$filePath = "C:\\temp\\file.txt"
if (Test-Path $filePath) {
# File exists
} else {
# File does not exist
}
"""
chunks = splitter.split_text(code)
assert chunks == [
'# Check if a file exists\n$filePath = "C:\\temp\\file.txt"',
"if (Test-Path $filePath) {\n # File exists\n} else {",
"# File does not exist\n}",
]
def test_powershell_code_splitter_longer_code() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.POWERSHELL, chunk_size=60, chunk_overlap=0
)
code = """
# Get a list of all processes and export to CSV
$processes = Get-Process
$processes | Export-Csv -Path "C:\\temp\\processes.csv" -NoTypeInformation
# Read the CSV file and display its content
$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"
$csvContent | ForEach-Object {
$_.ProcessName
}
# End of script
"""
chunks = splitter.split_text(code)
assert chunks == [
"# Get a list of all processes and export to CSV",
"$processes = Get-Process",
'$processes | Export-Csv -Path "C:\\temp\\processes.csv"',
"-NoTypeInformation",
"# Read the CSV file and display its content",
'$csvContent = Import-Csv -Path "C:\\temp\\processes.csv"',
"$csvContent | ForEach-Object {\n $_.ProcessName\n}",
"# End of script",
]