feat(text-splitters): add Visual Basic 6 support (#31173)

### **Description**

Add Visual Basic 6 support.

---

### **Issue**

No specific issue addressed.

---

### **Dependencies**

No additional dependencies required.

---------

Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
Fabio Fontana 2025-07-14 15:51:16 +02:00 committed by GitHub
parent 7e146a185b
commit fd168e1c11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 261 additions and 93 deletions

View File

@ -63,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"id": "a9e37aa1",
"metadata": {},
"outputs": [],
@ -84,7 +84,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "e21a2434",
"metadata": {},
"outputs": [
@ -114,10 +114,13 @@
" 'c',\n",
" 'lua',\n",
" 'perl',\n",
" 'haskell']"
" 'haskell',\n",
" 'elixir',\n",
" 'powershell',\n",
" 'visualbasic6']"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@ -136,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"id": "c92fb913",
"metadata": {},
"outputs": [
@ -146,7 +149,7 @@
"['\\nclass ', '\\ndef ', '\\n\\tdef ', '\\n\\n', '\\n', ' ', '']"
]
},
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -168,18 +171,18 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"id": "a58512b9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='def hello_world():\\n print(\"Hello, World!\")'),\n",
" Document(page_content='# Call the function\\nhello_world()')]"
"[Document(metadata={}, page_content='def hello_world():\\n print(\"Hello, World!\")'),\n",
" Document(metadata={}, page_content='# Call the function\\nhello_world()')]"
]
},
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -210,18 +213,18 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"id": "7db0d486",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}'),\n",
" Document(page_content='// Call the function\\nhelloWorld();')]"
"[Document(metadata={}, page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}'),\n",
" Document(metadata={}, page_content='// Call the function\\nhelloWorld();')]"
]
},
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -254,19 +257,19 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"id": "aee738a4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='function helloWorld(): void {'),\n",
" Document(page_content='console.log(\"Hello, World!\");\\n}'),\n",
" Document(page_content='// Call the function\\nhelloWorld();')]"
"[Document(metadata={}, page_content='function helloWorld(): void {'),\n",
" Document(metadata={}, page_content='console.log(\"Hello, World!\");\\n}'),\n",
" Document(metadata={}, page_content='// Call the function\\nhelloWorld();')]"
]
},
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -300,7 +303,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 10,
"id": "ac9295d3",
"metadata": {},
"outputs": [],
@ -321,7 +324,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 11,
"id": "bfa1771b-d4b0-48f8-a949-5537cd1df0dd",
"metadata": {},
"outputs": [
@ -337,7 +340,7 @@
" Document(metadata={}, page_content='are extremely open to contributions.')]"
]
},
"execution_count": 3,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -362,7 +365,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"id": "77d1049d",
"metadata": {},
"outputs": [],
@ -389,38 +392,38 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"id": "4dbc47e1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle'),\n",
" Document(page_content='\\\\section{Introduction}'),\n",
" Document(page_content='Large language models (LLMs) are a type of machine learning'),\n",
" Document(page_content='model that can be trained on vast amounts of text data to'),\n",
" Document(page_content='generate human-like language. In recent years, LLMs have'),\n",
" Document(page_content='made significant advances in a variety of natural language'),\n",
" Document(page_content='processing tasks, including language translation, text'),\n",
" Document(page_content='generation, and sentiment analysis.'),\n",
" Document(page_content='\\\\subsection{History of LLMs}'),\n",
" Document(page_content='The earliest LLMs were developed in the 1980s and 1990s,'),\n",
" Document(page_content='but they were limited by the amount of data that could be'),\n",
" Document(page_content='processed and the computational power available at the'),\n",
" Document(page_content='time. In the past decade, however, advances in hardware and'),\n",
" Document(page_content='software have made it possible to train LLMs on massive'),\n",
" Document(page_content='datasets, leading to significant improvements in'),\n",
" Document(page_content='performance.'),\n",
" Document(page_content='\\\\subsection{Applications of LLMs}'),\n",
" Document(page_content='LLMs have many applications in industry, including'),\n",
" Document(page_content='chatbots, content creation, and virtual assistants. They'),\n",
" Document(page_content='can also be used in academia for research in linguistics,'),\n",
" Document(page_content='psychology, and computational linguistics.'),\n",
" Document(page_content='\\\\end{document}')]"
"[Document(metadata={}, page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle'),\n",
" Document(metadata={}, page_content='\\\\section{Introduction}'),\n",
" Document(metadata={}, page_content='Large language models (LLMs) are a type of machine learning'),\n",
" Document(metadata={}, page_content='model that can be trained on vast amounts of text data to'),\n",
" Document(metadata={}, page_content='generate human-like language. In recent years, LLMs have'),\n",
" Document(metadata={}, page_content='made significant advances in a variety of natural language'),\n",
" Document(metadata={}, page_content='processing tasks, including language translation, text'),\n",
" Document(metadata={}, page_content='generation, and sentiment analysis.'),\n",
" Document(metadata={}, page_content='\\\\subsection{History of LLMs}'),\n",
" Document(metadata={}, page_content='The earliest LLMs were developed in the 1980s and 1990s,'),\n",
" Document(metadata={}, page_content='but they were limited by the amount of data that could be'),\n",
" Document(metadata={}, page_content='processed and the computational power available at the'),\n",
" Document(metadata={}, page_content='time. In the past decade, however, advances in hardware and'),\n",
" Document(metadata={}, page_content='software have made it possible to train LLMs on massive'),\n",
" Document(metadata={}, page_content='datasets, leading to significant improvements in'),\n",
" Document(metadata={}, page_content='performance.'),\n",
" Document(metadata={}, page_content='\\\\subsection{Applications of LLMs}'),\n",
" Document(metadata={}, page_content='LLMs have many applications in industry, including'),\n",
" Document(metadata={}, page_content='chatbots, content creation, and virtual assistants. They'),\n",
" Document(metadata={}, page_content='can also be used in academia for research in linguistics,'),\n",
" Document(metadata={}, page_content='psychology, and computational linguistics.'),\n",
" Document(metadata={}, page_content='\\\\end{document}')]"
]
},
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -445,7 +448,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"id": "0fc78794",
"metadata": {},
"outputs": [],
@ -479,29 +482,29 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 15,
"id": "e3e3fca1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='<!DOCTYPE html>\\n<html>'),\n",
" Document(page_content='<head>\\n <title>🦜️🔗 LangChain</title>'),\n",
" Document(page_content='<style>\\n body {\\n font-family: Aria'),\n",
" Document(page_content='l, sans-serif;\\n }\\n h1 {'),\n",
" Document(page_content='color: darkblue;\\n }\\n </style>\\n </head'),\n",
" Document(page_content='>'),\n",
" Document(page_content='<body>'),\n",
" Document(page_content='<div>\\n <h1>🦜️🔗 LangChain</h1>'),\n",
" Document(page_content='<p>⚡ Building applications with LLMs through composability ⚡'),\n",
" Document(page_content='</p>\\n </div>'),\n",
" Document(page_content='<div>\\n As an open-source project in a rapidly dev'),\n",
" Document(page_content='eloping field, we are extremely open to contributions.'),\n",
" Document(page_content='</div>\\n </body>\\n</html>')]"
"[Document(metadata={}, page_content='<!DOCTYPE html>\\n<html>'),\n",
" Document(metadata={}, page_content='<head>\\n <title>🦜️🔗 LangChain</title>'),\n",
" Document(metadata={}, page_content='<style>\\n body {\\n font-family: Aria'),\n",
" Document(metadata={}, page_content='l, sans-serif;\\n }\\n h1 {'),\n",
" Document(metadata={}, page_content='color: darkblue;\\n }\\n </style>\\n </head'),\n",
" Document(metadata={}, page_content='>'),\n",
" Document(metadata={}, page_content='<body>'),\n",
" Document(metadata={}, page_content='<div>\\n <h1>🦜️🔗 LangChain</h1>'),\n",
" Document(metadata={}, page_content='<p>⚡ Building applications with LLMs through composability ⚡'),\n",
" Document(metadata={}, page_content='</p>\\n </div>'),\n",
" Document(metadata={}, page_content='<div>\\n As an open-source project in a rapidly dev'),\n",
" Document(metadata={}, page_content='eloping field, we are extremely open to contributions.'),\n",
" Document(metadata={}, page_content='</div>\\n </body>\\n</html>')]"
]
},
"execution_count": 13,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@ -525,18 +528,18 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 16,
"id": "49a1df11",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='pragma solidity ^0.8.20;'),\n",
" Document(page_content='contract HelloWorld {\\n function add(uint a, uint b) pure public returns(uint) {\\n return a + b;\\n }\\n}')]"
"[Document(metadata={}, page_content='pragma solidity ^0.8.20;'),\n",
" Document(metadata={}, page_content='contract HelloWorld {\\n function add(uint a, uint b) pure public returns(uint) {\\n return a + b;\\n }\\n}')]"
]
},
"execution_count": 14,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@ -569,21 +572,21 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 17,
"id": "1524ae0f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='using System;'),\n",
" Document(page_content='class Program\\n{\\n static void Main()\\n {\\n int age = 30; // Change the age value as needed'),\n",
" Document(page_content='// Categorize the age without any console output\\n if (age < 18)\\n {\\n // Age is under 18'),\n",
" Document(page_content='}\\n else if (age >= 18 && age < 65)\\n {\\n // Age is an adult\\n }\\n else\\n {'),\n",
" Document(page_content='// Age is a senior citizen\\n }\\n }\\n}')]"
"[Document(metadata={}, page_content='using System;'),\n",
" Document(metadata={}, page_content='class Program\\n{\\n static void Main()\\n {\\n int age = 30; // Change the age value as needed'),\n",
" Document(metadata={}, page_content='// Categorize the age without any console output\\n if (age < 18)\\n {\\n // Age is under 18'),\n",
" Document(metadata={}, page_content='}\\n else if (age >= 18 && age < 65)\\n {\\n // Age is an adult\\n }\\n else\\n {'),\n",
" Document(metadata={}, page_content='// Age is a senior citizen\\n }\\n }\\n}')]"
]
},
"execution_count": 15,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@ -631,20 +634,20 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 18,
"id": "688185b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='main :: IO ()'),\n",
" Document(page_content='main = do\\n putStrLn \"Hello, World!\"\\n-- Some'),\n",
" Document(page_content='sample functions\\nadd :: Int -> Int -> Int\\nadd x y'),\n",
" Document(page_content='= x + y')]"
"[Document(metadata={}, page_content='main :: IO ()'),\n",
" Document(metadata={}, page_content='main = do\\n putStrLn \"Hello, World!\"\\n-- Some'),\n",
" Document(metadata={}, page_content='sample functions\\nadd :: Int -> Int -> Int\\nadd x y'),\n",
" Document(metadata={}, page_content='= x + y')]"
]
},
"execution_count": 3,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@ -676,23 +679,23 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 19,
"id": "90c66e7e-87a5-4a81-bece-7949aabf2369",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='<?php\\nnamespace foo;'),\n",
" Document(page_content='class Hello {'),\n",
" Document(page_content='public function __construct() { }\\n}'),\n",
" Document(page_content='function hello() {\\n echo \"Hello World!\";\\n}'),\n",
" Document(page_content='interface Human {\\n public function breath();\\n}'),\n",
" Document(page_content='trait Foo { }\\nenum Color\\n{\\n case Red;'),\n",
" Document(page_content='case Blue;\\n}')]"
"[Document(metadata={}, page_content='<?php\\nnamespace foo;'),\n",
" Document(metadata={}, page_content='class Hello {'),\n",
" Document(metadata={}, page_content='public function __construct() { }\\n}'),\n",
" Document(metadata={}, page_content='function hello() {\\n echo \"Hello World!\";\\n}'),\n",
" Document(metadata={}, page_content='interface Human {\\n public function breath();\\n}'),\n",
" Document(metadata={}, page_content='trait Foo { }\\nenum Color\\n{\\n case Red;'),\n",
" Document(metadata={}, page_content='case Blue;\\n}')]"
]
},
"execution_count": 2,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@ -733,10 +736,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"id": "7e6893ad",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[Document(metadata={}, page_content='$directoryPath = Get-Location\\n\\n$items = Get-ChildItem -Path $directoryPath'),\n",
" Document(metadata={}, page_content='$files = $items | Where-Object { -not $_.PSIsContainer }'),\n",
" Document(metadata={}, page_content='$sortedFiles = $files | Sort-Object LastWriteTime'),\n",
" Document(metadata={}, page_content='foreach ($file in $sortedFiles) {'),\n",
" Document(metadata={}, page_content='Write-Output (\"Name: \" + $file.Name + \" | Last Write Time: \" + $file.LastWriteTime)\\n}')]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"POWERSHELL_CODE = \"\"\"\n",
"$directoryPath = Get-Location\n",
@ -757,11 +775,58 @@
"powershell_docs = powershell_splitter.create_documents([POWERSHELL_CODE])\n",
"powershell_docs"
]
},
{
"cell_type": "markdown",
"id": "3ef77730",
"metadata": {},
"source": [
"## Visual Basic 6"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "1dc3c740",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(metadata={}, page_content='Option Explicit'),\n",
" Document(metadata={}, page_content='Public Sub HelloWorld()\\n MsgBox \"Hello, World!\"\\nEnd Sub'),\n",
" Document(metadata={}, page_content='Private Function Add(a As Integer, b As Integer) As Integer\\n Add = a + b\\nEnd Function')]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"VISUALBASIC6_CODE = \"\"\"Option Explicit\n",
"\n",
"Public Sub HelloWorld()\n",
" MsgBox \"Hello, World!\"\n",
"End Sub\n",
"\n",
"Private Function Add(a As Integer, b As Integer) As Integer\n",
" Add = a + b\n",
"End Function\n",
"\"\"\"\n",
"visualbasic6_splitter = RecursiveCharacterTextSplitter.from_language(\n",
" Language.VISUALBASIC6,\n",
" chunk_size=128,\n",
" chunk_overlap=0,\n",
")\n",
"visualbasic6_docs = visualbasic6_splitter.create_documents([VISUALBASIC6_CODE])\n",
"visualbasic6_docs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "langchain",
"language": "python",
"name": "python3"
},
@ -775,7 +840,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.16"
}
},
"nbformat": 4,

View File

@ -316,6 +316,7 @@ class Language(str, Enum):
HASKELL = "haskell"
ELIXIR = "elixir"
POWERSHELL = "powershell"
VISUALBASIC6 = "visualbasic6"
@dataclass(frozen=True)

View File

@ -734,6 +734,32 @@ class RecursiveCharacterTextSplitter(TextSplitter):
" ",
"",
]
if language == Language.VISUALBASIC6:
vis = r"(?:Public|Private|Friend|Global|Static)\s+"
return [
# Split along definitions
rf"\n(?!End\s){vis}?Sub\s+",
rf"\n(?!End\s){vis}?Function\s+",
rf"\n(?!End\s){vis}?Property\s+(?:Get|Let|Set)\s+",
rf"\n(?!End\s){vis}?Type\s+",
rf"\n(?!End\s){vis}?Enum\s+",
# Split along control flow statements
r"\n(?!End\s)If\s+",
r"\nElseIf\s+",
r"\nElse\s+",
r"\nSelect\s+Case\s+",
r"\nCase\s+",
r"\nFor\s+",
r"\nDo\s+",
r"\nWhile\s+",
r"\nWith\s+",
# Split by the normal type of lines
r"\n\n",
r"\n",
" ",
"",
]
if language in Language._value2member_map_:
msg = f"Language {language} is not implemented yet!"
raise ValueError(msg)

View File

@ -3043,6 +3043,82 @@ $csvContent | ForEach-Object {
]
FAKE_VISUALBASIC6_TEXT = """
Option Explicit
Public Function SumTwoIntegers(ByVal a As Integer, ByVal b As Integer) As Integer
SumTwoIntegers = a + b
End Function
Public Sub Main()
Dim i As Integer
Dim limit As Integer
i = 0
limit = 50
While i < limit
i = SumTwoIntegers(i, 1)
If i = limit \\ 2 Then
MsgBox "Halfway there! i = " & i
End If
Wend
MsgBox "Done! Final value of i: " & i
End Sub
"""
def test_visualbasic6_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.VISUALBASIC6,
chunk_size=CHUNK_SIZE,
chunk_overlap=0,
)
chunks = splitter.split_text(FAKE_VISUALBASIC6_TEXT)
assert chunks == [
"Option Explicit",
"Public Function",
"SumTwoIntegers(",
"ByVal",
"a As Integer,",
"ByVal b As",
"Integer) As",
"Integer",
"SumTwoIntegers",
"= a + b",
"End Function",
"Public Sub",
"Main()",
"Dim i As",
"Integer",
"Dim limit",
"As Integer",
"i = 0",
"limit = 50",
"While i <",
"limit",
"i =",
"SumTwoIntegers(",
"i,",
"1)",
"If i =",
"limit \\ 2 Then",
'MsgBox "Halfway',
'there! i = " &',
"i",
"End If",
"Wend",
"MsgBox",
'"Done! Final',
'value of i: " &',
"i",
"End Sub",
]
def custom_iframe_extractor(iframe_tag: Any) -> str:
iframe_src = iframe_tag.get("src", "")
return f"[iframe:{iframe_src}]({iframe_src})"