feat(text-splitters): add Visual Basic 6 support (#31173)

### **Description**

Add Visual Basic 6 support.

---

### **Issue**

No specific issue addressed.

---

### **Dependencies**

No additional dependencies required.

---------

Co-authored-by: Mason Daugherty <mason@langchain.dev>
This commit is contained in:
Fabio Fontana
2025-07-14 15:51:16 +02:00
committed by GitHub
parent 7e146a185b
commit fd168e1c11
4 changed files with 261 additions and 93 deletions

View File

@@ -63,7 +63,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"id": "a9e37aa1",
"metadata": {},
"outputs": [],
@@ -84,7 +84,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"id": "e21a2434",
"metadata": {},
"outputs": [
@@ -114,10 +114,13 @@
" 'c',\n",
" 'lua',\n",
" 'perl',\n",
" 'haskell']"
" 'haskell',\n",
" 'elixir',\n",
" 'powershell',\n",
" 'visualbasic6']"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@@ -136,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"id": "c92fb913",
"metadata": {},
"outputs": [
@@ -146,7 +149,7 @@
"['\\nclass ', '\\ndef ', '\\n\\tdef ', '\\n\\n', '\\n', ' ', '']"
]
},
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -168,18 +171,18 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"id": "a58512b9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='def hello_world():\\n print(\"Hello, World!\")'),\n",
" Document(page_content='# Call the function\\nhello_world()')]"
"[Document(metadata={}, page_content='def hello_world():\\n print(\"Hello, World!\")'),\n",
" Document(metadata={}, page_content='# Call the function\\nhello_world()')]"
]
},
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -210,18 +213,18 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"id": "7db0d486",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}'),\n",
" Document(page_content='// Call the function\\nhelloWorld();')]"
"[Document(metadata={}, page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}'),\n",
" Document(metadata={}, page_content='// Call the function\\nhelloWorld();')]"
]
},
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -254,19 +257,19 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"id": "aee738a4",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='function helloWorld(): void {'),\n",
" Document(page_content='console.log(\"Hello, World!\");\\n}'),\n",
" Document(page_content='// Call the function\\nhelloWorld();')]"
"[Document(metadata={}, page_content='function helloWorld(): void {'),\n",
" Document(metadata={}, page_content='console.log(\"Hello, World!\");\\n}'),\n",
" Document(metadata={}, page_content='// Call the function\\nhelloWorld();')]"
]
},
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -300,7 +303,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 10,
"id": "ac9295d3",
"metadata": {},
"outputs": [],
@@ -321,7 +324,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 11,
"id": "bfa1771b-d4b0-48f8-a949-5537cd1df0dd",
"metadata": {},
"outputs": [
@@ -337,7 +340,7 @@
" Document(metadata={}, page_content='are extremely open to contributions.')]"
]
},
"execution_count": 3,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -362,7 +365,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"id": "77d1049d",
"metadata": {},
"outputs": [],
@@ -389,38 +392,38 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"id": "4dbc47e1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle'),\n",
" Document(page_content='\\\\section{Introduction}'),\n",
" Document(page_content='Large language models (LLMs) are a type of machine learning'),\n",
" Document(page_content='model that can be trained on vast amounts of text data to'),\n",
" Document(page_content='generate human-like language. In recent years, LLMs have'),\n",
" Document(page_content='made significant advances in a variety of natural language'),\n",
" Document(page_content='processing tasks, including language translation, text'),\n",
" Document(page_content='generation, and sentiment analysis.'),\n",
" Document(page_content='\\\\subsection{History of LLMs}'),\n",
" Document(page_content='The earliest LLMs were developed in the 1980s and 1990s,'),\n",
" Document(page_content='but they were limited by the amount of data that could be'),\n",
" Document(page_content='processed and the computational power available at the'),\n",
" Document(page_content='time. In the past decade, however, advances in hardware and'),\n",
" Document(page_content='software have made it possible to train LLMs on massive'),\n",
" Document(page_content='datasets, leading to significant improvements in'),\n",
" Document(page_content='performance.'),\n",
" Document(page_content='\\\\subsection{Applications of LLMs}'),\n",
" Document(page_content='LLMs have many applications in industry, including'),\n",
" Document(page_content='chatbots, content creation, and virtual assistants. They'),\n",
" Document(page_content='can also be used in academia for research in linguistics,'),\n",
" Document(page_content='psychology, and computational linguistics.'),\n",
" Document(page_content='\\\\end{document}')]"
"[Document(metadata={}, page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle'),\n",
" Document(metadata={}, page_content='\\\\section{Introduction}'),\n",
" Document(metadata={}, page_content='Large language models (LLMs) are a type of machine learning'),\n",
" Document(metadata={}, page_content='model that can be trained on vast amounts of text data to'),\n",
" Document(metadata={}, page_content='generate human-like language. In recent years, LLMs have'),\n",
" Document(metadata={}, page_content='made significant advances in a variety of natural language'),\n",
" Document(metadata={}, page_content='processing tasks, including language translation, text'),\n",
" Document(metadata={}, page_content='generation, and sentiment analysis.'),\n",
" Document(metadata={}, page_content='\\\\subsection{History of LLMs}'),\n",
" Document(metadata={}, page_content='The earliest LLMs were developed in the 1980s and 1990s,'),\n",
" Document(metadata={}, page_content='but they were limited by the amount of data that could be'),\n",
" Document(metadata={}, page_content='processed and the computational power available at the'),\n",
" Document(metadata={}, page_content='time. In the past decade, however, advances in hardware and'),\n",
" Document(metadata={}, page_content='software have made it possible to train LLMs on massive'),\n",
" Document(metadata={}, page_content='datasets, leading to significant improvements in'),\n",
" Document(metadata={}, page_content='performance.'),\n",
" Document(metadata={}, page_content='\\\\subsection{Applications of LLMs}'),\n",
" Document(metadata={}, page_content='LLMs have many applications in industry, including'),\n",
" Document(metadata={}, page_content='chatbots, content creation, and virtual assistants. They'),\n",
" Document(metadata={}, page_content='can also be used in academia for research in linguistics,'),\n",
" Document(metadata={}, page_content='psychology, and computational linguistics.'),\n",
" Document(metadata={}, page_content='\\\\end{document}')]"
]
},
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -445,7 +448,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"id": "0fc78794",
"metadata": {},
"outputs": [],
@@ -479,29 +482,29 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 15,
"id": "e3e3fca1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='<!DOCTYPE html>\\n<html>'),\n",
" Document(page_content='<head>\\n <title>🦜️🔗 LangChain</title>'),\n",
" Document(page_content='<style>\\n body {\\n font-family: Aria'),\n",
" Document(page_content='l, sans-serif;\\n }\\n h1 {'),\n",
" Document(page_content='color: darkblue;\\n }\\n </style>\\n </head'),\n",
" Document(page_content='>'),\n",
" Document(page_content='<body>'),\n",
" Document(page_content='<div>\\n <h1>🦜️🔗 LangChain</h1>'),\n",
" Document(page_content='<p>⚡ Building applications with LLMs through composability ⚡'),\n",
" Document(page_content='</p>\\n </div>'),\n",
" Document(page_content='<div>\\n As an open-source project in a rapidly dev'),\n",
" Document(page_content='eloping field, we are extremely open to contributions.'),\n",
" Document(page_content='</div>\\n </body>\\n</html>')]"
"[Document(metadata={}, page_content='<!DOCTYPE html>\\n<html>'),\n",
" Document(metadata={}, page_content='<head>\\n <title>🦜️🔗 LangChain</title>'),\n",
" Document(metadata={}, page_content='<style>\\n body {\\n font-family: Aria'),\n",
" Document(metadata={}, page_content='l, sans-serif;\\n }\\n h1 {'),\n",
" Document(metadata={}, page_content='color: darkblue;\\n }\\n </style>\\n </head'),\n",
" Document(metadata={}, page_content='>'),\n",
" Document(metadata={}, page_content='<body>'),\n",
" Document(metadata={}, page_content='<div>\\n <h1>🦜️🔗 LangChain</h1>'),\n",
" Document(metadata={}, page_content='<p>⚡ Building applications with LLMs through composability ⚡'),\n",
" Document(metadata={}, page_content='</p>\\n </div>'),\n",
" Document(metadata={}, page_content='<div>\\n As an open-source project in a rapidly dev'),\n",
" Document(metadata={}, page_content='eloping field, we are extremely open to contributions.'),\n",
" Document(metadata={}, page_content='</div>\\n </body>\\n</html>')]"
]
},
"execution_count": 13,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -525,18 +528,18 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 16,
"id": "49a1df11",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='pragma solidity ^0.8.20;'),\n",
" Document(page_content='contract HelloWorld {\\n function add(uint a, uint b) pure public returns(uint) {\\n return a + b;\\n }\\n}')]"
"[Document(metadata={}, page_content='pragma solidity ^0.8.20;'),\n",
" Document(metadata={}, page_content='contract HelloWorld {\\n function add(uint a, uint b) pure public returns(uint) {\\n return a + b;\\n }\\n}')]"
]
},
"execution_count": 14,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -569,21 +572,21 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 17,
"id": "1524ae0f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='using System;'),\n",
" Document(page_content='class Program\\n{\\n static void Main()\\n {\\n int age = 30; // Change the age value as needed'),\n",
" Document(page_content='// Categorize the age without any console output\\n if (age < 18)\\n {\\n // Age is under 18'),\n",
" Document(page_content='}\\n else if (age >= 18 && age < 65)\\n {\\n // Age is an adult\\n }\\n else\\n {'),\n",
" Document(page_content='// Age is a senior citizen\\n }\\n }\\n}')]"
"[Document(metadata={}, page_content='using System;'),\n",
" Document(metadata={}, page_content='class Program\\n{\\n static void Main()\\n {\\n int age = 30; // Change the age value as needed'),\n",
" Document(metadata={}, page_content='// Categorize the age without any console output\\n if (age < 18)\\n {\\n // Age is under 18'),\n",
" Document(metadata={}, page_content='}\\n else if (age >= 18 && age < 65)\\n {\\n // Age is an adult\\n }\\n else\\n {'),\n",
" Document(metadata={}, page_content='// Age is a senior citizen\\n }\\n }\\n}')]"
]
},
"execution_count": 15,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -631,20 +634,20 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 18,
"id": "688185b5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='main :: IO ()'),\n",
" Document(page_content='main = do\\n putStrLn \"Hello, World!\"\\n-- Some'),\n",
" Document(page_content='sample functions\\nadd :: Int -> Int -> Int\\nadd x y'),\n",
" Document(page_content='= x + y')]"
"[Document(metadata={}, page_content='main :: IO ()'),\n",
" Document(metadata={}, page_content='main = do\\n putStrLn \"Hello, World!\"\\n-- Some'),\n",
" Document(metadata={}, page_content='sample functions\\nadd :: Int -> Int -> Int\\nadd x y'),\n",
" Document(metadata={}, page_content='= x + y')]"
]
},
"execution_count": 3,
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -676,23 +679,23 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 19,
"id": "90c66e7e-87a5-4a81-bece-7949aabf2369",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='<?php\\nnamespace foo;'),\n",
" Document(page_content='class Hello {'),\n",
" Document(page_content='public function __construct() { }\\n}'),\n",
" Document(page_content='function hello() {\\n echo \"Hello World!\";\\n}'),\n",
" Document(page_content='interface Human {\\n public function breath();\\n}'),\n",
" Document(page_content='trait Foo { }\\nenum Color\\n{\\n case Red;'),\n",
" Document(page_content='case Blue;\\n}')]"
"[Document(metadata={}, page_content='<?php\\nnamespace foo;'),\n",
" Document(metadata={}, page_content='class Hello {'),\n",
" Document(metadata={}, page_content='public function __construct() { }\\n}'),\n",
" Document(metadata={}, page_content='function hello() {\\n echo \"Hello World!\";\\n}'),\n",
" Document(metadata={}, page_content='interface Human {\\n public function breath();\\n}'),\n",
" Document(metadata={}, page_content='trait Foo { }\\nenum Color\\n{\\n case Red;'),\n",
" Document(metadata={}, page_content='case Blue;\\n}')]"
]
},
"execution_count": 2,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -733,10 +736,25 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 20,
"id": "7e6893ad",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[Document(metadata={}, page_content='$directoryPath = Get-Location\\n\\n$items = Get-ChildItem -Path $directoryPath'),\n",
" Document(metadata={}, page_content='$files = $items | Where-Object { -not $_.PSIsContainer }'),\n",
" Document(metadata={}, page_content='$sortedFiles = $files | Sort-Object LastWriteTime'),\n",
" Document(metadata={}, page_content='foreach ($file in $sortedFiles) {'),\n",
" Document(metadata={}, page_content='Write-Output (\"Name: \" + $file.Name + \" | Last Write Time: \" + $file.LastWriteTime)\\n}')]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"POWERSHELL_CODE = \"\"\"\n",
"$directoryPath = Get-Location\n",
@@ -757,11 +775,58 @@
"powershell_docs = powershell_splitter.create_documents([POWERSHELL_CODE])\n",
"powershell_docs"
]
},
{
"cell_type": "markdown",
"id": "3ef77730",
"metadata": {},
"source": [
"## Visual Basic 6"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "1dc3c740",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(metadata={}, page_content='Option Explicit'),\n",
" Document(metadata={}, page_content='Public Sub HelloWorld()\\n MsgBox \"Hello, World!\"\\nEnd Sub'),\n",
" Document(metadata={}, page_content='Private Function Add(a As Integer, b As Integer) As Integer\\n Add = a + b\\nEnd Function')]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"VISUALBASIC6_CODE = \"\"\"Option Explicit\n",
"\n",
"Public Sub HelloWorld()\n",
" MsgBox \"Hello, World!\"\n",
"End Sub\n",
"\n",
"Private Function Add(a As Integer, b As Integer) As Integer\n",
" Add = a + b\n",
"End Function\n",
"\"\"\"\n",
"visualbasic6_splitter = RecursiveCharacterTextSplitter.from_language(\n",
" Language.VISUALBASIC6,\n",
" chunk_size=128,\n",
" chunk_overlap=0,\n",
")\n",
"visualbasic6_docs = visualbasic6_splitter.create_documents([VISUALBASIC6_CODE])\n",
"visualbasic6_docs"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "langchain",
"language": "python",
"name": "python3"
},
@@ -775,7 +840,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.16"
}
},
"nbformat": 4,