diff --git a/docs/docs/how_to/code_splitter.ipynb b/docs/docs/how_to/code_splitter.ipynb index 3da6622478b..efa4fde9a79 100644 --- a/docs/docs/how_to/code_splitter.ipynb +++ b/docs/docs/how_to/code_splitter.ipynb @@ -63,7 +63,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "a9e37aa1", "metadata": {}, "outputs": [], @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "e21a2434", "metadata": {}, "outputs": [ @@ -114,10 +114,13 @@ " 'c',\n", " 'lua',\n", " 'perl',\n", - " 'haskell']" + " 'haskell',\n", + " 'elixir',\n", + " 'powershell',\n", + " 'visualbasic6']" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -136,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "c92fb913", "metadata": {}, "outputs": [ @@ -146,7 +149,7 @@ "['\\nclass ', '\\ndef ', '\\n\\tdef ', '\\n\\n', '\\n', ' ', '']" ] }, - "execution_count": 3, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -168,18 +171,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "a58512b9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='def hello_world():\\n print(\"Hello, World!\")'),\n", - " Document(page_content='# Call the function\\nhello_world()')]" + "[Document(metadata={}, page_content='def hello_world():\\n print(\"Hello, World!\")'),\n", + " Document(metadata={}, page_content='# Call the function\\nhello_world()')]" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -210,18 +213,18 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "7db0d486", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}'),\n", - " Document(page_content='// Call the function\\nhelloWorld();')]" + "[Document(metadata={}, page_content='function helloWorld() {\\n console.log(\"Hello, World!\");\\n}'),\n", + " Document(metadata={}, page_content='// Call the function\\nhelloWorld();')]" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -254,19 +257,19 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "aee738a4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='function helloWorld(): void {'),\n", - " Document(page_content='console.log(\"Hello, World!\");\\n}'),\n", - " Document(page_content='// Call the function\\nhelloWorld();')]" + "[Document(metadata={}, page_content='function helloWorld(): void {'),\n", + " Document(metadata={}, page_content='console.log(\"Hello, World!\");\\n}'),\n", + " Document(metadata={}, page_content='// Call the function\\nhelloWorld();')]" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -300,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "id": "ac9295d3", "metadata": {}, "outputs": [], @@ -321,7 +324,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "id": "bfa1771b-d4b0-48f8-a949-5537cd1df0dd", "metadata": {}, "outputs": [ @@ -337,7 +340,7 @@ " Document(metadata={}, page_content='are extremely open to contributions.')]" ] }, - "execution_count": 3, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -362,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "77d1049d", "metadata": {}, "outputs": [], @@ -389,38 +392,38 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "id": "4dbc47e1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle'),\n", - " Document(page_content='\\\\section{Introduction}'),\n", - " Document(page_content='Large language models (LLMs) are a type of machine learning'),\n", - " Document(page_content='model that can be trained on vast amounts of text data to'),\n", - " Document(page_content='generate human-like language. In recent years, LLMs have'),\n", - " Document(page_content='made significant advances in a variety of natural language'),\n", - " Document(page_content='processing tasks, including language translation, text'),\n", - " Document(page_content='generation, and sentiment analysis.'),\n", - " Document(page_content='\\\\subsection{History of LLMs}'),\n", - " Document(page_content='The earliest LLMs were developed in the 1980s and 1990s,'),\n", - " Document(page_content='but they were limited by the amount of data that could be'),\n", - " Document(page_content='processed and the computational power available at the'),\n", - " Document(page_content='time. In the past decade, however, advances in hardware and'),\n", - " Document(page_content='software have made it possible to train LLMs on massive'),\n", - " Document(page_content='datasets, leading to significant improvements in'),\n", - " Document(page_content='performance.'),\n", - " Document(page_content='\\\\subsection{Applications of LLMs}'),\n", - " Document(page_content='LLMs have many applications in industry, including'),\n", - " Document(page_content='chatbots, content creation, and virtual assistants. They'),\n", - " Document(page_content='can also be used in academia for research in linguistics,'),\n", - " Document(page_content='psychology, and computational linguistics.'),\n", - " Document(page_content='\\\\end{document}')]" + "[Document(metadata={}, page_content='\\\\documentclass{article}\\n\\n\\x08egin{document}\\n\\n\\\\maketitle'),\n", + " Document(metadata={}, page_content='\\\\section{Introduction}'),\n", + " Document(metadata={}, page_content='Large language models (LLMs) are a type of machine learning'),\n", + " Document(metadata={}, page_content='model that can be trained on vast amounts of text data to'),\n", + " Document(metadata={}, page_content='generate human-like language. In recent years, LLMs have'),\n", + " Document(metadata={}, page_content='made significant advances in a variety of natural language'),\n", + " Document(metadata={}, page_content='processing tasks, including language translation, text'),\n", + " Document(metadata={}, page_content='generation, and sentiment analysis.'),\n", + " Document(metadata={}, page_content='\\\\subsection{History of LLMs}'),\n", + " Document(metadata={}, page_content='The earliest LLMs were developed in the 1980s and 1990s,'),\n", + " Document(metadata={}, page_content='but they were limited by the amount of data that could be'),\n", + " Document(metadata={}, page_content='processed and the computational power available at the'),\n", + " Document(metadata={}, page_content='time. In the past decade, however, advances in hardware and'),\n", + " Document(metadata={}, page_content='software have made it possible to train LLMs on massive'),\n", + " Document(metadata={}, page_content='datasets, leading to significant improvements in'),\n", + " Document(metadata={}, page_content='performance.'),\n", + " Document(metadata={}, page_content='\\\\subsection{Applications of LLMs}'),\n", + " Document(metadata={}, page_content='LLMs have many applications in industry, including'),\n", + " Document(metadata={}, page_content='chatbots, content creation, and virtual assistants. They'),\n", + " Document(metadata={}, page_content='can also be used in academia for research in linguistics,'),\n", + " Document(metadata={}, page_content='psychology, and computational linguistics.'),\n", + " Document(metadata={}, page_content='\\\\end{document}')]" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -445,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "0fc78794", "metadata": {}, "outputs": [], @@ -479,29 +482,29 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "e3e3fca1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='\\n'),\n", - " Document(page_content='\\n 🦜️🔗 LangChain'),\n", - " Document(page_content='\\n None: + splitter = RecursiveCharacterTextSplitter.from_language( + Language.VISUALBASIC6, + chunk_size=CHUNK_SIZE, + chunk_overlap=0, + ) + chunks = splitter.split_text(FAKE_VISUALBASIC6_TEXT) + + assert chunks == [ + "Option Explicit", + "Public Function", + "SumTwoIntegers(", + "ByVal", + "a As Integer,", + "ByVal b As", + "Integer) As", + "Integer", + "SumTwoIntegers", + "= a + b", + "End Function", + "Public Sub", + "Main()", + "Dim i As", + "Integer", + "Dim limit", + "As Integer", + "i = 0", + "limit = 50", + "While i <", + "limit", + "i =", + "SumTwoIntegers(", + "i,", + "1)", + "If i =", + "limit \\ 2 Then", + 'MsgBox "Halfway', + 'there! i = " &', + "i", + "End If", + "Wend", + "MsgBox", + '"Done! Final', + 'value of i: " &', + "i", + "End Sub", + ] + + def custom_iframe_extractor(iframe_tag: Any) -> str: iframe_src = iframe_tag.get("src", "") return f"[iframe:{iframe_src}]({iframe_src})"