From b738ccd91eea03abf6ed11444cac297fe8c6b61b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fynn=20Fl=C3=BCgge?= Date: Fri, 29 Sep 2023 01:41:51 +0200 Subject: [PATCH] chore: add support for TypeScript code splitting (#11160) - **Description:** Adds typescript language to `TextSplitter` --------- Co-authored-by: Jacob Lee --- .../text_splitters/code_splitter.mdx | 31 +++++++++++++++++++ libs/langchain/langchain/text_splitter.py | 27 ++++++++++++++++ .../tests/unit_tests/test_text_splitter.py | 27 ++++++++++++++++ 3 files changed, 85 insertions(+) diff --git a/docs/snippets/modules/data_connection/document_transformers/text_splitters/code_splitter.mdx b/docs/snippets/modules/data_connection/document_transformers/text_splitters/code_splitter.mdx index 503ca52a537..e7a2db3d06c 100644 --- a/docs/snippets/modules/data_connection/document_transformers/text_splitters/code_splitter.mdx +++ b/docs/snippets/modules/data_connection/document_transformers/text_splitters/code_splitter.mdx @@ -18,6 +18,7 @@ from langchain.text_splitter import ( 'go', 'java', 'js', + 'ts', 'php', 'proto', 'python', @@ -107,6 +108,36 @@ js_docs +## TS +Here's an example using the TS text splitter: + + +```python +TS_CODE = """ +function helloWorld(): void { + console.log("Hello, World!"); +} + +// Call the function +helloWorld(); +""" + +ts_splitter = RecursiveCharacterTextSplitter.from_language( + language=Language.TS, chunk_size=60, chunk_overlap=0 +) +ts_docs = ts_splitter.create_documents([TS_CODE]) +ts_docs +``` + + + +``` + [Document(page_content='function helloWorld(): void {\n console.log("Hello, World!");\n}', metadata={}), + Document(page_content='// Call the function\nhelloWorld();', metadata={})] +``` + + + ## Markdown Here's an example using the Markdown text splitter: diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index 2e5f7021f3c..12664014be0 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -615,6 +615,7 @@ class Language(str, Enum): GO = "go" JAVA = "java" JS = "js" + TS = "ts" PHP = "php" PROTO = "proto" PYTHON = "python" @@ -782,6 +783,32 @@ class RecursiveCharacterTextSplitter(TextSplitter): " ", "", ] + elif language == Language.TS: + return [ + "\nenum ", + "\ninterface ", + "\nnamespace ", + "\ntype ", + # Split along class definitions + "\nclass ", + # Split along function definitions + "\nfunction ", + "\nconst ", + "\nlet ", + "\nvar ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + "\ndefault ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] elif language == Language.PHP: return [ # Split along function definitions diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index 717b28c242a..ce4680c20e4 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -472,6 +472,33 @@ helloWorld(); ] +def test_typescript_code_splitter() -> None: + splitter = RecursiveCharacterTextSplitter.from_language( + Language.TS, chunk_size=CHUNK_SIZE, chunk_overlap=0 + ) + code = """ +function helloWorld(): void { + console.log("Hello, World!"); +} + +// Call the function +helloWorld(); + """ + chunks = splitter.split_text(code) + assert chunks == [ + "function", + "helloWorld():", + "void {", + 'console.log("He', + "llo,", + 'World!");', + "}", + "// Call the", + "function", + "helloWorld();", + ] + + def test_java_code_splitter() -> None: splitter = RecursiveCharacterTextSplitter.from_language( Language.JAVA, chunk_size=CHUNK_SIZE, chunk_overlap=0