mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-07 05:30:39 +00:00
chore: add kotlin code splitter (#11364)
<!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes (if applicable), - **Dependencies:** any dependencies required for this change, - **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below), - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> - **Description:** Adds Kotlin language to `TextSplitter` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
b93a08079e
commit
0a4baca291
@ -17,6 +17,7 @@ from langchain.text_splitter import (
|
|||||||
['cpp',
|
['cpp',
|
||||||
'go',
|
'go',
|
||||||
'java',
|
'java',
|
||||||
|
'kotlin',
|
||||||
'js',
|
'js',
|
||||||
'ts',
|
'ts',
|
||||||
'php',
|
'php',
|
||||||
|
@ -614,6 +614,7 @@ class Language(str, Enum):
|
|||||||
CPP = "cpp"
|
CPP = "cpp"
|
||||||
GO = "go"
|
GO = "go"
|
||||||
JAVA = "java"
|
JAVA = "java"
|
||||||
|
KOTLIN = "kotlin"
|
||||||
JS = "js"
|
JS = "js"
|
||||||
TS = "ts"
|
TS = "ts"
|
||||||
PHP = "php"
|
PHP = "php"
|
||||||
@ -762,6 +763,32 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
" ",
|
" ",
|
||||||
"",
|
"",
|
||||||
]
|
]
|
||||||
|
elif language == Language.KOTLIN:
|
||||||
|
return [
|
||||||
|
# Split along class definitions
|
||||||
|
"\nclass ",
|
||||||
|
# Split along method definitions
|
||||||
|
"\npublic ",
|
||||||
|
"\nprotected ",
|
||||||
|
"\nprivate ",
|
||||||
|
"\ninternal ",
|
||||||
|
"\ncompanion ",
|
||||||
|
"\nfun ",
|
||||||
|
"\nval ",
|
||||||
|
"\nvar ",
|
||||||
|
# Split along control flow statements
|
||||||
|
"\nif ",
|
||||||
|
"\nfor ",
|
||||||
|
"\nwhile ",
|
||||||
|
"\nwhen ",
|
||||||
|
"\ncase ",
|
||||||
|
"\nelse ",
|
||||||
|
# Split by the normal type of lines
|
||||||
|
"\n\n",
|
||||||
|
"\n",
|
||||||
|
" ",
|
||||||
|
"",
|
||||||
|
]
|
||||||
elif language == Language.JS:
|
elif language == Language.JS:
|
||||||
return [
|
return [
|
||||||
# Split along function definitions
|
# Split along function definitions
|
||||||
|
@ -525,6 +525,38 @@ public class HelloWorld {
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_kotlin_code_splitter() -> None:
|
||||||
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
|
Language.KOTLIN, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||||
|
)
|
||||||
|
code = """
|
||||||
|
class HelloWorld {
|
||||||
|
companion object {
|
||||||
|
@JvmStatic
|
||||||
|
fun main(args: Array<String>) {
|
||||||
|
println("Hello, World!")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
chunks = splitter.split_text(code)
|
||||||
|
assert chunks == [
|
||||||
|
"class",
|
||||||
|
"HelloWorld {",
|
||||||
|
"companion",
|
||||||
|
"object {",
|
||||||
|
"@JvmStatic",
|
||||||
|
"fun",
|
||||||
|
"main(args:",
|
||||||
|
"Array<String>)",
|
||||||
|
"{",
|
||||||
|
'println("Hello,',
|
||||||
|
'World!")',
|
||||||
|
"}\n }",
|
||||||
|
"}",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_csharp_code_splitter() -> None:
|
def test_csharp_code_splitter() -> None:
|
||||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||||
Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||||
|
Loading…
Reference in New Issue
Block a user