mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 04:38:26 +00:00
chore: add kotlin code splitter (#11364)
<!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes (if applicable), - **Dependencies:** any dependencies required for this change, - **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below), - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> - **Description:** Adds Kotlin language to `TextSplitter` --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
b93a08079e
commit
0a4baca291
@ -17,6 +17,7 @@ from langchain.text_splitter import (
|
||||
['cpp',
|
||||
'go',
|
||||
'java',
|
||||
'kotlin',
|
||||
'js',
|
||||
'ts',
|
||||
'php',
|
||||
|
@ -614,6 +614,7 @@ class Language(str, Enum):
|
||||
CPP = "cpp"
|
||||
GO = "go"
|
||||
JAVA = "java"
|
||||
KOTLIN = "kotlin"
|
||||
JS = "js"
|
||||
TS = "ts"
|
||||
PHP = "php"
|
||||
@ -762,6 +763,32 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.KOTLIN:
|
||||
return [
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
# Split along method definitions
|
||||
"\npublic ",
|
||||
"\nprotected ",
|
||||
"\nprivate ",
|
||||
"\ninternal ",
|
||||
"\ncompanion ",
|
||||
"\nfun ",
|
||||
"\nval ",
|
||||
"\nvar ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nwhen ",
|
||||
"\ncase ",
|
||||
"\nelse ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.JS:
|
||||
return [
|
||||
# Split along function definitions
|
||||
|
@ -525,6 +525,38 @@ public class HelloWorld {
|
||||
]
|
||||
|
||||
|
||||
def test_kotlin_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.KOTLIN, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
class HelloWorld {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
fun main(args: Array<String>) {
|
||||
println("Hello, World!")
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"class",
|
||||
"HelloWorld {",
|
||||
"companion",
|
||||
"object {",
|
||||
"@JvmStatic",
|
||||
"fun",
|
||||
"main(args:",
|
||||
"Array<String>)",
|
||||
"{",
|
||||
'println("Hello,',
|
||||
'World!")',
|
||||
"}\n }",
|
||||
"}",
|
||||
]
|
||||
|
||||
|
||||
def test_csharp_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
|
Loading…
Reference in New Issue
Block a user