mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-02 13:08:57 +00:00
Feature/adding csharp support to textsplitter (#10350)
**Description:** Adding C# language support for `RecursiveCharacterTextSplitter` **Issue:** N/A **Dependencies:** N/A --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
3e5a143625
commit
4258c23867
@ -627,6 +627,7 @@ class Language(str, Enum):
|
||||
LATEX = "latex"
|
||||
HTML = "html"
|
||||
SOL = "sol"
|
||||
CSHARP = "csharp"
|
||||
|
||||
|
||||
class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
@ -1002,6 +1003,43 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
"<title",
|
||||
"",
|
||||
]
|
||||
elif language == Language.CSHARP:
|
||||
return [
|
||||
"\ninterface ",
|
||||
"\nenum ",
|
||||
"\nimplements ",
|
||||
"\ndelegate ",
|
||||
"\nevent ",
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
"\nabstract ",
|
||||
# Split along method definitions
|
||||
"\npublic ",
|
||||
"\nprotected ",
|
||||
"\nprivate ",
|
||||
"\nstatic ",
|
||||
"\nreturn ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\ncontinue ",
|
||||
"\nfor ",
|
||||
"\nforeach ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
"\nbreak ",
|
||||
"\ncase ",
|
||||
"\nelse ",
|
||||
# Split by exceptions
|
||||
"\ntry ",
|
||||
"\nthrow ",
|
||||
"\nfinally ",
|
||||
"\ncatch ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.SOL:
|
||||
return [
|
||||
# Split along compiler information definitions
|
||||
@ -1032,6 +1070,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Language {language} is not supported! "
|
||||
|
@ -498,6 +498,73 @@ public class HelloWorld {
|
||||
]
|
||||
|
||||
|
||||
def test_csharp_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
)
|
||||
code = """
|
||||
using System;
|
||||
class Program
|
||||
{
|
||||
static void Main()
|
||||
{
|
||||
int age = 30; // Change the age value as needed
|
||||
|
||||
// Categorize the age without any console output
|
||||
if (age < 18)
|
||||
{
|
||||
// Age is under 18
|
||||
}
|
||||
else if (age >= 18 && age < 65)
|
||||
{
|
||||
// Age is an adult
|
||||
}
|
||||
else
|
||||
{
|
||||
// Age is a senior citizen
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
chunks = splitter.split_text(code)
|
||||
assert chunks == [
|
||||
"using System;",
|
||||
"class Program\n{",
|
||||
"static void",
|
||||
"Main()",
|
||||
"{",
|
||||
"int age",
|
||||
"= 30; // Change",
|
||||
"the age value",
|
||||
"as needed",
|
||||
"//",
|
||||
"Categorize the",
|
||||
"age without any",
|
||||
"console output",
|
||||
"if (age",
|
||||
"< 18)",
|
||||
"{",
|
||||
"//",
|
||||
"Age is under 18",
|
||||
"}",
|
||||
"else if",
|
||||
"(age >= 18 &&",
|
||||
"age < 65)",
|
||||
"{",
|
||||
"//",
|
||||
"Age is an adult",
|
||||
"}",
|
||||
"else",
|
||||
"{",
|
||||
"//",
|
||||
"Age is a senior",
|
||||
"citizen",
|
||||
"}\n }",
|
||||
"}",
|
||||
]
|
||||
|
||||
|
||||
def test_cpp_code_splitter() -> None:
|
||||
splitter = RecursiveCharacterTextSplitter.from_language(
|
||||
Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0
|
||||
|
Loading…
Reference in New Issue
Block a user