text-splitters[minor]: Added Haskell support in langchain.text_splitter module (#16191)

- **Description:** Haskell language support added in text_splitter
module
  - **Dependencies:** No
  - **Twitter handle:** @nisargtr

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Nisarg Trivedi
2024-03-30 01:47:50 +05:30
committed by GitHub
parent b7344e3347
commit 1252ccce6f
4 changed files with 120 additions and 8 deletions

View File

@@ -291,6 +291,7 @@ class Language(str, Enum):
C = "c"
LUA = "lua"
PERL = "perl"
HASKELL = "haskell"
@dataclass(frozen=True)

View File

@@ -571,7 +571,45 @@ class RecursiveCharacterTextSplitter(TextSplitter):
" ",
"",
]
elif language == Language.HASKELL:
return [
# Split along function definitions
"\nmain :: ",
"\nmain = ",
"\nlet ",
"\nin ",
"\ndo ",
"\nwhere ",
"\n:: ",
"\n= ",
# Split along type declarations
"\ndata ",
"\nnewtype ",
"\ntype ",
"\n:: ",
# Split along module declarations
"\nmodule ",
# Split along import statements
"\nimport ",
"\nqualified ",
"\nimport qualified ",
# Split along typeclass declarations
"\nclass ",
"\ninstance ",
# Split along case expressions
"\ncase ",
# Split along guards in function definitions
"\n| ",
# Split along record field declarations
"\ndata ",
"\n= {",
"\n, ",
# Split by the normal type of lines
"\n\n",
"\n",
" ",
"",
]
else:
raise ValueError(
f"Language {language} is not supported! "

View File

@@ -1248,6 +1248,38 @@ def test_solidity_code_splitter() -> None:
]
def test_haskell_code_splitter() -> None:
splitter = RecursiveCharacterTextSplitter.from_language(
Language.HASKELL, chunk_size=CHUNK_SIZE, chunk_overlap=0
)
code = """
main :: IO ()
main = do
putStrLn "Hello, World!"
-- Some sample functions
add :: Int -> Int -> Int
add x y = x + y
"""
# Adjusted expected chunks to account for indentation and newlines
expected_chunks = [
"main ::",
"IO ()",
"main = do",
"putStrLn",
'"Hello, World!"',
"--",
"Some sample",
"functions",
"add :: Int ->",
"Int -> Int",
"add x y = x",
"+ y",
]
chunks = splitter.split_text(code)
assert chunks == expected_chunks
@pytest.mark.requires("lxml")
def test_html_header_text_splitter(tmp_path: Path) -> None:
splitter = HTMLHeaderTextSplitter(