mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 02:50:47 +00:00
Community[minor]: Add language parser for Elixir (#22742)
Hi 👋 First off, thanks a ton for your work on this 💚 Really appreciate what you're providing here for the community. ## Description This PR adds a basic language parser for the [Elixir](https://elixir-lang.org/) programming language. The parser code is based upon the approach outlined in https://github.com/langchain-ai/langchain/pull/13318: it's using `tree-sitter` under the hood and aligns with all the other `tree-sitter` based parses added that PR. The `CHUNK_QUERY` I'm using here is probably not the most sophisticated one, but it worked for my application. It's a starting point to provide "core" parsing support for Elixir in LangChain. It enables people to use the language parser out in real world applications which may then lead to further tweaking of the queries. I consider this PR just the ground work. - **Dependencies:** requires `tree-sitter` and `tree-sitter-languages` from the extended dependencies - **Twitter handle:**`@bitcrowd` ## Checklist - [x] **PR title**: "package: description" - [x] **Add tests and docs** - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. <!-- If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. -->
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
"- C++ (*)\n",
|
||||
"- C# (*)\n",
|
||||
"- COBOL\n",
|
||||
"- Elixir\n",
|
||||
"- Go (*)\n",
|
||||
"- Java (*)\n",
|
||||
"- JavaScript (requires package `esprima`)\n",
|
||||
|
@@ -0,0 +1,35 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(call target: ((identifier) @_identifier
|
||||
(#any-of? @_identifier "defmodule" "defprotocol" "defimpl"))) @module
|
||||
(call target: ((identifier) @_identifier
|
||||
(#any-of? @_identifier "def" "defmacro" "defmacrop" "defp"))) @function
|
||||
(unary_operator operator: "@" operand: (call target: ((identifier) @_identifier
|
||||
(#any-of? @_identifier "moduledoc" "typedoc""doc")))) @comment
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class ElixirSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Elixir."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("elixir")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"# {text}"
|
@@ -10,6 +10,7 @@ from langchain_community.document_loaders.parsers.language.c import CSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.elixir import ElixirSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.go import GoSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.java import JavaSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.javascript import (
|
||||
@@ -44,6 +45,8 @@ LANGUAGE_EXTENSIONS: Dict[str, str] = {
|
||||
"ts": "ts",
|
||||
"java": "java",
|
||||
"php": "php",
|
||||
"ex": "elixir",
|
||||
"exs": "elixir",
|
||||
}
|
||||
|
||||
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
||||
@@ -63,6 +66,7 @@ LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
||||
"ts": TypeScriptSegmenter,
|
||||
"java": JavaSegmenter,
|
||||
"php": PHPSegmenter,
|
||||
"elixir": ElixirSegmenter,
|
||||
}
|
||||
|
||||
Language = Literal[
|
||||
@@ -89,6 +93,7 @@ Language = Literal[
|
||||
"c",
|
||||
"lua",
|
||||
"perl",
|
||||
"elixir",
|
||||
]
|
||||
|
||||
|
||||
@@ -107,6 +112,7 @@ class LanguageParser(BaseBlobParser):
|
||||
- C++: "cpp" (*)
|
||||
- C#: "csharp" (*)
|
||||
- COBOL: "cobol"
|
||||
- Elixir: "elixir"
|
||||
- Go: "go" (*)
|
||||
- Java: "java" (*)
|
||||
- JavaScript: "js" (requires package `esprima`)
|
||||
|
@@ -0,0 +1,57 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.elixir import ElixirSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestElixirSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """@doc "some comment"
|
||||
def foo do
|
||||
i = 0
|
||||
end
|
||||
|
||||
defmodule M do
|
||||
def hi do
|
||||
i = 2
|
||||
end
|
||||
|
||||
defp wave do
|
||||
:ok
|
||||
end
|
||||
end"""
|
||||
|
||||
self.expected_simplified_code = """# Code for: @doc "some comment"
|
||||
# Code for: def foo do
|
||||
|
||||
# Code for: defmodule M do"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
'@doc "some comment"',
|
||||
"def foo do\n i = 0\nend",
|
||||
"defmodule M do\n"
|
||||
" def hi do\n"
|
||||
" i = 2\n"
|
||||
" end\n\n"
|
||||
" defp wave do\n"
|
||||
" :ok\n"
|
||||
" end\n"
|
||||
"end",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(ElixirSegmenter("def a do; end").is_valid())
|
||||
self.assertFalse(ElixirSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = ElixirSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(len(extracted_code), 3)
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = ElixirSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -293,6 +293,7 @@ class Language(str, Enum):
|
||||
LUA = "lua"
|
||||
PERL = "perl"
|
||||
HASKELL = "haskell"
|
||||
ELIXIR = "elixir"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
@@ -343,6 +343,30 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.ELIXIR:
|
||||
return [
|
||||
# Split along method function and module definiton
|
||||
"\ndef ",
|
||||
"\ndefp ",
|
||||
"\ndefmodule ",
|
||||
"\ndefprotocol ",
|
||||
"\ndefmacro ",
|
||||
"\ndefmacrop ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nunless ",
|
||||
"\nwhile ",
|
||||
"\ncase ",
|
||||
"\ncond ",
|
||||
"\nwith ",
|
||||
"\nfor ",
|
||||
"\ndo ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.RUST:
|
||||
return [
|
||||
# Split along function definitions
|
||||
|
Reference in New Issue
Block a user