From 058a64c563571615e4a022433266fb14c1666490 Mon Sep 17 00:00:00 2001 From: Max Mulatz Date: Mon, 10 Jun 2024 17:56:57 +0200 Subject: [PATCH] Community[minor]: Add language parser for Elixir (#22742) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hi 👋 First off, thanks a ton for your work on this 💚 Really appreciate what you're providing here for the community. ## Description This PR adds a basic language parser for the [Elixir](https://elixir-lang.org/) programming language. The parser code is based upon the approach outlined in https://github.com/langchain-ai/langchain/pull/13318: it's using `tree-sitter` under the hood and aligns with all the other `tree-sitter` based parses added that PR. The `CHUNK_QUERY` I'm using here is probably not the most sophisticated one, but it worked for my application. It's a starting point to provide "core" parsing support for Elixir in LangChain. It enables people to use the language parser out in real world applications which may then lead to further tweaking of the queries. I consider this PR just the ground work. - **Dependencies:** requires `tree-sitter` and `tree-sitter-languages` from the extended dependencies - **Twitter handle:**`@bitcrowd` ## Checklist - [x] **PR title**: "package: description" - [x] **Add tests and docs** - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. --- .../document_loaders/source_code.ipynb | 1 + .../parsers/language/elixir.py | 35 ++++++++++++ .../parsers/language/language_parser.py | 6 ++ .../parsers/language/test_elixir.py | 57 +++++++++++++++++++ .../langchain_text_splitters/base.py | 1 + .../langchain_text_splitters/character.py | 24 ++++++++ 6 files changed, 124 insertions(+) create mode 100644 libs/community/langchain_community/document_loaders/parsers/language/elixir.py create mode 100644 libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py diff --git a/docs/docs/integrations/document_loaders/source_code.ipynb b/docs/docs/integrations/document_loaders/source_code.ipynb index 043feaf75da..e05138d6b27 100644 --- a/docs/docs/integrations/document_loaders/source_code.ipynb +++ b/docs/docs/integrations/document_loaders/source_code.ipynb @@ -17,6 +17,7 @@ "- C++ (*)\n", "- C# (*)\n", "- COBOL\n", + "- Elixir\n", "- Go (*)\n", "- Java (*)\n", "- JavaScript (requires package `esprima`)\n", diff --git a/libs/community/langchain_community/document_loaders/parsers/language/elixir.py b/libs/community/langchain_community/document_loaders/parsers/language/elixir.py new file mode 100644 index 00000000000..780209767d8 --- /dev/null +++ b/libs/community/langchain_community/document_loaders/parsers/language/elixir.py @@ -0,0 +1,35 @@ +from typing import TYPE_CHECKING + +from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501 + TreeSitterSegmenter, +) + +if TYPE_CHECKING: + from tree_sitter import Language + + +CHUNK_QUERY = """ + [ + (call target: ((identifier) @_identifier + (#any-of? @_identifier "defmodule" "defprotocol" "defimpl"))) @module + (call target: ((identifier) @_identifier + (#any-of? @_identifier "def" "defmacro" "defmacrop" "defp"))) @function + (unary_operator operator: "@" operand: (call target: ((identifier) @_identifier + (#any-of? @_identifier "moduledoc" "typedoc""doc")))) @comment + ] +""".strip() + + +class ElixirSegmenter(TreeSitterSegmenter): + """Code segmenter for Elixir.""" + + def get_language(self) -> "Language": + from tree_sitter_languages import get_language + + return get_language("elixir") + + def get_chunk_query(self) -> str: + return CHUNK_QUERY + + def make_line_comment(self, text: str) -> str: + return f"# {text}" diff --git a/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py b/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py index 9405598d207..f44d74e6906 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py @@ -10,6 +10,7 @@ from langchain_community.document_loaders.parsers.language.c import CSegmenter from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter +from langchain_community.document_loaders.parsers.language.elixir import ElixirSegmenter from langchain_community.document_loaders.parsers.language.go import GoSegmenter from langchain_community.document_loaders.parsers.language.java import JavaSegmenter from langchain_community.document_loaders.parsers.language.javascript import ( @@ -44,6 +45,8 @@ LANGUAGE_EXTENSIONS: Dict[str, str] = { "ts": "ts", "java": "java", "php": "php", + "ex": "elixir", + "exs": "elixir", } LANGUAGE_SEGMENTERS: Dict[str, Any] = { @@ -63,6 +66,7 @@ LANGUAGE_SEGMENTERS: Dict[str, Any] = { "ts": TypeScriptSegmenter, "java": JavaSegmenter, "php": PHPSegmenter, + "elixir": ElixirSegmenter, } Language = Literal[ @@ -89,6 +93,7 @@ Language = Literal[ "c", "lua", "perl", + "elixir", ] @@ -107,6 +112,7 @@ class LanguageParser(BaseBlobParser): - C++: "cpp" (*) - C#: "csharp" (*) - COBOL: "cobol" + - Elixir: "elixir" - Go: "go" (*) - Java: "java" (*) - JavaScript: "js" (requires package `esprima`) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py new file mode 100644 index 00000000000..02d6af92656 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py @@ -0,0 +1,57 @@ +import unittest + +import pytest + +from langchain_community.document_loaders.parsers.language.elixir import ElixirSegmenter + + +@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +class TestElixirSegmenter(unittest.TestCase): + def setUp(self) -> None: + self.example_code = """@doc "some comment" +def foo do + i = 0 +end + +defmodule M do + def hi do + i = 2 + end + + defp wave do + :ok + end +end""" + + self.expected_simplified_code = """# Code for: @doc "some comment" +# Code for: def foo do + +# Code for: defmodule M do""" + + self.expected_extracted_code = [ + '@doc "some comment"', + "def foo do\n i = 0\nend", + "defmodule M do\n" + " def hi do\n" + " i = 2\n" + " end\n\n" + " defp wave do\n" + " :ok\n" + " end\n" + "end", + ] + + def test_is_valid(self) -> None: + self.assertTrue(ElixirSegmenter("def a do; end").is_valid()) + self.assertFalse(ElixirSegmenter("a b c 1 2 3").is_valid()) + + def test_extract_functions_classes(self) -> None: + segmenter = ElixirSegmenter(self.example_code) + extracted_code = segmenter.extract_functions_classes() + self.assertEqual(len(extracted_code), 3) + self.assertEqual(extracted_code, self.expected_extracted_code) + + def test_simplify_code(self) -> None: + segmenter = ElixirSegmenter(self.example_code) + simplified_code = segmenter.simplify_code() + self.assertEqual(simplified_code, self.expected_simplified_code) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index bdf7ae7be2d..36de4bca09f 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -293,6 +293,7 @@ class Language(str, Enum): LUA = "lua" PERL = "perl" HASKELL = "haskell" + ELIXIR = "elixir" @dataclass(frozen=True) diff --git a/libs/text-splitters/langchain_text_splitters/character.py b/libs/text-splitters/langchain_text_splitters/character.py index a492bb01b38..14cccc3c664 100644 --- a/libs/text-splitters/langchain_text_splitters/character.py +++ b/libs/text-splitters/langchain_text_splitters/character.py @@ -343,6 +343,30 @@ class RecursiveCharacterTextSplitter(TextSplitter): " ", "", ] + elif language == Language.ELIXIR: + return [ + # Split along method function and module definiton + "\ndef ", + "\ndefp ", + "\ndefmodule ", + "\ndefprotocol ", + "\ndefmacro ", + "\ndefmacrop ", + # Split along control flow statements + "\nif ", + "\nunless ", + "\nwhile ", + "\ncase ", + "\ncond ", + "\nwith ", + "\nfor ", + "\ndo ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] elif language == Language.RUST: return [ # Split along function definitions