diff --git a/docs/docs/integrations/document_loaders/source_code.ipynb b/docs/docs/integrations/document_loaders/source_code.ipynb index e05138d6b27..5f05a6a26bd 100644 --- a/docs/docs/integrations/document_loaders/source_code.ipynb +++ b/docs/docs/integrations/document_loaders/source_code.ipynb @@ -30,7 +30,7 @@ "- Scala (*)\n", "- TypeScript (*)\n", "\n", - "Items marked with (*) require the packages `tree_sitter` and `tree_sitter_languages`.\n", + "Items marked with (*) require the packages `tree_sitter` and `tree-sitter-language-pack`.\n", "It is straightforward to add support for additional languages using `tree_sitter`,\n", "although this currently requires modifying LangChain.\n", "\n", @@ -47,9 +47,7 @@ "id": "7fa47b2e", "metadata": {}, "outputs": [], - "source": [ - "%pip install -qU esprima esprima tree_sitter tree_sitter_languages" - ] + "source": "%pip install -qU esprima esprima tree_sitter tree-sitter-language-pack" }, { "cell_type": "code", diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt index 6ad4f43e867..2b97510a881 100644 --- a/libs/community/extended_testing_deps.txt +++ b/libs/community/extended_testing_deps.txt @@ -91,8 +91,8 @@ tidb-vector>=0.0.3,<1.0.0 timescale-vector==0.0.1 tqdm>=4.48.0 tiktoken>=0.8.0 -tree-sitter>=0.20.2,<0.21 -tree-sitter-languages>=1.8.0,<2 +tree-sitter>=0.23.2,<1 +tree-sitter-language-pack>=0.6.1,<1 upstash-redis>=1.1.0,<2 upstash-ratelimit>=1.1.0,<2 vdms>=0.0.20 diff --git a/libs/community/langchain_community/document_loaders/parsers/language/c.py b/libs/community/langchain_community/document_loaders/parsers/language/c.py index 2db1ec99fca..844785ccddf 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/c.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/c.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -25,10 +25,15 @@ class CSegmenter(TreeSitterSegmenter): """Code segmenter for C.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("c") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("c") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/cpp.py b/libs/community/langchain_community/document_loaders/parsers/language/cpp.py index 9d09164a846..dbdf1b3ca30 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/cpp.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/cpp.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -25,10 +25,15 @@ class CPPSegmenter(TreeSitterSegmenter): """Code segmenter for C++.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("cpp") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("cpp") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/csharp.py b/libs/community/langchain_community/document_loaders/parsers/language/csharp.py index a9f809fa00a..e7c9d42b167 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/csharp.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/csharp.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -25,9 +25,14 @@ class CSharpSegmenter(TreeSitterSegmenter): """Code segmenter for C#.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language - return get_language("c_sharp") + return get_language("csharp") + + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("csharp") def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/elixir.py b/libs/community/langchain_community/document_loaders/parsers/language/elixir.py index 780209767d8..7da0050b425 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/elixir.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/elixir.py @@ -5,17 +5,49 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language - + from tree_sitter import Language, Parser CHUNK_QUERY = """ [ - (call target: ((identifier) @_identifier - (#any-of? @_identifier "defmodule" "defprotocol" "defimpl"))) @module - (call target: ((identifier) @_identifier - (#any-of? @_identifier "def" "defmacro" "defmacrop" "defp"))) @function - (unary_operator operator: "@" operand: (call target: ((identifier) @_identifier - (#any-of? @_identifier "moduledoc" "typedoc""doc")))) @comment + (unary_operator + operator: "@" + operand: (call + target: (identifier) + (arguments + [ + (string) + (charlist) + (sigil + quoted_start: _ + quoted_end: _ + ) + (boolean) + ] + ) + ) + ) @comment + + (call + target: (identifier) + (arguments (alias)) + ) @module + + (call + target: (identifier) + (arguments + [ + ; zero-arity functions with no parentheses + (identifier) + ; regular function clause + (call target: (identifier)) + ; function clause with a guard clause + (binary_operator + left: (call target: (identifier)) + operator: "when" + ) + ] + ) + ) @function ] """.strip() @@ -24,10 +56,15 @@ class ElixirSegmenter(TreeSitterSegmenter): """Code segmenter for Elixir.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("elixir") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("elixir") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/go.py b/libs/community/langchain_community/document_loaders/parsers/language/go.py index f836ab3ad71..4285dc1e585 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/go.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/go.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -20,10 +20,15 @@ class GoSegmenter(TreeSitterSegmenter): """Code segmenter for Go.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("go") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("go") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/java.py b/libs/community/langchain_community/document_loaders/parsers/language/java.py index c7293e1ed7f..8c7dcb05a91 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/java.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/java.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -21,10 +21,15 @@ class JavaSegmenter(TreeSitterSegmenter): """Code segmenter for Java.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("java") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("java") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/kotlin.py b/libs/community/langchain_community/document_loaders/parsers/language/kotlin.py index 6f946f7b4a6..368c40dc9ce 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/kotlin.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/kotlin.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -20,10 +20,15 @@ class KotlinSegmenter(TreeSitterSegmenter): """Code segmenter for Kotlin.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("kotlin") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("kotlin") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py b/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py index e1d4e5ec664..f2be33af8d6 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py @@ -130,7 +130,7 @@ class LanguageParser(BaseBlobParser): - TypeScript: "ts" (*) Items marked with (*) require the packages `tree_sitter` and - `tree_sitter_languages`. It is straightforward to add support for additional + `tree-sitter-language-pack`. It is straightforward to add support for additional languages using `tree_sitter`, although this currently requires modifying LangChain. The language used for parsing can be configured, along with the minimum number of diff --git a/libs/community/langchain_community/document_loaders/parsers/language/lua.py b/libs/community/langchain_community/document_loaders/parsers/language/lua.py index 3e0a762ba4b..1b6ee8ba50c 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/lua.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/lua.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -22,10 +22,15 @@ class LuaSegmenter(TreeSitterSegmenter): """Code segmenter for Lua.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("lua") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("lua") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/perl.py b/libs/community/langchain_community/document_loaders/parsers/language/perl.py index b68d52cef2b..725ef1a2491 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/perl.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/perl.py @@ -5,12 +5,12 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ [ - (function_definition) @subroutine + (subroutine_declaration_statement) @subroutine ] """.strip() @@ -19,10 +19,15 @@ class PerlSegmenter(TreeSitterSegmenter): """Code segmenter for Perl.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("perl") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("perl") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/php.py b/libs/community/langchain_community/document_loaders/parsers/language/php.py index e7ec12a5ee8..d50eca1e646 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/php.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/php.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -24,10 +24,15 @@ class PHPSegmenter(TreeSitterSegmenter): """Code segmenter for PHP.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("php") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("php") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/ruby.py b/libs/community/langchain_community/document_loaders/parsers/language/ruby.py index 767a1f94a4d..34830b74468 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/ruby.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/ruby.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -21,10 +21,15 @@ class RubySegmenter(TreeSitterSegmenter): """Code segmenter for Ruby.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("ruby") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("ruby") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/rust.py b/libs/community/langchain_community/document_loaders/parsers/language/rust.py index bb73f96bf6d..5528a2b3ad1 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/rust.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/rust.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -23,10 +23,15 @@ class RustSegmenter(TreeSitterSegmenter): """Code segmenter for Rust.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("rust") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("rust") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/scala.py b/libs/community/langchain_community/document_loaders/parsers/language/scala.py index af62a4e748f..f4624fa3592 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/scala.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/scala.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -22,10 +22,15 @@ class ScalaSegmenter(TreeSitterSegmenter): """Code segmenter for Scala.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("scala") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("scala") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/sql.py b/libs/community/langchain_community/document_loaders/parsers/language/sql.py index 1c11b7b3637..731c409e442 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/sql.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/sql.py @@ -5,15 +5,20 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser +# CHUNK_QUERY = """ +# [ +# (create_table) @create +# (_select_statement) @select +# (insert) @insert +# (update) @update +# (_delete_statement) @delete +# ] +# """ CHUNK_QUERY = """ [ - (create_table_statement) @create - (select_statement) @select - (insert_statement) @insert - (update_statement) @update - (delete_statement) @delete + (statement) @statement ] """ @@ -28,10 +33,15 @@ class SQLSegmenter(TreeSitterSegmenter): def get_language(self) -> "Language": """Return the SQL language grammar for Tree-sitter.""" - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("sql") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("sql") + def get_chunk_query(self) -> str: """Return the Tree-sitter query for SQL segmentation.""" return CHUNK_QUERY diff --git a/libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py b/libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py index 7187cd6b5f5..576fe47371c 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/tree_sitter_segmenter.py @@ -6,7 +6,7 @@ from langchain_community.document_loaders.parsers.language.code_segmenter import ) if TYPE_CHECKING: - from tree_sitter import Language, Parser + from tree_sitter import Language, Node, Parser class TreeSitterSegmenter(CodeSegmenter): @@ -18,12 +18,12 @@ class TreeSitterSegmenter(CodeSegmenter): try: import tree_sitter # noqa: F401 - import tree_sitter_languages # noqa: F401 + import tree_sitter_language_pack # noqa: F401 except ImportError: raise ImportError( - "Could not import tree_sitter/tree_sitter_languages Python packages. " - "Please install them with " - "`pip install tree-sitter tree-sitter-languages`." + "Could not import tree_sitter/tree_sitter_language_pack " + "Python packages. Please install them with " + "`pip install tree-sitter tree-sitter-language-pack`." ) def is_valid(self) -> bool: @@ -35,48 +35,50 @@ class TreeSitterSegmenter(CodeSegmenter): return len(error_query.captures(tree.root_node)) == 0 - def extract_functions_classes(self) -> List[str]: + def _get_top_level_nodes(self) -> List["Node"]: language = self.get_language() query = language.query(self.get_chunk_query()) - parser = self.get_parser() tree = parser.parse(bytes(self.code, encoding="UTF-8")) captures = query.captures(tree.root_node) + top_level_nodes = {} + for node_type, nodes in captures.items(): + for node in nodes: + cursor = node.parent + is_child = False + while cursor is not None: + if cursor.id in top_level_nodes: + is_child = True + break + cursor = cursor.parent + if is_child: + continue + top_level_nodes[node.id] = node - processed_lines = set() - chunks = [] + children = node.children + for child in children: + if child.id in top_level_nodes: + del top_level_nodes[child.id] + children.extend(child.children) + top_level_nodes_list = list(top_level_nodes.values()) + top_level_nodes_list.sort(key=lambda n: n.start_point[0]) + return top_level_nodes_list - for node, name in captures: - start_line = node.start_point[0] - end_line = node.end_point[0] - lines = list(range(start_line, end_line + 1)) - - if any(line in processed_lines for line in lines): - continue - - processed_lines.update(lines) - chunk_text = node.text.decode("UTF-8") - chunks.append(chunk_text) - - return chunks + def extract_functions_classes(self) -> List[str]: + top_level_nodes = self._get_top_level_nodes() + return [ + node.text.decode("UTF-8") + for node in top_level_nodes + if node.text is not None + ] def simplify_code(self) -> str: - language = self.get_language() - query = language.query(self.get_chunk_query()) - - parser = self.get_parser() - tree = parser.parse(bytes(self.code, encoding="UTF-8")) - processed_lines = set() - simplified_lines = self.source_lines[:] - for node, name in query.captures(tree.root_node): + top_level_nodes = self._get_top_level_nodes() + for node in top_level_nodes: start_line = node.start_point[0] end_line = node.end_point[0] - lines = list(range(start_line, end_line + 1)) - if any(line in processed_lines for line in lines): - continue - simplified_lines[start_line] = self.make_line_comment( f"Code for: {self.source_lines[start_line]}" ) @@ -84,16 +86,11 @@ class TreeSitterSegmenter(CodeSegmenter): for line_num in range(start_line + 1, end_line + 1): simplified_lines[line_num] = None # type: ignore - processed_lines.update(lines) - return "\n".join(line for line in simplified_lines if line is not None) + @abstractmethod def get_parser(self) -> "Parser": - from tree_sitter import Parser - - parser = Parser() - parser.set_language(self.get_language()) - return parser + raise NotImplementedError() @abstractmethod def get_language(self) -> "Language": diff --git a/libs/community/langchain_community/document_loaders/parsers/language/typescript.py b/libs/community/langchain_community/document_loaders/parsers/language/typescript.py index ab7158e2e82..5795caf04be 100644 --- a/libs/community/langchain_community/document_loaders/parsers/language/typescript.py +++ b/libs/community/langchain_community/document_loaders/parsers/language/typescript.py @@ -5,7 +5,7 @@ from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter ) if TYPE_CHECKING: - from tree_sitter import Language + from tree_sitter import Language, Parser CHUNK_QUERY = """ @@ -22,10 +22,15 @@ class TypeScriptSegmenter(TreeSitterSegmenter): """Code segmenter for TypeScript.""" def get_language(self) -> "Language": - from tree_sitter_languages import get_language + from tree_sitter_language_pack import get_language return get_language("typescript") + def get_parser(self) -> "Parser": + from tree_sitter_language_pack import get_parser + + return get_parser("typescript") + def get_chunk_query(self) -> str: return CHUNK_QUERY diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py index aa6cb586889..e249f0565a5 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_c.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.c import CSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestCSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """int main() { diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py index a06b7fa7aff..af4c87d3c71 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_cpp.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestCPPSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """int foo() { @@ -55,9 +55,9 @@ auto T::bar() const -> int { def test_extract_functions_classes(self) -> None: segmenter = CPPSegmenter(self.example_code) extracted_code = segmenter.extract_functions_classes() - self.assertEqual(extracted_code, self.expected_extracted_code) + self.assertEqual(self.expected_extracted_code, extracted_code) def test_simplify_code(self) -> None: segmenter = CPPSegmenter(self.example_code) simplified_code = segmenter.simplify_code() - self.assertEqual(simplified_code, self.expected_simplified_code) + self.assertEqual(self.expected_simplified_code, simplified_code) diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py index 3f04713f4ce..7a7dbef187a 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_csharp.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestCSharpSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """namespace World diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py index 02d6af92656..1057f2019a7 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_elixir.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.elixir import ElixirSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestElixirSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """@doc "some comment" diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py index e1360e35d97..6e7f1544165 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_go.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.go import GoSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestGoSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """func foo(a int) int { diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py index 1129ae1a889..25d599a8919 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_java.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.java import JavaSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestJavaSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """class Hello diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py index 35bde9feb26..9ce8b59c886 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_kotlin.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestKotlinSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """fun foo(a: Int): Int { diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py index dab2ea8474b..efef084d08d 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_lua.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestLuaSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """function F() diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py index 78e3fa25a00..848a099a665 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_perl.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestPerlSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """sub Hello { diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py index c54df82dc7c..071a6b4d466 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_php.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.php import PHPSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestPHPSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """ None: self.example_code = """def foo diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py index 6b35677c305..188ca1ef165 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_rust.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.rust import RustSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestRustSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """fn foo() -> i32 { diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py index 3fad1aeb806..e314da706ac 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_scala.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.scala import ScalaSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestScalaSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """def foo() { diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_sql.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_sql.py index 37b22052ea2..be2acd96742 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_sql.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_sql.py @@ -5,7 +5,7 @@ import pytest from langchain_community.document_loaders.parsers.language.sql import SQLSegmenter -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestSQLSegmenter(unittest.TestCase): """Unit tests for the SQLSegmenter class.""" diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py index caf4b6ad66c..0212fa39491 100644 --- a/libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py +++ b/libs/community/tests/unit_tests/document_loaders/parsers/language/test_typescript.py @@ -7,7 +7,7 @@ from langchain_community.document_loaders.parsers.language.typescript import ( ) -@pytest.mark.requires("tree_sitter", "tree_sitter_languages") +@pytest.mark.requires("tree_sitter", "tree_sitter_language_pack") class TestTypeScriptSegmenter(unittest.TestCase): def setUp(self) -> None: self.example_code = """function foo(): number