mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-25 13:07:58 +00:00
Framework for supporting more languages in LanguageParser (#13318)
## Description I am submitting this for a school project as part of a team of 5. Other team members are @LeilaChr, @maazh10, @Megabear137, @jelalalamy. This PR also has contributions from community members @Harrolee and @Mario928. Initial context is in the issue we opened (#11229). This pull request adds: - Generic framework for expanding the languages that `LanguageParser` can handle, using the [tree-sitter](https://github.com/tree-sitter/py-tree-sitter#py-tree-sitter) parsing library and existing language-specific parsers written for it - Support for the following additional languages in `LanguageParser`: - C - C++ - C# - Go - Java (contributed by @Mario928 https://github.com/ThatsJustCheesy/langchain/pull/2) - Kotlin - Lua - Perl - Ruby - Rust - Scala - TypeScript (contributed by @Harrolee https://github.com/ThatsJustCheesy/langchain/pull/1) Here is the [design document](https://docs.google.com/document/d/17dB14cKCWAaiTeSeBtxHpoVPGKrsPye8W0o_WClz2kk) if curious, but no need to read it. ## Issues - Closes #11229 - Closes #10996 - Closes #8405 ## Dependencies `tree_sitter` and `tree_sitter_languages` on PyPI. We have tried to add these as optional dependencies. ## Documentation We have updated the list of supported languages, and also added a section to `source_code.ipynb` detailing how to add support for additional languages using our framework. ## Maintainer - @hwchase17 (previously reviewed https://github.com/langchain-ai/langchain/pull/6486) Thanks!! ## Git commits We will gladly squash any/all of our commits (esp merge commits) if necessary. Let us know if this is desirable, or if you will be squash-merging anyway. <!-- Thank you for contributing to LangChain! Replace this entire comment with: - **Description:** a description of the change, - **Issue:** the issue # it fixes (if applicable), - **Dependencies:** any dependencies required for this change, - **Tag maintainer:** for a quicker response, tag the relevant maintainer (see below), - **Twitter handle:** we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://github.com/langchain-ai/langchain/blob/master/.github/CONTRIBUTING.md If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/extras` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> --------- Co-authored-by: Maaz Hashmi <mhashmi373@gmail.com> Co-authored-by: LeilaChr <87657694+LeilaChr@users.noreply.github.com> Co-authored-by: Jeremy La <jeremylai511@gmail.com> Co-authored-by: Megabear137 <zubair.alnoor27@gmail.com> Co-authored-by: Lee Harrold <lhharrold@sep.com> Co-authored-by: Mario928 <88029051+Mario928@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
@@ -0,0 +1,36 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(struct_specifier
|
||||
body: (field_declaration_list)) @struct
|
||||
(enum_specifier
|
||||
body: (enumerator_list)) @enum
|
||||
(union_specifier
|
||||
body: (field_declaration_list)) @union
|
||||
(function_definition) @function
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class CSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for C."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("c")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -0,0 +1,36 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(class_specifier
|
||||
body: (field_declaration_list)) @class
|
||||
(struct_specifier
|
||||
body: (field_declaration_list)) @struct
|
||||
(union_specifier
|
||||
body: (field_declaration_list)) @union
|
||||
(function_definition) @function
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class CPPSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for C++."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("cpp")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -0,0 +1,36 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(namespace_declaration) @namespace
|
||||
(class_declaration) @class
|
||||
(method_declaration) @method
|
||||
(interface_declaration) @interface
|
||||
(enum_declaration) @enum
|
||||
(struct_declaration) @struct
|
||||
(record_declaration) @record
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class CSharpSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for C#."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("c_sharp")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -0,0 +1,31 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(function_declaration) @function
|
||||
(type_declaration) @type
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class GoSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Go."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("go")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -0,0 +1,32 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(class_declaration) @class
|
||||
(interface_declaration) @interface
|
||||
(enum_declaration) @enum
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class JavaSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Java."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("java")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -0,0 +1,31 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(function_declaration) @function
|
||||
(class_declaration) @class
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class KotlinSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Kotlin."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("kotlin")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -6,28 +6,66 @@ from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.language.c import CSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.go import GoSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.java import JavaSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.javascript import (
|
||||
JavaScriptSegmenter,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter
|
||||
from langchain_community.document_loaders.parsers.language.rust import RustSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.scala import ScalaSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.typescript import (
|
||||
TypeScriptSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain.text_splitter import Language
|
||||
from langchain.langchain.text_splitter import Language
|
||||
|
||||
try:
|
||||
from langchain.text_splitter import Language
|
||||
from langchain.langchain.text_splitter import Language
|
||||
|
||||
LANGUAGE_EXTENSIONS: Dict[str, str] = {
|
||||
"py": Language.PYTHON,
|
||||
"js": Language.JS,
|
||||
"cobol": Language.COBOL,
|
||||
"c": Language.C,
|
||||
"cpp": Language.CPP,
|
||||
"cs": Language.CSHARP,
|
||||
"rb": Language.RUBY,
|
||||
"scala": Language.SCALA,
|
||||
"rs": Language.RUST,
|
||||
"go": Language.GO,
|
||||
"kt": Language.KOTLIN,
|
||||
"lua": Language.LUA,
|
||||
"pl": Language.PERL,
|
||||
"ts": Language.TS,
|
||||
"java": Language.JAVA,
|
||||
}
|
||||
|
||||
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
||||
Language.PYTHON: PythonSegmenter,
|
||||
Language.JS: JavaScriptSegmenter,
|
||||
Language.COBOL: CobolSegmenter,
|
||||
Language.C: CSegmenter,
|
||||
Language.CPP: CPPSegmenter,
|
||||
Language.CSHARP: CSharpSegmenter,
|
||||
Language.RUBY: RubySegmenter,
|
||||
Language.RUST: RustSegmenter,
|
||||
Language.SCALA: ScalaSegmenter,
|
||||
Language.GO: GoSegmenter,
|
||||
Language.KOTLIN: KotlinSegmenter,
|
||||
Language.LUA: LuaSegmenter,
|
||||
Language.PERL: PerlSegmenter,
|
||||
Language.TS: TypeScriptSegmenter,
|
||||
Language.JAVA: JavaSegmenter,
|
||||
}
|
||||
except ImportError:
|
||||
LANGUAGE_EXTENSIONS = {}
|
||||
@@ -43,11 +81,34 @@ class LanguageParser(BaseBlobParser):
|
||||
|
||||
This approach can potentially improve the accuracy of QA models over source code.
|
||||
|
||||
Currently, the supported languages for code parsing are Python and JavaScript.
|
||||
The supported languages for code parsing are:
|
||||
|
||||
- C (*)
|
||||
- C++ (*)
|
||||
- C# (*)
|
||||
- COBOL
|
||||
- Go (*)
|
||||
- Java (*)
|
||||
- JavaScript (requires package `esprima`)
|
||||
- Kotlin (*)
|
||||
- Lua (*)
|
||||
- Perl (*)
|
||||
- Python
|
||||
- Ruby (*)
|
||||
- Rust (*)
|
||||
- Scala (*)
|
||||
- TypeScript (*)
|
||||
|
||||
Items marked with (*) require the packages `tree_sitter` and
|
||||
`tree_sitter_languages`. It is straightforward to add support for additional
|
||||
languages using `tree_sitter`, although this currently requires modifying LangChain.
|
||||
|
||||
The language used for parsing can be configured, along with the minimum number of
|
||||
lines required to activate the splitting based on syntax.
|
||||
|
||||
If a language is not explicitly specified, `LanguageParser` will infer one from
|
||||
filename extensions, if present.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
@@ -0,0 +1,33 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(function_definition_statement
|
||||
name: (identifier)) @function
|
||||
(local_function_definition_statement
|
||||
name: (identifier)) @function
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class LuaSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Lua."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("lua")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"-- {text}"
|
@@ -0,0 +1,30 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(function_definition) @subroutine
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class PerlSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Perl."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("perl")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"# {text}"
|
@@ -0,0 +1,32 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(method) @method
|
||||
(module) @module
|
||||
(class) @class
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class RubySegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Ruby."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("ruby")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"# {text}"
|
@@ -0,0 +1,34 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(function_item
|
||||
name: (identifier)
|
||||
body: (block)) @function
|
||||
(struct_item) @struct
|
||||
(trait_item) @trait
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class RustSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Rust."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("rust")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -0,0 +1,33 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(class_definition) @class
|
||||
(function_definition) @function
|
||||
(object_definition) @object
|
||||
(trait_definition) @trait
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class ScalaSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for Scala."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("scala")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
@@ -0,0 +1,108 @@
|
||||
from abc import abstractmethod
|
||||
from typing import TYPE_CHECKING, List
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
||||
CodeSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language, Parser
|
||||
|
||||
|
||||
class TreeSitterSegmenter(CodeSegmenter):
|
||||
"""Abstract class for `CodeSegmenter`s that use the tree-sitter library."""
|
||||
|
||||
def __init__(self, code: str):
|
||||
super().__init__(code)
|
||||
self.source_lines = self.code.splitlines()
|
||||
|
||||
try:
|
||||
import tree_sitter # noqa: F401
|
||||
import tree_sitter_languages # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import tree_sitter/tree_sitter_languages Python packages. "
|
||||
"Please install them with "
|
||||
"`pip install tree-sitter tree-sitter-languages`."
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
language = self.get_language()
|
||||
error_query = language.query("(ERROR) @error")
|
||||
|
||||
parser = self.get_parser()
|
||||
tree = parser.parse(bytes(self.code, encoding="UTF-8"))
|
||||
|
||||
return len(error_query.captures(tree.root_node)) == 0
|
||||
|
||||
def extract_functions_classes(self) -> List[str]:
|
||||
language = self.get_language()
|
||||
query = language.query(self.get_chunk_query())
|
||||
|
||||
parser = self.get_parser()
|
||||
tree = parser.parse(bytes(self.code, encoding="UTF-8"))
|
||||
captures = query.captures(tree.root_node)
|
||||
|
||||
processed_lines = set()
|
||||
chunks = []
|
||||
|
||||
for node, name in captures:
|
||||
start_line = node.start_point[0]
|
||||
end_line = node.end_point[0]
|
||||
lines = list(range(start_line, end_line + 1))
|
||||
|
||||
if any(line in processed_lines for line in lines):
|
||||
continue
|
||||
|
||||
processed_lines.update(lines)
|
||||
chunk_text = node.text.decode("UTF-8")
|
||||
chunks.append(chunk_text)
|
||||
|
||||
return chunks
|
||||
|
||||
def simplify_code(self) -> str:
|
||||
language = self.get_language()
|
||||
query = language.query(self.get_chunk_query())
|
||||
|
||||
parser = self.get_parser()
|
||||
tree = parser.parse(bytes(self.code, encoding="UTF-8"))
|
||||
processed_lines = set()
|
||||
|
||||
simplified_lines = self.source_lines[:]
|
||||
for node, name in query.captures(tree.root_node):
|
||||
start_line = node.start_point[0]
|
||||
end_line = node.end_point[0]
|
||||
|
||||
lines = list(range(start_line, end_line + 1))
|
||||
if any(line in processed_lines for line in lines):
|
||||
continue
|
||||
|
||||
simplified_lines[start_line] = self.make_line_comment(
|
||||
f"Code for: {self.source_lines[start_line]}"
|
||||
)
|
||||
|
||||
for line_num in range(start_line + 1, end_line + 1):
|
||||
simplified_lines[line_num] = None # type: ignore
|
||||
|
||||
processed_lines.update(lines)
|
||||
|
||||
return "\n".join(line for line in simplified_lines if line is not None)
|
||||
|
||||
def get_parser(self) -> "Parser":
|
||||
from tree_sitter import Parser
|
||||
|
||||
parser = Parser()
|
||||
parser.set_language(self.get_language())
|
||||
return parser
|
||||
|
||||
@abstractmethod
|
||||
def get_language(self) -> "Language":
|
||||
raise NotImplementedError() # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
def get_chunk_query(self) -> str:
|
||||
raise NotImplementedError() # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
raise NotImplementedError() # pragma: no cover
|
@@ -0,0 +1,33 @@
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.tree_sitter_segmenter import ( # noqa: E501
|
||||
TreeSitterSegmenter,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tree_sitter import Language
|
||||
|
||||
|
||||
CHUNK_QUERY = """
|
||||
[
|
||||
(function_declaration) @function
|
||||
(class_declaration) @class
|
||||
(interface_declaration) @interface
|
||||
(enum_declaration) @enum
|
||||
]
|
||||
""".strip()
|
||||
|
||||
|
||||
class TypeScriptSegmenter(TreeSitterSegmenter):
|
||||
"""Code segmenter for TypeScript."""
|
||||
|
||||
def get_language(self) -> "Language":
|
||||
from tree_sitter_languages import get_language
|
||||
|
||||
return get_language("typescript")
|
||||
|
||||
def get_chunk_query(self) -> str:
|
||||
return CHUNK_QUERY
|
||||
|
||||
def make_line_comment(self, text: str) -> str:
|
||||
return f"// {text}"
|
170
libs/community/poetry.lock
generated
170
libs/community/poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aenum"
|
||||
@@ -3140,6 +3140,7 @@ files = [
|
||||
{file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:227b178b22a7f91ae88525810441791b1ca1fc71c86f03190911793be15cec3d"},
|
||||
{file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:780eb6383fbae12afa819ef676fc93e1548ae4b076c004a393af26a04b460742"},
|
||||
{file = "jq-1.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:08ded6467f4ef89fec35b2bf310f210f8cd13fbd9d80e521500889edf8d22441"},
|
||||
{file = "jq-1.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49e44ed677713f4115bd5bf2dbae23baa4cd503be350e12a1c1f506b0687848f"},
|
||||
{file = "jq-1.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:984f33862af285ad3e41e23179ac4795f1701822473e1a26bf87ff023e5a89ea"},
|
||||
{file = "jq-1.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f42264fafc6166efb5611b5d4cb01058887d050a6c19334f6a3f8a13bb369df5"},
|
||||
{file = "jq-1.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a67154f150aaf76cc1294032ed588436eb002097dd4fd1e283824bf753a05080"},
|
||||
@@ -3650,7 +3651,7 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "langchain-core"
|
||||
version = "0.1.21"
|
||||
version = "0.1.22"
|
||||
description = "Building applications with LLMs through composability"
|
||||
optional = false
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
@@ -5968,7 +5969,6 @@ files = [
|
||||
{file = "pymongo-4.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b8729dbf25eb32ad0dc0b9bd5e6a0d0b7e5c2dc8ec06ad171088e1896b522a74"},
|
||||
{file = "pymongo-4.6.1-cp312-cp312-win32.whl", hash = "sha256:3177f783ae7e08aaf7b2802e0df4e4b13903520e8380915e6337cdc7a6ff01d8"},
|
||||
{file = "pymongo-4.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:00c199e1c593e2c8b033136d7a08f0c376452bac8a896c923fcd6f419e07bdd2"},
|
||||
{file = "pymongo-4.6.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:6dcc95f4bb9ed793714b43f4f23a7b0c57e4ef47414162297d6f650213512c19"},
|
||||
{file = "pymongo-4.6.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:13552ca505366df74e3e2f0a4f27c363928f3dff0eef9f281eb81af7f29bc3c5"},
|
||||
{file = "pymongo-4.6.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:77e0df59b1a4994ad30c6d746992ae887f9756a43fc25dec2db515d94cf0222d"},
|
||||
{file = "pymongo-4.6.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3a7f02a58a0c2912734105e05dedbee4f7507e6f1bd132ebad520be0b11d46fd"},
|
||||
@@ -6508,7 +6508,6 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@@ -8123,6 +8122,165 @@ files = [
|
||||
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
|
||||
test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<7.5)", "pytest-mock", "pytest-mypy-testing"]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter"
|
||||
version = "0.20.4"
|
||||
description = "Python bindings for the Tree-Sitter parsing library"
|
||||
optional = true
|
||||
python-versions = ">=3.3"
|
||||
files = [
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c259b9bcb596e54f54713eb3951226fc834d65289940f4bfdcdf519f08e8e876"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:88da7e2e4c69881cd63916cc24ae0b809f96aae331da45b418ae6b2d1ed2ca19"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66a68b156ba131e9d8dff4a1f72037f4b368cc50c58f18905a91743ae1d1c795"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae28e25d551f406807011487bdfb9728041e656b30b554fa7f3391ab64ed69f9"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36b10c9c69e825ba65cf9b0f77668bf33e70d2a5764b64ad6f133f8cc9220f09"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:7c18c64ddd44b75b7e1660b9793753eda427e4b145b6216d4b2d2e9b200c74f2"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:e9e9e594bbefb76ad9ea256f5c87eba7591b4758854d3df83ce4df415933a006"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-win32.whl", hash = "sha256:b4755229dc18644fe48bcab974bde09b171fcb6ef625d3cb5ece5c6198f4223e"},
|
||||
{file = "tree_sitter-0.20.4-cp310-cp310-win_amd64.whl", hash = "sha256:f792684cee8a46d9194d9f4223810e54ccc704470c5777538d59fbde0a4c91bf"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9d22ee75f45836554ee6a11e50dd8f9827941e67c49fce9a0790245b899811a9"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a0ffd76dd991ba745bb5d0ba1d583bec85726d3ddef8c9685dc8636a619adde"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:060d4e5b803be0975f1ac46e54a292eab0701296ccd912f6cdac3f7331e29143"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:822e02366dbf223697b2b56b8f91aa5b60571f9fe7c998988a381db1c69604e9"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:527ca72c6a8f60fa719af37fa86f58b7ad0e07b8f74d1c1c7e926c5c888a7e6b"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a418ca71309ea7052e076f08d623f33f58eae01a8e8cdc1e6d3a01b5b8ddebfe"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:08c3ba2561b61a83c28ca06a0bce2a5ffcfb6b39f9d27a45e5ebd9cad2bedb7f"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-win32.whl", hash = "sha256:8d04c75a389b2de94952d602264852acff8cd3ed1ccf8a2492a080973d5ddd58"},
|
||||
{file = "tree_sitter-0.20.4-cp311-cp311-win_amd64.whl", hash = "sha256:ba9215c0e7529d9eb370528e5d99b7389d14a7eae94f07d14fa9dab18f267c62"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c4c1af5ed4306071d30970c83ec882520a7bf5d8053996dbc4aa5c59238d4990"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9d70bfa550cf22c9cea9b3c0d18b889fc4f2a7e9dcf1d6cc93f49fa9d4a94954"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6de537bca0641775d8d175d37303d54998980fc0d997dd9aa89e16b415bf0cc3"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b1c0f8c0e3e50267566f5116cdceedf4e23e8c08b55ef3becbe954a11b16e84"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20ef2ee6d9bb8e21713949e5ff769ed670fe1217f95b7eeb6c675788438c1e6e"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b6fd1c881ab0de5faa67168db2d001eee32be5482cb4e0b21b217689a05b6fe4"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bf47047420021d50aec529cb66387c90562350b499ddf56ecef1fc8255439e30"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-win32.whl", hash = "sha256:c16b48378041fc9702b6aa3480f2ffa49ca8ea58141a862acd569e5a0679655f"},
|
||||
{file = "tree_sitter-0.20.4-cp312-cp312-win_amd64.whl", hash = "sha256:973e871167079a1b1d7304d361449253efbe2a6974728ad563cf407bd02ddccb"},
|
||||
{file = "tree_sitter-0.20.4-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:9d33a55598dd18a4d8b869a3417de82a4812c3a7dc7e61cb025ece3e9c3e4e96"},
|
||||
{file = "tree_sitter-0.20.4-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cee6955c2c97fc5927a41c7a8b06647c4b4d9b99b8a1581bf1183435c8cec3e"},
|
||||
{file = "tree_sitter-0.20.4-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5022bea67e479ad212be7c05b983a72e297a013efb4e8ea5b5b4d7da79a9fdef"},
|
||||
{file = "tree_sitter-0.20.4-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:640f60a5b966f0990338f1bf559455c3dcb822bc4329d82b3d42f32a48374dfe"},
|
||||
{file = "tree_sitter-0.20.4-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:0e83f641fe6f27d91bd4d259fff5d35de1567d3f581b9efe9bbd5be50fe4ddc7"},
|
||||
{file = "tree_sitter-0.20.4-cp36-cp36m-win32.whl", hash = "sha256:ce6a85027c66fa3f09d482cc6d41927ea40955f7f33b86aedd26dd932709a2c9"},
|
||||
{file = "tree_sitter-0.20.4-cp36-cp36m-win_amd64.whl", hash = "sha256:fe10779347a6c067af29cb37fd4b75fa96c5cb68f587cc9530b70fe3f2a51a55"},
|
||||
{file = "tree_sitter-0.20.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28d5f84e34e276887e3a240b60906ca7e2b51e975f3145c3149ceed977a69508"},
|
||||
{file = "tree_sitter-0.20.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c913b65cbe10996116988ac436748f24883b5097e58274223e89bb2c5d1bb1a"},
|
||||
{file = "tree_sitter-0.20.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecaed46241e071752195a628bb97d2b740f2fde9e34f8a74456a4ea8bb26df88"},
|
||||
{file = "tree_sitter-0.20.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:b641e88a97eab002a1736d93ef5a4beac90ea4fd6e25affd1831319b99f456c9"},
|
||||
{file = "tree_sitter-0.20.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:327c40f439c6155e4eee54c4657e4701a04f5f4816d9defdcb836bf65bf83d21"},
|
||||
{file = "tree_sitter-0.20.4-cp37-cp37m-win32.whl", hash = "sha256:1b7c1d95f006b3de42fbf4045bd00c273d113e372fcb6a5378e74ed120c12032"},
|
||||
{file = "tree_sitter-0.20.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6140d037239a41046f5d34fba5e0374ee697adb4b48b90579c618b5402781c11"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f42fd1104efaad8151370f1936e2a488b7337a5d24544a9ab59ba4c4010b1272"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7859717c5d62ee386b3d036cab8ed0f88f8c027b6b4ae476a55a8c5fb8aab713"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:fdd361fe1cc68db68b4d85165641275e34b86cc26b2bab932790204fa14824dc"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b8d7539075606027b67764543463ff2bc4e52f4158ef6dc419c9f5625aa5383"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78e76307f05aca6cde72f3307b4d53701f34ae45f2248ceb83d1626051e201fd"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dd8c352f4577f61098d06cf3feb7fd214259f41b5036b81003860ed54d16b448"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:281f3e5382d1bd7fccc88d1afe68c915565bc24f8b8dd4844079d46c7815b8a7"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-win32.whl", hash = "sha256:6a77ac3cdcddd80cdd1fd394318bff99f94f37e08d235aaefccb87e1224946e5"},
|
||||
{file = "tree_sitter-0.20.4-cp38-cp38-win_amd64.whl", hash = "sha256:8eee8adf54033dc48eab84b040f4d7b32355a964c4ae0aae5dfbdc4dbc3364ca"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e89f6508e30fce05e2c724725d022db30d877817b9d64f933506ffb3a3f4a2c2"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7fb6286bb1fae663c45ff0700ec88fb9b50a81eed2bae8a291f95fcf8cc19547"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:11e93f8b4bbae04070416a82257a7ab2eb0afb76e093ae3ea73bd63b792f6846"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8250725c5f78929aeb2c71db5dca76f1ef448389ca16f9439161f90978bb8478"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d404a8ca9de9b0843844f0cd4d423f46bc46375ab8afb63b1d8ec01201457ac8"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0f2422c9ee70ba972dfc3943746e6cf7fc03725a866908950245bda9ccfc7301"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:21a937942e4729abbe778a609d2c218574436cb351c36fba89ef3c8c6066ec78"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-win32.whl", hash = "sha256:427a9a39360cc1816e28f8182550e478e4ba983595a2565ab9dfe32ea1b03fd7"},
|
||||
{file = "tree_sitter-0.20.4-cp39-cp39-win_amd64.whl", hash = "sha256:7095bb9aff297fa9c6026bf8914fd295997d714d1a6ee9a1edf7282c772f9f64"},
|
||||
{file = "tree_sitter-0.20.4-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:859260b90f0e3867ae840e39f54e830f607b3bc531bc21deeeeaa8a30cbb89ad"},
|
||||
{file = "tree_sitter-0.20.4-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0dfc14be73cf46126660a3aecdd0396e69562ad1a902245225ca7bd29649594e"},
|
||||
{file = "tree_sitter-0.20.4-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec46355bf3ff23f54d5e365871ffd3e05cfbc65d1b36a8be7c0bcbda30a1d43"},
|
||||
{file = "tree_sitter-0.20.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d933a942fde39876b99c36f12aa3764e4a555ae9366c10ce6cca8c16341c1bbf"},
|
||||
{file = "tree_sitter-0.20.4-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a7eec3b55135fe851a38fa248c9fd75fc3d58ceb6e1865b795e416e4d598c2a1"},
|
||||
{file = "tree_sitter-0.20.4-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc76225529ee14a53e84413480ce81ec3c44eaa0455c140e961c90ac3118ead"},
|
||||
{file = "tree_sitter-0.20.4-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccf0396e47efffc0b528959a8f2e2346a98297579f867e9e1834c2aad4be829c"},
|
||||
{file = "tree_sitter-0.20.4-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:a15fbabd3bc8e29c48289c156d743e69f5ec72bb125cf44f7adbdaa1937c3da6"},
|
||||
{file = "tree_sitter-0.20.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:36f8adf2126f496cf376b6e4b707cba061c25beb17841727eef6f0e083e53e1f"},
|
||||
{file = "tree_sitter-0.20.4-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:841efb40c116ab0a066924925409a8a4dcffeb39a151c0b2a1c2abe56ad4fb42"},
|
||||
{file = "tree_sitter-0.20.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2051e8a70fd8426f27a43dad71d11929a62ce30a9b1eb65bba0ed79e82481592"},
|
||||
{file = "tree_sitter-0.20.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:99a3c2824d4cfcffd9f961176891426bde2cb36ece5280c61480be93319c23c4"},
|
||||
{file = "tree_sitter-0.20.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:72830dc85a10430eca3d56739b7efcd7a05459c8d425f08c1aee6179ab7f13a9"},
|
||||
{file = "tree_sitter-0.20.4-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4992dd226055b6cd0a4f5661c66b799a73d3eff716302e0f7ab06594ee12d49f"},
|
||||
{file = "tree_sitter-0.20.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a66d95bbf92175cdc295d6d77f330942811f02e3aaf3fc64431cb749683b2f7d"},
|
||||
{file = "tree_sitter-0.20.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a25b1087e4f7825b2458dacf5f4b0be2938f78e850e822edca1ff4994b56081a"},
|
||||
{file = "tree_sitter-0.20.4.tar.gz", hash = "sha256:6adb123e2f3e56399bbf2359924633c882cc40ee8344885200bca0922f713be5"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
setuptools = {version = ">=60.0.0", markers = "python_version >= \"3.12\""}
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-languages"
|
||||
version = "1.10.2"
|
||||
description = "Binary Python wheels for all tree sitter languages."
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5580348f0b20233b1d5431fa178ccd3d07423ca4a3275df02a44608fd72344b9"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:103c7466644486b1e9e03850df46fc6aa12f13ca636c74f173270276220ac80b"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d13db84511c6f1a7dc40383b66deafa74dabd8b877e3d65ab253f3719eccafd6"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57adfa32be7e465b54aa72f915f6c78a2b66b227df4f656b5d4fbd1ca7a92b3f"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c6385e033e460ceb8f33f3f940335f422ef2b763700a04f0089391a68b56153"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:dfa3f38cc5381c5aba01dd7494f59b8a9050e82ff6e06e1233e3a0cbae297e3c"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:9f195155acf47f8bc5de7cee46ecd07b2f5697f007ba89435b51ef4c0b953ea5"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2de330e2ac6d7426ca025a3ec0f10d5640c3682c1d0c7702e812dcfb44b58120"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-win32.whl", hash = "sha256:c9731cf745f135d9770eeba9bb4e2ff4dabc107b5ae9b8211e919f6b9100ea6d"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp310-cp310-win_amd64.whl", hash = "sha256:6dd75851c41d0c3c4987a9b7692d90fa8848706c23115669d8224ffd6571e357"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7eb7d7542b2091c875fe52719209631fca36f8c10fa66970d2c576ae6a1b8289"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6b41bcb00974b1c8a1800c7f1bb476a1d15a0463e760ee24872f2d53b08ee424"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6f370cd7845c6c81df05680d5bd96db8a99d32b56f4728c5d05978911130a853"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a1dc195c88ef4c72607e112a809a69190e096a2e5ebc6201548b3e05fdd169ad"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ae34ac314a7170be24998a0f994c1ac80761d8d4bd126af27ee53a023d3b849"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:01b5742d5f5bd675489486b582bd482215880b26dde042c067f8265a6e925d9c"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ab1cbc46244d34fd16f21edaa20231b2a57f09f092a06ee3d469f3117e6eb954"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0b1149e7467a4e92b8a70e6005fe762f880f493cf811fc003554b29f04f5e7c8"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-win32.whl", hash = "sha256:049276343962f4696390ee555acc2c1a65873270c66a6cbe5cb0bca83bcdf3c6"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp311-cp311-win_amd64.whl", hash = "sha256:7f3fdd468a577f04db3b63454d939e26e360229b53c80361920aa1ebf2cd7491"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c0f4c8b2734c45859edc7fcaaeaab97a074114111b5ba51ab4ec7ed52104763c"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:eecd3c1244ac3425b7a82ba9125b4ddb45d953bbe61de114c0334fd89b7fe782"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15db3c8510bc39a80147ee7421bf4782c15c09581c1dc2237ea89cefbd95b846"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:92c6487a6feea683154d3e06e6db68c30e0ae749a7ce4ce90b9e4e46b78c85c7"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2f1cd1d1bdd65332f9c2b67d49dcf148cf1ded752851d159ac3e5ee4f4d260"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:976c8039165b8e12f17a01ddee9f4e23ec6e352b165ad29b44d2bf04e2fbe77e"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:dafbbdf16bf668a580902e1620f4baa1913e79438abcce721a50647564c687b9"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1aeabd3d60d6d276b73cd8f3739d595b1299d123cc079a317f1a5b3c5461e2ca"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-win32.whl", hash = "sha256:fab8ee641914098e8933b87ea3d657bea4dd00723c1ee7038b847b12eeeef4f5"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp312-cp312-win_amd64.whl", hash = "sha256:5e606430d736367e5787fa5a7a0c5a1ec9b85eded0b3596bbc0d83532a40810b"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:838d5b48a7ed7a17658721952c77fda4570d2a069f933502653b17e15a9c39c9"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:987b3c71b1d278c2889e018ee77b8ee05c384e2e3334dec798f8b611c4ab2d1e"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:faa00abcb2c819027df58472da055d22fa7dfcb77c77413d8500c32ebe24d38b"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e102fbbf02322d9201a86a814e79a9734ac80679fdb9682144479044f401a73"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:8f0b87cf1a7b03174ba18dfd81582be82bfed26803aebfe222bd20e444aba003"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c0f1b9af9cb67f0b942b020da9fdd000aad5e92f2383ae0ba7a330b318d31912"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:5a4076c921f7a4d31e643843de7dfe040b65b63a238a5aa8d31d93aabe6572aa"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-win32.whl", hash = "sha256:fa6391a3a5d83d32db80815161237b67d70576f090ce5f38339206e917a6f8bd"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp37-cp37m-win_amd64.whl", hash = "sha256:55649d3f254585a064121513627cf9788c1cfdadbc5f097f33d5ba750685a4c0"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6f85d1edaa2d22d80d4ea5b6d12b95cf3644017b6c227d0d42854439e02e8893"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:d78feed4a764ef3141cb54bf00fe94d514d8b6e26e09423e23b4c616fcb7938c"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da1aca27531f9dd5308637d76643372856f0f65d0d28677d1bcf4211e8ed1ad0"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1031ea440dafb72237437d754eff8940153a3b051e3d18932ac25e75ce060a15"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99d3249beaef2c9fe558ecc9a97853c260433a849dcc68266d9770d196c2e102"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:59a4450f262a55148fb7e68681522f0c2a2f6b7d89666312a2b32708d8f416e1"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ce74eab0e430370d5e15a96b6c6205f93405c177a8b2e71e1526643b2fb9bab1"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:9b4dd2b6b3d24c85dffe33d6c343448869eaf4f41c19ddba662eb5d65d8808f4"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-win32.whl", hash = "sha256:92d734fb968fe3927a7596d9f0459f81a8fa7b07e16569476b28e27d0d753348"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp38-cp38-win_amd64.whl", hash = "sha256:46a13f7d38f2eeb75f7cf127d1201346093748c270d686131f0cbc50e42870a1"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f8c6a936ae99fdd8857e91f86c11c2f5e507ff30631d141d98132bb7ab2c8638"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c283a61423f49cdfa7b5a5dfbb39221e3bd126fca33479cd80749d4d7a6b7349"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e60be6bdcff923386a54a5edcb6ff33fc38ab0118636a762024fa2bc98de55"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c00069f9575bd831eabcce2cdfab158dde1ed151e7e5614c2d985ff7d78a7de1"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:475ff53203d8a43ccb19bb322fa2fb200d764001cc037793f1fadd714bb343da"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:26fe7c9c412e4141dea87ea4b3592fd12e385465b5bdab106b0d5125754d4f60"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:8fed27319957458340f24fe14daad467cd45021da034eef583519f83113a8c5e"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3657a491a7f96cc75a3568ddd062d25f3be82b6a942c68801a7b226ff7130181"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-win32.whl", hash = "sha256:33f7d584d01a7a3c893072f34cfc64ec031f3cfe57eebc32da2f8ac046e101a7"},
|
||||
{file = "tree_sitter_languages-1.10.2-cp39-cp39-win_amd64.whl", hash = "sha256:1b944af3ee729fa70fc8ae82224a9ff597cdb63addea084e0ea2fa2b0ec39bb7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
tree-sitter = "*"
|
||||
|
||||
[[package]]
|
||||
name = "typer"
|
||||
version = "0.9.0"
|
||||
@@ -8998,9 +9156,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
|
||||
|
||||
[extras]
|
||||
cli = ["typer"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "upstash-redis", "xata", "xmltodict", "zhipuai"]
|
||||
extended-testing = ["aiosqlite", "aleph-alpha-client", "anthropic", "arxiv", "assemblyai", "atlassian-python-api", "azure-ai-documentintelligence", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "cohere", "databricks-vectorsearch", "datasets", "dgml-utils", "elasticsearch", "esprima", "faiss-cpu", "feedparser", "fireworks-ai", "geopandas", "gitpython", "google-cloud-documentai", "gql", "gradientai", "hdbcli", "hologres-vector", "html2text", "httpx", "javelin-sdk", "jinja2", "jq", "jsonschema", "lxml", "markdownify", "motor", "msal", "mwparserfromhell", "mwxml", "newspaper3k", "numexpr", "nvidia-riva-client", "oci", "openai", "openapi-pydantic", "oracle-ads", "pandas", "pdfminer-six", "pgvector", "praw", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "rapidocr-onnxruntime", "rdflib", "requests-toolbelt", "rspace_client", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "timescale-vector", "tqdm", "tree-sitter", "tree-sitter-languages", "upstash-redis", "xata", "xmltodict", "zhipuai"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "fe633ab7d246239420a26bebf8bcf857edcf0778f75b3eb2a4b1314cb13645c8"
|
||||
content-hash = "e98000541a4991b1d41c9e995a4153ca24745e880afe75af6516574e3fb8b4a2"
|
||||
|
@@ -83,6 +83,8 @@ msal = {version = "^1.25.0", optional = true}
|
||||
databricks-vectorsearch = {version = "^0.21", optional = true}
|
||||
dgml-utils = {version = "^0.3.0", optional = true}
|
||||
datasets = {version = "^2.15.0", optional = true}
|
||||
tree-sitter = {version = "^0.20.2", optional = true}
|
||||
tree-sitter-languages = {version = "^1.8.0", optional = true}
|
||||
azure-ai-documentintelligence = {version = "^1.0.0b1", optional = true}
|
||||
oracle-ads = {version = "^2.9.1", optional = true}
|
||||
zhipuai = {version = "^1.0.7", optional = true}
|
||||
@@ -177,7 +179,6 @@ setuptools = "^67.6.1"
|
||||
langchain-core = {path = "../core", develop = true}
|
||||
|
||||
[tool.poetry.extras]
|
||||
|
||||
cli = ["typer"]
|
||||
|
||||
# An extra used to be able to add extended testing.
|
||||
@@ -249,6 +250,8 @@ extended_testing = [
|
||||
"databricks-vectorsearch",
|
||||
"dgml-utils",
|
||||
"cohere",
|
||||
"tree-sitter",
|
||||
"tree-sitter-languages",
|
||||
"azure-ai-documentintelligence",
|
||||
"oracle-ads",
|
||||
"zhipuai",
|
||||
|
@@ -0,0 +1,53 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.c import CSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestCSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """int main() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct S {
|
||||
};
|
||||
|
||||
union U {
|
||||
};
|
||||
|
||||
enum Evens {
|
||||
Two = 2,
|
||||
Four = 4
|
||||
};"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: int main() {
|
||||
|
||||
// Code for: struct S {
|
||||
|
||||
// Code for: union U {
|
||||
|
||||
// Code for: enum Evens {"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"int main() {\n return 0;\n}",
|
||||
"struct S {\n}",
|
||||
"union U {\n}",
|
||||
"enum Evens {\n Two = 2,\n Four = 4\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(CSegmenter("int a;").is_valid())
|
||||
self.assertFalse(CSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = CSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = CSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,63 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.cpp import CPPSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestCPPSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """int foo() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
class T {
|
||||
auto bar() const -> int;
|
||||
template<class U>
|
||||
void baz(U) {
|
||||
}
|
||||
};
|
||||
|
||||
struct S {
|
||||
};
|
||||
|
||||
union U {
|
||||
};
|
||||
|
||||
auto T::bar() const -> int {
|
||||
return 1;
|
||||
}"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: int foo() {
|
||||
|
||||
// Code for: class T {
|
||||
|
||||
// Code for: struct S {
|
||||
|
||||
// Code for: union U {
|
||||
|
||||
// Code for: auto T::bar() const -> int {"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"int foo() {\n return 1;\n}",
|
||||
"class T {\n auto bar() const -> int;\n "
|
||||
"template<class U>\n void baz(U) {\n }\n}",
|
||||
"struct S {\n}",
|
||||
"union U {\n}",
|
||||
"auto T::bar() const -> int {\n return 1;\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(CPPSegmenter("int a;").is_valid())
|
||||
self.assertFalse(CPPSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = CPPSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = CPPSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,78 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.csharp import CSharpSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestCSharpSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """namespace World
|
||||
{
|
||||
}
|
||||
|
||||
class Hello
|
||||
{
|
||||
static void Main(string []args)
|
||||
{
|
||||
System.Console.WriteLine("Hello, world.");
|
||||
}
|
||||
}
|
||||
|
||||
interface Human
|
||||
{
|
||||
void breathe();
|
||||
}
|
||||
|
||||
enum Tens
|
||||
{
|
||||
Ten = 10,
|
||||
Twenty = 20
|
||||
}
|
||||
|
||||
struct T
|
||||
{
|
||||
}
|
||||
|
||||
record Person(string FirstName, string LastName, string Id)
|
||||
{
|
||||
internal string Id { get; init; } = Id;
|
||||
}"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: namespace World
|
||||
|
||||
// Code for: class Hello
|
||||
|
||||
// Code for: interface Human
|
||||
|
||||
// Code for: enum Tens
|
||||
|
||||
// Code for: struct T
|
||||
|
||||
// Code for: record Person(string FirstName, string LastName, string Id)"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"namespace World\n{\n}",
|
||||
"class Hello\n{\n static void Main(string []args)\n {\n "
|
||||
'System.Console.WriteLine("Hello, world.");\n }\n}',
|
||||
"interface Human\n{\n void breathe();\n}",
|
||||
"enum Tens\n{\n Ten = 10,\n Twenty = 20\n}",
|
||||
"struct T\n{\n}",
|
||||
"record Person(string FirstName, string LastName, string Id)\n{\n "
|
||||
"internal string Id { get; init; } = Id;\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(CSharpSegmenter("int a;").is_valid())
|
||||
self.assertFalse(CSharpSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = CSharpSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = CSharpSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,50 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.go import GoSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestGoSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """func foo(a int) int {
|
||||
return a;
|
||||
}
|
||||
|
||||
type T struct {
|
||||
a int
|
||||
b bool
|
||||
c string
|
||||
}
|
||||
|
||||
type S interface {
|
||||
bar() float64
|
||||
}
|
||||
"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: func foo(a int) int {
|
||||
|
||||
// Code for: type T struct {
|
||||
|
||||
// Code for: type S interface {"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"func foo(a int) int {\n return a;\n}",
|
||||
"type T struct {\n a int\n b bool\n c string\n}",
|
||||
"type S interface {\n bar() float64\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(GoSegmenter("var a int;").is_valid())
|
||||
self.assertFalse(GoSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = GoSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = GoSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,57 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.java import JavaSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestJavaSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """class Hello
|
||||
{
|
||||
public static void main(String[] args)
|
||||
{
|
||||
System.out.println("Hello, world.");
|
||||
}
|
||||
}
|
||||
|
||||
interface Human
|
||||
{
|
||||
void breathe();
|
||||
}
|
||||
|
||||
enum Tens
|
||||
{
|
||||
TEN,
|
||||
TWENTY
|
||||
}
|
||||
"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: class Hello
|
||||
|
||||
// Code for: interface Human
|
||||
|
||||
// Code for: enum Tens"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"class Hello\n{\n "
|
||||
"public static void main(String[] args)\n {\n "
|
||||
'System.out.println("Hello, world.");\n }\n}',
|
||||
"interface Human\n{\n void breathe();\n}",
|
||||
"enum Tens\n{\n TEN,\n TWENTY\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(JavaSegmenter("int a;").is_valid())
|
||||
self.assertFalse(JavaSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = JavaSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = JavaSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,60 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.kotlin import KotlinSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestKotlinSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """fun foo(a: Int): Int {
|
||||
return a
|
||||
}
|
||||
|
||||
class T {
|
||||
var a: Int = 0
|
||||
var b: Boolean = false
|
||||
var c: String = ""
|
||||
}
|
||||
|
||||
interface S {
|
||||
fun bar(): Double
|
||||
}
|
||||
|
||||
enum class P {
|
||||
A,
|
||||
B,
|
||||
C
|
||||
}
|
||||
"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: fun foo(a: Int): Int {
|
||||
|
||||
// Code for: class T {
|
||||
|
||||
// Code for: interface S {
|
||||
|
||||
// Code for: enum class P {"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"fun foo(a: Int): Int {\n return a\n}",
|
||||
"class T {\n var a: Int = 0\n var b: Boolean = false\n "
|
||||
'var c: String = ""\n}',
|
||||
"interface S {\n fun bar(): Double\n}",
|
||||
"enum class P {\n A,\n B,\n C\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(KotlinSegmenter("val a: Int = 5").is_valid())
|
||||
self.assertFalse(KotlinSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = KotlinSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = KotlinSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,40 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.lua import LuaSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestLuaSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """function F()
|
||||
print("Hello")
|
||||
end
|
||||
|
||||
local function G()
|
||||
print("Goodbye")
|
||||
end"""
|
||||
|
||||
self.expected_simplified_code = """-- Code for: function F()
|
||||
|
||||
-- Code for: local function G()"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
'function F()\n print("Hello")\nend',
|
||||
'local function G()\n print("Goodbye")\nend',
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(LuaSegmenter("local a").is_valid())
|
||||
self.assertFalse(LuaSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = LuaSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = LuaSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,44 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.perl import PerlSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestPerlSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """sub Hello {
|
||||
print "Hello, World!";
|
||||
}
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my $self = {};
|
||||
bless $self, $class;
|
||||
return $self;
|
||||
}"""
|
||||
|
||||
self.expected_simplified_code = """# Code for: sub Hello {
|
||||
|
||||
# Code for: sub new {"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
'sub Hello {\n print "Hello, World!";\n}',
|
||||
"sub new {\n my $class = shift;\n my $self = {};\n "
|
||||
"bless $self, $class;\n return $self;\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(PerlSegmenter("$age = 25;").is_valid())
|
||||
self.assertFalse(PerlSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = PerlSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = PerlSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,51 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.ruby import RubySegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestRubySegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """def foo
|
||||
i = 0
|
||||
end
|
||||
|
||||
module M
|
||||
def hi
|
||||
i = 2
|
||||
end
|
||||
end
|
||||
|
||||
class T
|
||||
def bar
|
||||
j = 1
|
||||
end
|
||||
end"""
|
||||
|
||||
self.expected_simplified_code = """# Code for: def foo
|
||||
|
||||
# Code for: module M
|
||||
|
||||
# Code for: class T"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"def foo\n i = 0\nend",
|
||||
"module M\n def hi\n i = 2\n end\nend",
|
||||
"class T\n def bar\n j = 1\n end\nend",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(RubySegmenter("def a; end").is_valid())
|
||||
self.assertFalse(RubySegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = RubySegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = RubySegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,50 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.rust import RustSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestRustSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """fn foo() -> i32 {
|
||||
return 1;
|
||||
}
|
||||
|
||||
struct T {
|
||||
a: i32,
|
||||
b: bool,
|
||||
c: String
|
||||
}
|
||||
|
||||
trait S {
|
||||
fn bar() -> Self
|
||||
}
|
||||
"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: fn foo() -> i32 {
|
||||
|
||||
// Code for: struct T {
|
||||
|
||||
// Code for: trait S {"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"fn foo() -> i32 {\n return 1;\n}",
|
||||
"struct T {\n a: i32,\n b: bool,\n c: String\n}",
|
||||
"trait S {\n fn bar() -> Self\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(RustSegmenter("let a: i32;").is_valid())
|
||||
self.assertFalse(RustSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = RustSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = RustSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,56 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.scala import ScalaSegmenter
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestScalaSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """def foo() {
|
||||
return 1
|
||||
}
|
||||
|
||||
object T {
|
||||
def baz() {
|
||||
val x = 1
|
||||
}
|
||||
}
|
||||
|
||||
class S() {
|
||||
|
||||
}
|
||||
|
||||
trait T {
|
||||
def P(x: Any): Boolean
|
||||
}"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: def foo() {
|
||||
|
||||
// Code for: object T {
|
||||
|
||||
// Code for: class S() {
|
||||
|
||||
// Code for: trait T {"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"def foo() {\n return 1\n}",
|
||||
"object T {\n def baz() {\n val x = 1\n }\n}",
|
||||
"class S() {\n\n}",
|
||||
"trait T {\n def P(x: Any): Boolean\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertFalse(ScalaSegmenter("val x").is_valid())
|
||||
self.assertFalse(ScalaSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = ScalaSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = ScalaSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
@@ -0,0 +1,67 @@
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.typescript import (
|
||||
TypeScriptSegmenter,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("tree_sitter", "tree_sitter_languages")
|
||||
class TestTypeScriptSegmenter(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.example_code = """function foo(): number
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
class Autumn
|
||||
{
|
||||
leafCount = 45;
|
||||
reduceTemperature(desiredTemperature: number): number {
|
||||
return desiredTemperature * 0.6;
|
||||
}
|
||||
}
|
||||
|
||||
interface Season
|
||||
{
|
||||
change(): void;
|
||||
}
|
||||
|
||||
enum Colors
|
||||
{
|
||||
Green = 'green',
|
||||
Red = 'red',
|
||||
}
|
||||
"""
|
||||
|
||||
self.expected_simplified_code = """// Code for: function foo(): number
|
||||
|
||||
// Code for: class Autumn
|
||||
|
||||
// Code for: interface Season
|
||||
|
||||
// Code for: enum Colors"""
|
||||
|
||||
self.expected_extracted_code = [
|
||||
"function foo(): number\n{\n return 1;\n}",
|
||||
"class Autumn\n{\n leafCount = 45;\n "
|
||||
"reduceTemperature(desiredTemperature: number): number {\n "
|
||||
"return desiredTemperature * 0.6;\n }\n}",
|
||||
"interface Season\n{\n change(): void;\n}",
|
||||
"enum Colors\n{\n Green = 'green',\n Red = 'red',\n}",
|
||||
]
|
||||
|
||||
def test_is_valid(self) -> None:
|
||||
self.assertTrue(TypeScriptSegmenter("let a;").is_valid())
|
||||
self.assertFalse(TypeScriptSegmenter("a b c 1 2 3").is_valid())
|
||||
|
||||
def test_extract_functions_classes(self) -> None:
|
||||
segmenter = TypeScriptSegmenter(self.example_code)
|
||||
extracted_code = segmenter.extract_functions_classes()
|
||||
self.assertEqual(extracted_code, self.expected_extracted_code)
|
||||
|
||||
def test_simplify_code(self) -> None:
|
||||
segmenter = TypeScriptSegmenter(self.example_code)
|
||||
simplified_code = segmenter.simplify_code()
|
||||
self.assertEqual(simplified_code, self.expected_simplified_code)
|
Reference in New Issue
Block a user