text-splitters[minor], langchain[minor], community[patch], templates, docs: langchain-text-splitters 0.0.1 (#18346)

This commit is contained in:
Bagatur
2024-02-29 18:33:21 -08:00
committed by GitHub
parent 7891934173
commit 5efb5c099f
226 changed files with 6626 additions and 1980 deletions

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional
from typing import Any, Dict, Iterator, Literal, Optional
from langchain_core.documents import Document
@@ -26,50 +26,67 @@ from langchain_community.document_loaders.parsers.language.typescript import (
TypeScriptSegmenter,
)
if TYPE_CHECKING:
from langchain.text_splitter import Language
LANGUAGE_EXTENSIONS: Dict[str, str] = {
"py": "python",
"js": "js",
"cobol": "cobol",
"c": "c",
"cpp": "cpp",
"cs": "csharp",
"rb": "ruby",
"scala": "scala",
"rs": "rust",
"go": "go",
"kt": "kotlin",
"lua": "lua",
"pl": "perl",
"ts": "ts",
"java": "java",
}
try:
from langchain.text_splitter import Language
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
"python": PythonSegmenter,
"js": JavaScriptSegmenter,
"cobol": CobolSegmenter,
"c": CSegmenter,
"cpp": CPPSegmenter,
"csharp": CSharpSegmenter,
"ruby": RubySegmenter,
"rust": RustSegmenter,
"scala": ScalaSegmenter,
"go": GoSegmenter,
"kotlin": KotlinSegmenter,
"lua": LuaSegmenter,
"perl": PerlSegmenter,
"ts": TypeScriptSegmenter,
"java": JavaSegmenter,
}
LANGUAGE_EXTENSIONS: Dict[str, str] = {
"py": Language.PYTHON,
"js": Language.JS,
"cobol": Language.COBOL,
"c": Language.C,
"cpp": Language.CPP,
"cs": Language.CSHARP,
"rb": Language.RUBY,
"scala": Language.SCALA,
"rs": Language.RUST,
"go": Language.GO,
"kt": Language.KOTLIN,
"lua": Language.LUA,
"pl": Language.PERL,
"ts": Language.TS,
"java": Language.JAVA,
}
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
Language.PYTHON: PythonSegmenter,
Language.JS: JavaScriptSegmenter,
Language.COBOL: CobolSegmenter,
Language.C: CSegmenter,
Language.CPP: CPPSegmenter,
Language.CSHARP: CSharpSegmenter,
Language.RUBY: RubySegmenter,
Language.RUST: RustSegmenter,
Language.SCALA: ScalaSegmenter,
Language.GO: GoSegmenter,
Language.KOTLIN: KotlinSegmenter,
Language.LUA: LuaSegmenter,
Language.PERL: PerlSegmenter,
Language.TS: TypeScriptSegmenter,
Language.JAVA: JavaSegmenter,
}
except ImportError:
LANGUAGE_EXTENSIONS = {}
LANGUAGE_SEGMENTERS = {}
Language = Literal[
"cpp",
"go",
"java",
"kotlin",
"js",
"ts",
"php",
"proto",
"python",
"rst",
"ruby",
"rust",
"scala",
"swift",
"markdown",
"latex",
"html",
"sol",
"csharp",
"cobol",
"c",
"lua",
"perl",
]
class LanguageParser(BaseBlobParser):
@@ -83,21 +100,21 @@ class LanguageParser(BaseBlobParser):
The supported languages for code parsing are:
- C (*)
- C++ (*)
- C# (*)
- COBOL
- Go (*)
- Java (*)
- JavaScript (requires package `esprima`)
- Kotlin (*)
- Lua (*)
- Perl (*)
- Python
- Ruby (*)
- Rust (*)
- Scala (*)
- TypeScript (*)
- C: "c" (*)
- C++: "cpp" (*)
- C#: "csharp" (*)
- COBOL: "cobol"
- Go: "go" (*)
- Java: "java" (*)
- JavaScript: "js" (requires package `esprima`)
- Kotlin: "kotlin" (*)
- Lua: "lua" (*)
- Perl: "perl" (*)
- Python: "python"
- Ruby: "ruby" (*)
- Rust: "rust" (*)
- Scala: "scala" (*)
- TypeScript: "ts" (*)
Items marked with (*) require the packages `tree_sitter` and
`tree_sitter_languages`. It is straightforward to add support for additional
@@ -113,7 +130,6 @@ class LanguageParser(BaseBlobParser):
.. code-block:: python
from langchain.text_splitter.Language
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
@@ -129,13 +145,12 @@ class LanguageParser(BaseBlobParser):
.. code-block:: python
from langchain.text_splitter import Language
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py"],
parser=LanguageParser(language=Language.PYTHON)
parser=LanguageParser(language="python")
)
Example instantiations to set number of lines threshold: