mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-10 23:41:28 +00:00
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
@@ -0,0 +1,5 @@
|
||||
from langchain_community.document_loaders.parsers.language.language_parser import (
|
||||
LanguageParser,
|
||||
)
|
||||
|
||||
__all__ = ["LanguageParser"]
|
@@ -0,0 +1,98 @@
|
||||
import re
|
||||
from typing import Callable, List
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
||||
CodeSegmenter,
|
||||
)
|
||||
|
||||
|
||||
class CobolSegmenter(CodeSegmenter):
|
||||
"""Code segmenter for `COBOL`."""
|
||||
|
||||
PARAGRAPH_PATTERN = re.compile(r"^[A-Z0-9\-]+(\s+.*)?\.$", re.IGNORECASE)
|
||||
DIVISION_PATTERN = re.compile(
|
||||
r"^\s*(IDENTIFICATION|DATA|PROCEDURE|ENVIRONMENT)\s+DIVISION.*$", re.IGNORECASE
|
||||
)
|
||||
SECTION_PATTERN = re.compile(r"^\s*[A-Z0-9\-]+\s+SECTION.$", re.IGNORECASE)
|
||||
|
||||
def __init__(self, code: str):
|
||||
super().__init__(code)
|
||||
self.source_lines: List[str] = self.code.splitlines()
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
# Identify presence of any division to validate COBOL code
|
||||
return any(self.DIVISION_PATTERN.match(line) for line in self.source_lines)
|
||||
|
||||
def _extract_code(self, start_idx: int, end_idx: int) -> str:
|
||||
return "\n".join(self.source_lines[start_idx:end_idx]).rstrip("\n")
|
||||
|
||||
def _is_relevant_code(self, line: str) -> bool:
|
||||
"""Check if a line is part of the procedure division or a relevant section."""
|
||||
if "PROCEDURE DIVISION" in line.upper():
|
||||
return True
|
||||
# Add additional conditions for relevant sections if needed
|
||||
return False
|
||||
|
||||
def _process_lines(self, func: Callable) -> List[str]:
|
||||
"""A generic function to process COBOL lines based on provided func."""
|
||||
elements: List[str] = []
|
||||
start_idx = None
|
||||
inside_relevant_section = False
|
||||
|
||||
for i, line in enumerate(self.source_lines):
|
||||
if self._is_relevant_code(line):
|
||||
inside_relevant_section = True
|
||||
|
||||
if inside_relevant_section and (
|
||||
self.PARAGRAPH_PATTERN.match(line.strip().split(" ")[0])
|
||||
or self.SECTION_PATTERN.match(line.strip())
|
||||
):
|
||||
if start_idx is not None:
|
||||
func(elements, start_idx, i)
|
||||
start_idx = i
|
||||
|
||||
# Handle the last element if exists
|
||||
if start_idx is not None:
|
||||
func(elements, start_idx, len(self.source_lines))
|
||||
|
||||
return elements
|
||||
|
||||
def extract_functions_classes(self) -> List[str]:
|
||||
def extract_func(elements: List[str], start_idx: int, end_idx: int) -> None:
|
||||
elements.append(self._extract_code(start_idx, end_idx))
|
||||
|
||||
return self._process_lines(extract_func)
|
||||
|
||||
def simplify_code(self) -> str:
|
||||
simplified_lines: List[str] = []
|
||||
inside_relevant_section = False
|
||||
omitted_code_added = (
|
||||
False # To track if "* OMITTED CODE *" has been added after the last header
|
||||
)
|
||||
|
||||
for line in self.source_lines:
|
||||
is_header = (
|
||||
"PROCEDURE DIVISION" in line
|
||||
or "DATA DIVISION" in line
|
||||
or "IDENTIFICATION DIVISION" in line
|
||||
or self.PARAGRAPH_PATTERN.match(line.strip().split(" ")[0])
|
||||
or self.SECTION_PATTERN.match(line.strip())
|
||||
)
|
||||
|
||||
if is_header:
|
||||
inside_relevant_section = True
|
||||
# Reset the flag since we're entering a new section/division or
|
||||
# paragraph
|
||||
omitted_code_added = False
|
||||
|
||||
if inside_relevant_section:
|
||||
if is_header:
|
||||
# Add header and reset the omitted code added flag
|
||||
simplified_lines.append(line)
|
||||
elif not omitted_code_added:
|
||||
# Add omitted code comment only if it hasn't been added directly
|
||||
# after the last header
|
||||
simplified_lines.append("* OMITTED CODE *")
|
||||
omitted_code_added = True
|
||||
|
||||
return "\n".join(simplified_lines)
|
@@ -0,0 +1,20 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List
|
||||
|
||||
|
||||
class CodeSegmenter(ABC):
|
||||
"""Abstract class for the code segmenter."""
|
||||
|
||||
def __init__(self, code: str):
|
||||
self.code = code
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
@abstractmethod
|
||||
def simplify_code(self) -> str:
|
||||
raise NotImplementedError() # pragma: no cover
|
||||
|
||||
@abstractmethod
|
||||
def extract_functions_classes(self) -> List[str]:
|
||||
raise NotImplementedError() # pragma: no cover
|
@@ -0,0 +1,69 @@
|
||||
from typing import Any, List
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
||||
CodeSegmenter,
|
||||
)
|
||||
|
||||
|
||||
class JavaScriptSegmenter(CodeSegmenter):
|
||||
"""Code segmenter for JavaScript."""
|
||||
|
||||
def __init__(self, code: str):
|
||||
super().__init__(code)
|
||||
self.source_lines = self.code.splitlines()
|
||||
|
||||
try:
|
||||
import esprima # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import esprima Python package. "
|
||||
"Please install it with `pip install esprima`."
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
import esprima
|
||||
|
||||
try:
|
||||
esprima.parseScript(self.code)
|
||||
return True
|
||||
except esprima.Error:
|
||||
return False
|
||||
|
||||
def _extract_code(self, node: Any) -> str:
|
||||
start = node.loc.start.line - 1
|
||||
end = node.loc.end.line
|
||||
return "\n".join(self.source_lines[start:end])
|
||||
|
||||
def extract_functions_classes(self) -> List[str]:
|
||||
import esprima
|
||||
|
||||
tree = esprima.parseScript(self.code, loc=True)
|
||||
functions_classes = []
|
||||
|
||||
for node in tree.body:
|
||||
if isinstance(
|
||||
node,
|
||||
(esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
|
||||
):
|
||||
functions_classes.append(self._extract_code(node))
|
||||
|
||||
return functions_classes
|
||||
|
||||
def simplify_code(self) -> str:
|
||||
import esprima
|
||||
|
||||
tree = esprima.parseScript(self.code, loc=True)
|
||||
simplified_lines = self.source_lines[:]
|
||||
|
||||
for node in tree.body:
|
||||
if isinstance(
|
||||
node,
|
||||
(esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
|
||||
):
|
||||
start = node.loc.start.line - 1
|
||||
simplified_lines[start] = f"// Code for: {simplified_lines[start]}"
|
||||
|
||||
for line_num in range(start + 1, node.loc.end.line):
|
||||
simplified_lines[line_num] = None # type: ignore
|
||||
|
||||
return "\n".join(line for line in simplified_lines if line is not None)
|
@@ -0,0 +1,158 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseBlobParser
|
||||
from langchain_community.document_loaders.blob_loaders import Blob
|
||||
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
|
||||
from langchain_community.document_loaders.parsers.language.javascript import (
|
||||
JavaScriptSegmenter,
|
||||
)
|
||||
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain.text_splitter import Language
|
||||
|
||||
try:
|
||||
from langchain.text_splitter import Language
|
||||
|
||||
LANGUAGE_EXTENSIONS: Dict[str, str] = {
|
||||
"py": Language.PYTHON,
|
||||
"js": Language.JS,
|
||||
"cobol": Language.COBOL,
|
||||
}
|
||||
|
||||
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
||||
Language.PYTHON: PythonSegmenter,
|
||||
Language.JS: JavaScriptSegmenter,
|
||||
Language.COBOL: CobolSegmenter,
|
||||
}
|
||||
except ImportError:
|
||||
LANGUAGE_EXTENSIONS = {}
|
||||
LANGUAGE_SEGMENTERS = {}
|
||||
|
||||
|
||||
class LanguageParser(BaseBlobParser):
|
||||
"""Parse using the respective programming language syntax.
|
||||
|
||||
Each top-level function and class in the code is loaded into separate documents.
|
||||
Furthermore, an extra document is generated, containing the remaining top-level code
|
||||
that excludes the already segmented functions and classes.
|
||||
|
||||
This approach can potentially improve the accuracy of QA models over source code.
|
||||
|
||||
Currently, the supported languages for code parsing are Python and JavaScript.
|
||||
|
||||
The language used for parsing can be configured, along with the minimum number of
|
||||
lines required to activate the splitting based on syntax.
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.text_splitter.Language
|
||||
from langchain_community.document_loaders.generic import GenericLoader
|
||||
from langchain_community.document_loaders.parsers import LanguageParser
|
||||
|
||||
loader = GenericLoader.from_filesystem(
|
||||
"./code",
|
||||
glob="**/*",
|
||||
suffixes=[".py", ".js"],
|
||||
parser=LanguageParser()
|
||||
)
|
||||
docs = loader.load()
|
||||
|
||||
Example instantiations to manually select the language:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.text_splitter import Language
|
||||
|
||||
loader = GenericLoader.from_filesystem(
|
||||
"./code",
|
||||
glob="**/*",
|
||||
suffixes=[".py"],
|
||||
parser=LanguageParser(language=Language.PYTHON)
|
||||
)
|
||||
|
||||
Example instantiations to set number of lines threshold:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
loader = GenericLoader.from_filesystem(
|
||||
"./code",
|
||||
glob="**/*",
|
||||
suffixes=[".py"],
|
||||
parser=LanguageParser(parser_threshold=200)
|
||||
)
|
||||
"""
|
||||
|
||||
def __init__(self, language: Optional[Language] = None, parser_threshold: int = 0):
|
||||
"""
|
||||
Language parser that split code using the respective language syntax.
|
||||
|
||||
Args:
|
||||
language: If None (default), it will try to infer language from source.
|
||||
parser_threshold: Minimum lines needed to activate parsing (0 by default).
|
||||
"""
|
||||
self.language = language
|
||||
self.parser_threshold = parser_threshold
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
code = blob.as_string()
|
||||
|
||||
language = self.language or (
|
||||
LANGUAGE_EXTENSIONS.get(blob.source.rsplit(".", 1)[-1])
|
||||
if isinstance(blob.source, str)
|
||||
else None
|
||||
)
|
||||
|
||||
if language is None:
|
||||
yield Document(
|
||||
page_content=code,
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
if self.parser_threshold >= len(code.splitlines()):
|
||||
yield Document(
|
||||
page_content=code,
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
"language": language,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
self.Segmenter = LANGUAGE_SEGMENTERS[language]
|
||||
segmenter = self.Segmenter(blob.as_string())
|
||||
if not segmenter.is_valid():
|
||||
yield Document(
|
||||
page_content=code,
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
},
|
||||
)
|
||||
return
|
||||
|
||||
for functions_classes in segmenter.extract_functions_classes():
|
||||
yield Document(
|
||||
page_content=functions_classes,
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
"content_type": "functions_classes",
|
||||
"language": language,
|
||||
},
|
||||
)
|
||||
yield Document(
|
||||
page_content=segmenter.simplify_code(),
|
||||
metadata={
|
||||
"source": blob.source,
|
||||
"content_type": "simplified_code",
|
||||
"language": language,
|
||||
},
|
||||
)
|
@@ -0,0 +1,51 @@
|
||||
import ast
|
||||
from typing import Any, List
|
||||
|
||||
from langchain_community.document_loaders.parsers.language.code_segmenter import (
|
||||
CodeSegmenter,
|
||||
)
|
||||
|
||||
|
||||
class PythonSegmenter(CodeSegmenter):
|
||||
"""Code segmenter for `Python`."""
|
||||
|
||||
def __init__(self, code: str):
|
||||
super().__init__(code)
|
||||
self.source_lines = self.code.splitlines()
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
try:
|
||||
ast.parse(self.code)
|
||||
return True
|
||||
except SyntaxError:
|
||||
return False
|
||||
|
||||
def _extract_code(self, node: Any) -> str:
|
||||
start = node.lineno - 1
|
||||
end = node.end_lineno
|
||||
return "\n".join(self.source_lines[start:end])
|
||||
|
||||
def extract_functions_classes(self) -> List[str]:
|
||||
tree = ast.parse(self.code)
|
||||
functions_classes = []
|
||||
|
||||
for node in ast.iter_child_nodes(tree):
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
||||
functions_classes.append(self._extract_code(node))
|
||||
|
||||
return functions_classes
|
||||
|
||||
def simplify_code(self) -> str:
|
||||
tree = ast.parse(self.code)
|
||||
simplified_lines = self.source_lines[:]
|
||||
|
||||
for node in ast.iter_child_nodes(tree):
|
||||
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
||||
start = node.lineno - 1
|
||||
simplified_lines[start] = f"# Code for: {simplified_lines[start]}"
|
||||
|
||||
assert isinstance(node.end_lineno, int)
|
||||
for line_num in range(start + 1, node.end_lineno):
|
||||
simplified_lines[line_num] = None # type: ignore
|
||||
|
||||
return "\n".join(line for line in simplified_lines if line is not None)
|
Reference in New Issue
Block a user