community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion:

```
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
mv langchain/langchain/adapters community/langchain_community
mv langchain/langchain/callbacks community/langchain_community/callbacks
mv langchain/langchain/chat_loaders community/langchain_community
mv langchain/langchain/chat_models community/langchain_community
mv langchain/langchain/document_loaders community/langchain_community
mv langchain/langchain/docstore community/langchain_community
mv langchain/langchain/document_transformers community/langchain_community
mv langchain/langchain/embeddings community/langchain_community
mv langchain/langchain/graphs community/langchain_community
mv langchain/langchain/llms community/langchain_community
mv langchain/langchain/memory/chat_message_histories community/langchain_community
mv langchain/langchain/retrievers community/langchain_community
mv langchain/langchain/storage community/langchain_community
mv langchain/langchain/tools community/langchain_community
mv langchain/langchain/utilities community/langchain_community
mv langchain/langchain/vectorstores community/langchain_community
mv langchain/langchain/agents/agent_toolkits community/langchain_community
mv langchain/langchain/cache.py community/langchain_community
```

Moved the following to core
```
mv langchain/langchain/utils/json_schema.py core/langchain_core/utils
mv langchain/langchain/utils/html.py core/langchain_core/utils
mv langchain/langchain/utils/strings.py core/langchain_core/utils
cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py
rm langchain/langchain/utils/env.py
```

See .scripts/community_split/script_integrations.sh for all changes
This commit is contained in:
Bagatur
2023-12-11 13:53:30 -08:00
committed by GitHub
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions

View File

@@ -0,0 +1,5 @@
from langchain_community.document_loaders.parsers.language.language_parser import (
LanguageParser,
)
__all__ = ["LanguageParser"]

View File

@@ -0,0 +1,98 @@
import re
from typing import Callable, List
from langchain_community.document_loaders.parsers.language.code_segmenter import (
CodeSegmenter,
)
class CobolSegmenter(CodeSegmenter):
"""Code segmenter for `COBOL`."""
PARAGRAPH_PATTERN = re.compile(r"^[A-Z0-9\-]+(\s+.*)?\.$", re.IGNORECASE)
DIVISION_PATTERN = re.compile(
r"^\s*(IDENTIFICATION|DATA|PROCEDURE|ENVIRONMENT)\s+DIVISION.*$", re.IGNORECASE
)
SECTION_PATTERN = re.compile(r"^\s*[A-Z0-9\-]+\s+SECTION.$", re.IGNORECASE)
def __init__(self, code: str):
super().__init__(code)
self.source_lines: List[str] = self.code.splitlines()
def is_valid(self) -> bool:
# Identify presence of any division to validate COBOL code
return any(self.DIVISION_PATTERN.match(line) for line in self.source_lines)
def _extract_code(self, start_idx: int, end_idx: int) -> str:
return "\n".join(self.source_lines[start_idx:end_idx]).rstrip("\n")
def _is_relevant_code(self, line: str) -> bool:
"""Check if a line is part of the procedure division or a relevant section."""
if "PROCEDURE DIVISION" in line.upper():
return True
# Add additional conditions for relevant sections if needed
return False
def _process_lines(self, func: Callable) -> List[str]:
"""A generic function to process COBOL lines based on provided func."""
elements: List[str] = []
start_idx = None
inside_relevant_section = False
for i, line in enumerate(self.source_lines):
if self._is_relevant_code(line):
inside_relevant_section = True
if inside_relevant_section and (
self.PARAGRAPH_PATTERN.match(line.strip().split(" ")[0])
or self.SECTION_PATTERN.match(line.strip())
):
if start_idx is not None:
func(elements, start_idx, i)
start_idx = i
# Handle the last element if exists
if start_idx is not None:
func(elements, start_idx, len(self.source_lines))
return elements
def extract_functions_classes(self) -> List[str]:
def extract_func(elements: List[str], start_idx: int, end_idx: int) -> None:
elements.append(self._extract_code(start_idx, end_idx))
return self._process_lines(extract_func)
def simplify_code(self) -> str:
simplified_lines: List[str] = []
inside_relevant_section = False
omitted_code_added = (
False # To track if "* OMITTED CODE *" has been added after the last header
)
for line in self.source_lines:
is_header = (
"PROCEDURE DIVISION" in line
or "DATA DIVISION" in line
or "IDENTIFICATION DIVISION" in line
or self.PARAGRAPH_PATTERN.match(line.strip().split(" ")[0])
or self.SECTION_PATTERN.match(line.strip())
)
if is_header:
inside_relevant_section = True
# Reset the flag since we're entering a new section/division or
# paragraph
omitted_code_added = False
if inside_relevant_section:
if is_header:
# Add header and reset the omitted code added flag
simplified_lines.append(line)
elif not omitted_code_added:
# Add omitted code comment only if it hasn't been added directly
# after the last header
simplified_lines.append("* OMITTED CODE *")
omitted_code_added = True
return "\n".join(simplified_lines)

View File

@@ -0,0 +1,20 @@
from abc import ABC, abstractmethod
from typing import List
class CodeSegmenter(ABC):
"""Abstract class for the code segmenter."""
def __init__(self, code: str):
self.code = code
def is_valid(self) -> bool:
return True
@abstractmethod
def simplify_code(self) -> str:
raise NotImplementedError() # pragma: no cover
@abstractmethod
def extract_functions_classes(self) -> List[str]:
raise NotImplementedError() # pragma: no cover

View File

@@ -0,0 +1,69 @@
from typing import Any, List
from langchain_community.document_loaders.parsers.language.code_segmenter import (
CodeSegmenter,
)
class JavaScriptSegmenter(CodeSegmenter):
"""Code segmenter for JavaScript."""
def __init__(self, code: str):
super().__init__(code)
self.source_lines = self.code.splitlines()
try:
import esprima # noqa: F401
except ImportError:
raise ImportError(
"Could not import esprima Python package. "
"Please install it with `pip install esprima`."
)
def is_valid(self) -> bool:
import esprima
try:
esprima.parseScript(self.code)
return True
except esprima.Error:
return False
def _extract_code(self, node: Any) -> str:
start = node.loc.start.line - 1
end = node.loc.end.line
return "\n".join(self.source_lines[start:end])
def extract_functions_classes(self) -> List[str]:
import esprima
tree = esprima.parseScript(self.code, loc=True)
functions_classes = []
for node in tree.body:
if isinstance(
node,
(esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
):
functions_classes.append(self._extract_code(node))
return functions_classes
def simplify_code(self) -> str:
import esprima
tree = esprima.parseScript(self.code, loc=True)
simplified_lines = self.source_lines[:]
for node in tree.body:
if isinstance(
node,
(esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
):
start = node.loc.start.line - 1
simplified_lines[start] = f"// Code for: {simplified_lines[start]}"
for line_num in range(start + 1, node.loc.end.line):
simplified_lines[line_num] = None # type: ignore
return "\n".join(line for line in simplified_lines if line is not None)

View File

@@ -0,0 +1,158 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseBlobParser
from langchain_community.document_loaders.blob_loaders import Blob
from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
from langchain_community.document_loaders.parsers.language.javascript import (
JavaScriptSegmenter,
)
from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
if TYPE_CHECKING:
from langchain.text_splitter import Language
try:
from langchain.text_splitter import Language
LANGUAGE_EXTENSIONS: Dict[str, str] = {
"py": Language.PYTHON,
"js": Language.JS,
"cobol": Language.COBOL,
}
LANGUAGE_SEGMENTERS: Dict[str, Any] = {
Language.PYTHON: PythonSegmenter,
Language.JS: JavaScriptSegmenter,
Language.COBOL: CobolSegmenter,
}
except ImportError:
LANGUAGE_EXTENSIONS = {}
LANGUAGE_SEGMENTERS = {}
class LanguageParser(BaseBlobParser):
"""Parse using the respective programming language syntax.
Each top-level function and class in the code is loaded into separate documents.
Furthermore, an extra document is generated, containing the remaining top-level code
that excludes the already segmented functions and classes.
This approach can potentially improve the accuracy of QA models over source code.
Currently, the supported languages for code parsing are Python and JavaScript.
The language used for parsing can be configured, along with the minimum number of
lines required to activate the splitting based on syntax.
Examples:
.. code-block:: python
from langchain.text_splitter.Language
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py", ".js"],
parser=LanguageParser()
)
docs = loader.load()
Example instantiations to manually select the language:
.. code-block:: python
from langchain.text_splitter import Language
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py"],
parser=LanguageParser(language=Language.PYTHON)
)
Example instantiations to set number of lines threshold:
.. code-block:: python
loader = GenericLoader.from_filesystem(
"./code",
glob="**/*",
suffixes=[".py"],
parser=LanguageParser(parser_threshold=200)
)
"""
def __init__(self, language: Optional[Language] = None, parser_threshold: int = 0):
"""
Language parser that split code using the respective language syntax.
Args:
language: If None (default), it will try to infer language from source.
parser_threshold: Minimum lines needed to activate parsing (0 by default).
"""
self.language = language
self.parser_threshold = parser_threshold
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
code = blob.as_string()
language = self.language or (
LANGUAGE_EXTENSIONS.get(blob.source.rsplit(".", 1)[-1])
if isinstance(blob.source, str)
else None
)
if language is None:
yield Document(
page_content=code,
metadata={
"source": blob.source,
},
)
return
if self.parser_threshold >= len(code.splitlines()):
yield Document(
page_content=code,
metadata={
"source": blob.source,
"language": language,
},
)
return
self.Segmenter = LANGUAGE_SEGMENTERS[language]
segmenter = self.Segmenter(blob.as_string())
if not segmenter.is_valid():
yield Document(
page_content=code,
metadata={
"source": blob.source,
},
)
return
for functions_classes in segmenter.extract_functions_classes():
yield Document(
page_content=functions_classes,
metadata={
"source": blob.source,
"content_type": "functions_classes",
"language": language,
},
)
yield Document(
page_content=segmenter.simplify_code(),
metadata={
"source": blob.source,
"content_type": "simplified_code",
"language": language,
},
)

View File

@@ -0,0 +1,51 @@
import ast
from typing import Any, List
from langchain_community.document_loaders.parsers.language.code_segmenter import (
CodeSegmenter,
)
class PythonSegmenter(CodeSegmenter):
"""Code segmenter for `Python`."""
def __init__(self, code: str):
super().__init__(code)
self.source_lines = self.code.splitlines()
def is_valid(self) -> bool:
try:
ast.parse(self.code)
return True
except SyntaxError:
return False
def _extract_code(self, node: Any) -> str:
start = node.lineno - 1
end = node.end_lineno
return "\n".join(self.source_lines[start:end])
def extract_functions_classes(self) -> List[str]:
tree = ast.parse(self.code)
functions_classes = []
for node in ast.iter_child_nodes(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
functions_classes.append(self._extract_code(node))
return functions_classes
def simplify_code(self) -> str:
tree = ast.parse(self.code)
simplified_lines = self.source_lines[:]
for node in ast.iter_child_nodes(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
start = node.lineno - 1
simplified_lines[start] = f"# Code for: {simplified_lines[start]}"
assert isinstance(node.end_lineno, int)
for line_num in range(start + 1, node.end_lineno):
simplified_lines[line_num] = None # type: ignore
return "\n".join(line for line in simplified_lines if line is not None)