community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2025-09-10 23:41:28 +00:00 · 2023-12-11 13:53:30 -08:00
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions
--- a/libs/community/langchain_community/document_loaders/parsers/language/init.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/init.py
@@ -0,0 +1,5 @@
+from langchain_community.document_loaders.parsers.language.language_parser import (
+    LanguageParser,
+)
+
+__all__ = ["LanguageParser"]
--- a/libs/community/langchain_community/document_loaders/parsers/language/cobol.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/cobol.py
@@ -0,0 +1,98 @@
+import re
+from typing import Callable, List
+
+from langchain_community.document_loaders.parsers.language.code_segmenter import (
+    CodeSegmenter,
+)
+
+
+class CobolSegmenter(CodeSegmenter):
+    """Code segmenter for `COBOL`."""
+
+    PARAGRAPH_PATTERN = re.compile(r"^[A-Z0-9\-]+(\s+.*)?\.$", re.IGNORECASE)
+    DIVISION_PATTERN = re.compile(
+        r"^\s*(IDENTIFICATION|DATA|PROCEDURE|ENVIRONMENT)\s+DIVISION.*$", re.IGNORECASE
+    )
+    SECTION_PATTERN = re.compile(r"^\s*[A-Z0-9\-]+\s+SECTION.$", re.IGNORECASE)
+
+    def __init__(self, code: str):
+        super().__init__(code)
+        self.source_lines: List[str] = self.code.splitlines()
+
+    def is_valid(self) -> bool:
+        # Identify presence of any division to validate COBOL code
+        return any(self.DIVISION_PATTERN.match(line) for line in self.source_lines)
+
+    def _extract_code(self, start_idx: int, end_idx: int) -> str:
+        return "\n".join(self.source_lines[start_idx:end_idx]).rstrip("\n")
+
+    def _is_relevant_code(self, line: str) -> bool:
+        """Check if a line is part of the procedure division or a relevant section."""
+        if "PROCEDURE DIVISION" in line.upper():
+            return True
+        # Add additional conditions for relevant sections if needed
+        return False
+
+    def _process_lines(self, func: Callable) -> List[str]:
+        """A generic function to process COBOL lines based on provided func."""
+        elements: List[str] = []
+        start_idx = None
+        inside_relevant_section = False
+
+        for i, line in enumerate(self.source_lines):
+            if self._is_relevant_code(line):
+                inside_relevant_section = True
+
+            if inside_relevant_section and (
+                self.PARAGRAPH_PATTERN.match(line.strip().split(" ")[0])
+                or self.SECTION_PATTERN.match(line.strip())
+            ):
+                if start_idx is not None:
+                    func(elements, start_idx, i)
+                start_idx = i
+
+        # Handle the last element if exists
+        if start_idx is not None:
+            func(elements, start_idx, len(self.source_lines))
+
+        return elements
+
+    def extract_functions_classes(self) -> List[str]:
+        def extract_func(elements: List[str], start_idx: int, end_idx: int) -> None:
+            elements.append(self._extract_code(start_idx, end_idx))
+
+        return self._process_lines(extract_func)
+
+    def simplify_code(self) -> str:
+        simplified_lines: List[str] = []
+        inside_relevant_section = False
+        omitted_code_added = (
+            False  # To track if "* OMITTED CODE *" has been added after the last header
+        )
+
+        for line in self.source_lines:
+            is_header = (
+                "PROCEDURE DIVISION" in line
+                or "DATA DIVISION" in line
+                or "IDENTIFICATION DIVISION" in line
+                or self.PARAGRAPH_PATTERN.match(line.strip().split(" ")[0])
+                or self.SECTION_PATTERN.match(line.strip())
+            )
+
+            if is_header:
+                inside_relevant_section = True
+                # Reset the flag since we're entering a new section/division or
+                # paragraph
+                omitted_code_added = False
+
+            if inside_relevant_section:
+                if is_header:
+                    # Add header and reset the omitted code added flag
+                    simplified_lines.append(line)
+                elif not omitted_code_added:
+                    # Add omitted code comment only if it hasn't been added directly
+                    # after the last header
+                    simplified_lines.append("* OMITTED CODE *")
+                    omitted_code_added = True
+
+        return "\n".join(simplified_lines)
--- a/libs/community/langchain_community/document_loaders/parsers/language/code_segmenter.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/code_segmenter.py
@@ -0,0 +1,20 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class CodeSegmenter(ABC):
+    """Abstract class for the code segmenter."""
+
+    def __init__(self, code: str):
+        self.code = code
+
+    def is_valid(self) -> bool:
+        return True
+
+    @abstractmethod
+    def simplify_code(self) -> str:
+        raise NotImplementedError()  # pragma: no cover
+
+    @abstractmethod
+    def extract_functions_classes(self) -> List[str]:
+        raise NotImplementedError()  # pragma: no cover
--- a/libs/community/langchain_community/document_loaders/parsers/language/javascript.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/javascript.py
@@ -0,0 +1,69 @@
+from typing import Any, List
+
+from langchain_community.document_loaders.parsers.language.code_segmenter import (
+    CodeSegmenter,
+)
+
+
+class JavaScriptSegmenter(CodeSegmenter):
+    """Code segmenter for JavaScript."""
+
+    def __init__(self, code: str):
+        super().__init__(code)
+        self.source_lines = self.code.splitlines()
+
+        try:
+            import esprima  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "Could not import esprima Python package. "
+                "Please install it with `pip install esprima`."
+            )
+
+    def is_valid(self) -> bool:
+        import esprima
+
+        try:
+            esprima.parseScript(self.code)
+            return True
+        except esprima.Error:
+            return False
+
+    def _extract_code(self, node: Any) -> str:
+        start = node.loc.start.line - 1
+        end = node.loc.end.line
+        return "\n".join(self.source_lines[start:end])
+
+    def extract_functions_classes(self) -> List[str]:
+        import esprima
+
+        tree = esprima.parseScript(self.code, loc=True)
+        functions_classes = []
+
+        for node in tree.body:
+            if isinstance(
+                node,
+                (esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
+            ):
+                functions_classes.append(self._extract_code(node))
+
+        return functions_classes
+
+    def simplify_code(self) -> str:
+        import esprima
+
+        tree = esprima.parseScript(self.code, loc=True)
+        simplified_lines = self.source_lines[:]
+
+        for node in tree.body:
+            if isinstance(
+                node,
+                (esprima.nodes.FunctionDeclaration, esprima.nodes.ClassDeclaration),
+            ):
+                start = node.loc.start.line - 1
+                simplified_lines[start] = f"// Code for: {simplified_lines[start]}"
+
+                for line_num in range(start + 1, node.loc.end.line):
+                    simplified_lines[line_num] = None  # type: ignore
+
+        return "\n".join(line for line in simplified_lines if line is not None)
--- a/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/language_parser.py
@@ -0,0 +1,158 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+from langchain_community.document_loaders.parsers.language.cobol import CobolSegmenter
+from langchain_community.document_loaders.parsers.language.javascript import (
+    JavaScriptSegmenter,
+)
+from langchain_community.document_loaders.parsers.language.python import PythonSegmenter
+
+if TYPE_CHECKING:
+    from langchain.text_splitter import Language
+
+try:
+    from langchain.text_splitter import Language
+
+    LANGUAGE_EXTENSIONS: Dict[str, str] = {
+        "py": Language.PYTHON,
+        "js": Language.JS,
+        "cobol": Language.COBOL,
+    }
+
+    LANGUAGE_SEGMENTERS: Dict[str, Any] = {
+        Language.PYTHON: PythonSegmenter,
+        Language.JS: JavaScriptSegmenter,
+        Language.COBOL: CobolSegmenter,
+    }
+except ImportError:
+    LANGUAGE_EXTENSIONS = {}
+    LANGUAGE_SEGMENTERS = {}
+
+
+class LanguageParser(BaseBlobParser):
+    """Parse using the respective programming language syntax.
+
+    Each top-level function and class in the code is loaded into separate documents.
+    Furthermore, an extra document is generated, containing the remaining top-level code
+    that excludes the already segmented functions and classes.
+
+    This approach can potentially improve the accuracy of QA models over source code.
+
+    Currently, the supported languages for code parsing are Python and JavaScript.
+
+    The language used for parsing can be configured, along with the minimum number of
+    lines required to activate the splitting based on syntax.
+
+    Examples:
+
+       .. code-block:: python
+
+            from langchain.text_splitter.Language
+            from langchain_community.document_loaders.generic import GenericLoader
+            from langchain_community.document_loaders.parsers import LanguageParser
+
+            loader = GenericLoader.from_filesystem(
+                "./code",
+                glob="**/*",
+                suffixes=[".py", ".js"],
+                parser=LanguageParser()
+            )
+            docs = loader.load()
+
+        Example instantiations to manually select the language:
+
+        .. code-block:: python
+
+            from langchain.text_splitter import Language
+
+            loader = GenericLoader.from_filesystem(
+                "./code",
+                glob="**/*",
+                suffixes=[".py"],
+                parser=LanguageParser(language=Language.PYTHON)
+            )
+
+        Example instantiations to set number of lines threshold:
+
+        .. code-block:: python
+
+            loader = GenericLoader.from_filesystem(
+                "./code",
+                glob="**/*",
+                suffixes=[".py"],
+                parser=LanguageParser(parser_threshold=200)
+            )
+    """
+
+    def __init__(self, language: Optional[Language] = None, parser_threshold: int = 0):
+        """
+        Language parser that split code using the respective language syntax.
+
+        Args:
+            language: If None (default), it will try to infer language from source.
+            parser_threshold: Minimum lines needed to activate parsing (0 by default).
+        """
+        self.language = language
+        self.parser_threshold = parser_threshold
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        code = blob.as_string()
+
+        language = self.language or (
+            LANGUAGE_EXTENSIONS.get(blob.source.rsplit(".", 1)[-1])
+            if isinstance(blob.source, str)
+            else None
+        )
+
+        if language is None:
+            yield Document(
+                page_content=code,
+                metadata={
+                    "source": blob.source,
+                },
+            )
+            return
+
+        if self.parser_threshold >= len(code.splitlines()):
+            yield Document(
+                page_content=code,
+                metadata={
+                    "source": blob.source,
+                    "language": language,
+                },
+            )
+            return
+
+        self.Segmenter = LANGUAGE_SEGMENTERS[language]
+        segmenter = self.Segmenter(blob.as_string())
+        if not segmenter.is_valid():
+            yield Document(
+                page_content=code,
+                metadata={
+                    "source": blob.source,
+                },
+            )
+            return
+
+        for functions_classes in segmenter.extract_functions_classes():
+            yield Document(
+                page_content=functions_classes,
+                metadata={
+                    "source": blob.source,
+                    "content_type": "functions_classes",
+                    "language": language,
+                },
+            )
+        yield Document(
+            page_content=segmenter.simplify_code(),
+            metadata={
+                "source": blob.source,
+                "content_type": "simplified_code",
+                "language": language,
+            },
+        )
--- a/libs/community/langchain_community/document_loaders/parsers/language/python.py
+++ b/libs/community/langchain_community/document_loaders/parsers/language/python.py
@@ -0,0 +1,51 @@
+import ast
+from typing import Any, List
+
+from langchain_community.document_loaders.parsers.language.code_segmenter import (
+    CodeSegmenter,
+)
+
+
+class PythonSegmenter(CodeSegmenter):
+    """Code segmenter for `Python`."""
+
+    def __init__(self, code: str):
+        super().__init__(code)
+        self.source_lines = self.code.splitlines()
+
+    def is_valid(self) -> bool:
+        try:
+            ast.parse(self.code)
+            return True
+        except SyntaxError:
+            return False
+
+    def _extract_code(self, node: Any) -> str:
+        start = node.lineno - 1
+        end = node.end_lineno
+        return "\n".join(self.source_lines[start:end])
+
+    def extract_functions_classes(self) -> List[str]:
+        tree = ast.parse(self.code)
+        functions_classes = []
+
+        for node in ast.iter_child_nodes(tree):
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                functions_classes.append(self._extract_code(node))
+
+        return functions_classes
+
+    def simplify_code(self) -> str:
+        tree = ast.parse(self.code)
+        simplified_lines = self.source_lines[:]
+
+        for node in ast.iter_child_nodes(tree):
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+                start = node.lineno - 1
+                simplified_lines[start] = f"# Code for: {simplified_lines[start]}"
+
+                assert isinstance(node.end_lineno, int)
+                for line_num in range(start + 1, node.end_lineno):
+                    simplified_lines[line_num] = None  # type: ignore
+
+        return "\n".join(line for line in simplified_lines if line is not None)