community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2025-09-08 14:31:55 +00:00 · 2023-12-11 13:53:30 -08:00
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions
--- a/libs/community/langchain_community/document_loaders/bibtex.py
+++ b/libs/community/langchain_community/document_loaders/bibtex.py
@@ -0,0 +1,111 @@
+import logging
+import re
+from pathlib import Path
+from typing import Any, Iterator, List, Mapping, Optional
+
+from langchain_core.documents import Document
+
+from langchain_community.document_loaders.base import BaseLoader
+from langchain_community.utilities.bibtex import BibtexparserWrapper
+
+logger = logging.getLogger(__name__)
+
+
+class BibtexLoader(BaseLoader):
+    """Load a `bibtex` file.
+
+    Each document represents one entry from the bibtex file.
+
+    If a PDF file is present in the `file` bibtex field, the original PDF
+    is loaded into the document text. If no such file entry is present,
+    the `abstract` field is used instead.
+    """
+
+    def __init__(
+        self,
+        file_path: str,
+        *,
+        parser: Optional[BibtexparserWrapper] = None,
+        max_docs: Optional[int] = None,
+        max_content_chars: Optional[int] = 4_000,
+        load_extra_metadata: bool = False,
+        file_pattern: str = r"[^:]+\.pdf",
+    ):
+        """Initialize the BibtexLoader.
+
+        Args:
+            file_path: Path to the bibtex file.
+            parser: The parser to use. If None, a default parser is used.
+            max_docs: Max number of associated documents to load. Use -1 means
+                           no limit.
+            max_content_chars: Maximum number of characters to load from the PDF.
+            load_extra_metadata: Whether to load extra metadata from the PDF.
+            file_pattern: Regex pattern to match the file name in the bibtex.
+        """
+        self.file_path = file_path
+        self.parser = parser or BibtexparserWrapper()
+        self.max_docs = max_docs
+        self.max_content_chars = max_content_chars
+        self.load_extra_metadata = load_extra_metadata
+        self.file_regex = re.compile(file_pattern)
+
+    def _load_entry(self, entry: Mapping[str, Any]) -> Optional[Document]:
+        import fitz
+
+        parent_dir = Path(self.file_path).parent
+        # regex is useful for Zotero flavor bibtex files
+        file_names = self.file_regex.findall(entry.get("file", ""))
+        if not file_names:
+            return None
+        texts: List[str] = []
+        for file_name in file_names:
+            try:
+                with fitz.open(parent_dir / file_name) as f:
+                    texts.extend(page.get_text() for page in f)
+            except FileNotFoundError as e:
+                logger.debug(e)
+        content = "\n".join(texts) or entry.get("abstract", "")
+        if self.max_content_chars:
+            content = content[: self.max_content_chars]
+        metadata = self.parser.get_metadata(entry, load_extra=self.load_extra_metadata)
+        return Document(
+            page_content=content,
+            metadata=metadata,
+        )
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Load bibtex file using bibtexparser and get the article texts plus the
+        article metadata.
+        See https://bibtexparser.readthedocs.io/en/master/
+
+        Returns:
+            a list of documents with the document.page_content in text format
+        """
+        try:
+            import fitz  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "PyMuPDF package not found, please install it with "
+                "`pip install pymupdf`"
+            )
+
+        entries = self.parser.load_bibtex_entries(self.file_path)
+        if self.max_docs:
+            entries = entries[: self.max_docs]
+        for entry in entries:
+            doc = self._load_entry(entry)
+            if doc:
+                yield doc
+
+    def load(self) -> List[Document]:
+        """Load bibtex file documents from the given bibtex file path.
+
+        See https://bibtexparser.readthedocs.io/en/master/
+
+        Args:
+            file_path: the path to the bibtex file
+
+        Returns:
+            a list of documents with the document.page_content in text format
+        """
+        return list(self.lazy_load())