community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463)

Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
2025-09-16 23:13:31 +00:00 · 2023-12-11 13:53:30 -08:00
parent c0f4b95aa9
commit ed58eeb9c5
2446 changed files with 171805 additions and 137118 deletions
--- a/libs/community/langchain_community/retrievers/bm25.py
+++ b/libs/community/langchain_community/retrievers/bm25.py
@@ -0,0 +1,103 @@
+from __future__ import annotations
+
+from typing import Any, Callable, Dict, Iterable, List, Optional
+
+from langchain_core.callbacks import CallbackManagerForRetrieverRun
+from langchain_core.documents import Document
+from langchain_core.retrievers import BaseRetriever
+
+
+def default_preprocessing_func(text: str) -> List[str]:
+    return text.split()
+
+
+class BM25Retriever(BaseRetriever):
+    """`BM25` retriever without Elasticsearch."""
+
+    vectorizer: Any
+    """ BM25 vectorizer."""
+    docs: List[Document]
+    """ List of documents."""
+    k: int = 4
+    """ Number of documents to return."""
+    preprocess_func: Callable[[str], List[str]] = default_preprocessing_func
+    """ Preprocessing function to use on the text before BM25 vectorization."""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        arbitrary_types_allowed = True
+
+    @classmethod
+    def from_texts(
+        cls,
+        texts: Iterable[str],
+        metadatas: Optional[Iterable[dict]] = None,
+        bm25_params: Optional[Dict[str, Any]] = None,
+        preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
+        **kwargs: Any,
+    ) -> BM25Retriever:
+        """
+        Create a BM25Retriever from a list of texts.
+        Args:
+            texts: A list of texts to vectorize.
+            metadatas: A list of metadata dicts to associate with each text.
+            bm25_params: Parameters to pass to the BM25 vectorizer.
+            preprocess_func: A function to preprocess each text before vectorization.
+            **kwargs: Any other arguments to pass to the retriever.
+
+        Returns:
+            A BM25Retriever instance.
+        """
+        try:
+            from rank_bm25 import BM25Okapi
+        except ImportError:
+            raise ImportError(
+                "Could not import rank_bm25, please install with `pip install "
+                "rank_bm25`."
+            )
+
+        texts_processed = [preprocess_func(t) for t in texts]
+        bm25_params = bm25_params or {}
+        vectorizer = BM25Okapi(texts_processed, **bm25_params)
+        metadatas = metadatas or ({} for _ in texts)
+        docs = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadatas)]
+        return cls(
+            vectorizer=vectorizer, docs=docs, preprocess_func=preprocess_func, **kwargs
+        )
+
+    @classmethod
+    def from_documents(
+        cls,
+        documents: Iterable[Document],
+        *,
+        bm25_params: Optional[Dict[str, Any]] = None,
+        preprocess_func: Callable[[str], List[str]] = default_preprocessing_func,
+        **kwargs: Any,
+    ) -> BM25Retriever:
+        """
+        Create a BM25Retriever from a list of Documents.
+        Args:
+            documents: A list of Documents to vectorize.
+            bm25_params: Parameters to pass to the BM25 vectorizer.
+            preprocess_func: A function to preprocess each text before vectorization.
+            **kwargs: Any other arguments to pass to the retriever.
+
+        Returns:
+            A BM25Retriever instance.
+        """
+        texts, metadatas = zip(*((d.page_content, d.metadata) for d in documents))
+        return cls.from_texts(
+            texts=texts,
+            bm25_params=bm25_params,
+            metadatas=metadatas,
+            preprocess_func=preprocess_func,
+            **kwargs,
+        )
+
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        processed_query = self.preprocess_func(query)
+        return_docs = self.vectorizer.get_top_n(processed_query, self.docs, n=self.k)
+        return return_docs