From 46908ee3dab4b07d7f94960f75920b4a1da3ad26 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Fri, 28 Feb 2025 22:45:06 +0900 Subject: [PATCH 1/5] docs: update google_cloud_vertexai_rerank.ipynb (#30039) recieve -> receive --- .../document_transformers/google_cloud_vertexai_rerank.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/integrations/document_transformers/google_cloud_vertexai_rerank.ipynb b/docs/docs/integrations/document_transformers/google_cloud_vertexai_rerank.ipynb index 12ce774a1a8..96eec3ce67d 100644 --- a/docs/docs/integrations/document_transformers/google_cloud_vertexai_rerank.ipynb +++ b/docs/docs/integrations/document_transformers/google_cloud_vertexai_rerank.ipynb @@ -546,7 +546,7 @@ "id": "ud_cnGszb1i9" }, "source": [ - "Let's inspect a couple of reranked documents. We observe that the retriever still returns the relevant Langchain type [documents](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html) but as part of the metadata field, we also recieve the `relevance_score` from the Ranking API." + "Let's inspect a couple of reranked documents. We observe that the retriever still returns the relevant Langchain type [documents](https://python.langchain.com/api_reference/core/documents/langchain_core.documents.base.Document.html) but as part of the metadata field, we also receive the `relevance_score` from the Ranking API." ] }, { From 186cd7f1a1b771bc9999cb2bd5bcc7d72066ae64 Mon Sep 17 00:00:00 2001 From: Daniel Rauber Date: Fri, 28 Feb 2025 14:45:51 +0100 Subject: [PATCH 2/5] community: PlaywrightURLLoader should wait for page load event before attempting to extract data (#30043) ## Description The PlaywrightURLLoader should wait for a page to be loaded before attempting to extract data. --- .../langchain_community/document_loaders/url_playwright.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libs/community/langchain_community/document_loaders/url_playwright.py b/libs/community/langchain_community/document_loaders/url_playwright.py index 3f350615401..8e06ce4be0e 100644 --- a/libs/community/langchain_community/document_loaders/url_playwright.py +++ b/libs/community/langchain_community/document_loaders/url_playwright.py @@ -177,6 +177,8 @@ class PlaywrightURLLoader(BaseLoader): if response is None: raise ValueError(f"page.goto() returned None for url {url}") + page.wait_for_load_state("load") + text = self.evaluator.evaluate(page, browser, response) metadata = {"source": url} yield Document(page_content=text, metadata=metadata) @@ -216,6 +218,8 @@ class PlaywrightURLLoader(BaseLoader): if response is None: raise ValueError(f"page.goto() returned None for url {url}") + await page.wait_for_load_state("load") + text = await self.evaluator.evaluate_async(page, browser, response) metadata = {"source": url} yield Document(page_content=text, metadata=metadata) From f07338d2bfb71e1eba057c30b65b3ef602702436 Mon Sep 17 00:00:00 2001 From: Fakai Zhao Date: Fri, 28 Feb 2025 21:50:22 +0800 Subject: [PATCH 3/5] Implementing the MMR algorithm for OLAP vector storage (#30033) Thank you for contributing to LangChain! - **Implementing the MMR algorithm for OLAP vector storage**: - Support Apache Doris and StarRocks OLAP database. - Example: "vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 10})" - **Implementing the MMR algorithm for OLAP vector storage**: - **Apache Doris - **StarRocks - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - **Add tests and docs**: - Example: "vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 10})" - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: fakzhao --- .../vectorstores/apache_doris.py | 100 ++++++++++++++++- .../vectorstores/starrocks.py | 101 +++++++++++++++++- 2 files changed, 191 insertions(+), 10 deletions(-) diff --git a/libs/community/langchain_community/vectorstores/apache_doris.py b/libs/community/langchain_community/vectorstores/apache_doris.py index 56ee6c0f64f..0e88fba5bdf 100644 --- a/libs/community/langchain_community/vectorstores/apache_doris.py +++ b/libs/community/langchain_community/vectorstores/apache_doris.py @@ -4,16 +4,30 @@ import json import logging from hashlib import sha1 from threading import Thread -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.vectorstores import VectorStore from pydantic_settings import BaseSettings, SettingsConfigDict +from typing_extensions import TypedDict + +from langchain_community.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger() DEBUG = False +Metadata = Mapping[str, Union[str, int, float, bool]] + + +class QueryResult(TypedDict): + ids: List[List[str]] + embeddings: List[Any] + documents: List[Document] + metadatas: Optional[List[Metadata]] + distances: Optional[List[float]] + class ApacheDorisSettings(BaseSettings): """Apache Doris client configuration. @@ -310,10 +324,13 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( where_str = "" q_str = f""" - SELECT {self.config.column_map["document"]}, - {self.config.column_map["metadata"]}, + SELECT + id as id, + {self.config.column_map["document"]} as document, + {self.config.column_map["metadata"]} as metadata, cosine_distance(array[{q_emb_str}], - {self.config.column_map["embedding"]}) as dist + {self.config.column_map["embedding"]}) as dist, + {self.config.column_map["embedding"]} as embedding FROM {self.config.database}.{self.config.table} {where_str} ORDER BY dist {self.dist_order} @@ -371,12 +388,13 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( """ q_str = self._build_query_sql(embedding, k, where_str) try: + q_r = _get_named_result(self.connection, q_str) return [ Document( page_content=r[self.config.column_map["document"]], metadata=json.loads(r[self.config.column_map["metadata"]]), ) - for r in _get_named_result(self.connection, q_str) + for r in q_r ] except Exception as e: logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m") @@ -430,6 +448,63 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( def metadata_column(self) -> str: return self.config.column_map["metadata"] + def max_marginal_relevance_search_by_vector( + self, + embedding: list[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> list[Document]: + q_str = self._build_query_sql(embedding, fetch_k, None) + q_r = _get_named_result(self.connection, q_str) + results = QueryResult( + ids=[r["id"] for r in q_r], + embeddings=[ + json.loads(r[self.config.column_map["embedding"]]) for r in q_r + ], + documents=[r[self.config.column_map["document"]] for r in q_r], + metadatas=[json.loads(r[self.config.column_map["metadata"]]) for r in q_r], + distances=[r["dist"] for r in q_r], + ) + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + results["embeddings"], + k=k, + lambda_mult=lambda_mult, + ) + + candidates = _results_to_docs(results) + + selected_results = [r for i, r in enumerate(candidates) if i in mmr_selected] + return selected_results + + def max_marginal_relevance_search( + self, + query: str, + k: int = 5, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Document]: + if self.embeddings is None: + raise ValueError( + "For MMR search, you must specify an embedding function oncreation." + ) + + embedding = self.embeddings.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding, + k, + fetch_k, + lambda_mult=lambda_mult, + filter=filter, + where_document=where_document, + ) + def _has_mul_sub_str(s: str, *args: Any) -> bool: """Check if a string has multiple substrings. @@ -480,3 +555,18 @@ def _get_named_result(connection: Any, query: str) -> List[dict[str, Any]]: _debug_output(result) cursor.close() return result + + +def _results_to_docs(results: Any) -> List[Document]: + return [doc for doc, _ in _results_to_docs_and_scores(results)] + + +def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: + return [ + (Document(page_content=result[0], metadata=result[1] or {}), result[2]) + for result in zip( + results["documents"], + results["metadatas"], + results["distances"], + ) + ] diff --git a/libs/community/langchain_community/vectorstores/starrocks.py b/libs/community/langchain_community/vectorstores/starrocks.py index 9298f12a78f..d3ce2dcc9b5 100644 --- a/libs/community/langchain_community/vectorstores/starrocks.py +++ b/libs/community/langchain_community/vectorstores/starrocks.py @@ -4,12 +4,16 @@ import json import logging from hashlib import sha1 from threading import Thread -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.vectorstores import VectorStore from pydantic_settings import BaseSettings, SettingsConfigDict +from typing_extensions import TypedDict + +from langchain_community.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger() DEBUG = False @@ -66,6 +70,17 @@ def get_named_result(connection: Any, query: str) -> List[dict[str, Any]]: return result +Metadata = Mapping[str, Union[str, int, float, bool]] + + +class QueryResult(TypedDict): + ids: List[List[str]] + embeddings: List[Any] + documents: List[Document] + metadatas: Optional[List[Metadata]] + distances: Optional[List[float]] + + class StarRocksSettings(BaseSettings): """StarRocks client configuration. @@ -363,10 +378,13 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( where_str = "" q_str = f""" - SELECT {self.config.column_map["document"]}, - {self.config.column_map["metadata"]}, + SELECT + id as id, + {self.config.column_map["document"]} as document, + {self.config.column_map["metadata"]} as metadata, cosine_similarity_norm(array[{q_emb_str}], - {self.config.column_map["embedding"]}) as dist + {self.config.column_map["embedding"]}) as dist, + {self.config.column_map["embedding"]} as embedding FROM {self.config.database}.{self.config.table} {where_str} ORDER BY dist {self.dist_order} @@ -424,12 +442,13 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( """ q_str = self._build_query_sql(embedding, k, where_str) try: + q_r = get_named_result(self.connection, q_str) return [ Document( page_content=r[self.config.column_map["document"]], metadata=json.loads(r[self.config.column_map["metadata"]]), ) - for r in get_named_result(self.connection, q_str) + for r in q_r ] except Exception as e: logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m") @@ -484,3 +503,75 @@ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( @property def metadata_column(self) -> str: return self.config.column_map["metadata"] + + def max_marginal_relevance_search_by_vector( + self, + embedding: list[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> list[Document]: + q_str = self._build_query_sql(embedding, fetch_k, None) + q_r = get_named_result(self.connection, q_str) + results = QueryResult( + ids=[r["id"] for r in q_r], + embeddings=[ + json.loads(r[self.config.column_map["embedding"]]) for r in q_r + ], + documents=[r[self.config.column_map["document"]] for r in q_r], + metadatas=[json.loads(r[self.config.column_map["metadata"]]) for r in q_r], + distances=[r["dist"] for r in q_r], + ) + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + results["embeddings"], + k=k, + lambda_mult=lambda_mult, + ) + + candidates = _results_to_docs(results) + + selected_results = [r for i, r in enumerate(candidates) if i in mmr_selected] + return selected_results + + def max_marginal_relevance_search( + self, + query: str, + k: int = 5, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Document]: + if self.embeddings is None: + raise ValueError( + "For MMR search, you must specify an embedding function oncreation." + ) + + embedding = self.embeddings.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding, + k, + fetch_k, + lambda_mult=lambda_mult, + filter=filter, + where_document=where_document, + ) + + +def _results_to_docs(results: Any) -> List[Document]: + return [doc for doc, _ in _results_to_docs_and_scores(results)] + + +def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: + return [ + (Document(page_content=result[0], metadata=result[1] or {}), result[2]) + for result in zip( + results["documents"], + results["metadatas"], + results["distances"], + ) + ] From 476cd26f57ec890a66f2bdc7dd2b0daf98356ea5 Mon Sep 17 00:00:00 2001 From: Tiest van Gool Date: Fri, 28 Feb 2025 07:08:12 -0700 Subject: [PATCH 4/5] Add xAI to ChatModelTabs drop down (#30028) Thank you for contributing to LangChain! - [ ] **PR title**: "docs: add xAI to ChatModelTabs" - [ ] **PR message**: - **Description:** Added `ChatXAI` to `ChatModelTabs` dropdown to improve visibility of xAI chat models (e.g., "grok-2", "grok-3"). - **Issue:** Follow-up to #30010 - **Dependencies:** none - **Twitter handle:** @tiestvangool If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Chester Curme --- docs/src/theme/ChatModelTabs.js | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/src/theme/ChatModelTabs.js b/docs/src/theme/ChatModelTabs.js index bbe840d8e34..b9a3044251a 100644 --- a/docs/src/theme/ChatModelTabs.js +++ b/docs/src/theme/ChatModelTabs.js @@ -211,6 +211,13 @@ ${llmVarName} = ChatWatsonx( apiKeyName: "DATABRICKS_TOKEN", packageName: "databricks-langchain", }, + { + value: "xai", + label: "xAI", + model: "grok-2", + apiKeyName: "XAI_API_KEY", + packageName: "langchain-xai", + }, ].map((item) => ({ ...item, ...overrideParams?.[item.value], From a1897ca62142515e1a5fb20d4b2bcf0eeee8b2ad Mon Sep 17 00:00:00 2001 From: Cheney Zhang Date: Fri, 28 Feb 2025 23:22:53 +0800 Subject: [PATCH 5/5] docs: refine milvus doc with hybrid-search (#30037) Milvus Document refinement: add more detailed hybrid search description with full-text search introduction here. Signed-off-by: ChengZi --- .../integrations/vectorstores/milvus.ipynb | 129 ++++++++++++++++-- 1 file changed, 115 insertions(+), 14 deletions(-) diff --git a/docs/docs/integrations/vectorstores/milvus.ipynb b/docs/docs/integrations/vectorstores/milvus.ipynb index e147541a04a..6e115f55754 100644 --- a/docs/docs/integrations/vectorstores/milvus.ipynb +++ b/docs/docs/integrations/vectorstores/milvus.ipynb @@ -206,7 +206,7 @@ "source": [ "Note the change in the URI below. Once the instance is initialized, navigate to http://127.0.0.1:9091/webui to view the local web UI.\n", "\n", - "Here is an example of how you would use a dense embedding + the Milvus BM25 built-in function to assemble a hybrid retrieval vector store instance:" + "Here is an example of how you create your vector store instance with the Milvus database serivce:" ] }, { @@ -218,28 +218,25 @@ "source": [ "from langchain_milvus import BM25BuiltInFunction, Milvus\n", "\n", - "dense_index_param = {\n", - " \"metric_type\": \"COSINE\",\n", - " \"index_type\": \"HNSW\",\n", - "}\n", - "sparse_index_param = {\n", - " \"metric_type\": \"BM25\",\n", - " \"index_type\": \"AUTOINDEX\",\n", - "}\n", - "\n", "URI = \"http://localhost:19530\"\n", "\n", "vectorstore = Milvus(\n", " embedding_function=embeddings,\n", - " builtin_function=BM25BuiltInFunction(output_field_names=\"sparse\"),\n", - " index_params=[dense_index_param, sparse_index_param],\n", - " vector_field=[\"dense\", \"sparse\"],\n", " connection_args={\"uri\": URI, \"token\": \"root:Milvus\", \"db_name\": \"milvus_demo\"},\n", + " index_params={\"index_type\": \"FLAT\", \"metric_type\": \"L2\"},\n", " consistency_level=\"Strong\",\n", " drop_old=False, # set to True if seeking to drop the collection with that name if it exists\n", ")" ] }, + { + "cell_type": "markdown", + "id": "6d5a9670", + "metadata": {}, + "source": [ + "> If you want to use Zilliz Cloud, the fully managed cloud service for Milvus, please adjust the uri and token, which correspond to the [Public Endpoint](https://docs.zilliz.com/docs/byoc/quick-start#free-cluster-details) and [Api key](https://docs.zilliz.com/docs/byoc/quick-start#free-cluster-details) in Zilliz Cloud." + ] + }, { "cell_type": "markdown", "id": "cae1a7d5", @@ -552,6 +549,110 @@ "retriever.invoke(\"Stealing from the bank is a crime\", filter={\"source\": \"news\"})" ] }, + { + "cell_type": "markdown", + "id": "8edb47106e1a46a883d545849b8ab81b", + "metadata": { + "collapsed": false + }, + "source": [ + "\n", + "## Hybrid Search\n", + "\n", + "The most common hybrid search scenario is the dense + sparse hybrid search, where candidates are retrieved using both semantic vector similarity and precise keyword matching. Results from these methods are merged, reranked, and passed to an LLM to generate the final answer. This approach balances precision and semantic understanding, making it highly effective for diverse query scenarios.\n", + "\n", + "\n", + "### Full-text search\n", + "Since [Milvus 2.5](https://milvus.io/blog/introduce-milvus-2-5-full-text-search-powerful-metadata-filtering-and-more.md), full-text search is natively supported through the Sparse-BM25 approach, by representing the BM25 algorithm as sparse vectors. Milvus accepts raw text as input and automatically converts it into sparse vectors stored in a specified field, eliminating the need for manual sparse embedding generation.\n", + "\n", + "For full-text search Milvus VectorStore accepts a `builtin_function` parameter. Through this parameter, you can pass in an instance of the `BM25BuiltInFunction`. This is different than semantic search which usually passes dense embeddings to the `VectorStore`,\n", + "\n", + "Here is a simple example of hybrid search in Milvus with OpenAI dense embedding for semantic search and BM25 for full-text search:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "10185d26023b46108eb7d9f57d49d2b3", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from langchain_milvus import BM25BuiltInFunction, Milvus\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "vectorstore = Milvus.from_documents(\n", + " documents=documents,\n", + " embedding=OpenAIEmbeddings(),\n", + " builtin_function=BM25BuiltInFunction(),\n", + " # `dense` is for OpenAI embeddings, `sparse` is the output field of BM25 function\n", + " vector_field=[\"dense\", \"sparse\"],\n", + " connection_args={\n", + " \"uri\": URI,\n", + " },\n", + " consistency_level=\"Strong\",\n", + " drop_old=True,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8763a12b2bbd4a93a75aff182afb95dc", + "metadata": { + "collapsed": false + }, + "source": [ + "> - When you use `BM25BuiltInFunction`, please note that the full-text search is available in Milvus Standalone and Milvus Distributed, but not in Milvus Lite, although it is on the roadmap for future inclusion. It will also be available in Zilliz Cloud (fully-managed Milvus) soon. Please reach out to support@zilliz.com for more information.\n", + "\n", + "In the code above, we define an instance of `BM25BuiltInFunction` and pass it to the `Milvus` object. `BM25BuiltInFunction` is a lightweight wrapper class for [`Function`](https://milvus.io/docs/manage-collections.md#Function) in Milvus. We can use it with `OpenAIEmbeddings` to initialize a dense + sparse hybrid search Milvus vector store instance.\n", + "\n", + "`BM25BuiltInFunction` does not require the client to pass corpus or training, all are automatically processed at the Milvus server's end, so users do not need to care about any vocabulary and corpus. In addition, users can also customize the [analyzer](https://milvus.io/docs/analyzer-overview.md#Analyzer-Overview) to implement the custom text processing in the BM25." + ] + }, + { + "cell_type": "markdown", + "id": "7623eae2785240b9bd12b16a66d81610", + "metadata": { + "collapsed": false + }, + "source": [ + "### Rerank the candidates\n", + "After the first stage of retrieval, we need to rerank the candidates to get a better result. You can refer to the [Reranking](https://milvus.io/docs/reranking.md#Reranking) for more information.\n", + "\n", + "Here is an example for weighted reranking:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cdc8c89c7104fffa095e18ddfef8986", + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "query = \"What are the novels Lila has written and what are their contents?\"\n", + "\n", + "vectorstore.similarity_search(\n", + " query, k=1, ranker_type=\"weighted\", ranker_params={\"weights\": [0.6, 0.4]}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b3965036", + "metadata": {}, + "source": [ + "For more information about Full-text search and Hybrid search, please refer to the [Using Full-Text Search with LangChain and Milvus](https://milvus.io/docs/full_text_search_with_langchain.md) and [Hybrid Retrieval with LangChain and Milvus](https://milvus.io/docs/milvus_hybrid_search_retriever.md)." + ] + }, { "cell_type": "markdown", "id": "8ac953f1", @@ -726,7 +827,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.0" } }, "nbformat": 4,