From c36e6d4371750037b156f76549519db0692b7b6f Mon Sep 17 00:00:00 2001 From: Edmond Wang Date: Mon, 10 Feb 2025 21:35:38 +0800 Subject: [PATCH 1/5] =?UTF-8?q?docs:=20Add=20Comments=20and=20Supplementar?= =?UTF-8?q?y=20Example=20Code=20to=20Vearch=20Vector=20Dat=E2=80=A6=20(#29?= =?UTF-8?q?706)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - **Description:** Added some comments to the example code in the Vearch vector database documentation and included commonly used sample code. - **Issue:** None - **Dependencies:** None --------- Co-authored-by: wangchuxiong --- docs/docs/integrations/vectorstores/vearch.ipynb | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/docs/integrations/vectorstores/vearch.ipynb b/docs/docs/integrations/vectorstores/vearch.ipynb index da2efb893a7..d5e40fdc4c1 100644 --- a/docs/docs/integrations/vectorstores/vearch.ipynb +++ b/docs/docs/integrations/vectorstores/vearch.ipynb @@ -156,6 +156,15 @@ " db_name=\"vearch_cluster_langchian\",\n", " table_name=\"tobenumone\",\n", " flag=1,\n", + ")\n", + "\n", + "# The vector data is usually already initialized, so we don’t need the document parameter and can directly create the object.\n", + "vearch_cluster_b = Vearch(\n", + " embeddings,\n", + " path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",\n", + " db_name=\"vearch_cluster_langchian\",\n", + " table_name=\"tobenumone\",\n", + " flag=1,\n", ")" ] }, @@ -244,6 +253,7 @@ ], "source": [ "query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", + "# The second parameter is the top-n to retrieve, and its default value is 4.\n", "vearch_standalone_res = vearch_standalone.similarity_search(query, 3)\n", "for idx, tmp in enumerate(vearch_standalone_res):\n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", @@ -261,6 +271,11 @@ "for idx, tmp in enumerate(cluster_res):\n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", "\n", + "# In practical applications, we usually limit the boundary value of similarity. The following method can set this value.\n", + "cluster_res_with_bound = vearch_cluster.similarity_search_with_score(\n", + " query=query_c, k=3, min_score=0.5\n", + ")\n", + "\n", "# combine your local knowleadge and query\n", "context_c = \"\".join([tmp.page_content for tmp in cluster_res])\n", "new_query_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context_c} \\n 回答用户这个问题:{query_c}\\n\\n\"\n", From 6655246504085d7d9d68505fba30d1f14262eaad Mon Sep 17 00:00:00 2001 From: Tiest van Gool Date: Mon, 10 Feb 2025 06:38:15 -0700 Subject: [PATCH 2/5] Classification Tutorial: Replaced .dict() with .model_dump() method (#29701) The .dict() method is deprecated inf Pydantic V2.0 and use `model_dump` method instead. Thank you for contributing to LangChain! - [ ] **PR title**: "package: description" - Where "package" is whichever of langchain, community, core, etc. is being modified. Use "docs: ..." for purely docs changes, "infra: ..." for CI changes. - Example: "community: add foobar LLM" - [ ] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change - **Issue:** the issue # it fixes, if applicable - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [ ] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --- docs/docs/tutorials/classification.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/docs/tutorials/classification.ipynb b/docs/docs/tutorials/classification.ipynb index b61ab56e3c3..4efafa165c7 100644 --- a/docs/docs/tutorials/classification.ipynb +++ b/docs/docs/tutorials/classification.ipynb @@ -154,7 +154,7 @@ "id": "ff3cf30d", "metadata": {}, "source": [ - "If we want dictionary output, we can just call `.dict()`" + "If we want dictionary output, we can just call `.model_dump()`" ] }, { @@ -179,7 +179,7 @@ "prompt = tagging_prompt.invoke({\"input\": inp})\n", "response = llm.invoke(prompt)\n", "\n", - "response.dict()" + "response.model_dump()" ] }, { From 894b0cac3c84ca9a0b6fa6e8d1f5e738fa32cdcc Mon Sep 17 00:00:00 2001 From: Jun He Date: Mon, 10 Feb 2025 07:53:21 -0600 Subject: [PATCH 3/5] docs: Remove redundant line (#29698) If I understand it correctly, chain1 is never used. --- docs/docs/how_to/functions.ipynb | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/docs/how_to/functions.ipynb b/docs/docs/how_to/functions.ipynb index 9a37eda942f..6ffec14c30a 100644 --- a/docs/docs/how_to/functions.ipynb +++ b/docs/docs/how_to/functions.ipynb @@ -99,8 +99,6 @@ "\n", "prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n", "\n", - "chain1 = prompt | model\n", - "\n", "chain = (\n", " {\n", " \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n", From 60740c44c53441a1e8d16fbd17cb162cc03b308e Mon Sep 17 00:00:00 2001 From: Changyong Um Date: Mon, 10 Feb 2025 22:56:37 +0900 Subject: [PATCH 4/5] community: Add configurable text key for indexing and the retriever in Pinecone Hybrid Search (#29697) **issue** In Langchain, the original content is generally stored under the `text` key. However, the `PineconeHybridSearchRetriever` searches the `context` field in the metadata and cannot change this key. To address this, I have modified the code to allow changing the key to something other than context. In my opinion, following Langchain's conventions, the `text` key seems more appropriate than `context`. However, since I wasn't sure about the author's intent, I have left the default value as `context`. --- .../retrievers/pinecone_hybrid_search.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/retrievers/pinecone_hybrid_search.py b/libs/community/langchain_community/retrievers/pinecone_hybrid_search.py index a6e0f68002d..cd3e3e96d08 100644 --- a/libs/community/langchain_community/retrievers/pinecone_hybrid_search.py +++ b/libs/community/langchain_community/retrievers/pinecone_hybrid_search.py @@ -31,6 +31,7 @@ def create_index( ids: Optional[List[str]] = None, metadatas: Optional[List[dict]] = None, namespace: Optional[str] = None, + text_key: str = "context", ) -> None: """Create an index from a list of contexts. @@ -69,7 +70,7 @@ def create_index( ) # add context passages as metadata meta = [ - {"context": context, **metadata} + {text_key: context, **metadata} for context, metadata in zip(context_batch, metadata_batch) ] @@ -114,7 +115,7 @@ class PineconeHybridSearchRetriever(BaseRetriever): """Alpha value for hybrid search.""" namespace: Optional[str] = None """Namespace value for index partition.""" - + text_key: str = "context" model_config = ConfigDict( arbitrary_types_allowed=True, extra="forbid", @@ -135,6 +136,7 @@ class PineconeHybridSearchRetriever(BaseRetriever): ids=ids, metadatas=metadatas, namespace=namespace, + text_key=self.text_key, ) @pre_init @@ -174,7 +176,7 @@ class PineconeHybridSearchRetriever(BaseRetriever): ) final_result = [] for res in result["matches"]: - context = res["metadata"].pop("context") + context = res["metadata"].pop(self.text_key) metadata = res["metadata"] if "score" not in metadata and "score" in res: metadata["score"] = res["score"] From 624216aa64cc48e5a0f1db5f65d5c2dc8581f0f1 Mon Sep 17 00:00:00 2001 From: Bhav Sardana <40908961+sardanabhav@users.noreply.github.com> Date: Mon, 10 Feb 2025 19:27:58 +0530 Subject: [PATCH 5/5] community:Fix for Pydantic model validator of GoogleApiYoutubeLoader (#29694) - **Description:** Community: bugfix for pedantic model validator for GoogleApiYoutubeLoader - **Issue:** #29165, #27432 Fix is similar to #29346 --- .../langchain_community/document_loaders/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/youtube.py b/libs/community/langchain_community/document_loaders/youtube.py index 64871eb3caf..1b99a8d2da2 100644 --- a/libs/community/langchain_community/document_loaders/youtube.py +++ b/libs/community/langchain_community/document_loaders/youtube.py @@ -392,11 +392,11 @@ class GoogleApiYoutubeLoader(BaseLoader): @model_validator(mode="before") @classmethod - def validate_channel_or_videoIds_is_set(cls, values: Dict[str, Any]) -> Any: + def validate_channel_or_videoIds_is_set(cls, values: Any) -> Any: """Validate that either folder_id or document_ids is set, but not both.""" - if not values.get("channel_name") and not values.get("video_ids"): + if not values.kwargs.get("channel_name") and not values.kwargs.get("video_ids"): raise ValueError("Must specify either channel_name or video_ids") - return values + return values.kwargs def _get_transcripe_for_video_id(self, video_id: str) -> str: from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi