Merge branch 'master' into pprados/06-pdfplumber

2025-09-04 12:39:32 +00:00 · 2025-02-11 09:50:06 +01:00
parent 535e222068 3b3d52206f
commit 7733591803
9 changed files with 60 additions and 51 deletions
--- a/docs/docs/how_to/functions.ipynb
+++ b/docs/docs/how_to/functions.ipynb
@@ -99,8 +99,6 @@
    "\n",
    "prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n",
    "\n",
-    "chain1 = prompt | model\n",
-    "\n",
    "chain = (\n",
    "    {\n",
    "        \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n",
--- a/docs/docs/integrations/providers/nimble.ipynb
+++ b/docs/docs/integrations/providers/nimble.ipynb
@@ -27,18 +27,12 @@
    "If you'd like to learn more about Nimble, visit us at [nimbleway.com](https://www.nimbleway.com/).\n",
    "\n",
    "\n",
-    "## Currently we expose the following components\n",
-    "\n",
-    "*   **Retriever** - Allow us to query the internet and get parsed textual results utilizing several search engines.\n",
-    "\n",
-    "\n"
+    "## Retrievers:"
   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "## Usage"
-   ],
+   "source": "### NimbleSearchRetriever",
   "metadata": {
    "id": "AuMFgVFrKbNH"
   },
@@ -47,33 +41,42 @@
  {
   "cell_type": "markdown",
   "source": [
-    "In order to use our provider you have to provide an API key like so"
-   ],
-   "metadata": {
-    "id": "sFlPjZX9KdK6"
-   },
-   "id": "sFlPjZX9KdK6"
-  },
-  {
-   "cell_type": "code",
-   "source": [
-    "import getpass\n",
-    "import os\n",
+    "Enables developers to build RAG applications and AI Agents that can search, access, and retrieve online information from anywhere on the web.\n",
    "\n",
-    "os.environ[\"NIMBLE_API_KEY\"] = getpass.getpass()"
+    "We need to install the `langchain-nimble` python package."
   ],
   "metadata": {
-    "id": "eAqSHZ-Z8R3F"
+    "id": "sFlPjZX9KdK6"
   },
-   "id": "eAqSHZ-Z8R3F",
+   "id": "sFlPjZX9KdK6"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
   "execution_count": null,
-   "outputs": []
+   "source": "%pip install -U langchain-nimble",
+   "id": "65f237c852aa3885"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "See a [usage example](/docs/integrations/retrievers/nimble/).",
+   "id": "77bd7b9a6a8e381b"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "```python\n",
+    "from langchain_nimble import NimbeSearchRetriever\n",
+    "```"
+   ],
+   "id": "511f9d569c21a5d2"
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "For more information about the Authentication process, see [Nimble APIs Authentication Documentation](https://docs.nimbleway.com/nimble-sdk/web-api/nimble-web-api-quick-start-guide/nimble-apis-authentication)."
-   ],
+   "source": "Note that authentication is required, please refer to the [Setup section in the documentation](/docs/integrations/retrievers/nimble/#setup).",
   "metadata": {
    "id": "WfwnI_RS8PO5"
   },
--- a/docs/docs/integrations/retrievers/nimble.ipynb
+++ b/docs/docs/integrations/retrievers/nimble.ipynb
--- a/docs/docs/integrations/vectorstores/vearch.ipynb
+++ b/docs/docs/integrations/vectorstores/vearch.ipynb
@@ -156,6 +156,15 @@
    "    db_name=\"vearch_cluster_langchian\",\n",
    "    table_name=\"tobenumone\",\n",
    "    flag=1,\n",
+    ")\n",
+    "\n",
+    "# The vector data is usually already initialized, so we don’t need the document parameter and can directly create the object.\n",
+    "vearch_cluster_b = Vearch(\n",
+    "    embeddings,\n",
+    "    path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",\n",
+    "    db_name=\"vearch_cluster_langchian\",\n",
+    "    table_name=\"tobenumone\",\n",
+    "    flag=1,\n",
    ")"
   ]
  },
@@ -244,6 +253,7 @@
   ],
   "source": [
    "query = \"你知道凌波微步吗，你知道都有谁会凌波微步?\"\n",
+    "# The second parameter is the top-n to retrieve, and its default value is 4.\n",
    "vearch_standalone_res = vearch_standalone.similarity_search(query, 3)\n",
    "for idx, tmp in enumerate(vearch_standalone_res):\n",
    "    print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
@@ -261,6 +271,11 @@
    "for idx, tmp in enumerate(cluster_res):\n",
    "    print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
    "\n",
+    "# In practical applications, we usually limit the boundary value of similarity. The following method can set this value.\n",
+    "cluster_res_with_bound = vearch_cluster.similarity_search_with_score(\n",
+    "    query=query_c, k=3, min_score=0.5\n",
+    ")\n",
+    "\n",
    "# combine your local knowleadge and query\n",
    "context_c = \"\".join([tmp.page_content for tmp in cluster_res])\n",
    "new_query_c = f\"基于以下信息，尽可能准确的来回答用户的问题。背景信息:\\n {context_c} \\n 回答用户这个问题:{query_c}\\n\\n\"\n",
--- a/docs/docs/tutorials/classification.ipynb
+++ b/docs/docs/tutorials/classification.ipynb
@@ -154,7 +154,7 @@
   "id": "ff3cf30d",
   "metadata": {},
   "source": [
-    "If we want dictionary output, we can just call `.dict()`"
+    "If we want dictionary output, we can just call `.model_dump()`"
   ]
  },
  {
@@ -179,7 +179,7 @@
    "prompt = tagging_prompt.invoke({\"input\": inp})\n",
    "response = llm.invoke(prompt)\n",
    "\n",
-    "response.dict()"
+    "response.model_dump()"
   ]
  },
  {
--- a/libs/community/extended_testing_deps.txt
+++ b/libs/community/extended_testing_deps.txt
@@ -64,7 +64,7 @@ pdfplumber>=0.11
 pgvector>=0.1.6,<0.2
 playwright>=1.48.0,<2
 praw>=7.7.1,<8
-premai>=0.3.25,<0.4
+premai>=0.3.25,<0.4,!=0.3.100
 psychicapi>=0.8.0,<0.9
 pydantic>=2.7.4,<3
 pytesseract>=0.3.13
--- a/libs/community/langchain_community/document_loaders/youtube.py
+++ b/libs/community/langchain_community/document_loaders/youtube.py
@@ -392,11 +392,11 @@ class GoogleApiYoutubeLoader(BaseLoader):

    @model_validator(mode="before")
    @classmethod
-    def validate_channel_or_videoIds_is_set(cls, values: Dict[str, Any]) -> Any:
+    def validate_channel_or_videoIds_is_set(cls, values: Any) -> Any:
        """Validate that either folder_id or document_ids is set, but not both."""
-        if not values.get("channel_name") and not values.get("video_ids"):
+        if not values.kwargs.get("channel_name") and not values.kwargs.get("video_ids"):
            raise ValueError("Must specify either channel_name or video_ids")
-        return values
+        return values.kwargs

    def _get_transcripe_for_video_id(self, video_id: str) -> str:
        from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi
--- a/libs/community/langchain_community/retrievers/pinecone_hybrid_search.py
+++ b/libs/community/langchain_community/retrievers/pinecone_hybrid_search.py
@@ -31,6 +31,7 @@ def create_index(
    ids: Optional[List[str]] = None,
    metadatas: Optional[List[dict]] = None,
    namespace: Optional[str] = None,
+    text_key: str = "context",
 ) -> None:
    """Create an index from a list of contexts.

@@ -69,7 +70,7 @@ def create_index(
        )
        # add context passages as metadata
        meta = [
-            {"context": context, **metadata}
+            {text_key: context, **metadata}
            for context, metadata in zip(context_batch, metadata_batch)
        ]

@@ -114,7 +115,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
    """Alpha value for hybrid search."""
    namespace: Optional[str] = None
    """Namespace value for index partition."""
-
+    text_key: str = "context"
    model_config = ConfigDict(
        arbitrary_types_allowed=True,
        extra="forbid",
@@ -135,6 +136,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
            ids=ids,
            metadatas=metadatas,
            namespace=namespace,
+            text_key=self.text_key,
        )

    @pre_init
@@ -174,7 +176,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
        )
        final_result = []
        for res in result["matches"]:
-            context = res["metadata"].pop("context")
+            context = res["metadata"].pop(self.text_key)
            metadata = res["metadata"]
            if "score" not in metadata and "score" in res:
                metadata["score"] = res["score"]
--- a/libs/community/langchain_community/utilities/wikidata.py
+++ b/libs/community/langchain_community/utilities/wikidata.py
@@ -70,7 +70,7 @@ DEFAULT_PROPERTIES = [
 DEFAULT_LANG_CODE = "en"
 WIKIDATA_USER_AGENT = "langchain-wikidata"
 WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"
-WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v0/"
+WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v1/"


 class WikidataAPIWrapper(BaseModel):