Merge branch 'master' into pprados/06-pdfplumber

This commit is contained in:
Philippe PRADOS 2025-02-11 09:50:06 +01:00 committed by GitHub
commit 7733591803
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 60 additions and 51 deletions

View File

@ -99,8 +99,6 @@
"\n", "\n",
"prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n", "prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n",
"\n", "\n",
"chain1 = prompt | model\n",
"\n",
"chain = (\n", "chain = (\n",
" {\n", " {\n",
" \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n", " \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n",

View File

@ -27,18 +27,12 @@
"If you'd like to learn more about Nimble, visit us at [nimbleway.com](https://www.nimbleway.com/).\n", "If you'd like to learn more about Nimble, visit us at [nimbleway.com](https://www.nimbleway.com/).\n",
"\n", "\n",
"\n", "\n",
"## Currently we expose the following components\n", "## Retrievers:"
"\n",
"* **Retriever** - Allow us to query the internet and get parsed textual results utilizing several search engines.\n",
"\n",
"\n"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": "### NimbleSearchRetriever",
"## Usage"
],
"metadata": { "metadata": {
"id": "AuMFgVFrKbNH" "id": "AuMFgVFrKbNH"
}, },
@ -47,7 +41,9 @@
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"In order to use our provider you have to provide an API key like so" "Enables developers to build RAG applications and AI Agents that can search, access, and retrieve online information from anywhere on the web.\n",
"\n",
"We need to install the `langchain-nimble` python package."
], ],
"metadata": { "metadata": {
"id": "sFlPjZX9KdK6" "id": "sFlPjZX9KdK6"
@ -55,25 +51,32 @@
"id": "sFlPjZX9KdK6" "id": "sFlPjZX9KdK6"
}, },
{ {
"metadata": {},
"cell_type": "code", "cell_type": "code",
"source": [ "outputs": [],
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"NIMBLE_API_KEY\"] = getpass.getpass()"
],
"metadata": {
"id": "eAqSHZ-Z8R3F"
},
"id": "eAqSHZ-Z8R3F",
"execution_count": null, "execution_count": null,
"outputs": [] "source": "%pip install -U langchain-nimble",
"id": "65f237c852aa3885"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "See a [usage example](/docs/integrations/retrievers/nimble/).",
"id": "77bd7b9a6a8e381b"
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"```python\n",
"from langchain_nimble import NimbeSearchRetriever\n",
"```"
],
"id": "511f9d569c21a5d2"
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": "Note that authentication is required, please refer to the [Setup section in the documentation](/docs/integrations/retrievers/nimble/#setup).",
"For more information about the Authentication process, see [Nimble APIs Authentication Documentation](https://docs.nimbleway.com/nimble-sdk/web-api/nimble-web-api-quick-start-guide/nimble-apis-authentication)."
],
"metadata": { "metadata": {
"id": "WfwnI_RS8PO5" "id": "WfwnI_RS8PO5"
}, },

File diff suppressed because one or more lines are too long

View File

@ -156,6 +156,15 @@
" db_name=\"vearch_cluster_langchian\",\n", " db_name=\"vearch_cluster_langchian\",\n",
" table_name=\"tobenumone\",\n", " table_name=\"tobenumone\",\n",
" flag=1,\n", " flag=1,\n",
")\n",
"\n",
"# The vector data is usually already initialized, so we dont need the document parameter and can directly create the object.\n",
"vearch_cluster_b = Vearch(\n",
" embeddings,\n",
" path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",\n",
" db_name=\"vearch_cluster_langchian\",\n",
" table_name=\"tobenumone\",\n",
" flag=1,\n",
")" ")"
] ]
}, },
@ -244,6 +253,7 @@
], ],
"source": [ "source": [
"query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", "query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n",
"# The second parameter is the top-n to retrieve, and its default value is 4.\n",
"vearch_standalone_res = vearch_standalone.similarity_search(query, 3)\n", "vearch_standalone_res = vearch_standalone.similarity_search(query, 3)\n",
"for idx, tmp in enumerate(vearch_standalone_res):\n", "for idx, tmp in enumerate(vearch_standalone_res):\n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
@ -261,6 +271,11 @@
"for idx, tmp in enumerate(cluster_res):\n", "for idx, tmp in enumerate(cluster_res):\n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
"\n", "\n",
"# In practical applications, we usually limit the boundary value of similarity. The following method can set this value.\n",
"cluster_res_with_bound = vearch_cluster.similarity_search_with_score(\n",
" query=query_c, k=3, min_score=0.5\n",
")\n",
"\n",
"# combine your local knowleadge and query\n", "# combine your local knowleadge and query\n",
"context_c = \"\".join([tmp.page_content for tmp in cluster_res])\n", "context_c = \"\".join([tmp.page_content for tmp in cluster_res])\n",
"new_query_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context_c} \\n 回答用户这个问题:{query_c}\\n\\n\"\n", "new_query_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context_c} \\n 回答用户这个问题:{query_c}\\n\\n\"\n",

View File

@ -154,7 +154,7 @@
"id": "ff3cf30d", "id": "ff3cf30d",
"metadata": {}, "metadata": {},
"source": [ "source": [
"If we want dictionary output, we can just call `.dict()`" "If we want dictionary output, we can just call `.model_dump()`"
] ]
}, },
{ {
@ -179,7 +179,7 @@
"prompt = tagging_prompt.invoke({\"input\": inp})\n", "prompt = tagging_prompt.invoke({\"input\": inp})\n",
"response = llm.invoke(prompt)\n", "response = llm.invoke(prompt)\n",
"\n", "\n",
"response.dict()" "response.model_dump()"
] ]
}, },
{ {

View File

@ -64,7 +64,7 @@ pdfplumber>=0.11
pgvector>=0.1.6,<0.2 pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2 playwright>=1.48.0,<2
praw>=7.7.1,<8 praw>=7.7.1,<8
premai>=0.3.25,<0.4 premai>=0.3.25,<0.4,!=0.3.100
psychicapi>=0.8.0,<0.9 psychicapi>=0.8.0,<0.9
pydantic>=2.7.4,<3 pydantic>=2.7.4,<3
pytesseract>=0.3.13 pytesseract>=0.3.13

View File

@ -392,11 +392,11 @@ class GoogleApiYoutubeLoader(BaseLoader):
@model_validator(mode="before") @model_validator(mode="before")
@classmethod @classmethod
def validate_channel_or_videoIds_is_set(cls, values: Dict[str, Any]) -> Any: def validate_channel_or_videoIds_is_set(cls, values: Any) -> Any:
"""Validate that either folder_id or document_ids is set, but not both.""" """Validate that either folder_id or document_ids is set, but not both."""
if not values.get("channel_name") and not values.get("video_ids"): if not values.kwargs.get("channel_name") and not values.kwargs.get("video_ids"):
raise ValueError("Must specify either channel_name or video_ids") raise ValueError("Must specify either channel_name or video_ids")
return values return values.kwargs
def _get_transcripe_for_video_id(self, video_id: str) -> str: def _get_transcripe_for_video_id(self, video_id: str) -> str:
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi

View File

@ -31,6 +31,7 @@ def create_index(
ids: Optional[List[str]] = None, ids: Optional[List[str]] = None,
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[dict]] = None,
namespace: Optional[str] = None, namespace: Optional[str] = None,
text_key: str = "context",
) -> None: ) -> None:
"""Create an index from a list of contexts. """Create an index from a list of contexts.
@ -69,7 +70,7 @@ def create_index(
) )
# add context passages as metadata # add context passages as metadata
meta = [ meta = [
{"context": context, **metadata} {text_key: context, **metadata}
for context, metadata in zip(context_batch, metadata_batch) for context, metadata in zip(context_batch, metadata_batch)
] ]
@ -114,7 +115,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
"""Alpha value for hybrid search.""" """Alpha value for hybrid search."""
namespace: Optional[str] = None namespace: Optional[str] = None
"""Namespace value for index partition.""" """Namespace value for index partition."""
text_key: str = "context"
model_config = ConfigDict( model_config = ConfigDict(
arbitrary_types_allowed=True, arbitrary_types_allowed=True,
extra="forbid", extra="forbid",
@ -135,6 +136,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
ids=ids, ids=ids,
metadatas=metadatas, metadatas=metadatas,
namespace=namespace, namespace=namespace,
text_key=self.text_key,
) )
@pre_init @pre_init
@ -174,7 +176,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
) )
final_result = [] final_result = []
for res in result["matches"]: for res in result["matches"]:
context = res["metadata"].pop("context") context = res["metadata"].pop(self.text_key)
metadata = res["metadata"] metadata = res["metadata"]
if "score" not in metadata and "score" in res: if "score" not in metadata and "score" in res:
metadata["score"] = res["score"] metadata["score"] = res["score"]

View File

@ -70,7 +70,7 @@ DEFAULT_PROPERTIES = [
DEFAULT_LANG_CODE = "en" DEFAULT_LANG_CODE = "en"
WIKIDATA_USER_AGENT = "langchain-wikidata" WIKIDATA_USER_AGENT = "langchain-wikidata"
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php" WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"
WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v0/" WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v1/"
class WikidataAPIWrapper(BaseModel): class WikidataAPIWrapper(BaseModel):