Merge branch 'master' into pprados/06-pdfplumber

This commit is contained in:
Philippe PRADOS 2025-02-11 09:50:06 +01:00 committed by GitHub
commit 7733591803
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 60 additions and 51 deletions

View File

@ -99,8 +99,6 @@
"\n",
"prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n",
"\n",
"chain1 = prompt | model\n",
"\n",
"chain = (\n",
" {\n",
" \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n",

View File

@ -27,18 +27,12 @@
"If you'd like to learn more about Nimble, visit us at [nimbleway.com](https://www.nimbleway.com/).\n",
"\n",
"\n",
"## Currently we expose the following components\n",
"\n",
"* **Retriever** - Allow us to query the internet and get parsed textual results utilizing several search engines.\n",
"\n",
"\n"
"## Retrievers:"
]
},
{
"cell_type": "markdown",
"source": [
"## Usage"
],
"source": "### NimbleSearchRetriever",
"metadata": {
"id": "AuMFgVFrKbNH"
},
@ -47,7 +41,9 @@
{
"cell_type": "markdown",
"source": [
"In order to use our provider you have to provide an API key like so"
"Enables developers to build RAG applications and AI Agents that can search, access, and retrieve online information from anywhere on the web.\n",
"\n",
"We need to install the `langchain-nimble` python package."
],
"metadata": {
"id": "sFlPjZX9KdK6"
@ -55,25 +51,32 @@
"id": "sFlPjZX9KdK6"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"import getpass\n",
"import os\n",
"\n",
"os.environ[\"NIMBLE_API_KEY\"] = getpass.getpass()"
],
"metadata": {
"id": "eAqSHZ-Z8R3F"
},
"id": "eAqSHZ-Z8R3F",
"outputs": [],
"execution_count": null,
"outputs": []
"source": "%pip install -U langchain-nimble",
"id": "65f237c852aa3885"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "See a [usage example](/docs/integrations/retrievers/nimble/).",
"id": "77bd7b9a6a8e381b"
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"```python\n",
"from langchain_nimble import NimbeSearchRetriever\n",
"```"
],
"id": "511f9d569c21a5d2"
},
{
"cell_type": "markdown",
"source": [
"For more information about the Authentication process, see [Nimble APIs Authentication Documentation](https://docs.nimbleway.com/nimble-sdk/web-api/nimble-web-api-quick-start-guide/nimble-apis-authentication)."
],
"source": "Note that authentication is required, please refer to the [Setup section in the documentation](/docs/integrations/retrievers/nimble/#setup).",
"metadata": {
"id": "WfwnI_RS8PO5"
},

File diff suppressed because one or more lines are too long

View File

@ -156,6 +156,15 @@
" db_name=\"vearch_cluster_langchian\",\n",
" table_name=\"tobenumone\",\n",
" flag=1,\n",
")\n",
"\n",
"# The vector data is usually already initialized, so we dont need the document parameter and can directly create the object.\n",
"vearch_cluster_b = Vearch(\n",
" embeddings,\n",
" path_or_url=\"http://test-vearch-langchain-router.vectorbase.svc.ht1.n.jd.local\",\n",
" db_name=\"vearch_cluster_langchian\",\n",
" table_name=\"tobenumone\",\n",
" flag=1,\n",
")"
]
},
@ -244,6 +253,7 @@
],
"source": [
"query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n",
"# The second parameter is the top-n to retrieve, and its default value is 4.\n",
"vearch_standalone_res = vearch_standalone.similarity_search(query, 3)\n",
"for idx, tmp in enumerate(vearch_standalone_res):\n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
@ -261,6 +271,11 @@
"for idx, tmp in enumerate(cluster_res):\n",
" print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n",
"\n",
"# In practical applications, we usually limit the boundary value of similarity. The following method can set this value.\n",
"cluster_res_with_bound = vearch_cluster.similarity_search_with_score(\n",
" query=query_c, k=3, min_score=0.5\n",
")\n",
"\n",
"# combine your local knowleadge and query\n",
"context_c = \"\".join([tmp.page_content for tmp in cluster_res])\n",
"new_query_c = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context_c} \\n 回答用户这个问题:{query_c}\\n\\n\"\n",

View File

@ -154,7 +154,7 @@
"id": "ff3cf30d",
"metadata": {},
"source": [
"If we want dictionary output, we can just call `.dict()`"
"If we want dictionary output, we can just call `.model_dump()`"
]
},
{
@ -179,7 +179,7 @@
"prompt = tagging_prompt.invoke({\"input\": inp})\n",
"response = llm.invoke(prompt)\n",
"\n",
"response.dict()"
"response.model_dump()"
]
},
{

View File

@ -64,7 +64,7 @@ pdfplumber>=0.11
pgvector>=0.1.6,<0.2
playwright>=1.48.0,<2
praw>=7.7.1,<8
premai>=0.3.25,<0.4
premai>=0.3.25,<0.4,!=0.3.100
psychicapi>=0.8.0,<0.9
pydantic>=2.7.4,<3
pytesseract>=0.3.13

View File

@ -392,11 +392,11 @@ class GoogleApiYoutubeLoader(BaseLoader):
@model_validator(mode="before")
@classmethod
def validate_channel_or_videoIds_is_set(cls, values: Dict[str, Any]) -> Any:
def validate_channel_or_videoIds_is_set(cls, values: Any) -> Any:
"""Validate that either folder_id or document_ids is set, but not both."""
if not values.get("channel_name") and not values.get("video_ids"):
if not values.kwargs.get("channel_name") and not values.kwargs.get("video_ids"):
raise ValueError("Must specify either channel_name or video_ids")
return values
return values.kwargs
def _get_transcripe_for_video_id(self, video_id: str) -> str:
from youtube_transcript_api import NoTranscriptFound, YouTubeTranscriptApi

View File

@ -31,6 +31,7 @@ def create_index(
ids: Optional[List[str]] = None,
metadatas: Optional[List[dict]] = None,
namespace: Optional[str] = None,
text_key: str = "context",
) -> None:
"""Create an index from a list of contexts.
@ -69,7 +70,7 @@ def create_index(
)
# add context passages as metadata
meta = [
{"context": context, **metadata}
{text_key: context, **metadata}
for context, metadata in zip(context_batch, metadata_batch)
]
@ -114,7 +115,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
"""Alpha value for hybrid search."""
namespace: Optional[str] = None
"""Namespace value for index partition."""
text_key: str = "context"
model_config = ConfigDict(
arbitrary_types_allowed=True,
extra="forbid",
@ -135,6 +136,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
ids=ids,
metadatas=metadatas,
namespace=namespace,
text_key=self.text_key,
)
@pre_init
@ -174,7 +176,7 @@ class PineconeHybridSearchRetriever(BaseRetriever):
)
final_result = []
for res in result["matches"]:
context = res["metadata"].pop("context")
context = res["metadata"].pop(self.text_key)
metadata = res["metadata"]
if "score" not in metadata and "score" in res:
metadata["score"] = res["score"]

View File

@ -70,7 +70,7 @@ DEFAULT_PROPERTIES = [
DEFAULT_LANG_CODE = "en"
WIKIDATA_USER_AGENT = "langchain-wikidata"
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"
WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v0/"
WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v1/"
class WikidataAPIWrapper(BaseModel):