Add youdotcom retriever (#11304)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
mrbean 2023-10-05 16:48:11 -04:00 committed by GitHub
parent 1655ff2ded
commit 9903a70379
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 150 additions and 0 deletions

View File

@ -0,0 +1,62 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "47828a7a",
"metadata": {},
"source": [
"## Using the You.com Retriever\n",
"The retriever from You.com is good for retrieving lots of text. We return multiple of the best text snippets per URL we find to be relevant.\n",
"\n",
"First you just need to initialize the retriever"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a90d61d4",
"metadata": {},
"outputs": [],
"source": [
"from langchain.retrievers.you_retriever import YouRetriever\n",
"from langchain.chains import RetrievalQA\n",
"from langchain.llms import OpenAI\n",
"\n",
"yr = YouRetriever()\n",
"qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type=\"map_reduce\", retriever=yr)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4a223f2f",
"metadata": {},
"outputs": [],
"source": [
"query = \"what starting ohio state quarterback most recently went their entire college career without beating Michigan?\"\n",
"qa.run(query)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.17"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,46 @@
from typing import Any, Dict, List
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
from langchain.pydantic_v1 import root_validator
from langchain.schema import BaseRetriever, Document
from langchain.utils import get_from_dict_or_env
class YouRetriever(BaseRetriever):
"""`You` retriever that uses You.com's search API.
To connect to the You.com api requires an API key which
you can get by emailing api@you.com.
You can check out our docs at https://documentation.you.com.
You need to set the environment variable `YDC_API_KEY` for retriever to operate.
"""
ydc_api_key: str
@root_validator(pre=True)
def validate_client(
cls,
values: Dict[str, Any],
) -> Dict[str, Any]:
values["ydc_api_key"] = get_from_dict_or_env(
values, "ydc_api_key", "YDC_API_KEY"
)
return values
def _get_relevant_documents(
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
) -> List[Document]:
import requests
headers = {"X-API-Key": self.ydc_api_key}
results = requests.get(
f"https://api.ydc-index.io/search?query={query}",
headers=headers,
).json()
docs = []
for hit in results["hits"]:
for snippet in hit["snippets"]:
docs.append(Document(page_content=snippet))
return docs

View File

@ -0,0 +1,16 @@
import os
from langchain.retrievers.you import YouRetriever
class TestYouRetriever:
@classmethod
def setup_class(cls) -> None:
if not os.getenv("YDC_API_KEY"):
raise ValueError("YDC_API_KEY environment variable is not set")
def test_get_relevant_documents(self) -> None:
retriever = YouRetriever()
actual = retriever.get_relevant_documents("test")
assert len(actual) > 0

View File

@ -0,0 +1,26 @@
import json
import os
from unittest import mock
from requests import Response
from langchain.retrievers.you import YouRetriever
from langchain.schema import Document
class TestYouRetriever:
def test_get_relevant_documents(self) -> None:
os.environ["YDC_API_KEY"] = "MOCK KEY!"
retriever = YouRetriever()
with mock.patch("requests.get") as mock_get:
fixture = {"hits": [{"snippets": ["yo"]}, {"snippets": ["bird up"]}]}
response = Response()
response._content = bytes(json.dumps(fixture).encode("utf-8"))
mock_get.return_value = response
actual = retriever.get_relevant_documents("test")
assert actual == [
Document(page_content="yo"),
Document(page_content="bird up"),
]