diff --git a/langchain/chains/research/fetch.py b/langchain/chains/research/download.py similarity index 92% rename from langchain/chains/research/fetch.py rename to langchain/chains/research/download.py index 372dfb1f4f1..e0448829b24 100644 --- a/langchain/chains/research/fetch.py +++ b/langchain/chains/research/download.py @@ -15,6 +15,7 @@ import abc import asyncio import mimetypes from typing import Any, List, Optional, Sequence +from pydantic import ValidationError from bs4 import BeautifulSoup @@ -109,10 +110,18 @@ class RequestsDownloadHandler(DownloadHandler): def _repackage_as_blobs(urls: Sequence[str], contents: Sequence[str]) -> List[Blob]: """Repackage the contents as blobs.""" - return [ - Blob(data=content, mimetype=mimetypes.guess_type(url)[0], path=url) - for url, content in zip(urls, contents) - ] + blobs = [] + for url, content in zip(urls, contents): + mimetype = mimetypes.guess_type(url)[0] + try: + blobs.append(Blob(data=content, mimetype=mimetype, path=url)) + except ValidationError: + raise ValueError( + f"Could not create a blob for content at {url}. " + f"Content type is {type(content)}" + ) + + return blobs class AutoDownloadHandler(DownloadHandler): diff --git a/langchain/chains/research/typedefs.py b/langchain/chains/research/typedefs.py index acfbaee0190..8cb7d4ec37d 100644 --- a/langchain/chains/research/typedefs.py +++ b/langchain/chains/research/typedefs.py @@ -1,55 +1,12 @@ import abc -from typing import Any, List, Mapping, Sequence +from typing import List -from langchain.callbacks.manager import Callbacks from langchain.document_loaders.blob_loaders import Blob -# -# class AbstractQueryGenerator(abc.ABC): -# """Abstract class for generating queries.""" -# -# @abc.abstractmethod -# def generate_queries(self, question: str, callbacks: Callbacks = None) -> List[str]: -# """Generate queries for the given question.""" -# raise NotImplementedError() -# -# @abc.abstractmethod -# async def agenerate_queries( -# self, question: str, callbacks: Callbacks = None -# ) -> List[str]: -# """Generate queries for the given question.""" -# raise NotImplementedError() -# -# -# class AbstractSearcher(abc.ABC): -# """Abstract class for running searches.""" -# -# def search(self, queries: Sequence[str]) -> List[Mapping[str, Any]]: -# """Run a search for the given query. -# -# Args: -# queries: the query to run the search for. -# -# Returns: -# a list of search results. -# """ -# raise NotImplementedError() -# -# async def asearch(self, queries: Sequence[str]) -> List[Mapping[str, Any]]: -# """Run a search for the given query. -# -# Args: -# queries: the query to run the search for. -# -# Returns: -# a list of search results. -# """ -# raise NotImplementedError() -# -# -# class BlobCrawler(abc.ABC): -# """Crawl a blob and identify links to related content.""" -# -# @abc.abstractmethod -# def crawl(self, blob: Blob, query: str, callbacks: Callbacks = None) -> List[str]: -# """Explore the blob and identify links to related content that is relevant to the query.""" + +class BlobCrawler(abc.ABC): + """Crawl a blob and identify links to related content.""" + + @abc.abstractmethod + def crawl(self, blob: Blob, query: str) -> List[str]: + """Explore the blob and identify links to relevant content."""