diff --git a/libs/community/langchain_community/utilities/arxiv.py b/libs/community/langchain_community/utilities/arxiv.py index 383df6396e1..2a0027fd5a2 100644 --- a/libs/community/langchain_community/utilities/arxiv.py +++ b/libs/community/langchain_community/utilities/arxiv.py @@ -27,6 +27,7 @@ class ArxivAPIWrapper(BaseModel): Attributes: top_k_results: number of the top-scored document used for the arxiv tool ARXIV_MAX_QUERY_LENGTH: the cut limit on the query used for the arxiv tool. + continue_on_failure (bool): If True, continue loading other URLs on failure. load_max_docs: a limit to the number of loaded documents load_all_available_meta: if True: the `metadata` of the loaded Documents contains all available @@ -54,6 +55,7 @@ class ArxivAPIWrapper(BaseModel): arxiv_exceptions: Any # :meta private: top_k_results: int = 3 ARXIV_MAX_QUERY_LENGTH: int = 300 + continue_on_failure: bool = False load_max_docs: int = 100 load_all_available_meta: bool = False doc_content_chars_max: Optional[int] = 4000 @@ -225,6 +227,12 @@ class ArxivAPIWrapper(BaseModel): except (FileNotFoundError, fitz.fitz.FileDataError) as f_ex: logger.debug(f_ex) continue + except Exception as e: + if self.continue_on_failure: + logger.error(e) + continue + else: + raise e if self.load_all_available_meta: extra_metadata = { "entry_id": result.entry_id, diff --git a/libs/community/tests/integration_tests/document_loaders/test_arxiv.py b/libs/community/tests/integration_tests/document_loaders/test_arxiv.py index 765290642a8..ceeb48d7507 100644 --- a/libs/community/tests/integration_tests/document_loaders/test_arxiv.py +++ b/libs/community/tests/integration_tests/document_loaders/test_arxiv.py @@ -1,10 +1,17 @@ -from typing import List +import shutil +from http.client import HTTPMessage +from pathlib import Path +from typing import List, Union +from unittest.mock import patch +from urllib.error import HTTPError import pytest from langchain_core.documents import Document from langchain_community.document_loaders.arxiv import ArxivLoader +EXAMPLE_HELLO_PDF_PATH = Path(__file__).parents[1] / "examples" / "hello.pdf" + def assert_docs(docs: List[Document]) -> None: for doc in docs: @@ -57,6 +64,36 @@ def test_load_returns_full_set_of_metadata() -> None: assert len(set(doc.metadata)) > 4 +def test_skip_http_error() -> None: + """Test skipping unexpected Http 404 error of a single doc""" + tmp_hello_pdf_path = Path(__file__).parent / "hello.pdf" + + def first_download_fails() -> Union[HTTPError, str]: + if not hasattr(first_download_fails, "firstCall"): + first_download_fails.__setattr__("firstCall", False) + raise HTTPError( + url="", code=404, msg="Not Found", hdrs=HTTPMessage(), fp=None + ) + else: + # Return temporary example pdf path + shutil.copy(EXAMPLE_HELLO_PDF_PATH, tmp_hello_pdf_path) + return str(tmp_hello_pdf_path.absolute()) + + with patch("arxiv.Result.download_pdf") as mock_download_pdf: + # Set up the mock to raise HTTP 404 error + mock_download_pdf.side_effect = first_download_fails + # Load documents + loader = ArxivLoader( + query="ChatGPT", + load_max_docs=2, + load_all_available_meta=True, + continue_on_failure=True, + ) + docs = loader.load() + # Only 1 of 2 documents should be loaded + assert len(docs) == 1 + + @pytest.mark.skip(reason="test could be flaky") def test_load_issue_9046() -> None: """Test for the fixed issue 9046"""