diff --git a/libs/langchain/langchain/utilities/arxiv.py b/libs/langchain/langchain/utilities/arxiv.py index 246ed890453..bd4aa2f1528 100644 --- a/libs/langchain/langchain/utilities/arxiv.py +++ b/libs/langchain/langchain/utilities/arxiv.py @@ -128,6 +128,8 @@ class ArxivAPIWrapper(BaseModel): ) try: + # Remove the ":" and "-" from the query, as they can cause search problems + query = query.replace(":", "").replace("-", "") results = self.arxiv_search( # type: ignore query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs ).results() @@ -141,7 +143,7 @@ class ArxivAPIWrapper(BaseModel): doc_file_name: str = result.download_pdf() with fitz.open(doc_file_name) as doc_file: text: str = "".join(page.get_text() for page in doc_file) - except FileNotFoundError as f_ex: + except (FileNotFoundError, fitz.fitz.FileDataError) as f_ex: logger.debug(f_ex) continue if self.load_all_available_meta: diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_arxiv.py b/libs/langchain/tests/integration_tests/document_loaders/test_arxiv.py index 60315e52b81..fbd5cf45fd8 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_arxiv.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_arxiv.py @@ -1,5 +1,7 @@ from typing import List +import pytest + from langchain.document_loaders.arxiv import ArxivLoader from langchain.schema import Document @@ -53,3 +55,29 @@ def test_load_returns_full_set_of_metadata() -> None: ) print(doc.metadata) assert len(set(doc.metadata)) > 4 + + +@pytest.mark.skip(reason="test could be flaky") +def test_load_issue_9046() -> None: + """Test for the fixed issue 9046""" + expected_docs = 3 + + # ":" character could not be an issue + loader = ArxivLoader( + query="MetaGPT: Meta Programming for Multi-Agent Collaborative Framework", + load_max_docs=expected_docs, + ) + docs = loader.load() + + assert_docs(docs) + assert "MetaGPT" in docs[0].metadata["Title"] + + # "-" character could not be an issue + loader = ArxivLoader( + query="MetaGPT - Meta Programming for Multi-Agent Collaborative Framework", + load_max_docs=expected_docs, + ) + docs = loader.load() + + assert_docs(docs) + assert "MetaGPT" in docs[0].metadata["Title"] diff --git a/libs/langchain/tests/integration_tests/utilities/test_arxiv.py b/libs/langchain/tests/integration_tests/utilities/test_arxiv.py index 0fc2fd6deba..17dd1337496 100644 --- a/libs/langchain/tests/integration_tests/utilities/test_arxiv.py +++ b/libs/langchain/tests/integration_tests/utilities/test_arxiv.py @@ -5,6 +5,7 @@ import pytest from langchain.agents.load_tools import load_tools from langchain.schema import Document +from langchain.tools import ArxivQueryRun from langchain.tools.base import BaseTool from langchain.utilities import ArxivAPIWrapper @@ -81,7 +82,7 @@ def test_load_returns_unlimited_doc_content_chars() -> None: doc_content_chars_max = None api_client = ArxivAPIWrapper(doc_content_chars_max=doc_content_chars_max) docs = api_client.load("1605.08386") - assert len(docs[0].page_content) == 54337 + assert len(docs[0].page_content) == pytest.approx(54338, rel=1e-2) def test_load_returns_full_set_of_metadata() -> None: @@ -120,7 +121,7 @@ def test_load_arxiv_from_universal_entry_with_params() -> None: "load_all_available_meta": True, } arxiv_tool = _load_arxiv_from_universal_entry(**params) - assert isinstance(arxiv_tool, ArxivAPIWrapper) + assert isinstance(arxiv_tool, ArxivQueryRun) wp = arxiv_tool.api_wrapper assert wp.top_k_results == 1, "failed to assert top_k_results" assert wp.load_max_docs == 10, "failed to assert load_max_docs"