mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-23 15:19:33 +00:00
ArxivLoader fix for issue 9046 (#9061)
Fixed #9046 Added ut-s for this fix. @eyurtsev
This commit is contained in:
parent
e94a5d753f
commit
fcbbddedae
@ -128,6 +128,8 @@ class ArxivAPIWrapper(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Remove the ":" and "-" from the query, as they can cause search problems
|
||||||
|
query = query.replace(":", "").replace("-", "")
|
||||||
results = self.arxiv_search( # type: ignore
|
results = self.arxiv_search( # type: ignore
|
||||||
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
|
query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
|
||||||
).results()
|
).results()
|
||||||
@ -141,7 +143,7 @@ class ArxivAPIWrapper(BaseModel):
|
|||||||
doc_file_name: str = result.download_pdf()
|
doc_file_name: str = result.download_pdf()
|
||||||
with fitz.open(doc_file_name) as doc_file:
|
with fitz.open(doc_file_name) as doc_file:
|
||||||
text: str = "".join(page.get_text() for page in doc_file)
|
text: str = "".join(page.get_text() for page in doc_file)
|
||||||
except FileNotFoundError as f_ex:
|
except (FileNotFoundError, fitz.fitz.FileDataError) as f_ex:
|
||||||
logger.debug(f_ex)
|
logger.debug(f_ex)
|
||||||
continue
|
continue
|
||||||
if self.load_all_available_meta:
|
if self.load_all_available_meta:
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from langchain.document_loaders.arxiv import ArxivLoader
|
from langchain.document_loaders.arxiv import ArxivLoader
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
|
|
||||||
@ -53,3 +55,29 @@ def test_load_returns_full_set_of_metadata() -> None:
|
|||||||
)
|
)
|
||||||
print(doc.metadata)
|
print(doc.metadata)
|
||||||
assert len(set(doc.metadata)) > 4
|
assert len(set(doc.metadata)) > 4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="test could be flaky")
|
||||||
|
def test_load_issue_9046() -> None:
|
||||||
|
"""Test for the fixed issue 9046"""
|
||||||
|
expected_docs = 3
|
||||||
|
|
||||||
|
# ":" character could not be an issue
|
||||||
|
loader = ArxivLoader(
|
||||||
|
query="MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
|
||||||
|
load_max_docs=expected_docs,
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert_docs(docs)
|
||||||
|
assert "MetaGPT" in docs[0].metadata["Title"]
|
||||||
|
|
||||||
|
# "-" character could not be an issue
|
||||||
|
loader = ArxivLoader(
|
||||||
|
query="MetaGPT - Meta Programming for Multi-Agent Collaborative Framework",
|
||||||
|
load_max_docs=expected_docs,
|
||||||
|
)
|
||||||
|
docs = loader.load()
|
||||||
|
|
||||||
|
assert_docs(docs)
|
||||||
|
assert "MetaGPT" in docs[0].metadata["Title"]
|
||||||
|
@ -5,6 +5,7 @@ import pytest
|
|||||||
|
|
||||||
from langchain.agents.load_tools import load_tools
|
from langchain.agents.load_tools import load_tools
|
||||||
from langchain.schema import Document
|
from langchain.schema import Document
|
||||||
|
from langchain.tools import ArxivQueryRun
|
||||||
from langchain.tools.base import BaseTool
|
from langchain.tools.base import BaseTool
|
||||||
from langchain.utilities import ArxivAPIWrapper
|
from langchain.utilities import ArxivAPIWrapper
|
||||||
|
|
||||||
@ -81,7 +82,7 @@ def test_load_returns_unlimited_doc_content_chars() -> None:
|
|||||||
doc_content_chars_max = None
|
doc_content_chars_max = None
|
||||||
api_client = ArxivAPIWrapper(doc_content_chars_max=doc_content_chars_max)
|
api_client = ArxivAPIWrapper(doc_content_chars_max=doc_content_chars_max)
|
||||||
docs = api_client.load("1605.08386")
|
docs = api_client.load("1605.08386")
|
||||||
assert len(docs[0].page_content) == 54337
|
assert len(docs[0].page_content) == pytest.approx(54338, rel=1e-2)
|
||||||
|
|
||||||
|
|
||||||
def test_load_returns_full_set_of_metadata() -> None:
|
def test_load_returns_full_set_of_metadata() -> None:
|
||||||
@ -120,7 +121,7 @@ def test_load_arxiv_from_universal_entry_with_params() -> None:
|
|||||||
"load_all_available_meta": True,
|
"load_all_available_meta": True,
|
||||||
}
|
}
|
||||||
arxiv_tool = _load_arxiv_from_universal_entry(**params)
|
arxiv_tool = _load_arxiv_from_universal_entry(**params)
|
||||||
assert isinstance(arxiv_tool, ArxivAPIWrapper)
|
assert isinstance(arxiv_tool, ArxivQueryRun)
|
||||||
wp = arxiv_tool.api_wrapper
|
wp = arxiv_tool.api_wrapper
|
||||||
assert wp.top_k_results == 1, "failed to assert top_k_results"
|
assert wp.top_k_results == 1, "failed to assert top_k_results"
|
||||||
assert wp.load_max_docs == 10, "failed to assert load_max_docs"
|
assert wp.load_max_docs == 10, "failed to assert load_max_docs"
|
||||||
|
Loading…
Reference in New Issue
Block a user