community[minor]: Implement lazy_load() for ArxivLoader (#18664)

Integration tests: `tests/integration_tests/utilities/test_arxiv.py` and
`tests/integration_tests/document_loaders/test_arxiv.py`
This commit is contained in:
Christophe Bornet 2024-03-06 15:16:49 +01:00 committed by GitHub
parent 2d96803ddd
commit 1100f8de7a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 22 additions and 10 deletions

View File

@ -1,4 +1,4 @@
from typing import Any, List, Optional from typing import Any, Iterator, List, Optional
from langchain_core.documents import Document from langchain_core.documents import Document
@ -23,8 +23,8 @@ class ArxivLoader(BaseLoader):
doc_content_chars_max=doc_content_chars_max, **kwargs doc_content_chars_max=doc_content_chars_max, **kwargs
) )
def load(self) -> List[Document]: def lazy_load(self) -> Iterator[Document]:
return self.client.load(self.query) yield from self.client.lazy_load(self.query)
def get_summaries_as_docs(self) -> List[Document]: def get_summaries_as_docs(self) -> List[Document]:
return self.client.get_summaries_as_docs(self.query) return self.client.get_summaries_as_docs(self.query)

View File

@ -2,7 +2,7 @@
import logging import logging
import os import os
import re import re
from typing import Any, Dict, List, Optional from typing import Any, Dict, Iterator, List, Optional
from langchain_core.documents import Document from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator from langchain_core.pydantic_v1 import BaseModel, root_validator
@ -177,7 +177,22 @@ class ArxivAPIWrapper(BaseModel):
Args: Args:
query: a plaintext search query query: a plaintext search query
""" # noqa: E501 """
return list(self.lazy_load(query))
def lazy_load(self, query: str) -> Iterator[Document]:
"""
Run Arxiv search and get the article texts plus the article meta information.
See https://lukasschwab.me/arxiv.py/index.html#Search
Returns: documents with the document.page_content in text format
Performs an arxiv search, downloads the top k results as PDFs, loads
them as Documents, and returns them.
Args:
query: a plaintext search query
"""
try: try:
import fitz import fitz
except ImportError: except ImportError:
@ -200,9 +215,8 @@ class ArxivAPIWrapper(BaseModel):
).results() ).results()
except self.arxiv_exceptions as ex: except self.arxiv_exceptions as ex:
logger.debug("Error on arxiv: %s", ex) logger.debug("Error on arxiv: %s", ex)
return [] return
docs: List[Document] = []
for result in results: for result in results:
try: try:
doc_file_name: str = result.download_pdf() doc_file_name: str = result.download_pdf()
@ -231,9 +245,7 @@ class ArxivAPIWrapper(BaseModel):
"Summary": result.summary, "Summary": result.summary,
**extra_metadata, **extra_metadata,
} }
doc = Document( yield Document(
page_content=text[: self.doc_content_chars_max], metadata=metadata page_content=text[: self.doc_content_chars_max], metadata=metadata
) )
docs.append(doc)
os.remove(doc_file_name) os.remove(doc_file_name)
return docs