From 1100f8de7aa38bc0af71e7e729fae18a95b9f891 Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Wed, 6 Mar 2024 15:16:49 +0100 Subject: [PATCH] community[minor]: Implement lazy_load() for ArxivLoader (#18664) Integration tests: `tests/integration_tests/utilities/test_arxiv.py` and `tests/integration_tests/document_loaders/test_arxiv.py` --- .../document_loaders/arxiv.py | 6 ++--- .../langchain_community/utilities/arxiv.py | 26 ++++++++++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/arxiv.py b/libs/community/langchain_community/document_loaders/arxiv.py index 811d2b605ff..2f26d155201 100644 --- a/libs/community/langchain_community/document_loaders/arxiv.py +++ b/libs/community/langchain_community/document_loaders/arxiv.py @@ -1,4 +1,4 @@ -from typing import Any, List, Optional +from typing import Any, Iterator, List, Optional from langchain_core.documents import Document @@ -23,8 +23,8 @@ class ArxivLoader(BaseLoader): doc_content_chars_max=doc_content_chars_max, **kwargs ) - def load(self) -> List[Document]: - return self.client.load(self.query) + def lazy_load(self) -> Iterator[Document]: + yield from self.client.lazy_load(self.query) def get_summaries_as_docs(self) -> List[Document]: return self.client.get_summaries_as_docs(self.query) diff --git a/libs/community/langchain_community/utilities/arxiv.py b/libs/community/langchain_community/utilities/arxiv.py index 832332266de..383df6396e1 100644 --- a/libs/community/langchain_community/utilities/arxiv.py +++ b/libs/community/langchain_community/utilities/arxiv.py @@ -2,7 +2,7 @@ import logging import os import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterator, List, Optional from langchain_core.documents import Document from langchain_core.pydantic_v1 import BaseModel, root_validator @@ -177,7 +177,22 @@ class ArxivAPIWrapper(BaseModel): Args: query: a plaintext search query - """ # noqa: E501 + """ + return list(self.lazy_load(query)) + + def lazy_load(self, query: str) -> Iterator[Document]: + """ + Run Arxiv search and get the article texts plus the article meta information. + See https://lukasschwab.me/arxiv.py/index.html#Search + + Returns: documents with the document.page_content in text format + + Performs an arxiv search, downloads the top k results as PDFs, loads + them as Documents, and returns them. + + Args: + query: a plaintext search query + """ try: import fitz except ImportError: @@ -200,9 +215,8 @@ class ArxivAPIWrapper(BaseModel): ).results() except self.arxiv_exceptions as ex: logger.debug("Error on arxiv: %s", ex) - return [] + return - docs: List[Document] = [] for result in results: try: doc_file_name: str = result.download_pdf() @@ -231,9 +245,7 @@ class ArxivAPIWrapper(BaseModel): "Summary": result.summary, **extra_metadata, } - doc = Document( + yield Document( page_content=text[: self.doc_content_chars_max], metadata=metadata ) - docs.append(doc) os.remove(doc_file_name) - return docs