mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-22 06:39:52 +00:00
community[minor]: Implement lazy_load() for ArxivLoader (#18664)
Integration tests: `tests/integration_tests/utilities/test_arxiv.py` and `tests/integration_tests/document_loaders/test_arxiv.py`
This commit is contained in:
parent
2d96803ddd
commit
1100f8de7a
@ -1,4 +1,4 @@
|
|||||||
from typing import Any, List, Optional
|
from typing import Any, Iterator, List, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
@ -23,8 +23,8 @@ class ArxivLoader(BaseLoader):
|
|||||||
doc_content_chars_max=doc_content_chars_max, **kwargs
|
doc_content_chars_max=doc_content_chars_max, **kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
return self.client.load(self.query)
|
yield from self.client.lazy_load(self.query)
|
||||||
|
|
||||||
def get_summaries_as_docs(self) -> List[Document]:
|
def get_summaries_as_docs(self) -> List[Document]:
|
||||||
return self.client.get_summaries_as_docs(self.query)
|
return self.client.get_summaries_as_docs(self.query)
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, Iterator, List, Optional
|
||||||
|
|
||||||
from langchain_core.documents import Document
|
from langchain_core.documents import Document
|
||||||
from langchain_core.pydantic_v1 import BaseModel, root_validator
|
from langchain_core.pydantic_v1 import BaseModel, root_validator
|
||||||
@ -177,7 +177,22 @@ class ArxivAPIWrapper(BaseModel):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
query: a plaintext search query
|
query: a plaintext search query
|
||||||
""" # noqa: E501
|
"""
|
||||||
|
return list(self.lazy_load(query))
|
||||||
|
|
||||||
|
def lazy_load(self, query: str) -> Iterator[Document]:
|
||||||
|
"""
|
||||||
|
Run Arxiv search and get the article texts plus the article meta information.
|
||||||
|
See https://lukasschwab.me/arxiv.py/index.html#Search
|
||||||
|
|
||||||
|
Returns: documents with the document.page_content in text format
|
||||||
|
|
||||||
|
Performs an arxiv search, downloads the top k results as PDFs, loads
|
||||||
|
them as Documents, and returns them.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: a plaintext search query
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
import fitz
|
import fitz
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -200,9 +215,8 @@ class ArxivAPIWrapper(BaseModel):
|
|||||||
).results()
|
).results()
|
||||||
except self.arxiv_exceptions as ex:
|
except self.arxiv_exceptions as ex:
|
||||||
logger.debug("Error on arxiv: %s", ex)
|
logger.debug("Error on arxiv: %s", ex)
|
||||||
return []
|
return
|
||||||
|
|
||||||
docs: List[Document] = []
|
|
||||||
for result in results:
|
for result in results:
|
||||||
try:
|
try:
|
||||||
doc_file_name: str = result.download_pdf()
|
doc_file_name: str = result.download_pdf()
|
||||||
@ -231,9 +245,7 @@ class ArxivAPIWrapper(BaseModel):
|
|||||||
"Summary": result.summary,
|
"Summary": result.summary,
|
||||||
**extra_metadata,
|
**extra_metadata,
|
||||||
}
|
}
|
||||||
doc = Document(
|
yield Document(
|
||||||
page_content=text[: self.doc_content_chars_max], metadata=metadata
|
page_content=text[: self.doc_content_chars_max], metadata=metadata
|
||||||
)
|
)
|
||||||
docs.append(doc)
|
|
||||||
os.remove(doc_file_name)
|
os.remove(doc_file_name)
|
||||||
return docs
|
|
||||||
|
Loading…
Reference in New Issue
Block a user