Added cleaning up the downloaded PDF files (#4601)

ArxivAPIWrapper searches and downloads PDFs to get related information. But I found that it doesn't delete the downloaded file. The reason why this is a problem is that a lot of PDF files remain on the server. For example, one size is about 28M. So, I added a delete line because it's too big to maintain on the server. # Clean up downloaded PDF files - Changes: Added new line to delete downloaded file - Background: To get the information on arXiv's paper, ArxivAPIWrapper class downloads a PDF. It's a natural approach, but the wrapper retains a lot of PDF files on the server. - Problem: One size of PDFs is about 28M. It's too big to maintain on a small server like AWS. - Dependency: import os Thank you. --------- Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
2025-08-22 10:59:22 +00:00 · 2023-05-17 09:26:56 +09:00 · 2023-05-17 09:26:56 +09:00 · e90654f39b
commit e90654f39b
parent 6fbd5e837f
1 changed files with 53 additions and 53 deletions
--- a/langchain/utilities/arxiv.py
+++ b/langchain/utilities/arxiv.py
@ -1,5 +1,6 @@
 """Util that calls Arxiv."""
 import logging
 import os
 from typing import Any, Dict, List
 from pydantic import BaseModel, Extra, root_validator
@ -71,21 +72,21 @@ class ArxivAPIWrapper(BaseModel):
        It uses only the most informative fields of article meta information.
        """
        try:
-            docs = [
+            results = self.arxiv_search(  # type: ignore
-                f"Published: {result.updated.date()}\nTitle: {result.title}\n"
+                query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
-                f"Authors: {', '.join(a.name for a in result.authors)}\n"
+            ).results()
                f"Summary: {result.summary}"
                for result in self.arxiv_search(  # type: ignore
                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
                ).results()
            ]
            return (
                "\n\n".join(docs)[: self.doc_content_chars_max]
                if docs
                else "No good Arxiv Result was found"
            )
        except self.arxiv_exceptions as ex:
            return f"Arxiv exception: {ex}"
        docs = [
            f"Published: {result.updated.date()}\nTitle: {result.title}\n"
            f"Authors: {', '.join(a.name for a in result.authors)}\n"
            f"Summary: {result.summary}"
            for result in results
        ]
        if docs:
            return "\n\n".join(docs)[: self.doc_content_chars_max]
        else:
            return "No good Arxiv Result was found"
    def load(self, query: str) -> List[Document]:
        """
@ -98,52 +99,51 @@ class ArxivAPIWrapper(BaseModel):
        try:
            import fitz
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "PyMuPDF package not found, please install it with "
                "`pip install pymupdf`"
            )
        try:
-            docs: List[Document] = []
+            results = self.arxiv_search(  # type: ignore
            for result in self.arxiv_search(  # type: ignore
                query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
-            ).results():
+            ).results()
                try:
                    doc_file_name: str = result.download_pdf()
                    with fitz.open(doc_file_name) as doc_file:
                        text: str = "".join(page.get_text() for page in doc_file)
                        add_meta = (
                            {
                                "entry_id": result.entry_id,
                                "published_first_time": str(result.published.date()),
                                "comment": result.comment,
                                "journal_ref": result.journal_ref,
                                "doi": result.doi,
                                "primary_category": result.primary_category,
                                "categories": result.categories,
                                "links": [link.href for link in result.links],
                            }
                            if self.load_all_available_meta
                            else {}
                        )
                        doc = Document(
                            page_content=text[: self.doc_content_chars_max],
                            metadata=(
                                {
                                    "Published": str(result.updated.date()),
                                    "Title": result.title,
                                    "Authors": ", ".join(
                                        a.name for a in result.authors
                                    ),
                                    "Summary": result.summary,
                                    **add_meta,
                                }
                            ),
                        )
                        docs.append(doc)
                except FileNotFoundError as f_ex:
                    logger.debug(f_ex)
            return docs
        except self.arxiv_exceptions as ex:
            logger.debug("Error on arxiv: %s", ex)
            return []
        docs: List[Document] = []
        for result in results:
            try:
                doc_file_name: str = result.download_pdf()
                with fitz.open(doc_file_name) as doc_file:
                    text: str = "".join(page.get_text() for page in doc_file)
            except FileNotFoundError as f_ex:
                logger.debug(f_ex)
                continue
            if self.load_all_available_meta:
                extra_metadata = {
                    "entry_id": result.entry_id,
                    "published_first_time": str(result.published.date()),
                    "comment": result.comment,
                    "journal_ref": result.journal_ref,
                    "doi": result.doi,
                    "primary_category": result.primary_category,
                    "categories": result.categories,
                    "links": [link.href for link in result.links],
                }
            else:
                extra_metadata = {}
            metadata = {
                "Published": str(result.updated.date()),
                "Title": result.title,
                "Authors": ", ".join(a.name for a in result.authors),
                "Summary": result.summary,
                **extra_metadata,
            }
            doc = Document(
                page_content=text[: self.doc_content_chars_max], metadata=metadata
            )
            docs.append(doc)
            os.remove(doc_file_name)
        return docs