community: refactor Arxiv search logic (#27084)

PR message: Description: This PR refactors the Arxiv API wrapper by extracting the Arxiv search logic into a helper function (_fetch_results) to reduce code duplication and improve maintainability. The helper function is used in methods like get_summaries_as_docs, run, and lazy_load, streamlining the code and making it easier to maintain in the future. Issue: This is a minor refactor, so no specific issue is being fixed. Dependencies: No new dependencies are introduced with this change. Add tests and docs: No new integrations were added, so no additional tests or docs are necessary for this PR. Lint and test: I have run make format, make lint, and make test to ensure all checks pass successfully. --------- Co-authored-by: Erick Friis <erick@langchain.dev>
2025-09-06 05:25:04 +00:00 · 2024-10-15 18:43:03 +03:00
parent 57fbc6bdf1
commit 443b37403d
1 changed files with 21 additions and 27 deletions
--- a/libs/community/langchain_community/utilities/arxiv.py
+++ b/libs/community/langchain_community/utilities/arxiv.py
@@ -94,6 +94,16 @@ class ArxivAPIWrapper(BaseModel):
            )
        return values
    def _fetch_results(self, query: str) -> Any:
        """Helper function to fetch arxiv results based on query."""
        if self.is_arxiv_identifier(query):
            return self.arxiv_search(
                id_list=query.split(), max_results=self.top_k_results
            ).results()
        return self.arxiv_search(
            query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
        ).results()
    def get_summaries_as_docs(self, query: str) -> List[Document]:
        """
        Performs an arxiv search and returns list of
@@ -107,16 +117,11 @@ class ArxivAPIWrapper(BaseModel):
            query: a plaintext search query
        """
        try:
-            if self.is_arxiv_identifier(query):
+            results = self._fetch_results(
-                results = self.arxiv_search(
+                query
-                    id_list=query.split(),
+            )  # Using helper function to fetch results
                    max_results=self.top_k_results,
                ).results()
            else:
                results = self.arxiv_search(  # type: ignore
                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
                ).results()
        except self.arxiv_exceptions as ex:
            logger.error(f"Arxiv exception: {ex}")  # Added error logging
            return [Document(page_content=f"Arxiv exception: {ex}")]
        docs = [
            Document(
@@ -146,16 +151,11 @@ class ArxivAPIWrapper(BaseModel):
            query: a plaintext search query
        """
        try:
-            if self.is_arxiv_identifier(query):
+            results = self._fetch_results(
-                results = self.arxiv_search(
+                query
-                    id_list=query.split(),
+            )  # Using helper function to fetch results
                    max_results=self.top_k_results,
                ).results()
            else:
                results = self.arxiv_search(  # type: ignore
                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.top_k_results
                ).results()
        except self.arxiv_exceptions as ex:
            logger.error(f"Arxiv exception: {ex}")  # Added error logging
            return f"Arxiv exception: {ex}"
        docs = [
            f"Published: {result.updated.date()}\n"
@@ -208,15 +208,9 @@ class ArxivAPIWrapper(BaseModel):
        try:
            # Remove the ":" and "-" from the query, as they can cause search problems
            query = query.replace(":", "").replace("-", "")
-            if self.is_arxiv_identifier(query):
+            results = self._fetch_results(
-                results = self.arxiv_search(
+                query
-                    id_list=query[: self.ARXIV_MAX_QUERY_LENGTH].split(),
+            )  # Using helper function to fetch results
                    max_results=self.load_max_docs,
                ).results()
            else:
                results = self.arxiv_search(  # type: ignore
                    query[: self.ARXIV_MAX_QUERY_LENGTH], max_results=self.load_max_docs
                ).results()
        except self.arxiv_exceptions as ex:
            logger.debug("Error on arxiv: %s", ex)
            return