diff --git a/docs/docs_skeleton/docs/integrations/tools/google_scholar.ipynb b/docs/docs_skeleton/docs/integrations/tools/google_scholar.ipynb new file mode 100644 index 00000000000..5f132b35c63 --- /dev/null +++ b/docs/docs_skeleton/docs/integrations/tools/google_scholar.ipynb @@ -0,0 +1,102 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Google Scholar\n", + "\n", + "This notebook goes through how to use Google Scholar Tool" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: google-search-results in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (2.4.2)\n", + "Requirement already satisfied: requests in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from google-search-results) (2.31.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (3.3.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (1.26.17)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (2023.5.7)\n" + ] + } + ], + "source": [ + "!pip install google-search-results" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.tools.google_scholar import GoogleScholarQueryRun\n", + "from langchain.utilities.google_scholar import GoogleScholarAPIWrapper\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Title: Large language models (LLM) and ChatGPT: what will the impact on nuclear medicine be?\\nAuthors: IL Alberts,K Shi\\nSummary: IL Alberts, L Mercolli, T Pyka, G Prenosil, K Shi… - European journal of …, 2023 - Springer\\nTotal-Citations: 28\\n\\nTitle: Dynamic Planning with a LLM\\nAuthors: G Dagan,F Keller,A Lascarides\\nSummary: G Dagan, F Keller, A Lascarides - arXiv preprint arXiv:2308.06391, 2023 - arxiv.org\\nTotal-Citations: 3\\n\\nTitle: Openagi: When llm meets domain experts\\nAuthors: Y Ge,W Hua,J Ji,J Tan,S Xu,Y Zhang\\nSummary: Y Ge, W Hua, J Ji, J Tan, S Xu, Y Zhang - arXiv preprint arXiv:2304.04370, 2023 - arxiv.org\\nTotal-Citations: 19\\n\\nTitle: Llm-planner: Few-shot grounded planning for embodied agents with large language models\\nAuthors: CH Song\\nSummary: CH Song, J Wu, C Washington… - Proceedings of the …, 2023 - openaccess.thecvf.com\\nTotal-Citations: 28\\n\\nTitle: The science of detecting llm-generated texts\\nAuthors: R Tang,YN Chuang,X Hu\\nSummary: R Tang, YN Chuang, X Hu - arXiv preprint arXiv:2303.07205, 2023 - arxiv.org\\nTotal-Citations: 23\\n\\nTitle: X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages\\nAuthors: F Chen,M Han,J Shi\\nSummary: F Chen, M Han, H Zhao, Q Zhang, J Shi, S Xu… - arXiv preprint arXiv …, 2023 - arxiv.org\\nTotal-Citations: 12\\n\\nTitle: 3d-llm: Injecting the 3d world into large language models\\nAuthors: Y Hong,H Zhen,P Chen,S Zheng,Y Du\\nSummary: Y Hong, H Zhen, P Chen, S Zheng, Y Du… - arXiv preprint arXiv …, 2023 - arxiv.org\\nTotal-Citations: 4\\n\\nTitle: The internal state of an llm knows when its lying\\nAuthors: A Azaria,T Mitchell\\nSummary: A Azaria, T Mitchell - arXiv preprint arXiv:2304.13734, 2023 - arxiv.org\\nTotal-Citations: 18\\n\\nTitle: LLM-Pruner: On the Structural Pruning of Large Language Models\\nAuthors: X Ma,G Fang,X Wang\\nSummary: X Ma, G Fang, X Wang - arXiv preprint arXiv:2305.11627, 2023 - arxiv.org\\nTotal-Citations: 15\\n\\nTitle: Large language models are few-shot testers: Exploring llm-based general bug reproduction\\nAuthors: S Kang,J Yoon,S Yoo\\nSummary: S Kang, J Yoon, S Yoo - 2023 IEEE/ACM 45th International …, 2023 - ieeexplore.ieee.org\\nTotal-Citations: 17'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.environ[\"SERP_API_KEY\"] = \"\"\n", + "tool = GoogleScholarQueryRun(api_wrapper=GoogleScholarAPIWrapper())\n", + "tool.run(\"LLM Models\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.9.16 ('langchain')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "15e58ce194949b77a891bd4339ce3d86a9bd138e905926019517993f97db9e6c" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/langchain/langchain/agents/load_tools.py b/libs/langchain/langchain/agents/load_tools.py index ed1eca41c78..85e690b621a 100644 --- a/libs/langchain/langchain/agents/load_tools.py +++ b/libs/langchain/langchain/agents/load_tools.py @@ -34,6 +34,7 @@ from langchain.tools.base import BaseTool from langchain.tools.bing_search.tool import BingSearchRun from langchain.tools.ddg_search.tool import DuckDuckGoSearchRun from langchain.tools.google_search.tool import GoogleSearchResults, GoogleSearchRun +from langchain.tools.google_scholar.tool import GoogleScholarQueryRun from langchain.tools.metaphor_search.tool import MetaphorSearchResults from langchain.tools.google_serper.tool import GoogleSerperResults, GoogleSerperRun from langchain.tools.searchapi.tool import SearchAPIResults, SearchAPIRun @@ -64,6 +65,7 @@ from langchain.utilities.bing_search import BingSearchAPIWrapper from langchain.utilities.duckduckgo_search import DuckDuckGoSearchAPIWrapper from langchain.utilities.google_search import GoogleSearchAPIWrapper from langchain.utilities.google_serper import GoogleSerperAPIWrapper +from langchain.utilities.google_scholar import GoogleScholarAPIWrapper from langchain.utilities.metaphor_search import MetaphorSearchAPIWrapper from langchain.utilities.awslambda import LambdaWrapper from langchain.utilities.graphql import GraphQLAPIWrapper @@ -222,6 +224,10 @@ def _get_google_serper(**kwargs: Any) -> BaseTool: return GoogleSerperRun(api_wrapper=GoogleSerperAPIWrapper(**kwargs)) +def _get_google_scholar(**kwargs: Any) -> BaseTool: + return GoogleScholarQueryRun(api_wrapper=GoogleScholarAPIWrapper(**kwargs)) + + def _get_google_serper_results_json(**kwargs: Any) -> BaseTool: return GoogleSerperResults(api_wrapper=GoogleSerperAPIWrapper(**kwargs)) @@ -337,6 +343,10 @@ _EXTRA_OPTIONAL_TOOLS: Dict[str, Tuple[Callable[[KwArg(Any)], BaseTool], List[st "metaphor-search": (_get_metaphor_search, ["metaphor_api_key"]), "ddg-search": (_get_ddg_search, []), "google-serper": (_get_google_serper, ["serper_api_key", "aiosession"]), + "google-scholar": ( + _get_google_scholar, + ["top_k_results", "hl", "lr", "serp_api_key"], + ), "google-serper-results-json": ( _get_google_serper_results_json, ["serper_api_key", "aiosession"], diff --git a/libs/langchain/langchain/tools/google_scholar/__init__.py b/libs/langchain/langchain/tools/google_scholar/__init__.py new file mode 100644 index 00000000000..10f633aac6f --- /dev/null +++ b/libs/langchain/langchain/tools/google_scholar/__init__.py @@ -0,0 +1,5 @@ +"""Google Scholar API Toolkit.""" + +from langchain.tools.google_scholar.tool import GoogleScholarQueryRun + +__all__ = ["GoogleScholarQueryRun"] diff --git a/libs/langchain/langchain/tools/google_scholar/tool.py b/libs/langchain/langchain/tools/google_scholar/tool.py new file mode 100644 index 00000000000..abb9c157188 --- /dev/null +++ b/libs/langchain/langchain/tools/google_scholar/tool.py @@ -0,0 +1,28 @@ +"""Tool for the Google Scholar""" + +from typing import Optional + +from langchain.callbacks.manager import CallbackManagerForToolRun +from langchain.tools.base import BaseTool +from langchain.utilities.google_scholar import GoogleScholarAPIWrapper + + +class GoogleScholarQueryRun(BaseTool): + """Tool that queries the Google search API.""" + + name: str = "google_scholar" + description: str = ( + "A wrapper around Google Scholar Search. " + "Useful for when you need to get information about" + "research papers from Google Scholar" + "Input should be a search query." + ) + api_wrapper: GoogleScholarAPIWrapper + + def _run( + self, + query: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + """Use the tool.""" + return self.api_wrapper.run(query) diff --git a/libs/langchain/langchain/utilities/__init__.py b/libs/langchain/langchain/utilities/__init__.py index 59f0f2093b1..12c0d3398de 100644 --- a/libs/langchain/langchain/utilities/__init__.py +++ b/libs/langchain/langchain/utilities/__init__.py @@ -74,6 +74,12 @@ def _import_google_places_api() -> Any: return GooglePlacesAPIWrapper +def _import_google_scholar() -> Any: + from langchain.utilities.google_scholar import GoogleScholarAPIWrapper + + return GoogleScholarAPIWrapper + + def _import_google_search() -> Any: from langchain.utilities.google_search import GoogleSearchAPIWrapper @@ -225,6 +231,8 @@ def __getattr__(name: str) -> Any: return _import_duckduckgo_search() elif name == "GoldenQueryAPIWrapper": return _import_golden_query() + elif name == "GoogleScholarAPIWrapper": + return _import_google_scholar() elif name == "GooglePlacesAPIWrapper": return _import_google_places_api() elif name == "GoogleSearchAPIWrapper": @@ -286,6 +294,7 @@ __all__ = [ "DuckDuckGoSearchAPIWrapper", "GoldenQueryAPIWrapper", "GooglePlacesAPIWrapper", + "GoogleScholarAPIWrapper", "GoogleSearchAPIWrapper", "GoogleSerperAPIWrapper", "GraphQLAPIWrapper", diff --git a/libs/langchain/langchain/utilities/google_scholar.py b/libs/langchain/langchain/utilities/google_scholar.py new file mode 100644 index 00000000000..777394f1925 --- /dev/null +++ b/libs/langchain/langchain/utilities/google_scholar.py @@ -0,0 +1,129 @@ +"""Util that calls Google Scholar Search.""" +from typing import Dict, Optional + +from langchain.pydantic_v1 import BaseModel, Extra, root_validator +from langchain.utils import get_from_dict_or_env + + +class GoogleScholarAPIWrapper(BaseModel): + """Wrapper for Google Scholar API + + You can create serpapi key by signing up at: https://serpapi.com/users/sign_up. + + The wrapper uses the serpapi python package: + https://serpapi.com/integrations/python#search-google-scholar + + To use, you should have the environment variable ``SERP_API_KEY`` + set with your API key, or pass `serp_api_key` as a named parameter + to the constructor. + + Attributes: + top_k_results: number of results to return from google-scholar query search. + By default it returns top 10 results. + hl: attribute defines the language to use for the Google Scholar search. + It's a two-letter language code. + (e.g., en for English, es for Spanish, or fr for French). Head to the + Google languages page for a full list of supported Google languages: + https://serpapi.com/google-languages + + lr: attribute defines one or multiple languages to limit the search to. + It uses lang_{two-letter language code} to specify languages + and | as a delimiter. (e.g., lang_fr|lang_de will only search French + and German pages). Head to the Google lr languages for a full + list of supported languages: https://serpapi.com/google-lr-languages + + Example: + .. code-block:: python + + from langchain.utilities import GoogleScholarAPIWrapper + google_scholar = GoogleScholarAPIWrapper() + google_scholar.run('langchain') + """ + + top_k_results: int = 10 + hl: str = "en" + lr: str = "lang_en" + serp_api_key: Optional[str] = None + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + serp_api_key = get_from_dict_or_env(values, "serp_api_key", "SERP_API_KEY") + values["SERP_API_KEY"] = serp_api_key + + try: + from serpapi import GoogleScholarSearch + + except ImportError: + raise ImportError( + "google-search-results is not installed. " + "Please install it with `pip install google-search-results" + ">=2.4.2`" + ) + GoogleScholarSearch.SERP_API_KEY = serp_api_key + values["google_scholar_engine"] = GoogleScholarSearch + + return values + + def run(self, query: str) -> str: + """Run query through GoogleSearchScholar and parse result""" + total_results = [] + page = 0 + while page < max((self.top_k_results - 20), 1): + # We are getting 20 results from every page + # which is the max in order to reduce the number of API CALLS. + # 0 is the first page of results, 20 is the 2nd page of results, + # 40 is the 3rd page of results, etc. + results = ( + self.google_scholar_engine( # type: ignore + { + "q": query, + "start": page, + "hl": self.hl, + "num": min( + self.top_k_results, 20 + ), # if top_k_result is less than 20. + "lr": self.lr, + } + ) + .get_dict() + .get("organic_results", []) + ) + total_results.extend(results) + if not results: # No need to search for more pages if current page + # has returned no results + break + page += 20 + if ( + self.top_k_results % 20 != 0 and page > 20 and total_results + ): # From the last page we would only need top_k_results%20 results + # if k is not divisible by 20. + results = ( + self.google_scholar_engine( # type: ignore + { + "q": query, + "start": page, + "num": self.top_k_results % 20, + "hl": self.hl, + "lr": self.lr, + } + ) + .get_dict() + .get("organic_results", []) + ) + total_results.extend(results) + if not total_results: + return "No good Google Scholar Result was found" + docs = [ + f"Title: {result.get('title','')}\n" + f"Authors: {','.join([author.get('name') for author in result.get('publication_info',{}).get('authors',[])])}\n" # noqa: E501 + f"Summary: {result.get('publication_info',{}).get('summary','')}\n" + f"Total-Citations: {result.get('inline_links',{}).get('cited_by',{}).get('total','')}" # noqa: E501 + for result in total_results + ] + return "\n\n".join(docs)