x

2026-02-04 00:00:34 +00:00 · 2024-04-24 14:08:46 -04:00 · 2024-04-24 14:05:24 -04:00 · 2024-04-24 14:05:00 -04:00
3 changed files with 256 additions and 221 deletions
--- a/libs/community/langchain_community/retrievers/init.py
+++ b/libs/community/langchain_community/retrievers/init.py
@@ -118,6 +118,9 @@ if TYPE_CHECKING:
    from langchain_community.retrievers.weaviate_hybrid_search import (
        WeaviateHybridSearchRetriever,  # noqa: F401
    )
+    from langchain_community.retrievers.web_research import (
+        WebResearchRetriever,  # noqa: F401
+    )
    from langchain_community.retrievers.wikipedia import (
        WikipediaRetriever,  # noqa: F401
    )
@@ -208,6 +211,7 @@ _module_lookup = {
    "TavilySearchAPIRetriever": "langchain_community.retrievers.tavily_search_api",
    "VespaRetriever": "langchain_community.retrievers.vespa_retriever",
    "WeaviateHybridSearchRetriever": "langchain_community.retrievers.weaviate_hybrid_search",  # noqa: E501
+    "WebResearchRetriever": "langchain_community.retrievers.web_research",
    "WikipediaRetriever": "langchain_community.retrievers.wikipedia",
    "YouRetriever": "langchain_community.retrievers.you",
    "ZepRetriever": "langchain_community.retrievers.zep",
--- a/libs/community/langchain_community/retrievers/web_research.py
+++ b/libs/community/langchain_community/retrievers/web_research.py
@@ -0,0 +1,239 @@
+import logging
+import re
+from typing import List, Optional
+
+from langchain_core.callbacks import (
+    AsyncCallbackManagerForRetrieverRun,
+    CallbackManagerForRetrieverRun,
+)
+from langchain_core.documents import Document
+from langchain_core.language_models import BaseLLM
+from langchain_core.output_parsers import BaseOutputParser
+from langchain_core.prompts import BasePromptTemplate, PromptTemplate
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_core.retrievers import BaseRetriever
+from langchain_core.vectorstores import VectorStore
+from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
+
+from langchain_community.document_loaders import AsyncHtmlLoader
+from langchain_community.document_transformers import Html2TextTransformer
+from langchain_community.llms import LlamaCpp
+from langchain_community.utilities import GoogleSearchAPIWrapper
+
+logger = logging.getLogger(__name__)
+
+
+class SearchQueries(BaseModel):
+    """Search queries to research for the user's goal."""
+
+    queries: List[str] = Field(
+        ..., description="List of search queries to look up on Google"
+    )
+
+
+DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
+    input_variables=["question"],
+    template="""<<SYS>> \n You are an assistant tasked with improving Google search \
+results. \n <</SYS>> \n\n [INST] Generate THREE Google search queries that \
+are similar to this question. The output should be a numbered list of questions \
+and each should have a question mark at the end: \n\n {question} [/INST]""",
+)
+
+DEFAULT_SEARCH_PROMPT = PromptTemplate(
+    input_variables=["question"],
+    template="""You are an assistant tasked with improving Google search \
+results. Generate THREE Google search queries that are similar to \
+this question. The output should be a numbered list of questions and each \
+should have a question mark at the end: {question}""",
+)
+
+
+class QuestionListOutputParser(BaseOutputParser[List[str]]):
+    """Output parser for a list of numbered questions."""
+
+    def parse(self, text: str) -> List[str]:
+        lines = re.findall(r"\d+\..*?(?:\n|$)", text)
+        return lines
+
+
+try:
+    from langchain.chains import LLMChain
+    from langchain.chains.prompt_selector import ConditionalPromptSelector
+
+    DEFAULT_TEXT_SPLITTER = RecursiveCharacterTextSplitter(
+        chunk_size=1500, chunk_overlap=150
+    )
+
+    class WebResearchRetriever(BaseRetriever):
+        """`Google Search API` retriever."""
+
+        # Inputs
+        vectorstore: VectorStore = Field(
+            ..., description="Vector store for storing web pages"
+        )
+        llm_chain: LLMChain
+        search: GoogleSearchAPIWrapper = Field(
+            ..., description="Google Search API Wrapper"
+        )
+        num_search_results: int = Field(
+            1, description="Number of pages per Google search"
+        )
+        text_splitter: TextSplitter = Field(
+            RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50),
+            description="Text splitter for splitting web pages into chunks",
+        )
+        url_database: List[str] = Field(
+            default_factory=list, description="List of processed URLs"
+        )
+
+        @classmethod
+        def from_llm(
+            cls,
+            vectorstore: VectorStore,
+            llm: BaseLLM,
+            search: GoogleSearchAPIWrapper,
+            prompt: Optional[BasePromptTemplate] = None,
+            num_search_results: int = 1,
+            text_splitter: RecursiveCharacterTextSplitter = DEFAULT_TEXT_SPLITTER,
+        ) -> "WebResearchRetriever":
+            """Initialize from llm using default template.
+
+            Args:
+                vectorstore: Vector store for storing web pages
+                llm: llm for search question generation
+                search: GoogleSearchAPIWrapper
+                prompt: prompt to generating search questions
+                num_search_results: Number of pages per Google search
+                text_splitter: Text splitter for splitting web pages into chunks
+
+            Returns:
+                WebResearchRetriever
+            """
+
+            if not prompt:
+                QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
+                    default_prompt=DEFAULT_SEARCH_PROMPT,
+                    conditionals=[
+                        (
+                            lambda llm: isinstance(llm, LlamaCpp),
+                            DEFAULT_LLAMA_SEARCH_PROMPT,
+                        )
+                    ],
+                )
+                prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
+
+            # Use chat model prompt
+            llm_chain = LLMChain(
+                llm=llm,
+                prompt=prompt,
+                output_parser=QuestionListOutputParser(),
+            )
+
+            return cls(
+                vectorstore=vectorstore,
+                llm_chain=llm_chain,
+                search=search,
+                num_search_results=num_search_results,
+                text_splitter=text_splitter,
+            )
+
+        def clean_search_query(self, query: str) -> str:
+            # Some search tools (e.g., Google) will
+            # fail to return results if query has a
+            # leading digit: 1. "LangCh..."
+            # Check if the first character is a digit
+            if query[0].isdigit():
+                # Find the position of the first quote
+                first_quote_pos = query.find('"')
+                if first_quote_pos != -1:
+                    # Extract the part of the string after the quote
+                    query = query[first_quote_pos + 1 :]
+                    # Remove the trailing quote if present
+                    if query.endswith('"'):
+                        query = query[:-1]
+            return query.strip()
+
+        def search_tool(self, query: str, num_search_results: int = 1) -> List[dict]:
+            """Returns num_search_results pages per Google search."""
+            query_clean = self.clean_search_query(query)
+            result = self.search.results(query_clean, num_search_results)
+            return result
+
+        def _get_relevant_documents(
+            self,
+            query: str,
+            *,
+            run_manager: CallbackManagerForRetrieverRun,
+        ) -> List[Document]:
+            """Search Google for documents related to the query input.
+
+            Args:
+                query: user query
+
+            Returns:
+                Relevant documents from all various urls.
+            """
+
+            # Get search questions
+            logger.info("Generating questions for Google Search ...")
+            result = self.llm_chain({"question": query})
+            logger.info(f"Questions for Google Search (raw): {result}")
+            questions = result["text"]
+            logger.info(f"Questions for Google Search: {questions}")
+
+            # Get urls
+            logger.info("Searching for relevant urls...")
+            urls_to_look = []
+            for query in questions:
+                # Google search
+                search_results = self.search_tool(query, self.num_search_results)
+                logger.info("Searching for relevant urls...")
+                logger.info(f"Search results: {search_results}")
+                for res in search_results:
+                    if res.get("link", None):
+                        urls_to_look.append(res["link"])
+
+            # Relevant urls
+            urls = set(urls_to_look)
+
+            # Check for any new urls that we have not processed
+            new_urls = list(urls.difference(self.url_database))
+
+            logger.info(f"New URLs to load: {new_urls}")
+            # Load, split, and add new urls to vectorstore
+            if new_urls:
+                loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True)
+                html2text = Html2TextTransformer()
+                logger.info("Indexing new urls...")
+                docs = loader.load()
+                docs = list(html2text.transform_documents(docs))
+                docs = self.text_splitter.split_documents(docs)
+                self.vectorstore.add_documents(docs)
+                self.url_database.extend(new_urls)
+
+            # Search for relevant splits
+            # TODO: make this async
+            logger.info("Grabbing most relevant splits from urls...")
+            docs = []
+            for query in questions:
+                docs.extend(self.vectorstore.similarity_search(query))
+
+            # Get unique docs
+            unique_documents_dict = {
+                (doc.page_content, tuple(sorted(doc.metadata.items()))): doc
+                for doc in docs
+            }
+            unique_documents = list(unique_documents_dict.values())
+            return unique_documents
+
+        async def _aget_relevant_documents(
+            self,
+            query: str,
+            *,
+            run_manager: AsyncCallbackManagerForRetrieverRun,
+        ) -> List[Document]:
+            raise NotImplementedError
+except ImportError:
+    # placeholder for when langchain is not installed
+    class WebResearchRetriever:  # type: ignore[no-redef]
+        pass
--- a/libs/langchain/langchain/retrievers/web_research.py
+++ b/libs/langchain/langchain/retrievers/web_research.py
@@ -1,223 +1,15 @@
-import logging
-import re
-from typing import List, Optional
-
-from langchain_community.document_loaders import AsyncHtmlLoader
-from langchain_community.document_transformers import Html2TextTransformer
-from langchain_community.llms import LlamaCpp
-from langchain_community.utilities import GoogleSearchAPIWrapper
-from langchain_core.callbacks import (
-    AsyncCallbackManagerForRetrieverRun,
-    CallbackManagerForRetrieverRun,
-)
-from langchain_core.documents import Document
-from langchain_core.language_models import BaseLLM
-from langchain_core.output_parsers import BaseOutputParser
-from langchain_core.prompts import BasePromptTemplate, PromptTemplate
-from langchain_core.pydantic_v1 import BaseModel, Field
-from langchain_core.retrievers import BaseRetriever
-from langchain_core.vectorstores import VectorStore
-from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter
-
-from langchain.chains import LLMChain
-from langchain.chains.prompt_selector import ConditionalPromptSelector
-
-logger = logging.getLogger(__name__)
-
-
-class SearchQueries(BaseModel):
-    """Search queries to research for the user's goal."""
-
-    queries: List[str] = Field(
-        ..., description="List of search queries to look up on Google"
-    )
-
-
-DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
-    input_variables=["question"],
-    template="""<<SYS>> \n You are an assistant tasked with improving Google search \
-results. \n <</SYS>> \n\n [INST] Generate THREE Google search queries that \
-are similar to this question. The output should be a numbered list of questions \
-and each should have a question mark at the end: \n\n {question} [/INST]""",
+from langchain_community.retrievers.web_research import (
+    DEFAULT_LLAMA_SEARCH_PROMPT,
+    DEFAULT_SEARCH_PROMPT,
+    QuestionListOutputParser,
+    SearchQueries,
+    WebResearchRetriever,
 )

-DEFAULT_SEARCH_PROMPT = PromptTemplate(
-    input_variables=["question"],
-    template="""You are an assistant tasked with improving Google search \
-results. Generate THREE Google search queries that are similar to \
-this question. The output should be a numbered list of questions and each \
-should have a question mark at the end: {question}""",
-)
-
-
-class QuestionListOutputParser(BaseOutputParser[List[str]]):
-    """Output parser for a list of numbered questions."""
-
-    def parse(self, text: str) -> List[str]:
-        lines = re.findall(r"\d+\..*?(?:\n|$)", text)
-        return lines
-
-
-class WebResearchRetriever(BaseRetriever):
-    """`Google Search API` retriever."""
-
-    # Inputs
-    vectorstore: VectorStore = Field(
-        ..., description="Vector store for storing web pages"
-    )
-    llm_chain: LLMChain
-    search: GoogleSearchAPIWrapper = Field(..., description="Google Search API Wrapper")
-    num_search_results: int = Field(1, description="Number of pages per Google search")
-    text_splitter: TextSplitter = Field(
-        RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50),
-        description="Text splitter for splitting web pages into chunks",
-    )
-    url_database: List[str] = Field(
-        default_factory=list, description="List of processed URLs"
-    )
-
-    @classmethod
-    def from_llm(
-        cls,
-        vectorstore: VectorStore,
-        llm: BaseLLM,
-        search: GoogleSearchAPIWrapper,
-        prompt: Optional[BasePromptTemplate] = None,
-        num_search_results: int = 1,
-        text_splitter: RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(
-            chunk_size=1500, chunk_overlap=150
-        ),
-    ) -> "WebResearchRetriever":
-        """Initialize from llm using default template.
-
-        Args:
-            vectorstore: Vector store for storing web pages
-            llm: llm for search question generation
-            search: GoogleSearchAPIWrapper
-            prompt: prompt to generating search questions
-            num_search_results: Number of pages per Google search
-            text_splitter: Text splitter for splitting web pages into chunks
-
-        Returns:
-            WebResearchRetriever
-        """
-
-        if not prompt:
-            QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
-                default_prompt=DEFAULT_SEARCH_PROMPT,
-                conditionals=[
-                    (lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)
-                ],
-            )
-            prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
-
-        # Use chat model prompt
-        llm_chain = LLMChain(
-            llm=llm,
-            prompt=prompt,
-            output_parser=QuestionListOutputParser(),
-        )
-
-        return cls(
-            vectorstore=vectorstore,
-            llm_chain=llm_chain,
-            search=search,
-            num_search_results=num_search_results,
-            text_splitter=text_splitter,
-        )
-
-    def clean_search_query(self, query: str) -> str:
-        # Some search tools (e.g., Google) will
-        # fail to return results if query has a
-        # leading digit: 1. "LangCh..."
-        # Check if the first character is a digit
-        if query[0].isdigit():
-            # Find the position of the first quote
-            first_quote_pos = query.find('"')
-            if first_quote_pos != -1:
-                # Extract the part of the string after the quote
-                query = query[first_quote_pos + 1 :]
-                # Remove the trailing quote if present
-                if query.endswith('"'):
-                    query = query[:-1]
-        return query.strip()
-
-    def search_tool(self, query: str, num_search_results: int = 1) -> List[dict]:
-        """Returns num_search_results pages per Google search."""
-        query_clean = self.clean_search_query(query)
-        result = self.search.results(query_clean, num_search_results)
-        return result
-
-    def _get_relevant_documents(
-        self,
-        query: str,
-        *,
-        run_manager: CallbackManagerForRetrieverRun,
-    ) -> List[Document]:
-        """Search Google for documents related to the query input.
-
-        Args:
-            query: user query
-
-        Returns:
-            Relevant documents from all various urls.
-        """
-
-        # Get search questions
-        logger.info("Generating questions for Google Search ...")
-        result = self.llm_chain({"question": query})
-        logger.info(f"Questions for Google Search (raw): {result}")
-        questions = result["text"]
-        logger.info(f"Questions for Google Search: {questions}")
-
-        # Get urls
-        logger.info("Searching for relevant urls...")
-        urls_to_look = []
-        for query in questions:
-            # Google search
-            search_results = self.search_tool(query, self.num_search_results)
-            logger.info("Searching for relevant urls...")
-            logger.info(f"Search results: {search_results}")
-            for res in search_results:
-                if res.get("link", None):
-                    urls_to_look.append(res["link"])
-
-        # Relevant urls
-        urls = set(urls_to_look)
-
-        # Check for any new urls that we have not processed
-        new_urls = list(urls.difference(self.url_database))
-
-        logger.info(f"New URLs to load: {new_urls}")
-        # Load, split, and add new urls to vectorstore
-        if new_urls:
-            loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True)
-            html2text = Html2TextTransformer()
-            logger.info("Indexing new urls...")
-            docs = loader.load()
-            docs = list(html2text.transform_documents(docs))
-            docs = self.text_splitter.split_documents(docs)
-            self.vectorstore.add_documents(docs)
-            self.url_database.extend(new_urls)
-
-        # Search for relevant splits
-        # TODO: make this async
-        logger.info("Grabbing most relevant splits from urls...")
-        docs = []
-        for query in questions:
-            docs.extend(self.vectorstore.similarity_search(query))
-
-        # Get unique docs
-        unique_documents_dict = {
-            (doc.page_content, tuple(sorted(doc.metadata.items()))): doc for doc in docs
-        }
-        unique_documents = list(unique_documents_dict.values())
-        return unique_documents
-
-    async def _aget_relevant_documents(
-        self,
-        query: str,
-        *,
-        run_manager: AsyncCallbackManagerForRetrieverRun,
-    ) -> List[Document]:
-        raise NotImplementedError
+__all__ = [
+    "SearchQueries",
+    "DEFAULT_LLAMA_SEARCH_PROMPT",
+    "DEFAULT_SEARCH_PROMPT",
+    "QuestionListOutputParser",
+    "WebResearchRetriever",
+]
Author	SHA1	Message	Date
Eugene Yurtsev	22cfb1ea06	x	2024-04-24 14:08:46 -04:00
Eugene Yurtsev	62dc24dccc	x	2024-04-24 14:05:24 -04:00
Eugene Yurtsev	fc56e4b678	x	2024-04-24 14:05:00 -04:00