Added new use case docs for Web Scraping, Chromium loader, BS4 transformer (#8732)

- Description: Added a new use case category called "Web Scraping", and a tutorial to scrape websites using OpenAI Functions Extraction chain to the docs. - Tag maintainer:@baskaryan @hwchase17 , - Twitter handle: https://www.linkedin.com/in/haiphunghiem/ (I'm on LinkedIn mostly) --------- Co-authored-by: Lance Martin <lance@langchain.dev>
2025-09-01 11:02:37 +00:00 · 2023-08-11 14:46:59 -04:00
parent 6cb763507c
commit e4418d1b7e
11 changed files with 1045 additions and 0 deletions
--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@@ -52,6 +52,7 @@ from langchain.document_loaders.blockchain import BlockchainDocumentLoader
 from langchain.document_loaders.brave_search import BraveSearchLoader
 from langchain.document_loaders.browserless import BrowserlessLoader
 from langchain.document_loaders.chatgpt import ChatGPTLoader
+from langchain.document_loaders.chromium import AsyncChromiumLoader
 from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
 from langchain.document_loaders.concurrent import ConcurrentLoader
 from langchain.document_loaders.confluence import ConfluenceLoader
@@ -196,6 +197,9 @@ PagedPDFSplitter = PyPDFLoader
 TelegramChatLoader = TelegramChatFileLoader

 __all__ = [
+    "AcreomLoader",
+    "AsyncHtmlLoader",
+    "AsyncChromiumLoader",
    "AZLyricsLoader",
    "AcreomLoader",
    "AirbyteCDKLoader",
--- a/libs/langchain/langchain/document_loaders/chromium.py
+++ b/libs/langchain/langchain/document_loaders/chromium.py
@@ -0,0 +1,90 @@
+import asyncio
+import logging
+from typing import Iterator, List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__name__)
+
+
+class AsyncChromiumLoader(BaseLoader):
+    """Scrape HTML content from provided URLs using a
+    headless instance of the Chromium browser."""
+
+    def __init__(
+        self,
+        urls: List[str],
+    ):
+        """
+        Initialize the loader with a list of URL paths.
+
+        Args:
+            urls (List[str]): A list of URLs to scrape content from.
+
+        Raises:
+            ImportError: If the required 'playwright' package is not installed.
+        """
+        self.urls = urls
+
+        try:
+            import playwright  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "playwright is required for AsyncChromiumLoader. "
+                "Please install it with `pip install playwright`."
+            )
+
+    async def ascrape_playwright(self, url: str) -> str:
+        """
+        Asynchronously scrape the content of a given URL using Playwright's async API.
+
+        Args:
+            url (str): The URL to scrape.
+
+        Returns:
+            str: The scraped HTML content or an error message if an exception occurs.
+
+        """
+        from playwright.async_api import async_playwright
+
+        logger.info("Starting scraping...")
+        results = ""
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            try:
+                page = await browser.new_page()
+                await page.goto(url)
+                results = await page.content()  # Simply get the HTML content
+                logger.info("Content scraped")
+            except Exception as e:
+                results = f"Error: {e}"
+            await browser.close()
+        return results
+
+    def lazy_load(self) -> Iterator[Document]:
+        """
+        Lazily load text content from the provided URLs.
+
+        This method yields Documents one at a time as they're scraped,
+        instead of waiting to scrape all URLs before returning.
+
+        Yields:
+            Document: The scraped content encapsulated within a Document object.
+
+        """
+        for url in self.urls:
+            html_content = asyncio.run(self.ascrape_playwright(url))
+            metadata = {"source": url}
+            yield Document(page_content=html_content, metadata=metadata)
+
+    def load(self) -> List[Document]:
+        """
+        Load and return all Documents from the provided URLs.
+
+        Returns:
+            List[Document]: A list of Document objects
+            containing the scraped content from each URL.
+
+        """
+        return list(self.lazy_load())
--- a/libs/langchain/langchain/document_transformers/init.py
+++ b/libs/langchain/langchain/document_transformers/init.py
@@ -15,6 +15,9 @@
    Document
 """  # noqa: E501

+from langchain.document_transformers.beautiful_soup_transformer import (
+    BeautifulSoupTransformer,
+)
 from langchain.document_transformers.doctran_text_extract import (
    DoctranPropertyExtractor,
 )
@@ -31,6 +34,7 @@ from langchain.document_transformers.nuclia_text_transform import NucliaTextTran
 from langchain.document_transformers.openai_functions import OpenAIMetadataTagger

 __all__ = [
+    "BeautifulSoupTransformer",
    "DoctranQATransformer",
    "DoctranTextTranslator",
    "DoctranPropertyExtractor",
--- a/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py
+++ b/libs/langchain/langchain/document_transformers/beautiful_soup_transformer.py
@@ -0,0 +1,143 @@
+from typing import Any, List, Sequence
+
+from langchain.schema import BaseDocumentTransformer, Document
+
+
+class BeautifulSoupTransformer(BaseDocumentTransformer):
+    """Transform HTML content by extracting specific tags and removing unwanted ones.
+
+    Example:
+        .. code-block:: python
+            from langchain.document_transformers import BeautifulSoupTransformer
+            bs4_transformer = BeautifulSoupTransformer()
+            docs_transformed = bs4_transformer.transform_documents(docs)
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize the transformer.
+
+        This checks if the BeautifulSoup4 package is installed.
+        If not, it raises an ImportError.
+        """
+        try:
+            import bs4  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "BeautifulSoup4 is required for BeautifulSoupTransformer. "
+                "Please install it with `pip install beautifulsoup4`."
+            )
+
+    def transform_documents(
+        self,
+        documents: Sequence[Document],
+        unwanted_tags: List[str] = ["script", "style"],
+        tags_to_extract: List[str] = ["p", "li", "div", "a"],
+        remove_lines: bool = True,
+        **kwargs: Any,
+    ) -> Sequence[Document]:
+        """
+        Transform a list of Document objects by cleaning their HTML content.
+
+        Args:
+            documents: A sequence of Document objects containing HTML content.
+            unwanted_tags: A list of tags to be removed from the HTML.
+            tags_to_extract: A list of tags whose content will be extracted.
+            remove_lines: If set to True, unnecessary lines will be
+            removed from the HTML content.
+
+        Returns:
+            A sequence of Document objects with transformed content.
+        """
+        for doc in documents:
+            cleaned_content = doc.page_content
+
+            cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)
+
+            cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
+
+            if remove_lines:
+                cleaned_content = self.remove_unnecessary_lines(cleaned_content)
+
+            doc.page_content = cleaned_content
+
+        return documents
+
+    @staticmethod
+    def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
+        """
+        Remove unwanted tags from a given HTML content.
+
+        Args:
+            html_content: The original HTML content string.
+            unwanted_tags: A list of tags to be removed from the HTML.
+
+        Returns:
+            A cleaned HTML string with unwanted tags removed.
+        """
+        from bs4 import BeautifulSoup
+
+        soup = BeautifulSoup(html_content, "html.parser")
+        for tag in unwanted_tags:
+            for element in soup.find_all(tag):
+                element.decompose()
+        return str(soup)
+
+    @staticmethod
+    def extract_tags(html_content: str, tags: List[str]) -> str:
+        """
+        Extract specific tags from a given HTML content.
+
+        Args:
+            html_content: The original HTML content string.
+            tags: A list of tags to be extracted from the HTML.
+
+        Returns:
+            A string combining the content of the extracted tags.
+        """
+        from bs4 import BeautifulSoup
+
+        soup = BeautifulSoup(html_content, "html.parser")
+        text_parts = []
+        for tag in tags:
+            elements = soup.find_all(tag)
+            for element in elements:
+                if tag == "a":
+                    href = element.get("href")
+                    if href:
+                        text_parts.append(f"{element.get_text()} ({href})")
+                    else:
+                        text_parts.append(element.get_text())
+                else:
+                    text_parts.append(element.get_text())
+        return " ".join(text_parts)
+
+    @staticmethod
+    def remove_unnecessary_lines(content: str) -> str:
+        """
+        Clean up the content by removing unnecessary lines.
+
+        Args:
+            content: A string, which may contain unnecessary lines or spaces.
+
+        Returns:
+            A cleaned string with unnecessary lines removed.
+        """
+        lines = content.split("\n")
+        stripped_lines = [line.strip() for line in lines]
+        non_empty_lines = [line for line in stripped_lines if line]
+        seen = set()
+        deduped_lines = []
+        for line in non_empty_lines:
+            if line not in seen:
+                seen.add(line)
+                deduped_lines.append(line)
+        cleaned_content = " ".join(deduped_lines)
+        return cleaned_content
+
+    async def atransform_documents(
+        self,
+        documents: Sequence[Document],
+        **kwargs: Any,
+    ) -> Sequence[Document]:
+        raise NotImplementedError