Added new use case docs for Web Scraping, Chromium loader, BS4 transformer (#8732)

- Description: Added a new use case category called "Web Scraping", and
a tutorial to scrape websites using OpenAI Functions Extraction chain to
the docs.
  - Tag maintainer:@baskaryan @hwchase17 ,
- Twitter handle: https://www.linkedin.com/in/haiphunghiem/ (I'm on
LinkedIn mostly)

---------

Co-authored-by: Lance Martin <lance@langchain.dev>
This commit is contained in:
Hai The Dude
2023-08-11 14:46:59 -04:00
committed by GitHub
parent 6cb763507c
commit e4418d1b7e
11 changed files with 1045 additions and 0 deletions

View File

@@ -52,6 +52,7 @@ from langchain.document_loaders.blockchain import BlockchainDocumentLoader
from langchain.document_loaders.brave_search import BraveSearchLoader
from langchain.document_loaders.browserless import BrowserlessLoader
from langchain.document_loaders.chatgpt import ChatGPTLoader
from langchain.document_loaders.chromium import AsyncChromiumLoader
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
from langchain.document_loaders.concurrent import ConcurrentLoader
from langchain.document_loaders.confluence import ConfluenceLoader
@@ -196,6 +197,9 @@ PagedPDFSplitter = PyPDFLoader
TelegramChatLoader = TelegramChatFileLoader
__all__ = [
"AcreomLoader",
"AsyncHtmlLoader",
"AsyncChromiumLoader",
"AZLyricsLoader",
"AcreomLoader",
"AirbyteCDKLoader",

View File

@@ -0,0 +1,90 @@
import asyncio
import logging
from typing import Iterator, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
class AsyncChromiumLoader(BaseLoader):
"""Scrape HTML content from provided URLs using a
headless instance of the Chromium browser."""
def __init__(
self,
urls: List[str],
):
"""
Initialize the loader with a list of URL paths.
Args:
urls (List[str]): A list of URLs to scrape content from.
Raises:
ImportError: If the required 'playwright' package is not installed.
"""
self.urls = urls
try:
import playwright # noqa: F401
except ImportError:
raise ImportError(
"playwright is required for AsyncChromiumLoader. "
"Please install it with `pip install playwright`."
)
async def ascrape_playwright(self, url: str) -> str:
"""
Asynchronously scrape the content of a given URL using Playwright's async API.
Args:
url (str): The URL to scrape.
Returns:
str: The scraped HTML content or an error message if an exception occurs.
"""
from playwright.async_api import async_playwright
logger.info("Starting scraping...")
results = ""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
try:
page = await browser.new_page()
await page.goto(url)
results = await page.content() # Simply get the HTML content
logger.info("Content scraped")
except Exception as e:
results = f"Error: {e}"
await browser.close()
return results
def lazy_load(self) -> Iterator[Document]:
"""
Lazily load text content from the provided URLs.
This method yields Documents one at a time as they're scraped,
instead of waiting to scrape all URLs before returning.
Yields:
Document: The scraped content encapsulated within a Document object.
"""
for url in self.urls:
html_content = asyncio.run(self.ascrape_playwright(url))
metadata = {"source": url}
yield Document(page_content=html_content, metadata=metadata)
def load(self) -> List[Document]:
"""
Load and return all Documents from the provided URLs.
Returns:
List[Document]: A list of Document objects
containing the scraped content from each URL.
"""
return list(self.lazy_load())

View File

@@ -15,6 +15,9 @@
Document
""" # noqa: E501
from langchain.document_transformers.beautiful_soup_transformer import (
BeautifulSoupTransformer,
)
from langchain.document_transformers.doctran_text_extract import (
DoctranPropertyExtractor,
)
@@ -31,6 +34,7 @@ from langchain.document_transformers.nuclia_text_transform import NucliaTextTran
from langchain.document_transformers.openai_functions import OpenAIMetadataTagger
__all__ = [
"BeautifulSoupTransformer",
"DoctranQATransformer",
"DoctranTextTranslator",
"DoctranPropertyExtractor",

View File

@@ -0,0 +1,143 @@
from typing import Any, List, Sequence
from langchain.schema import BaseDocumentTransformer, Document
class BeautifulSoupTransformer(BaseDocumentTransformer):
"""Transform HTML content by extracting specific tags and removing unwanted ones.
Example:
.. code-block:: python
from langchain.document_transformers import BeautifulSoupTransformer
bs4_transformer = BeautifulSoupTransformer()
docs_transformed = bs4_transformer.transform_documents(docs)
"""
def __init__(self) -> None:
"""
Initialize the transformer.
This checks if the BeautifulSoup4 package is installed.
If not, it raises an ImportError.
"""
try:
import bs4 # noqa:F401
except ImportError:
raise ImportError(
"BeautifulSoup4 is required for BeautifulSoupTransformer. "
"Please install it with `pip install beautifulsoup4`."
)
def transform_documents(
self,
documents: Sequence[Document],
unwanted_tags: List[str] = ["script", "style"],
tags_to_extract: List[str] = ["p", "li", "div", "a"],
remove_lines: bool = True,
**kwargs: Any,
) -> Sequence[Document]:
"""
Transform a list of Document objects by cleaning their HTML content.
Args:
documents: A sequence of Document objects containing HTML content.
unwanted_tags: A list of tags to be removed from the HTML.
tags_to_extract: A list of tags whose content will be extracted.
remove_lines: If set to True, unnecessary lines will be
removed from the HTML content.
Returns:
A sequence of Document objects with transformed content.
"""
for doc in documents:
cleaned_content = doc.page_content
cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)
cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
if remove_lines:
cleaned_content = self.remove_unnecessary_lines(cleaned_content)
doc.page_content = cleaned_content
return documents
@staticmethod
def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
"""
Remove unwanted tags from a given HTML content.
Args:
html_content: The original HTML content string.
unwanted_tags: A list of tags to be removed from the HTML.
Returns:
A cleaned HTML string with unwanted tags removed.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
for tag in unwanted_tags:
for element in soup.find_all(tag):
element.decompose()
return str(soup)
@staticmethod
def extract_tags(html_content: str, tags: List[str]) -> str:
"""
Extract specific tags from a given HTML content.
Args:
html_content: The original HTML content string.
tags: A list of tags to be extracted from the HTML.
Returns:
A string combining the content of the extracted tags.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
text_parts = []
for tag in tags:
elements = soup.find_all(tag)
for element in elements:
if tag == "a":
href = element.get("href")
if href:
text_parts.append(f"{element.get_text()} ({href})")
else:
text_parts.append(element.get_text())
else:
text_parts.append(element.get_text())
return " ".join(text_parts)
@staticmethod
def remove_unnecessary_lines(content: str) -> str:
"""
Clean up the content by removing unnecessary lines.
Args:
content: A string, which may contain unnecessary lines or spaces.
Returns:
A cleaned string with unnecessary lines removed.
"""
lines = content.split("\n")
stripped_lines = [line.strip() for line in lines]
non_empty_lines = [line for line in stripped_lines if line]
seen = set()
deduped_lines = []
for line in non_empty_lines:
if line not in seen:
seen.add(line)
deduped_lines.append(line)
cleaned_content = " ".join(deduped_lines)
return cleaned_content
async def atransform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
raise NotImplementedError