mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-01 11:02:37 +00:00
Added new use case docs for Web Scraping, Chromium loader, BS4 transformer (#8732)
- Description: Added a new use case category called "Web Scraping", and a tutorial to scrape websites using OpenAI Functions Extraction chain to the docs. - Tag maintainer:@baskaryan @hwchase17 , - Twitter handle: https://www.linkedin.com/in/haiphunghiem/ (I'm on LinkedIn mostly) --------- Co-authored-by: Lance Martin <lance@langchain.dev>
This commit is contained in:
@@ -52,6 +52,7 @@ from langchain.document_loaders.blockchain import BlockchainDocumentLoader
|
||||
from langchain.document_loaders.brave_search import BraveSearchLoader
|
||||
from langchain.document_loaders.browserless import BrowserlessLoader
|
||||
from langchain.document_loaders.chatgpt import ChatGPTLoader
|
||||
from langchain.document_loaders.chromium import AsyncChromiumLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.concurrent import ConcurrentLoader
|
||||
from langchain.document_loaders.confluence import ConfluenceLoader
|
||||
@@ -196,6 +197,9 @@ PagedPDFSplitter = PyPDFLoader
|
||||
TelegramChatLoader = TelegramChatFileLoader
|
||||
|
||||
__all__ = [
|
||||
"AcreomLoader",
|
||||
"AsyncHtmlLoader",
|
||||
"AsyncChromiumLoader",
|
||||
"AZLyricsLoader",
|
||||
"AcreomLoader",
|
||||
"AirbyteCDKLoader",
|
||||
|
90
libs/langchain/langchain/document_loaders/chromium.py
Normal file
90
libs/langchain/langchain/document_loaders/chromium.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Iterator, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AsyncChromiumLoader(BaseLoader):
|
||||
"""Scrape HTML content from provided URLs using a
|
||||
headless instance of the Chromium browser."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
urls: List[str],
|
||||
):
|
||||
"""
|
||||
Initialize the loader with a list of URL paths.
|
||||
|
||||
Args:
|
||||
urls (List[str]): A list of URLs to scrape content from.
|
||||
|
||||
Raises:
|
||||
ImportError: If the required 'playwright' package is not installed.
|
||||
"""
|
||||
self.urls = urls
|
||||
|
||||
try:
|
||||
import playwright # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"playwright is required for AsyncChromiumLoader. "
|
||||
"Please install it with `pip install playwright`."
|
||||
)
|
||||
|
||||
async def ascrape_playwright(self, url: str) -> str:
|
||||
"""
|
||||
Asynchronously scrape the content of a given URL using Playwright's async API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape.
|
||||
|
||||
Returns:
|
||||
str: The scraped HTML content or an error message if an exception occurs.
|
||||
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
logger.info("Starting scraping...")
|
||||
results = ""
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
try:
|
||||
page = await browser.new_page()
|
||||
await page.goto(url)
|
||||
results = await page.content() # Simply get the HTML content
|
||||
logger.info("Content scraped")
|
||||
except Exception as e:
|
||||
results = f"Error: {e}"
|
||||
await browser.close()
|
||||
return results
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""
|
||||
Lazily load text content from the provided URLs.
|
||||
|
||||
This method yields Documents one at a time as they're scraped,
|
||||
instead of waiting to scrape all URLs before returning.
|
||||
|
||||
Yields:
|
||||
Document: The scraped content encapsulated within a Document object.
|
||||
|
||||
"""
|
||||
for url in self.urls:
|
||||
html_content = asyncio.run(self.ascrape_playwright(url))
|
||||
metadata = {"source": url}
|
||||
yield Document(page_content=html_content, metadata=metadata)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""
|
||||
Load and return all Documents from the provided URLs.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of Document objects
|
||||
containing the scraped content from each URL.
|
||||
|
||||
"""
|
||||
return list(self.lazy_load())
|
@@ -15,6 +15,9 @@
|
||||
Document
|
||||
""" # noqa: E501
|
||||
|
||||
from langchain.document_transformers.beautiful_soup_transformer import (
|
||||
BeautifulSoupTransformer,
|
||||
)
|
||||
from langchain.document_transformers.doctran_text_extract import (
|
||||
DoctranPropertyExtractor,
|
||||
)
|
||||
@@ -31,6 +34,7 @@ from langchain.document_transformers.nuclia_text_transform import NucliaTextTran
|
||||
from langchain.document_transformers.openai_functions import OpenAIMetadataTagger
|
||||
|
||||
__all__ = [
|
||||
"BeautifulSoupTransformer",
|
||||
"DoctranQATransformer",
|
||||
"DoctranTextTranslator",
|
||||
"DoctranPropertyExtractor",
|
||||
|
@@ -0,0 +1,143 @@
|
||||
from typing import Any, List, Sequence
|
||||
|
||||
from langchain.schema import BaseDocumentTransformer, Document
|
||||
|
||||
|
||||
class BeautifulSoupTransformer(BaseDocumentTransformer):
|
||||
"""Transform HTML content by extracting specific tags and removing unwanted ones.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
from langchain.document_transformers import BeautifulSoupTransformer
|
||||
bs4_transformer = BeautifulSoupTransformer()
|
||||
docs_transformed = bs4_transformer.transform_documents(docs)
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""
|
||||
Initialize the transformer.
|
||||
|
||||
This checks if the BeautifulSoup4 package is installed.
|
||||
If not, it raises an ImportError.
|
||||
"""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"BeautifulSoup4 is required for BeautifulSoupTransformer. "
|
||||
"Please install it with `pip install beautifulsoup4`."
|
||||
)
|
||||
|
||||
def transform_documents(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
unwanted_tags: List[str] = ["script", "style"],
|
||||
tags_to_extract: List[str] = ["p", "li", "div", "a"],
|
||||
remove_lines: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> Sequence[Document]:
|
||||
"""
|
||||
Transform a list of Document objects by cleaning their HTML content.
|
||||
|
||||
Args:
|
||||
documents: A sequence of Document objects containing HTML content.
|
||||
unwanted_tags: A list of tags to be removed from the HTML.
|
||||
tags_to_extract: A list of tags whose content will be extracted.
|
||||
remove_lines: If set to True, unnecessary lines will be
|
||||
removed from the HTML content.
|
||||
|
||||
Returns:
|
||||
A sequence of Document objects with transformed content.
|
||||
"""
|
||||
for doc in documents:
|
||||
cleaned_content = doc.page_content
|
||||
|
||||
cleaned_content = self.remove_unwanted_tags(cleaned_content, unwanted_tags)
|
||||
|
||||
cleaned_content = self.extract_tags(cleaned_content, tags_to_extract)
|
||||
|
||||
if remove_lines:
|
||||
cleaned_content = self.remove_unnecessary_lines(cleaned_content)
|
||||
|
||||
doc.page_content = cleaned_content
|
||||
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
def remove_unwanted_tags(html_content: str, unwanted_tags: List[str]) -> str:
|
||||
"""
|
||||
Remove unwanted tags from a given HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The original HTML content string.
|
||||
unwanted_tags: A list of tags to be removed from the HTML.
|
||||
|
||||
Returns:
|
||||
A cleaned HTML string with unwanted tags removed.
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
for tag in unwanted_tags:
|
||||
for element in soup.find_all(tag):
|
||||
element.decompose()
|
||||
return str(soup)
|
||||
|
||||
@staticmethod
|
||||
def extract_tags(html_content: str, tags: List[str]) -> str:
|
||||
"""
|
||||
Extract specific tags from a given HTML content.
|
||||
|
||||
Args:
|
||||
html_content: The original HTML content string.
|
||||
tags: A list of tags to be extracted from the HTML.
|
||||
|
||||
Returns:
|
||||
A string combining the content of the extracted tags.
|
||||
"""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
text_parts = []
|
||||
for tag in tags:
|
||||
elements = soup.find_all(tag)
|
||||
for element in elements:
|
||||
if tag == "a":
|
||||
href = element.get("href")
|
||||
if href:
|
||||
text_parts.append(f"{element.get_text()} ({href})")
|
||||
else:
|
||||
text_parts.append(element.get_text())
|
||||
else:
|
||||
text_parts.append(element.get_text())
|
||||
return " ".join(text_parts)
|
||||
|
||||
@staticmethod
|
||||
def remove_unnecessary_lines(content: str) -> str:
|
||||
"""
|
||||
Clean up the content by removing unnecessary lines.
|
||||
|
||||
Args:
|
||||
content: A string, which may contain unnecessary lines or spaces.
|
||||
|
||||
Returns:
|
||||
A cleaned string with unnecessary lines removed.
|
||||
"""
|
||||
lines = content.split("\n")
|
||||
stripped_lines = [line.strip() for line in lines]
|
||||
non_empty_lines = [line for line in stripped_lines if line]
|
||||
seen = set()
|
||||
deduped_lines = []
|
||||
for line in non_empty_lines:
|
||||
if line not in seen:
|
||||
seen.add(line)
|
||||
deduped_lines.append(line)
|
||||
cleaned_content = " ".join(deduped_lines)
|
||||
return cleaned_content
|
||||
|
||||
async def atransform_documents(
|
||||
self,
|
||||
documents: Sequence[Document],
|
||||
**kwargs: Any,
|
||||
) -> Sequence[Document]:
|
||||
raise NotImplementedError
|
Reference in New Issue
Block a user