From c3d4126eb198e54e316f84f961452aa454f89a8d Mon Sep 17 00:00:00 2001 From: Emilien Chauvet Date: Wed, 5 Jun 2024 17:20:34 +0200 Subject: [PATCH] community[minor]: add user agent for web scraping loaders (#22480) **Description:** This PR adds a `USER_AGENT` env variable that is to be used for web scraping. It creates a util to get that user agent and uses it in the classes used for scraping in [this piece of doc](https://python.langchain.com/v0.1/docs/use_cases/web_scraping/). Identifying your scraper is considered a good politeness practice, this PR aims at easing it. **Issue:** `None` **Dependencies:** `None` **Twitter handle:** `None` --- .../document_loaders/async_chromium.ipynb | 2 +- .../document_loaders/async_html.py | 3 ++- .../document_loaders/chromium.py | 15 ++++++++++++--- .../document_loaders/web_base.py | 4 +++- .../langchain_community/utils/user_agent.py | 16 ++++++++++++++++ 5 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 libs/community/langchain_community/utils/user_agent.py diff --git a/docs/docs/integrations/document_loaders/async_chromium.ipynb b/docs/docs/integrations/document_loaders/async_chromium.ipynb index 616dea0c9d2..88cc2b84ce6 100644 --- a/docs/docs/integrations/document_loaders/async_chromium.ipynb +++ b/docs/docs/integrations/document_loaders/async_chromium.ipynb @@ -48,7 +48,7 @@ "from langchain_community.document_loaders import AsyncChromiumLoader\n", "\n", "urls = [\"https://www.wsj.com\"]\n", - "loader = AsyncChromiumLoader(urls)\n", + "loader = AsyncChromiumLoader(urls, user_agent=\"MyAppUserAgent\")\n", "docs = loader.load()\n", "docs[0].page_content[0:100]" ] diff --git a/libs/community/langchain_community/document_loaders/async_html.py b/libs/community/langchain_community/document_loaders/async_html.py index 5bb9d897512..1bd40aff5e5 100644 --- a/libs/community/langchain_community/document_loaders/async_html.py +++ b/libs/community/langchain_community/document_loaders/async_html.py @@ -19,11 +19,12 @@ import requests from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseLoader +from langchain_community.utils.user_agent import get_user_agent logger = logging.getLogger(__name__) default_header_template = { - "User-Agent": "", + "User-Agent": get_user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*" ";q=0.8", "Accept-Language": "en-US,en;q=0.5", diff --git a/libs/community/langchain_community/document_loaders/chromium.py b/libs/community/langchain_community/document_loaders/chromium.py index e15e78b12ce..4779d910f34 100644 --- a/libs/community/langchain_community/document_loaders/chromium.py +++ b/libs/community/langchain_community/document_loaders/chromium.py @@ -1,10 +1,11 @@ import asyncio import logging -from typing import AsyncIterator, Iterator, List +from typing import AsyncIterator, Iterator, List, Optional from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseLoader +from langchain_community.utils.user_agent import get_user_agent logger = logging.getLogger(__name__) @@ -13,18 +14,26 @@ class AsyncChromiumLoader(BaseLoader): """Scrape HTML pages from URLs using a headless instance of the Chromium.""" - def __init__(self, urls: List[str], *, headless: bool = True): + def __init__( + self, + urls: List[str], + *, + headless: bool = True, + user_agent: Optional[str] = None, + ): """Initialize the loader with a list of URL paths. Args: urls: A list of URLs to scrape content from. headless: Whether to run browser in headless mode. + user_agent: The user agent to use for the browser Raises: ImportError: If the required 'playwright' package is not installed. """ self.urls = urls self.headless = headless + self.user_agent = user_agent or get_user_agent() try: import playwright # noqa: F401 @@ -52,7 +61,7 @@ class AsyncChromiumLoader(BaseLoader): async with async_playwright() as p: browser = await p.chromium.launch(headless=self.headless) try: - page = await browser.new_page() + page = await browser.new_page(user_agent=self.user_agent) await page.goto(url) results = await page.content() # Simply get the HTML content logger.info("Content scraped") diff --git a/libs/community/langchain_community/document_loaders/web_base.py b/libs/community/langchain_community/document_loaders/web_base.py index b925f792e57..a086a135ab2 100644 --- a/libs/community/langchain_community/document_loaders/web_base.py +++ b/libs/community/langchain_community/document_loaders/web_base.py @@ -1,4 +1,5 @@ """Web base loader class.""" + import asyncio import logging import warnings @@ -9,11 +10,12 @@ import requests from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseLoader +from langchain_community.utils.user_agent import get_user_agent logger = logging.getLogger(__name__) default_header_template = { - "User-Agent": "", + "User-Agent": get_user_agent(), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*" ";q=0.8", "Accept-Language": "en-US,en;q=0.5", diff --git a/libs/community/langchain_community/utils/user_agent.py b/libs/community/langchain_community/utils/user_agent.py new file mode 100644 index 00000000000..140f63b04ed --- /dev/null +++ b/libs/community/langchain_community/utils/user_agent.py @@ -0,0 +1,16 @@ +import logging +import os + +log = logging.getLogger(__name__) + + +def get_user_agent() -> str: + """Get user agent from environment variable.""" + env_user_agent = os.environ.get("USER_AGENT") + if not env_user_agent: + logging.warning( + "USER_AGENT environment variable not set, " + "consider setting it to identify your requests." + ) + return "DefaultLangchainUserAgent" + return env_user_agent