mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-03 20:16:52 +00:00
community[minor]: add user agent for web scraping loaders (#22480)
**Description:** This PR adds a `USER_AGENT` env variable that is to be used for web scraping. It creates a util to get that user agent and uses it in the classes used for scraping in [this piece of doc](https://python.langchain.com/v0.1/docs/use_cases/web_scraping/). Identifying your scraper is considered a good politeness practice, this PR aims at easing it. **Issue:** `None` **Dependencies:** `None` **Twitter handle:** `None`
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
"""Web base loader class."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import warnings
|
||||
@@ -9,11 +10,12 @@ import requests
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders.base import BaseLoader
|
||||
from langchain_community.utils.user_agent import get_user_agent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
default_header_template = {
|
||||
"User-Agent": "",
|
||||
"User-Agent": get_user_agent(),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*"
|
||||
";q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
|
Reference in New Issue
Block a user