From b2b835cb36b37c4b5abf9ada6464c9693e0c7955 Mon Sep 17 00:00:00 2001 From: Rostyslav Borovyk Date: Fri, 15 Aug 2025 17:46:26 +0300 Subject: [PATCH] docs(docs): add Oxylabs document loader (#32429) Thank you for contributing to LangChain! Follow these steps to mark your pull request as ready for review. **If any of these steps are not completed, your PR will not be considered for review.** - [x] **PR title**: Follows the format: {TYPE}({SCOPE}): {DESCRIPTION} - Examples: - feat(core): add multi-tenant support - fix(cli): resolve flag parsing error - docs(openai): update API usage examples - Allowed `{TYPE}` values: - feat, fix, docs, style, refactor, perf, test, build, ci, chore, revert, release - Allowed `{SCOPE}` values (optional): - core, cli, langchain, standard-tests, docs, anthropic, chroma, deepseek, exa, fireworks, groq, huggingface, mistralai, nomic, ollama, openai, perplexity, prompty, qdrant, xai - Note: the `{DESCRIPTION}` must not start with an uppercase letter. - Once you've written the title, please delete this checklist item; do not include it in the PR. - [x] **PR message**: ***Delete this entire checklist*** and replace with - **Description:** a description of the change. Include a [closing keyword](https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword) if applicable to a relevant issue. - **Issue:** the issue # it fixes, if applicable (e.g. Fixes #123) - **Dependencies:** any dependencies required for this change - **Twitter handle:** if your PR gets announced, and you'd like a mention, we'll gladly shout you out! - [x] **Add tests and docs**: If you're adding a new integration, you must include: 1. A test for the integration, preferably unit tests that do not rely on network access, 2. An example notebook showing its use. It lives in `docs/docs/integrations` directory. - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. **We will not consider a PR unless these three are passing in CI.** See [contribution guidelines](https://python.langchain.com/docs/contributing/) for more. Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to `pyproject.toml` files (even optional ones) unless they are **required** for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. --------- Co-authored-by: Mason Daugherty --- .../document_loaders/oxylabs.ipynb | 334 ++++++++++++++++++ docs/src/theme/FeatureTables.js | 7 + 2 files changed, 341 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/oxylabs.ipynb diff --git a/docs/docs/integrations/document_loaders/oxylabs.ipynb b/docs/docs/integrations/document_loaders/oxylabs.ipynb new file mode 100644 index 00000000000..b23102dd43b --- /dev/null +++ b/docs/docs/integrations/document_loaders/oxylabs.ipynb @@ -0,0 +1,334 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Oxylabs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Oxylabs](https://oxylabs.io/) is a web intelligence collection platform that enables companies worldwide to unlock data-driven insights.\n", + "\n", + "## Overview\n", + "\n", + "Oxylabs document loader allows to load data from search engines, e-commerce sites, travel platforms, and any other website. It supports geolocation, browser rendering, data parsing, multiple user agents and many more parameters. Check out [Oxylabs documentation](https://developers.oxylabs.io/scraping-solutions/web-scraper-api) for more information.\n", + "\n", + "\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | Pricing |\n", + "|:--------------|:------------------------------------------------------------------|:-----:|:------------:|:-----------------------------:|\n", + "| OxylabsLoader | [langchain-oxylabs](https://github.com/oxylabs/langchain-oxylabs) | ✅ | ❌ | Free 5,000 results for 1 week |\n", + "\n", + "### Loader features\n", + "| Document Lazy Loading |\n", + "|:---------------------:|\n", + "| ✅ |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the required dependencies.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%pip install -U langchain-oxylabs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Credentials\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up the proper API keys and environment variables.\n", + "Create your API user credentials: Sign up for a free trial or purchase the product\n", + "in the [Oxylabs dashboard](https://dashboard.oxylabs.io/en/registration)\n", + "to create your API user credentials (OXYLABS_USERNAME and OXYLABS_PASSWORD)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"OXYLABS_USERNAME\"] = getpass.getpass(\"Enter your Oxylabs username: \")\n", + "os.environ[\"OXYLABS_PASSWORD\"] = getpass.getpass(\"Enter your Oxylabs password: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-06T10:57:51.630011Z", + "start_time": "2025-08-06T10:57:51.623814Z" + } + }, + "outputs": [], + "source": [ + "from langchain_oxylabs import OxylabsLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-06T10:57:53.685413Z", + "start_time": "2025-08-06T10:57:53.628859Z" + } + }, + "outputs": [], + "source": [ + "loader = OxylabsLoader(\n", + " urls=[\n", + " \"https://sandbox.oxylabs.io/products/1\",\n", + " \"https://sandbox.oxylabs.io/products/2\",\n", + " ],\n", + " params={\"markdown\": True},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Load" + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-06T10:59:51.487327Z", + "start_time": "2025-08-06T10:59:48.592743Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2751\n", + "[![](data:image/svg+xml...)![logo](data:image/gif;base64...)![logo](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2FnavLogo.a8764883.png&w=750&q=75)](/)\n", + "\n", + "Game platforms:\n", + "\n", + "* **All**\n", + "\n", + "* [Nintendo platform](/products/category/nintendo)\n", + "\n", + "+ wii\n", + "+ wii-u\n", + "+ nintendo-64\n", + "+ switch\n", + "+ gamecube\n", + "+ game-boy-advance\n", + "+ 3ds\n", + "+ ds\n", + "\n", + "* [Xbox platform](/products/category/xbox-platform)\n", + "\n", + "* **Dreamcast**\n", + "\n", + "* [Playstation platform](/products/category/playstation-platform)\n", + "\n", + "* **Pc**\n", + "\n", + "* **Stadia**\n", + "\n", + "Go Back\n", + "\n", + "Note!This is a sandbox website used for web scraping. Information listed in this website does not have any real meaning and should not be associated with the actual products.\n", + "\n", + "![The Legend of Zelda: Ocarina of Time](data:image/gif;base64...)![The Legend of Zelda: Ocarina of Time](/assets/action-adventure.svg)\n", + "\n", + "## The Legend of Zelda: Ocarina of Time\n", + "\n", + "**Developer:** Nintendo**Platform:****Type:** singleplayer\n", + "\n", + "As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human uses Link to g\n", + "5542\n", + "[![](data:image/svg+xml...)![logo](data:image/gif;base64...)![logo](/_next/image?url=%2F_next%2Fstatic%2Fmedia%2FnavLogo.a8764883.png&w=750&q=75)](/)\n", + "\n", + "Game platforms:\n", + "\n", + "* **All**\n", + "\n", + "* [Nintendo platform](/products/category/nintendo)\n", + "\n", + "+ wii\n", + "+ wii-u\n", + "+ nintendo-64\n", + "+ switch\n", + "+ gamecube\n", + "+ game-boy-advance\n", + "+ 3ds\n", + "+ ds\n", + "\n", + "* [Xbox platform](/products/category/xbox-platform)\n", + "\n", + "* **Dreamcast**\n", + "\n", + "* [Playstation platform](/products/category/playstation-platform)\n", + "\n", + "* **Pc**\n", + "\n", + "* **Stadia**\n", + "\n", + "Go Back\n", + "\n", + "Note!This is a sandbox website used for web scraping. Information listed in this website does not have any real meaning and should not be associated with the actual products.\n", + "\n", + "![Super Mario Galaxy](data:image/gif;base64...)![Super Mario Galaxy](/assets/action.svg)\n", + "\n", + "## Super Mario Galaxy\n", + "\n", + "**Developer:** Nintendo**Platform:****Type:** singleplayer\n", + "\n", + "[Metacritic's 2007 Wii Game of the Year] The ultimate Nintendo hero is taking the ultimate step ... out into space. Join Mario as he ushers in a new era of video games, de\n" + ] + } + ], + "source": [ + "for document in loader.load():\n", + " print(document.page_content[:1000])" + ] + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "## Lazy Load" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "for document in loader.lazy_load():\n", + " print(document.page_content[:1000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced examples\n", + "\n", + "The following examples show the usage of `OxylabsLoader` with geolocation, currency, pagination and user agent parameters for Amazon Search and Google Search sources." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-06T11:04:19.901122Z", + "start_time": "2025-08-06T11:04:19.838933Z" + } + }, + "outputs": [], + "source": [ + "loader = OxylabsLoader(\n", + " queries=[\"gaming headset\", \"gaming chair\", \"computer mouse\"],\n", + " params={\n", + " \"source\": \"amazon_search\",\n", + " \"parse\": True,\n", + " \"geo_location\": \"DE\",\n", + " \"currency\": \"EUR\",\n", + " \"pages\": 3,\n", + " },\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2025-08-06T11:07:17.648142Z", + "start_time": "2025-08-06T11:07:17.595629Z" + } + }, + "outputs": [], + "source": [ + "loader = OxylabsLoader(\n", + " queries=[\"europe gdp per capita\", \"us gdp per capita\"],\n", + " params={\n", + " \"source\": \"google_search\",\n", + " \"parse\": True,\n", + " \"geo_location\": \"Paris, France\",\n", + " \"user_agent_type\": \"mobile\",\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "[More information about this package.](https://github.com/oxylabs/langchain-oxylabs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/src/theme/FeatureTables.js b/docs/src/theme/FeatureTables.js index a8e579f56d3..fd2ea36638a 100644 --- a/docs/src/theme/FeatureTables.js +++ b/docs/src/theme/FeatureTables.js @@ -856,6 +856,13 @@ const FEATURE_TABLES = { source: "Web interaction and structured data extraction from any web page using an AgentQL query or a Natural Language prompt", api: "API", apiLink: "https://python.langchain.com/docs/integrations/document_loaders/agentql/" + }, + { + name: "Oxylabs", + link: "oxylabs", + source: "Web intelligence platform enabling the access to various data sources.", + api: "API", + apiLink: "https://github.com/oxylabs/langchain-oxylabs" } ] },