From 8a71f1b41b83a4c8122ca53306fb09400ef5b537 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:22:14 -0700 Subject: [PATCH] core[minor]: add langsmith document loader (#25493) needs tests --- .../document_loaders/langsmith.ipynb | 294 ++++++++++++++++++ .../document_loaders/__init__.py | 2 + .../document_loaders/langsmith.py | 128 ++++++++ .../document_loaders/test_langsmith.py | 58 ++++ 4 files changed, 482 insertions(+) create mode 100644 docs/docs/integrations/document_loaders/langsmith.ipynb create mode 100644 libs/core/langchain_core/document_loaders/langsmith.py create mode 100644 libs/core/tests/unit_tests/document_loaders/test_langsmith.py diff --git a/docs/docs/integrations/document_loaders/langsmith.ipynb b/docs/docs/integrations/document_loaders/langsmith.ipynb new file mode 100644 index 00000000000..98413d15620 --- /dev/null +++ b/docs/docs/integrations/document_loaders/langsmith.ipynb @@ -0,0 +1,294 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: LangSmith\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LangSmithLoader\n", + "\n", + "This notebook provides a quick overview for getting started with the LangSmith [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all LangSmithLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html).\n", + "\n", + "## Overview\n", + "### Integration details\n", + "\n", + "| Class | Package | Local | Serializable | JS support|\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [LangSmithLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html) | [langchain-core](https://api.python.langchain.com/en/latest/core_api_reference.html) | ❌ | ❌ | ❌ | \n", + "\n", + "### Loader features\n", + "| Source | Lazy loading | Native async\n", + "| :---: | :---: | :---: | \n", + "| LangSmithLoader | ✅ | ❌ | \n", + "\n", + "## Setup\n", + "\n", + "To access the LangSmith document loader you'll need to install `langchain-core`, create a [LangSmith](https://langsmith.com) account and get an API key.\n", + "\n", + "### Credentials\n", + "\n", + "Sign up at https://langsmith.com and generate an API key. Once you've done this set the LANGSMITH_API_KEY environment variable:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "if not os.environ.get(\"LANGSMITH_API_KEY\"):\n", + " os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to get automated best-in-class tracing, you can also turn on LangSmith tracing:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# os.environ[\"LANGSMITH_TRACING\"] = \"true\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Installation\n", + "\n", + "Install `langchain-core`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain-core" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clone example dataset\n", + "\n", + "For this example, we'll clone and load a public LangSmith dataset. Cloning creates a copy of this dataset on our personal LangSmith account. You can only load datasets that you have a personal copy of." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from langsmith import Client as LangSmithClient\n", + "\n", + "ls_client = LangSmithClient()\n", + "\n", + "dataset_name = \"LangSmith Few Shot Datasets Notebook\"\n", + "dataset_public_url = (\n", + " \"https://smith.langchain.com/public/55658626-124a-4223-af45-07fb774a6212/d\"\n", + ")\n", + "\n", + "ls_client.clone_public_dataset(dataset_public_url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization\n", + "\n", + "Now we can instantiate our document loader and load documents:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.document_loaders import LangSmithLoader\n", + "\n", + "loader = LangSmithLoader(\n", + " dataset_name=dataset_name,\n", + " content_key=\"question\",\n", + " limit=50,\n", + " # format_content=...,\n", + " # ...\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Show me an example using Weaviate, but customizing the vectorStoreRetriever to return the top 10 k nearest neighbors. \n" + ] + } + ], + "source": [ + "docs = loader.load()\n", + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'question': 'Show me an example using Weaviate, but customizing the vectorStoreRetriever to return the top 10 k nearest neighbors. '}\n" + ] + } + ], + "source": [ + "print(docs[0].metadata[\"inputs\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'answer': 'To customize the Weaviate client and return the top 10 k nearest neighbors, you can utilize the `as_retriever` method with the appropriate parameters. Here\\'s how you can achieve this:\\n\\n```python\\n# Assuming you have imported the necessary modules and classes\\n\\n# Create the Weaviate client\\nclient = weaviate.Client(url=os.environ[\"WEAVIATE_URL\"], ...)\\n\\n# Initialize the Weaviate wrapper\\nweaviate = Weaviate(client, index_name, text_key)\\n\\n# Customize the client to return top 10 k nearest neighbors using as_retriever\\ncustom_retriever = weaviate.as_retriever(\\n search_type=\"similarity\",\\n search_kwargs={\\n \\'k\\': 10 # Customize the value of k as needed\\n }\\n)\\n\\n# Now you can use the custom_retriever to perform searches\\nresults = custom_retriever.search(query, ...)\\n```'}\n" + ] + } + ], + "source": [ + "print(docs[0].metadata[\"outputs\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['dataset_id',\n", + " 'inputs',\n", + " 'outputs',\n", + " 'metadata',\n", + " 'id',\n", + " 'created_at',\n", + " 'modified_at',\n", + " 'runs',\n", + " 'source_run_id']" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(docs[0].metadata.keys())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lazy Load" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "page = []\n", + "for doc in loader.lazy_load():\n", + " page.append(doc)\n", + " if len(page) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", + " # page = []\n", + " break\n", + "len(page)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all LangSmithLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv-311", + "language": "python", + "name": "poetry-venv-311" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/core/langchain_core/document_loaders/__init__.py b/libs/core/langchain_core/document_loaders/__init__.py index 05a48a9be01..e8b6fde5be8 100644 --- a/libs/core/langchain_core/document_loaders/__init__.py +++ b/libs/core/langchain_core/document_loaders/__init__.py @@ -1,5 +1,6 @@ from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader from langchain_core.document_loaders.blob_loaders import Blob, BlobLoader, PathLike +from langchain_core.document_loaders.langsmith import LangSmithLoader __all__ = [ "BaseBlobParser", @@ -7,4 +8,5 @@ __all__ = [ "Blob", "BlobLoader", "PathLike", + "LangSmithLoader", ] diff --git a/libs/core/langchain_core/document_loaders/langsmith.py b/libs/core/langchain_core/document_loaders/langsmith.py new file mode 100644 index 00000000000..232da98ccf7 --- /dev/null +++ b/libs/core/langchain_core/document_loaders/langsmith.py @@ -0,0 +1,128 @@ +import datetime +import json +import uuid +from typing import Any, Callable, Iterator, Optional, Sequence, Union + +from langsmith import Client as LangSmithClient + +from langchain_core.document_loaders.base import BaseLoader +from langchain_core.documents import Document + + +class LangSmithLoader(BaseLoader): + """Load LangSmith Dataset examples as Documents. + + Loads the example inputs as the Document page content and places the entire example + into the Document metadata. This allows you to easily create few-shot example + retrievers from the loaded documents. + + .. dropdown:: Lazy load + + .. code-block:: python + + from langchain_core.document_loaders import LangSmithLoader + + loader = LangSmithLoader(dataset_id="...", limit=100) + docs = [] + for doc in loader.lazy_load(): + docs.append(doc) + + .. code-block:: pycon + + # -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...] + + .. versionadded:: 0.2.34 + """ # noqa: E501 + + def __init__( + self, + *, + dataset_id: Optional[Union[uuid.UUID, str]] = None, + dataset_name: Optional[str] = None, + example_ids: Optional[Sequence[Union[uuid.UUID, str]]] = None, + as_of: Optional[Union[datetime.datetime, str]] = None, + splits: Optional[Sequence[str]] = None, + inline_s3_urls: bool = True, + offset: int = 0, + limit: Optional[int] = None, + metadata: Optional[dict] = None, + filter: Optional[str] = None, + content_key: str = "", + format_content: Optional[Callable[..., str]] = None, + client: Optional[LangSmithClient] = None, + **client_kwargs: Any, + ) -> None: + """ + Args: + dataset_id: The ID of the dataset to filter by. Defaults to None. + dataset_name: The name of the dataset to filter by. Defaults to None. + content_key: The inputs key to set as Document page content. ``"."`` characters + are interpreted as nested keys. E.g. ``content_key="first.second"`` will + result in + ``Document(page_content=format_content(example.inputs["first"]["second"]))`` + format_content: Function for converting the content extracted from the example + inputs into a string. Defaults to JSON-encoding the contents. + example_ids: The IDs of the examples to filter by. Defaults to None. + as_of: The dataset version tag OR + timestamp to retrieve the examples as of. + Response examples will only be those that were present at the time + of the tagged (or timestamped) version. + splits: A list of dataset splits, which are + divisions of your dataset such as 'train', 'test', or 'validation'. + Returns examples only from the specified splits. + inline_s3_urls: Whether to inline S3 URLs. Defaults to True. + offset: The offset to start from. Defaults to 0. + limit: The maximum number of examples to return. + filter: A structured fileter string to apply to the examples. + client: LangSmith Client. If not provided will be initialized from below args. + client_kwargs: Keyword args to pass to LangSmith client init. Should only be + specified if ``client`` isn't. + """ # noqa: E501 + if client and client_kwargs: + raise ValueError + self._client = client or LangSmithClient(**client_kwargs) + self.content_key = list(content_key.split(".")) if content_key else [] + self.format_content = format_content or _stringify + self.dataset_id = dataset_id + self.dataset_name = dataset_name + self.example_ids = example_ids + self.as_of = as_of + self.splits = splits + self.inline_s3_urls = inline_s3_urls + self.offset = offset + self.limit = limit + self.metadata = metadata + self.filter = filter + + def lazy_load(self) -> Iterator[Document]: + for example in self._client.list_examples( + dataset_id=self.dataset_id, + dataset_name=self.dataset_name, + example_ids=self.example_ids, + as_of=self.as_of, + splits=self.splits, + inline_s3_urls=self.inline_s3_urls, + offset=self.offset, + limit=self.limit, + metadata=self.metadata, + filter=self.filter, + ): + content: Any = example.inputs + for key in self.content_key: + content = content[key] + content_str = self.format_content(content) + metadata = example.dict() + # Stringify datetime and UUID types. + for k in ("dataset_id", "created_at", "modified_at", "source_run_id", "id"): + metadata[k] = str(metadata[k]) if metadata[k] else metadata[k] + yield Document(content_str, metadata=metadata) + + +def _stringify(x: Union[str, dict]) -> str: + if isinstance(x, str): + return x + else: + try: + return json.dumps(x, indent=2) + except Exception: + return str(x) diff --git a/libs/core/tests/unit_tests/document_loaders/test_langsmith.py b/libs/core/tests/unit_tests/document_loaders/test_langsmith.py new file mode 100644 index 00000000000..e754ab2d372 --- /dev/null +++ b/libs/core/tests/unit_tests/document_loaders/test_langsmith.py @@ -0,0 +1,58 @@ +import datetime +import uuid +from unittest.mock import MagicMock, patch + +from langsmith.schemas import Example + +from langchain_core.document_loaders import LangSmithLoader +from langchain_core.documents import Document + + +def test_init() -> None: + LangSmithLoader(api_key="secret") + + +EXAMPLES = [ + Example( + inputs={"first": {"second": "foo"}}, + outputs={"res": "a"}, + dataset_id=uuid.uuid4(), + id=uuid.uuid4(), + created_at=datetime.datetime.now(), + ), + Example( + inputs={"first": {"second": "bar"}}, + outputs={"res": "b"}, + dataset_id=uuid.uuid4(), + id=uuid.uuid4(), + created_at=datetime.datetime.now(), + ), + Example( + inputs={"first": {"second": "baz"}}, + outputs={"res": "c"}, + dataset_id=uuid.uuid4(), + id=uuid.uuid4(), + created_at=datetime.datetime.now(), + ), +] + + +@patch("langsmith.Client.list_examples", MagicMock(return_value=iter(EXAMPLES))) +def test_lazy_load() -> None: + loader = LangSmithLoader( + api_key="dummy", + dataset_id="mock", + content_key="first.second", + format_content=(lambda x: x.upper()), + ) + expected = [] + for example in EXAMPLES: + metadata = { + k: v if not v or isinstance(v, dict) else str(v) + for k, v in example.dict().items() + } + expected.append( + Document(example.inputs["first"]["second"].upper(), metadata=metadata) + ) + actual = [doc for doc in loader.lazy_load()] + assert expected == actual