mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-31 02:11:09 +00:00
parent
8e3e532e7d
commit
8a71f1b41b
294
docs/docs/integrations/document_loaders/langsmith.ipynb
Normal file
294
docs/docs/integrations/document_loaders/langsmith.ipynb
Normal file
@ -0,0 +1,294 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"---\n",
|
||||
"sidebar_label: LangSmith\n",
|
||||
"---"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# LangSmithLoader\n",
|
||||
"\n",
|
||||
"This notebook provides a quick overview for getting started with the LangSmith [document loader](https://python.langchain.com/v0.2/docs/concepts/#document-loaders). For detailed documentation of all LangSmithLoader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html).\n",
|
||||
"\n",
|
||||
"## Overview\n",
|
||||
"### Integration details\n",
|
||||
"\n",
|
||||
"| Class | Package | Local | Serializable | JS support|\n",
|
||||
"| :--- | :--- | :---: | :---: | :---: |\n",
|
||||
"| [LangSmithLoader](https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html) | [langchain-core](https://api.python.langchain.com/en/latest/core_api_reference.html) | ❌ | ❌ | ❌ | \n",
|
||||
"\n",
|
||||
"### Loader features\n",
|
||||
"| Source | Lazy loading | Native async\n",
|
||||
"| :---: | :---: | :---: | \n",
|
||||
"| LangSmithLoader | ✅ | ❌ | \n",
|
||||
"\n",
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"To access the LangSmith document loader you'll need to install `langchain-core`, create a [LangSmith](https://langsmith.com) account and get an API key.\n",
|
||||
"\n",
|
||||
"### Credentials\n",
|
||||
"\n",
|
||||
"Sign up at https://langsmith.com and generate an API key. Once you've done this set the LANGSMITH_API_KEY environment variable:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import getpass\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"if not os.environ.get(\"LANGSMITH_API_KEY\"):\n",
|
||||
" os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"If you want to get automated best-in-class tracing, you can also turn on LangSmith tracing:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Installation\n",
|
||||
"\n",
|
||||
"Install `langchain-core`:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%pip install -qU langchain-core"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Clone example dataset\n",
|
||||
"\n",
|
||||
"For this example, we'll clone and load a public LangSmith dataset. Cloning creates a copy of this dataset on our personal LangSmith account. You can only load datasets that you have a personal copy of."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langsmith import Client as LangSmithClient\n",
|
||||
"\n",
|
||||
"ls_client = LangSmithClient()\n",
|
||||
"\n",
|
||||
"dataset_name = \"LangSmith Few Shot Datasets Notebook\"\n",
|
||||
"dataset_public_url = (\n",
|
||||
" \"https://smith.langchain.com/public/55658626-124a-4223-af45-07fb774a6212/d\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"ls_client.clone_public_dataset(dataset_public_url)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Initialization\n",
|
||||
"\n",
|
||||
"Now we can instantiate our document loader and load documents:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain_core.document_loaders import LangSmithLoader\n",
|
||||
"\n",
|
||||
"loader = LangSmithLoader(\n",
|
||||
" dataset_name=dataset_name,\n",
|
||||
" content_key=\"question\",\n",
|
||||
" limit=50,\n",
|
||||
" # format_content=...,\n",
|
||||
" # ...\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Show me an example using Weaviate, but customizing the vectorStoreRetriever to return the top 10 k nearest neighbors. \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = loader.load()\n",
|
||||
"print(docs[0].page_content)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'question': 'Show me an example using Weaviate, but customizing the vectorStoreRetriever to return the top 10 k nearest neighbors. '}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata[\"inputs\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'answer': 'To customize the Weaviate client and return the top 10 k nearest neighbors, you can utilize the `as_retriever` method with the appropriate parameters. Here\\'s how you can achieve this:\\n\\n```python\\n# Assuming you have imported the necessary modules and classes\\n\\n# Create the Weaviate client\\nclient = weaviate.Client(url=os.environ[\"WEAVIATE_URL\"], ...)\\n\\n# Initialize the Weaviate wrapper\\nweaviate = Weaviate(client, index_name, text_key)\\n\\n# Customize the client to return top 10 k nearest neighbors using as_retriever\\ncustom_retriever = weaviate.as_retriever(\\n search_type=\"similarity\",\\n search_kwargs={\\n \\'k\\': 10 # Customize the value of k as needed\\n }\\n)\\n\\n# Now you can use the custom_retriever to perform searches\\nresults = custom_retriever.search(query, ...)\\n```'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(docs[0].metadata[\"outputs\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['dataset_id',\n",
|
||||
" 'inputs',\n",
|
||||
" 'outputs',\n",
|
||||
" 'metadata',\n",
|
||||
" 'id',\n",
|
||||
" 'created_at',\n",
|
||||
" 'modified_at',\n",
|
||||
" 'runs',\n",
|
||||
" 'source_run_id']"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(docs[0].metadata.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Lazy Load"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"10"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"page = []\n",
|
||||
"for doc in loader.lazy_load():\n",
|
||||
" page.append(doc)\n",
|
||||
" if len(page) >= 10:\n",
|
||||
" # do some paged operation, e.g.\n",
|
||||
" # index.upsert(page)\n",
|
||||
" # page = []\n",
|
||||
" break\n",
|
||||
"len(page)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## API reference\n",
|
||||
"\n",
|
||||
"For detailed documentation of all LangSmithLoader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_core.document_loaders.langsmith.LangSmithLoader.html"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "poetry-venv-311",
|
||||
"language": "python",
|
||||
"name": "poetry-venv-311"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
from langchain_core.document_loaders.base import BaseBlobParser, BaseLoader
|
||||
from langchain_core.document_loaders.blob_loaders import Blob, BlobLoader, PathLike
|
||||
from langchain_core.document_loaders.langsmith import LangSmithLoader
|
||||
|
||||
__all__ = [
|
||||
"BaseBlobParser",
|
||||
@ -7,4 +8,5 @@ __all__ = [
|
||||
"Blob",
|
||||
"BlobLoader",
|
||||
"PathLike",
|
||||
"LangSmithLoader",
|
||||
]
|
||||
|
128
libs/core/langchain_core/document_loaders/langsmith.py
Normal file
128
libs/core/langchain_core/document_loaders/langsmith.py
Normal file
@ -0,0 +1,128 @@
|
||||
import datetime
|
||||
import json
|
||||
import uuid
|
||||
from typing import Any, Callable, Iterator, Optional, Sequence, Union
|
||||
|
||||
from langsmith import Client as LangSmithClient
|
||||
|
||||
from langchain_core.document_loaders.base import BaseLoader
|
||||
from langchain_core.documents import Document
|
||||
|
||||
|
||||
class LangSmithLoader(BaseLoader):
|
||||
"""Load LangSmith Dataset examples as Documents.
|
||||
|
||||
Loads the example inputs as the Document page content and places the entire example
|
||||
into the Document metadata. This allows you to easily create few-shot example
|
||||
retrievers from the loaded documents.
|
||||
|
||||
.. dropdown:: Lazy load
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_core.document_loaders import LangSmithLoader
|
||||
|
||||
loader = LangSmithLoader(dataset_id="...", limit=100)
|
||||
docs = []
|
||||
for doc in loader.lazy_load():
|
||||
docs.append(doc)
|
||||
|
||||
.. code-block:: pycon
|
||||
|
||||
# -> [Document("...", metadata={"inputs": {...}, "outputs": {...}, ...}), ...]
|
||||
|
||||
.. versionadded:: 0.2.34
|
||||
""" # noqa: E501
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
dataset_id: Optional[Union[uuid.UUID, str]] = None,
|
||||
dataset_name: Optional[str] = None,
|
||||
example_ids: Optional[Sequence[Union[uuid.UUID, str]]] = None,
|
||||
as_of: Optional[Union[datetime.datetime, str]] = None,
|
||||
splits: Optional[Sequence[str]] = None,
|
||||
inline_s3_urls: bool = True,
|
||||
offset: int = 0,
|
||||
limit: Optional[int] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
filter: Optional[str] = None,
|
||||
content_key: str = "",
|
||||
format_content: Optional[Callable[..., str]] = None,
|
||||
client: Optional[LangSmithClient] = None,
|
||||
**client_kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
Args:
|
||||
dataset_id: The ID of the dataset to filter by. Defaults to None.
|
||||
dataset_name: The name of the dataset to filter by. Defaults to None.
|
||||
content_key: The inputs key to set as Document page content. ``"."`` characters
|
||||
are interpreted as nested keys. E.g. ``content_key="first.second"`` will
|
||||
result in
|
||||
``Document(page_content=format_content(example.inputs["first"]["second"]))``
|
||||
format_content: Function for converting the content extracted from the example
|
||||
inputs into a string. Defaults to JSON-encoding the contents.
|
||||
example_ids: The IDs of the examples to filter by. Defaults to None.
|
||||
as_of: The dataset version tag OR
|
||||
timestamp to retrieve the examples as of.
|
||||
Response examples will only be those that were present at the time
|
||||
of the tagged (or timestamped) version.
|
||||
splits: A list of dataset splits, which are
|
||||
divisions of your dataset such as 'train', 'test', or 'validation'.
|
||||
Returns examples only from the specified splits.
|
||||
inline_s3_urls: Whether to inline S3 URLs. Defaults to True.
|
||||
offset: The offset to start from. Defaults to 0.
|
||||
limit: The maximum number of examples to return.
|
||||
filter: A structured fileter string to apply to the examples.
|
||||
client: LangSmith Client. If not provided will be initialized from below args.
|
||||
client_kwargs: Keyword args to pass to LangSmith client init. Should only be
|
||||
specified if ``client`` isn't.
|
||||
""" # noqa: E501
|
||||
if client and client_kwargs:
|
||||
raise ValueError
|
||||
self._client = client or LangSmithClient(**client_kwargs)
|
||||
self.content_key = list(content_key.split(".")) if content_key else []
|
||||
self.format_content = format_content or _stringify
|
||||
self.dataset_id = dataset_id
|
||||
self.dataset_name = dataset_name
|
||||
self.example_ids = example_ids
|
||||
self.as_of = as_of
|
||||
self.splits = splits
|
||||
self.inline_s3_urls = inline_s3_urls
|
||||
self.offset = offset
|
||||
self.limit = limit
|
||||
self.metadata = metadata
|
||||
self.filter = filter
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
for example in self._client.list_examples(
|
||||
dataset_id=self.dataset_id,
|
||||
dataset_name=self.dataset_name,
|
||||
example_ids=self.example_ids,
|
||||
as_of=self.as_of,
|
||||
splits=self.splits,
|
||||
inline_s3_urls=self.inline_s3_urls,
|
||||
offset=self.offset,
|
||||
limit=self.limit,
|
||||
metadata=self.metadata,
|
||||
filter=self.filter,
|
||||
):
|
||||
content: Any = example.inputs
|
||||
for key in self.content_key:
|
||||
content = content[key]
|
||||
content_str = self.format_content(content)
|
||||
metadata = example.dict()
|
||||
# Stringify datetime and UUID types.
|
||||
for k in ("dataset_id", "created_at", "modified_at", "source_run_id", "id"):
|
||||
metadata[k] = str(metadata[k]) if metadata[k] else metadata[k]
|
||||
yield Document(content_str, metadata=metadata)
|
||||
|
||||
|
||||
def _stringify(x: Union[str, dict]) -> str:
|
||||
if isinstance(x, str):
|
||||
return x
|
||||
else:
|
||||
try:
|
||||
return json.dumps(x, indent=2)
|
||||
except Exception:
|
||||
return str(x)
|
@ -0,0 +1,58 @@
|
||||
import datetime
|
||||
import uuid
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from langsmith.schemas import Example
|
||||
|
||||
from langchain_core.document_loaders import LangSmithLoader
|
||||
from langchain_core.documents import Document
|
||||
|
||||
|
||||
def test_init() -> None:
|
||||
LangSmithLoader(api_key="secret")
|
||||
|
||||
|
||||
EXAMPLES = [
|
||||
Example(
|
||||
inputs={"first": {"second": "foo"}},
|
||||
outputs={"res": "a"},
|
||||
dataset_id=uuid.uuid4(),
|
||||
id=uuid.uuid4(),
|
||||
created_at=datetime.datetime.now(),
|
||||
),
|
||||
Example(
|
||||
inputs={"first": {"second": "bar"}},
|
||||
outputs={"res": "b"},
|
||||
dataset_id=uuid.uuid4(),
|
||||
id=uuid.uuid4(),
|
||||
created_at=datetime.datetime.now(),
|
||||
),
|
||||
Example(
|
||||
inputs={"first": {"second": "baz"}},
|
||||
outputs={"res": "c"},
|
||||
dataset_id=uuid.uuid4(),
|
||||
id=uuid.uuid4(),
|
||||
created_at=datetime.datetime.now(),
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@patch("langsmith.Client.list_examples", MagicMock(return_value=iter(EXAMPLES)))
|
||||
def test_lazy_load() -> None:
|
||||
loader = LangSmithLoader(
|
||||
api_key="dummy",
|
||||
dataset_id="mock",
|
||||
content_key="first.second",
|
||||
format_content=(lambda x: x.upper()),
|
||||
)
|
||||
expected = []
|
||||
for example in EXAMPLES:
|
||||
metadata = {
|
||||
k: v if not v or isinstance(v, dict) else str(v)
|
||||
for k, v in example.dict().items()
|
||||
}
|
||||
expected.append(
|
||||
Document(example.inputs["first"]["second"].upper(), metadata=metadata)
|
||||
)
|
||||
actual = [doc for doc in loader.lazy_load()]
|
||||
assert expected == actual
|
Loading…
Reference in New Issue
Block a user