Compare commits

...

10 Commits

Author SHA1 Message Date
Bagatur
e94bd97b37 fmt 2024-08-15 10:02:18 -07:00
Bagatur
9557555f1f fmt 2024-08-15 09:35:54 -07:00
Bagatur
3b80e9d59c fmt 2024-08-14 20:26:15 -07:00
Bagatur
a6d65ac891 fmt 2024-08-14 20:19:21 -07:00
Bagatur
22e57eb5de fmt 2024-08-14 18:42:12 -07:00
Bagatur
af2c129f8a fmt 2024-08-14 16:15:57 -07:00
Bagatur
0c4b09afe3 Merge branch 'master' into bagatutr/langsmith_example_selector 2024-08-14 16:00:14 -07:00
Bagatur
b2e7f82460 fmt 2024-08-14 15:32:01 -07:00
Bagatur
97ff183483 fmt 2024-08-14 14:41:29 -07:00
Bagatur
1b3c16268b wip 2024-08-14 12:15:54 -07:00
5 changed files with 363 additions and 12 deletions

View File

@@ -3,12 +3,12 @@
.. currentmodule:: {{ module }}
.. autoclass:: {{ objname }}
.. NOTE:: {{objname}} implements the standard :py:class:`Runnable Interface <langchain_core.runnables.base.Runnable>`. 🏃
The :py:class:`Runnable Interface <langchain_core.runnables.base.Runnable>` has additional methods that are available on runnables, such as :py:meth:`with_types <langchain_core.runnables.base.Runnable.with_types>`, :py:meth:`with_retry <langchain_core.runnables.base.Runnable.with_retry>`, :py:meth:`assign <langchain_core.runnables.base.Runnable.assign>`, :py:meth:`bind <langchain_core.runnables.base.Runnable.bind>`, :py:meth:`get_graph <langchain_core.runnables.base.Runnable.get_graph>`, and more.
.. autoclass:: {{ objname }}
{% block attributes %}
{% if attributes %}
.. rubric:: {{ _('Attributes') }}

View File

@@ -4,6 +4,7 @@ This allows us to select examples that are most relevant to the input.
"""
from langchain_core.example_selectors.base import BaseExampleSelector
from langchain_core.example_selectors.langsmith import LangSmithExampleSelector
from langchain_core.example_selectors.length_based import (
LengthBasedExampleSelector,
)
@@ -19,4 +20,5 @@ __all__ = [
"MaxMarginalRelevanceExampleSelector",
"SemanticSimilarityExampleSelector",
"sorted_values",
"LangSmithExampleSelector",
]

View File

@@ -10,19 +10,21 @@ class BaseExampleSelector(ABC):
"""Interface for selecting examples to include in prompts."""
@abstractmethod
def add_example(self, example: Dict[str, str]) -> Any:
def add_example(self, example: Dict[str, Any]) -> Any:
"""Add new example to store.
Args:
example: A dictionary with keys as input variables
and values as their values."""
example: A dict that maps input and output variables to their example
values.
"""
async def aadd_example(self, example: Dict[str, str]) -> Any:
async def aadd_example(self, example: Dict[str, Any]) -> Any:
"""Async add new example to store.
Args:
example: A dictionary with keys as input variables
and values as their values."""
example: A dict that maps input and output variables to their example
values.
"""
return await run_in_executor(None, self.add_example, example)
@@ -31,14 +33,14 @@ class BaseExampleSelector(ABC):
"""Select which examples to use based on the inputs.
Args:
input_variables: A dictionary with keys as input variables
and values as their values."""
input_variables: A that maps input variables to their values.
"""
async def aselect_examples(self, input_variables: Dict[str, str]) -> List[dict]:
"""Async select which examples to use based on the inputs.
Args:
input_variables: A dictionary with keys as input variables
and values as their values."""
input_variables: A that maps input variables to their values.
"""
return await run_in_executor(None, self.select_examples, input_variables)

View File

@@ -0,0 +1,346 @@
"""Select examples using a LangSmith few-shot index."""
import json
from typing import Any, Dict, List, Optional, Type, Union
from langsmith import Client
from langsmith import schemas as ls_schemas
from langsmith import utils as ls_utils
from langchain_core._api import beta
from langchain_core.example_selectors.base import BaseExampleSelector
from langchain_core.runnables import Runnable, RunnableConfig
from langchain_core.utils.function_calling import convert_to_openai_function
@beta()
class LangSmithExampleSelector(
BaseExampleSelector, Runnable[Dict[str, Any], Dict[str, Any]]
):
"""Select examples using a LangSmith few-shot index.
Head to the LangSmith docs for more on how dataset indexing works:
`LangSmith docs <https://docs.smith.langchain.com/how_to_guides/datasets/index_datasets_for_dynamic_few_shot_example_selection>`_.
Note, initializing the ``LangSmithExampleSelector`` does **not** create a dataset.
This must be done explicitly, either outside the example selector or using
the ``LangSmithExampleSelector(...).create_dataset(...)`` method.
Args:
k: How many examples to return on invocation.
dataset_name: The name of the dataset of examples.
dataset_id: The ID of the dataset of examples. Must specify one of
dataset_name or dataset_id. If both are specified they must correspond
to the same dataset.
client: ``langsmith.Client``. If None, then ``client_kwargs`` will be used
to initialize a new ``langsmith.Client``.
client_kwargs: If ``client`` isn't specified these keyword args will be
used ot initialize a new ``langsmith.Client``.
.. dropdown:: Index creation
.. code-block:: python
from langchain_core.example_selectors import LangSmithExampleSelector
examples = [
{"input": {"question": "..."}, "outputs": {"answer": "..."}}
...
]
example_selector = LangSmithExampleSelector(
k=4,
dataset_name="foo_bar_task_few_shot_examples",
)
# Create the dataset.
example_selector.create_dataset(...)
# Populate the dataset.
example_selector.add_examples(examples)
.. dropdown:: Retrieving few shot examples
.. code-block:: python
from langchain_core.example_selectors import LangSmithExampleSelector
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model
example_selector = LangSmithExampleSelector(
k=4,
dataset_name="foo_bar_task_few_shot_examples",
)
instructions = "..."
def construct_prompt(input_: dict) -> list:
examples = []
for ex in input_["examples"]:
examples.extend([
HumanMessage(ex["inputs"]["question"], name="example_user"),
AIMessage(ex["outputs"]["answer"], name="example_assistant")
])
return [
SystemMessage(instructions),
*examples,
HumanMessage(input_["question"])
]
llm = init_chat_model("gpt-4o", temperature=0)
chain = example_selector | construct_prompt | llm
chain.invoke({"question": "..."})
""" # noqa: E501
def __init__(
self,
*,
k: int,
dataset_name: Optional[str] = None,
dataset_id: Optional[str] = None,
client: Optional[Client] = None,
**client_kwargs: Any,
) -> None:
if client_kwargs and client:
raise ValueError(
f"Must specify one and only one of:\n{client=}\n\n{client_kwargs}."
)
self.k = k
self.dataset_name = dataset_name
self.dataset_id = dataset_id
self._client = client or Client(**client_kwargs)
def invoke(
self,
input: Dict[str, Any],
config: Optional[RunnableConfig] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""Retrieve examples from the dataset that are most relevant to the input.
Args:
input: A dictionary of input variables.
config: A config to use when invoking the Runnable.
The config supports standard keys like 'tags', 'metadata' for tracing
purposes, 'max_concurrency' for controlling how much work to do
in parallel, and other keys. Please refer to the
:class:`~langchain_core.runnables.config.RunnableConfig` for more
details.
kwargs: Additional keyword arguments are passed through to
``select_examples()``.
Returns:
A dictionary with the inputs plus an "examples" key which contains a list
of the retrieved examples. Each example is a dictionary with top-level
"inputs" and "outputs" keys which contain a mapping of the input
and output variables to their example values.
"""
return {
**input,
**{
"examples": self._call_with_config(
self.select_examples, # type: ignore[arg-type]
input,
config,
**kwargs,
)
},
}
async def ainvoke(
self,
input: Dict[str, Any],
config: Optional[RunnableConfig] = None,
**kwargs: Any,
) -> Dict[str, Any]:
"""Retrieve examples from the dataset that are most relevant to the input.
Args:
input: A dictionary of input variables.
config: A config to use when invoking the Runnable.
The config supports standard keys like 'tags', 'metadata' for tracing
purposes, 'max_concurrency' for controlling how much work to do
in parallel, and other keys. Please refer to the
:class:`~langchain_core.runnables.config.RunnableConfig` for more
details.
kwargs: Additional keyword arguments are passed through to
``select_examples()``.
Returns:
A dictionary with the inputs plus an "examples" key which contains a list
of the retrieved examples. Each example is a dictionary with top-level
"inputs" and "outputs" keys which contain a mapping of the input
and output variables to their example values.
"""
return {
**input,
**{
"examples": await self._acall_with_config(
self.aselect_examples, # type: ignore[arg-type]
input,
config,
**kwargs,
)
},
}
def select_examples(self, input_variables: Dict[str, str]) -> List[Dict[str, Any]]:
"""Select which examples to use based on the inputs.
Args:
input_variables: A that maps input variables to their values.
Returns:
A list of examples. Each example is a dictionary with a top level "inputs"
and "outputs" key. The values of these are dictionaries which map the
input and output variables to their example values.
"""
search_req_json = json.dumps({"inputs": input_variables, "limit": self.k})
dataset_id = self._get_dataset_id()
few_shot_resp = self._client.request_with_retries(
"POST",
f"/datasets/{dataset_id}/search",
headers={**self._client._headers, "Content-Type": "application/json"},
data=search_req_json,
)
ls_utils.raise_for_status_with_text(few_shot_resp)
return few_shot_resp.json()["examples"]
def add_examples(self, examples: List[Dict[str, Dict]]) -> Any:
"""Add new examples to store.
Args:
examples: A list of dicts. Each dict must have a top-level "inputs" key
that contains the inputs for the example and an "outputs" key that
contains the outputs for the example.
Returns:
None
Raises:
ValueError: if dataset with (self.dataset_name, self.dataset_id) does not
exist yet.
"""
if not self.dataset_exists():
identifiers = ", ".join(
arg
for arg in [
f"dataset_name={self.dataset_name}",
f"dataset_id={self.dataset_id}",
]
if arg
)
raise ValueError(
f"Dataset with {identifiers} does not exist yet."
f" Please run `LangSmithExampleSelector(...).create_dataset(...)` to"
f" create this dataset and then try adding examples."
)
self._client.create_examples(
dataset_name=self.dataset_name,
dataset_id=self.dataset_id,
inputs=[e["inputs"] for e in examples],
outputs=[e["outputs"] for e in examples],
)
def add_example(self, example: Dict[str, Dict]) -> Any:
"""Add new example to store.
Args:
example: A dict that must have a top-level "inputs" key
that contains the inputs for the example and an "outputs" key that
contains the outputs for the example.
"""
return self.add_examples([example])
def create_dataset(
self,
input_schema: Union[Dict, Type],
output_schema: Union[Dict, Type],
*,
description: Optional[str] = None,
) -> str:
"""Create a dataset to index examples into and retrieve examples from.
Args:
input_schema: The expected schema for all example inputs, passed in as a
JSON Schema or a TypedDict class.
output_schema: The expected schema for all example outputs, passed in as a
JSON Schema or a TypedDict class.
description: An optional description for the dataset.
Returns:
String dataset ID of the newly created dataset. If
``LangSmithExampleSelector.dataset_id`` is set, this will be the ID that's
returned.
Raises:
ValueError: If ``LangSmithExampleSelector.dataset_name`` isn't set.
"""
if not self.dataset_name:
raise ValueError(
"`LangSmithExampleSelector.dataset_name` must be set to be able to "
"create a dataset. Please initialize a new example selector with "
"init arg `dataset_name` passed in."
)
dataset_to_create_json = {
"name": self.dataset_name,
"description": description or "Dataset of indexed few-shot examples.",
"data_type": "kv",
"inputs_schema_definition": _convert_to_json_schema(input_schema),
"outputs_schema_definition": _convert_to_json_schema(output_schema),
"id": self.dataset_id,
}
# Create dataset.
headers = {**self._client._headers, "Content-Type": "application/json"}
resp = self._client.request_with_retries(
"POST",
"/datasets",
headers=headers,
data=json.dumps(dataset_to_create_json),
)
ls_utils.raise_for_status_with_text(resp)
dataset = ls_schemas.Dataset(
**resp.json(),
_host_url=self._client._host_url,
_tenant_id=self._client._get_optional_tenant_id(),
)
dataset_id = str(dataset.id)
# Turn on dataset indexing.
resp = self._client.request_with_retries(
"POST",
f"/datasets/{dataset_id}/index",
headers=headers,
data=json.dumps({"tag": "latest"}),
)
ls_utils.raise_for_status_with_text(resp)
return dataset_id
def dataset_exists(self) -> bool:
"""Returns True if the configured dataset already exists, otherwise False."""
return self._client.has_dataset(
dataset_name=self.dataset_name, dataset_id=self.dataset_id
)
def _get_dataset_id(self) -> str:
if self.dataset_id:
return self.dataset_id
else:
dataset = self._client.read_dataset(dataset_name=self.dataset_name)
return str(dataset.id)
def _convert_to_json_schema(schema: Union[Dict, Type]) -> Dict:
# TODO: Flip so that theres a generic convert_to_json_schema function that
# convert_to_openai_function uses.
oai_function = convert_to_openai_function(schema)
return {
"title": oai_function["name"],
"description": oai_function["description"],
**oai_function["parameters"],
}

View File

@@ -2,6 +2,7 @@ from langchain_core.example_selectors import __all__
EXPECTED_ALL = [
"BaseExampleSelector",
"LangSmithExampleSelector",
"LengthBasedExampleSelector",
"MaxMarginalRelevanceExampleSelector",
"SemanticSimilarityExampleSelector",