core: Add async methods to BaseExampleSelector and SemanticSimilarityExampleSelector (#19399)

Few-Shot prompt template may use a `SemanticSimilarityExampleSelector`
that in turn uses a `VectorStore` that does I/O operations.
So to work correctly on the event loop, we need:
* async methods for the `VectorStore` (OK)
* async methods for the `SemanticSimilarityExampleSelector` (this PR)
* async methods for `BasePromptTemplate` and `BaseChatPromptTemplate`
(future work)
This commit is contained in:
Christophe Bornet 2024-03-26 15:06:43 +01:00 committed by GitHub
parent 29c58528c7
commit a9457d269e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 261 additions and 25 deletions

View File

@ -2,6 +2,8 @@
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from langchain_core.runnables import run_in_executor
class BaseExampleSelector(ABC):
"""Interface for selecting examples to include in prompts."""
@ -10,6 +12,14 @@ class BaseExampleSelector(ABC):
def add_example(self, example: Dict[str, str]) -> Any:
"""Add new example to store."""
async def aadd_example(self, example: Dict[str, str]) -> Any:
"""Add new example to store."""
return await run_in_executor(None, self.add_example, example)
@abstractmethod
def select_examples(self, input_variables: Dict[str, str]) -> List[dict]:
"""Select which examples to use based on the inputs."""
async def aselect_examples(self, input_variables: Dict[str, str]) -> List[dict]:
"""Select which examples to use based on the inputs."""
return await run_in_executor(None, self.select_examples, input_variables)

View File

@ -3,6 +3,7 @@ from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type
from langchain_core.documents import Document
from langchain_core.example_selectors.base import BaseExampleSelector
from langchain_core.pydantic_v1 import BaseModel, Extra
from langchain_core.vectorstores import VectorStore
@ -37,34 +38,59 @@ class SemanticSimilarityExampleSelector(BaseExampleSelector, BaseModel):
extra = Extra.forbid
arbitrary_types_allowed = True
@staticmethod
def _example_to_text(
example: Dict[str, str], input_keys: Optional[List[str]]
) -> str:
if input_keys:
return " ".join(sorted_values({key: example[key] for key in input_keys}))
else:
return " ".join(sorted_values(example))
def _documents_to_examples(self, documents: List[Document]) -> List[dict]:
# Get the examples from the metadata.
# This assumes that examples are stored in metadata.
examples = [dict(e.metadata) for e in documents]
# If example keys are provided, filter examples to those keys.
if self.example_keys:
examples = [{k: eg[k] for k in self.example_keys} for eg in examples]
return examples
def add_example(self, example: Dict[str, str]) -> str:
"""Add new example to vectorstore."""
if self.input_keys:
string_example = " ".join(
sorted_values({key: example[key] for key in self.input_keys})
)
else:
string_example = " ".join(sorted_values(example))
ids = self.vectorstore.add_texts([string_example], metadatas=[example])
ids = self.vectorstore.add_texts(
[self._example_to_text(example, self.input_keys)], metadatas=[example]
)
return ids[0]
async def aadd_example(self, example: Dict[str, str]) -> str:
"""Add new example to vectorstore."""
ids = await self.vectorstore.aadd_texts(
[self._example_to_text(example, self.input_keys)], metadatas=[example]
)
return ids[0]
def select_examples(self, input_variables: Dict[str, str]) -> List[dict]:
"""Select which examples to use based on semantic similarity."""
# Get the docs with the highest similarity.
if self.input_keys:
input_variables = {key: input_variables[key] for key in self.input_keys}
vectorstore_kwargs = self.vectorstore_kwargs or {}
query = " ".join(sorted_values(input_variables))
example_docs = self.vectorstore.similarity_search(
query, k=self.k, **vectorstore_kwargs
self._example_to_text(input_variables, self.input_keys),
k=self.k,
**vectorstore_kwargs,
)
# Get the examples from the metadata.
# This assumes that examples are stored in metadata.
examples = [dict(e.metadata) for e in example_docs]
# If example keys are provided, filter examples to those keys.
if self.example_keys:
examples = [{k: eg[k] for k in self.example_keys} for eg in examples]
return examples
return self._documents_to_examples(example_docs)
async def aselect_examples(self, input_variables: Dict[str, str]) -> List[dict]:
"""Select which examples to use based on semantic similarity."""
# Get the docs with the highest similarity.
vectorstore_kwargs = self.vectorstore_kwargs or {}
example_docs = await self.vectorstore.asimilarity_search(
self._example_to_text(input_variables, self.input_keys),
k=self.k,
**vectorstore_kwargs,
)
return self._documents_to_examples(example_docs)
@classmethod
def from_examples(
@ -95,13 +121,7 @@ class SemanticSimilarityExampleSelector(BaseExampleSelector, BaseModel):
Returns:
The ExampleSelector instantiated, backed by a vector store.
"""
if input_keys:
string_examples = [
" ".join(sorted_values({k: eg[k] for k in input_keys}))
for eg in examples
]
else:
string_examples = [" ".join(sorted_values(eg)) for eg in examples]
string_examples = [cls._example_to_text(eg, input_keys) for eg in examples]
vectorstore = vectorstore_cls.from_texts(
string_examples, embeddings, metadatas=examples, **vectorstore_cls_kwargs
)
@ -113,6 +133,47 @@ class SemanticSimilarityExampleSelector(BaseExampleSelector, BaseModel):
vectorstore_kwargs=vectorstore_kwargs,
)
@classmethod
async def afrom_examples(
cls,
examples: List[dict],
embeddings: Embeddings,
vectorstore_cls: Type[VectorStore],
k: int = 4,
input_keys: Optional[List[str]] = None,
*,
example_keys: Optional[List[str]] = None,
vectorstore_kwargs: Optional[dict] = None,
**vectorstore_cls_kwargs: Any,
) -> SemanticSimilarityExampleSelector:
"""Create k-shot example selector using example list and embeddings.
Reshuffles examples dynamically based on query similarity.
Args:
examples: List of examples to use in the prompt.
embeddings: An initialized embedding API interface, e.g. OpenAIEmbeddings().
vectorstore_cls: A vector store DB interface class, e.g. FAISS.
k: Number of examples to select
input_keys: If provided, the search is based on the input variables
instead of all variables.
vectorstore_cls_kwargs: optional kwargs containing url for vector store
Returns:
The ExampleSelector instantiated, backed by a vector store.
"""
string_examples = [cls._example_to_text(eg, input_keys) for eg in examples]
vectorstore = await vectorstore_cls.afrom_texts(
string_examples, embeddings, metadatas=examples, **vectorstore_cls_kwargs
)
return cls(
vectorstore=vectorstore,
k=k,
input_keys=input_keys,
example_keys=example_keys,
vectorstore_kwargs=vectorstore_kwargs,
)
class MaxMarginalRelevanceExampleSelector(SemanticSimilarityExampleSelector):
"""ExampleSelector that selects examples based on Max Marginal Relevance.

View File

@ -0,0 +1,26 @@
from typing import Dict, List, Optional
from langchain_core.example_selectors import BaseExampleSelector
class DummyExampleSelector(BaseExampleSelector):
def __init__(self) -> None:
self.example: Optional[Dict[str, str]] = None
def add_example(self, example: Dict[str, str]) -> None:
self.example = example
def select_examples(self, input_variables: Dict[str, str]) -> List[dict]:
return [input_variables]
async def test_aadd_example() -> None:
selector = DummyExampleSelector()
await selector.aadd_example({"foo": "bar"})
assert selector.example == {"foo": "bar"}
async def test_aselect_examples() -> None:
selector = DummyExampleSelector()
examples = await selector.aselect_examples({"foo": "bar"})
assert examples == [{"foo": "bar"}]

View File

@ -0,0 +1,139 @@
from typing import Any, Iterable, List, Optional, cast
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings, FakeEmbeddings
from langchain_core.example_selectors import SemanticSimilarityExampleSelector
from langchain_core.vectorstores import VectorStore
class DummyVectorStore(VectorStore):
def __init__(self, init_arg: Optional[str] = None):
self.texts: List[str] = []
self.metadatas: List[dict] = []
self._embeddings: Optional[Embeddings] = None
self.init_arg = init_arg
@property
def embeddings(self) -> Optional[Embeddings]:
return self._embeddings
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
self.texts.extend(texts)
if metadatas:
self.metadatas.extend(metadatas)
return ["dummy_id"]
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
return [
Document(page_content=query, metadata={"metadata": query, "other": "other"})
] * k
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "DummyVectorStore":
store = DummyVectorStore(**kwargs)
store.add_texts(texts, metadatas)
store._embeddings = embedding
return store
def test_add_example() -> None:
vector_store = DummyVectorStore()
selector = SemanticSimilarityExampleSelector(
vectorstore=vector_store, input_keys=["foo", "foo3"]
)
selector.add_example({"foo": "bar", "foo2": "bar2", "foo3": "bar3"})
assert vector_store.texts == ["bar bar3"]
assert vector_store.metadatas == [{"foo": "bar", "foo2": "bar2", "foo3": "bar3"}]
async def test_aadd_example() -> None:
vector_store = DummyVectorStore()
selector = SemanticSimilarityExampleSelector(
vectorstore=vector_store, input_keys=["foo", "foo3"]
)
await selector.aadd_example({"foo": "bar", "foo2": "bar2", "foo3": "bar3"})
assert vector_store.texts == ["bar bar3"]
assert vector_store.metadatas == [{"foo": "bar", "foo2": "bar2", "foo3": "bar3"}]
def test_select_examples() -> None:
vector_store = DummyVectorStore()
selector = SemanticSimilarityExampleSelector(
vectorstore=vector_store, input_keys=["foo2"], example_keys=["metadata"], k=2
)
examples = selector.select_examples({"foo": "bar", "foo2": "bar2"})
assert examples == [{"metadata": "bar2"}] * 2
async def test_aselect_examples() -> None:
vector_store = DummyVectorStore()
selector = SemanticSimilarityExampleSelector(
vectorstore=vector_store, input_keys=["foo2"], example_keys=["metadata"], k=2
)
examples = await selector.aselect_examples({"foo": "bar", "foo2": "bar2"})
assert examples == [{"metadata": "bar2"}] * 2
def test_from_examples() -> None:
examples = [{"foo": "bar"}]
embeddings = FakeEmbeddings(size=1)
selector = SemanticSimilarityExampleSelector.from_examples(
examples=examples,
embeddings=embeddings,
vectorstore_cls=DummyVectorStore,
k=2,
input_keys=["foo"],
example_keys=["some_example_key"],
vectorstore_kwargs={"vs_foo": "vs_bar"},
init_arg="some_init_arg",
)
assert selector.input_keys == ["foo"]
assert selector.example_keys == ["some_example_key"]
assert selector.k == 2
assert selector.vectorstore_kwargs == {"vs_foo": "vs_bar"}
assert isinstance(selector.vectorstore, DummyVectorStore)
vector_store = cast(DummyVectorStore, selector.vectorstore)
assert vector_store.embeddings is embeddings
assert vector_store.init_arg == "some_init_arg"
assert vector_store.texts == ["bar"]
assert vector_store.metadatas == [{"foo": "bar"}]
async def test_afrom_examples() -> None:
examples = [{"foo": "bar"}]
embeddings = FakeEmbeddings(size=1)
selector = await SemanticSimilarityExampleSelector.afrom_examples(
examples=examples,
embeddings=embeddings,
vectorstore_cls=DummyVectorStore,
k=2,
input_keys=["foo"],
example_keys=["some_example_key"],
vectorstore_kwargs={"vs_foo": "vs_bar"},
init_arg="some_init_arg",
)
assert selector.input_keys == ["foo"]
assert selector.example_keys == ["some_example_key"]
assert selector.k == 2
assert selector.vectorstore_kwargs == {"vs_foo": "vs_bar"}
assert isinstance(selector.vectorstore, DummyVectorStore)
vector_store = cast(DummyVectorStore, selector.vectorstore)
assert vector_store.embeddings is embeddings
assert vector_store.init_arg == "some_init_arg"
assert vector_store.texts == ["bar"]
assert vector_store.metadatas == [{"foo": "bar"}]