diff --git a/libs/core/langchain_core/example_selectors/base.py b/libs/core/langchain_core/example_selectors/base.py index 5061e140a75..a9b1aba0d52 100644 --- a/libs/core/langchain_core/example_selectors/base.py +++ b/libs/core/langchain_core/example_selectors/base.py @@ -2,6 +2,8 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List +from langchain_core.runnables import run_in_executor + class BaseExampleSelector(ABC): """Interface for selecting examples to include in prompts.""" @@ -10,6 +12,14 @@ class BaseExampleSelector(ABC): def add_example(self, example: Dict[str, str]) -> Any: """Add new example to store.""" + async def aadd_example(self, example: Dict[str, str]) -> Any: + """Add new example to store.""" + return await run_in_executor(None, self.add_example, example) + @abstractmethod def select_examples(self, input_variables: Dict[str, str]) -> List[dict]: """Select which examples to use based on the inputs.""" + + async def aselect_examples(self, input_variables: Dict[str, str]) -> List[dict]: + """Select which examples to use based on the inputs.""" + return await run_in_executor(None, self.select_examples, input_variables) diff --git a/libs/core/langchain_core/example_selectors/semantic_similarity.py b/libs/core/langchain_core/example_selectors/semantic_similarity.py index 919ad88a3b5..c3bb86fa9c5 100644 --- a/libs/core/langchain_core/example_selectors/semantic_similarity.py +++ b/libs/core/langchain_core/example_selectors/semantic_similarity.py @@ -3,6 +3,7 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type +from langchain_core.documents import Document from langchain_core.example_selectors.base import BaseExampleSelector from langchain_core.pydantic_v1 import BaseModel, Extra from langchain_core.vectorstores import VectorStore @@ -37,34 +38,59 @@ class SemanticSimilarityExampleSelector(BaseExampleSelector, BaseModel): extra = Extra.forbid arbitrary_types_allowed = True + @staticmethod + def _example_to_text( + example: Dict[str, str], input_keys: Optional[List[str]] + ) -> str: + if input_keys: + return " ".join(sorted_values({key: example[key] for key in input_keys})) + else: + return " ".join(sorted_values(example)) + + def _documents_to_examples(self, documents: List[Document]) -> List[dict]: + # Get the examples from the metadata. + # This assumes that examples are stored in metadata. + examples = [dict(e.metadata) for e in documents] + # If example keys are provided, filter examples to those keys. + if self.example_keys: + examples = [{k: eg[k] for k in self.example_keys} for eg in examples] + return examples + def add_example(self, example: Dict[str, str]) -> str: """Add new example to vectorstore.""" - if self.input_keys: - string_example = " ".join( - sorted_values({key: example[key] for key in self.input_keys}) - ) - else: - string_example = " ".join(sorted_values(example)) - ids = self.vectorstore.add_texts([string_example], metadatas=[example]) + ids = self.vectorstore.add_texts( + [self._example_to_text(example, self.input_keys)], metadatas=[example] + ) + return ids[0] + + async def aadd_example(self, example: Dict[str, str]) -> str: + """Add new example to vectorstore.""" + ids = await self.vectorstore.aadd_texts( + [self._example_to_text(example, self.input_keys)], metadatas=[example] + ) return ids[0] def select_examples(self, input_variables: Dict[str, str]) -> List[dict]: """Select which examples to use based on semantic similarity.""" # Get the docs with the highest similarity. - if self.input_keys: - input_variables = {key: input_variables[key] for key in self.input_keys} vectorstore_kwargs = self.vectorstore_kwargs or {} - query = " ".join(sorted_values(input_variables)) example_docs = self.vectorstore.similarity_search( - query, k=self.k, **vectorstore_kwargs + self._example_to_text(input_variables, self.input_keys), + k=self.k, + **vectorstore_kwargs, ) - # Get the examples from the metadata. - # This assumes that examples are stored in metadata. - examples = [dict(e.metadata) for e in example_docs] - # If example keys are provided, filter examples to those keys. - if self.example_keys: - examples = [{k: eg[k] for k in self.example_keys} for eg in examples] - return examples + return self._documents_to_examples(example_docs) + + async def aselect_examples(self, input_variables: Dict[str, str]) -> List[dict]: + """Select which examples to use based on semantic similarity.""" + # Get the docs with the highest similarity. + vectorstore_kwargs = self.vectorstore_kwargs or {} + example_docs = await self.vectorstore.asimilarity_search( + self._example_to_text(input_variables, self.input_keys), + k=self.k, + **vectorstore_kwargs, + ) + return self._documents_to_examples(example_docs) @classmethod def from_examples( @@ -95,13 +121,7 @@ class SemanticSimilarityExampleSelector(BaseExampleSelector, BaseModel): Returns: The ExampleSelector instantiated, backed by a vector store. """ - if input_keys: - string_examples = [ - " ".join(sorted_values({k: eg[k] for k in input_keys})) - for eg in examples - ] - else: - string_examples = [" ".join(sorted_values(eg)) for eg in examples] + string_examples = [cls._example_to_text(eg, input_keys) for eg in examples] vectorstore = vectorstore_cls.from_texts( string_examples, embeddings, metadatas=examples, **vectorstore_cls_kwargs ) @@ -113,6 +133,47 @@ class SemanticSimilarityExampleSelector(BaseExampleSelector, BaseModel): vectorstore_kwargs=vectorstore_kwargs, ) + @classmethod + async def afrom_examples( + cls, + examples: List[dict], + embeddings: Embeddings, + vectorstore_cls: Type[VectorStore], + k: int = 4, + input_keys: Optional[List[str]] = None, + *, + example_keys: Optional[List[str]] = None, + vectorstore_kwargs: Optional[dict] = None, + **vectorstore_cls_kwargs: Any, + ) -> SemanticSimilarityExampleSelector: + """Create k-shot example selector using example list and embeddings. + + Reshuffles examples dynamically based on query similarity. + + Args: + examples: List of examples to use in the prompt. + embeddings: An initialized embedding API interface, e.g. OpenAIEmbeddings(). + vectorstore_cls: A vector store DB interface class, e.g. FAISS. + k: Number of examples to select + input_keys: If provided, the search is based on the input variables + instead of all variables. + vectorstore_cls_kwargs: optional kwargs containing url for vector store + + Returns: + The ExampleSelector instantiated, backed by a vector store. + """ + string_examples = [cls._example_to_text(eg, input_keys) for eg in examples] + vectorstore = await vectorstore_cls.afrom_texts( + string_examples, embeddings, metadatas=examples, **vectorstore_cls_kwargs + ) + return cls( + vectorstore=vectorstore, + k=k, + input_keys=input_keys, + example_keys=example_keys, + vectorstore_kwargs=vectorstore_kwargs, + ) + class MaxMarginalRelevanceExampleSelector(SemanticSimilarityExampleSelector): """ExampleSelector that selects examples based on Max Marginal Relevance. diff --git a/libs/core/tests/unit_tests/example_selectors/test_base.py b/libs/core/tests/unit_tests/example_selectors/test_base.py new file mode 100644 index 00000000000..5ab9ed7c2c0 --- /dev/null +++ b/libs/core/tests/unit_tests/example_selectors/test_base.py @@ -0,0 +1,26 @@ +from typing import Dict, List, Optional + +from langchain_core.example_selectors import BaseExampleSelector + + +class DummyExampleSelector(BaseExampleSelector): + def __init__(self) -> None: + self.example: Optional[Dict[str, str]] = None + + def add_example(self, example: Dict[str, str]) -> None: + self.example = example + + def select_examples(self, input_variables: Dict[str, str]) -> List[dict]: + return [input_variables] + + +async def test_aadd_example() -> None: + selector = DummyExampleSelector() + await selector.aadd_example({"foo": "bar"}) + assert selector.example == {"foo": "bar"} + + +async def test_aselect_examples() -> None: + selector = DummyExampleSelector() + examples = await selector.aselect_examples({"foo": "bar"}) + assert examples == [{"foo": "bar"}] diff --git a/libs/core/tests/unit_tests/example_selectors/test_similarity.py b/libs/core/tests/unit_tests/example_selectors/test_similarity.py new file mode 100644 index 00000000000..3f6f0972f7a --- /dev/null +++ b/libs/core/tests/unit_tests/example_selectors/test_similarity.py @@ -0,0 +1,139 @@ +from typing import Any, Iterable, List, Optional, cast + +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings, FakeEmbeddings +from langchain_core.example_selectors import SemanticSimilarityExampleSelector +from langchain_core.vectorstores import VectorStore + + +class DummyVectorStore(VectorStore): + def __init__(self, init_arg: Optional[str] = None): + self.texts: List[str] = [] + self.metadatas: List[dict] = [] + self._embeddings: Optional[Embeddings] = None + self.init_arg = init_arg + + @property + def embeddings(self) -> Optional[Embeddings]: + return self._embeddings + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + self.texts.extend(texts) + if metadatas: + self.metadatas.extend(metadatas) + return ["dummy_id"] + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + return [ + Document(page_content=query, metadata={"metadata": query, "other": "other"}) + ] * k + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> "DummyVectorStore": + store = DummyVectorStore(**kwargs) + store.add_texts(texts, metadatas) + store._embeddings = embedding + return store + + +def test_add_example() -> None: + vector_store = DummyVectorStore() + selector = SemanticSimilarityExampleSelector( + vectorstore=vector_store, input_keys=["foo", "foo3"] + ) + selector.add_example({"foo": "bar", "foo2": "bar2", "foo3": "bar3"}) + assert vector_store.texts == ["bar bar3"] + assert vector_store.metadatas == [{"foo": "bar", "foo2": "bar2", "foo3": "bar3"}] + + +async def test_aadd_example() -> None: + vector_store = DummyVectorStore() + selector = SemanticSimilarityExampleSelector( + vectorstore=vector_store, input_keys=["foo", "foo3"] + ) + await selector.aadd_example({"foo": "bar", "foo2": "bar2", "foo3": "bar3"}) + assert vector_store.texts == ["bar bar3"] + assert vector_store.metadatas == [{"foo": "bar", "foo2": "bar2", "foo3": "bar3"}] + + +def test_select_examples() -> None: + vector_store = DummyVectorStore() + selector = SemanticSimilarityExampleSelector( + vectorstore=vector_store, input_keys=["foo2"], example_keys=["metadata"], k=2 + ) + examples = selector.select_examples({"foo": "bar", "foo2": "bar2"}) + assert examples == [{"metadata": "bar2"}] * 2 + + +async def test_aselect_examples() -> None: + vector_store = DummyVectorStore() + selector = SemanticSimilarityExampleSelector( + vectorstore=vector_store, input_keys=["foo2"], example_keys=["metadata"], k=2 + ) + examples = await selector.aselect_examples({"foo": "bar", "foo2": "bar2"}) + assert examples == [{"metadata": "bar2"}] * 2 + + +def test_from_examples() -> None: + examples = [{"foo": "bar"}] + embeddings = FakeEmbeddings(size=1) + selector = SemanticSimilarityExampleSelector.from_examples( + examples=examples, + embeddings=embeddings, + vectorstore_cls=DummyVectorStore, + k=2, + input_keys=["foo"], + example_keys=["some_example_key"], + vectorstore_kwargs={"vs_foo": "vs_bar"}, + init_arg="some_init_arg", + ) + assert selector.input_keys == ["foo"] + assert selector.example_keys == ["some_example_key"] + assert selector.k == 2 + assert selector.vectorstore_kwargs == {"vs_foo": "vs_bar"} + + assert isinstance(selector.vectorstore, DummyVectorStore) + vector_store = cast(DummyVectorStore, selector.vectorstore) + assert vector_store.embeddings is embeddings + assert vector_store.init_arg == "some_init_arg" + assert vector_store.texts == ["bar"] + assert vector_store.metadatas == [{"foo": "bar"}] + + +async def test_afrom_examples() -> None: + examples = [{"foo": "bar"}] + embeddings = FakeEmbeddings(size=1) + selector = await SemanticSimilarityExampleSelector.afrom_examples( + examples=examples, + embeddings=embeddings, + vectorstore_cls=DummyVectorStore, + k=2, + input_keys=["foo"], + example_keys=["some_example_key"], + vectorstore_kwargs={"vs_foo": "vs_bar"}, + init_arg="some_init_arg", + ) + assert selector.input_keys == ["foo"] + assert selector.example_keys == ["some_example_key"] + assert selector.k == 2 + assert selector.vectorstore_kwargs == {"vs_foo": "vs_bar"} + + assert isinstance(selector.vectorstore, DummyVectorStore) + vector_store = cast(DummyVectorStore, selector.vectorstore) + assert vector_store.embeddings is embeddings + assert vector_store.init_arg == "some_init_arg" + assert vector_store.texts == ["bar"] + assert vector_store.metadatas == [{"foo": "bar"}]