diff --git a/libs/community/langchain_community/vectorstores/usearch.py b/libs/community/langchain_community/vectorstores/usearch.py index fa94d19de00..c59446de54d 100644 --- a/libs/community/langchain_community/vectorstores/usearch.py +++ b/libs/community/langchain_community/vectorstores/usearch.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np from langchain_core.documents import Document @@ -42,7 +42,7 @@ class USearch(VectorStore): self, texts: Iterable[str], metadatas: Optional[List[Dict]] = None, - ids: Optional[np.ndarray] = None, + ids: Optional[Union[np.ndarray, list[str]]] = None, **kwargs: Any, ) -> List[str]: """Run more texts through the embeddings and add to the vectorstore. @@ -69,6 +69,8 @@ class USearch(VectorStore): last_id = int(self.ids[-1]) + 1 if ids is None: ids = np.array([str(last_id + id) for id, _ in enumerate(texts)]) + elif isinstance(ids, list): + ids = np.array(ids) self.index.add(np.array(ids), np.array(embeddings)) self.docstore.add(dict(zip(ids, documents))) @@ -134,7 +136,7 @@ class USearch(VectorStore): texts: List[str], embedding: Embeddings, metadatas: Optional[List[Dict]] = None, - ids: Optional[np.ndarray] = None, + ids: Optional[Union[np.ndarray, list[str]]] = None, metric: str = "cos", **kwargs: Any, ) -> USearch: @@ -159,6 +161,8 @@ class USearch(VectorStore): documents: List[Document] = [] if ids is None: ids = np.array([str(id) for id, _ in enumerate(texts)]) + elif isinstance(ids, list): + ids = np.array(ids) for i, text in enumerate(texts): metadata = metadatas[i] if metadatas else {} documents.append(Document(page_content=text, metadata=metadata)) diff --git a/libs/core/langchain_core/vectorstores/base.py b/libs/core/langchain_core/vectorstores/base.py index aacf897ffb9..ccc966d9f2a 100644 --- a/libs/core/langchain_core/vectorstores/base.py +++ b/libs/core/langchain_core/vectorstores/base.py @@ -25,7 +25,7 @@ import logging import math import warnings from abc import ABC, abstractmethod -from collections.abc import Collection, Iterable, Sequence +from collections.abc import Collection, Iterable, Iterator, Sequence from itertools import cycle from typing import ( TYPE_CHECKING, @@ -61,10 +61,8 @@ class VectorStore(ABC): self, texts: Iterable[str], metadatas: Optional[list[dict]] = None, - # One of the kwargs should be `ids` which is a list of ids - # associated with the texts. - # This is not yet enforced in the type signature for backwards compatibility - # with existing implementations. + *, + ids: Optional[list[str]] = None, **kwargs: Any, ) -> list[str]: """Run more texts through the embeddings and add to the vectorstore. @@ -72,6 +70,7 @@ class VectorStore(ABC): Args: texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of IDs associated with the texts. **kwargs: vectorstore specific parameters. One of the kwargs should be `ids` which is a list of ids associated with the texts. @@ -99,10 +98,14 @@ class VectorStore(ABC): f"Got {len(metadatas)} metadatas and {len(texts_)} texts." ) metadatas_ = iter(metadatas) if metadatas else cycle([{}]) + ids_: Iterator[Optional[str]] = iter(ids) if ids else cycle([None]) docs = [ - Document(page_content=text, metadata=metadata_) - for text, metadata_ in zip(texts, metadatas_) + Document(id=id_, page_content=text, metadata=metadata_) + for text, metadata_, id_ in zip(texts, metadatas_, ids_) ] + if ids is not None: + # For backward compatibility + kwargs["ids"] = ids return self.add_documents(docs, **kwargs) raise NotImplementedError( @@ -206,6 +209,8 @@ class VectorStore(ABC): self, texts: Iterable[str], metadatas: Optional[list[dict]] = None, + *, + ids: Optional[list[str]] = None, **kwargs: Any, ) -> list[str]: """Async run more texts through the embeddings and add to the vectorstore. @@ -214,6 +219,7 @@ class VectorStore(ABC): texts: Iterable of strings to add to the vectorstore. metadatas: Optional list of metadatas associated with the texts. Default is None. + ids: Optional list **kwargs: vectorstore specific parameters. Returns: @@ -223,6 +229,9 @@ class VectorStore(ABC): ValueError: If the number of metadatas does not match the number of texts. ValueError: If the number of ids does not match the number of texts. """ + if ids is not None: + # For backward compatibility + kwargs["ids"] = ids if type(self).aadd_documents != VectorStore.aadd_documents: # Import document in local scope to avoid circular imports from langchain_core.documents import Document @@ -239,12 +248,12 @@ class VectorStore(ABC): f"Got {len(metadatas)} metadatas and {len(texts_)} texts." ) metadatas_ = iter(metadatas) if metadatas else cycle([{}]) + ids_: Iterator[Optional[str]] = iter(ids) if ids else cycle([None]) docs = [ - Document(page_content=text, metadata=metadata_) - for text, metadata_ in zip(texts, metadatas_) + Document(id=id_, page_content=text, metadata=metadata_) + for text, metadata_, id_ in zip(texts, metadatas_, ids_) ] - return await self.aadd_documents(docs, **kwargs) return await run_in_executor(None, self.add_texts, texts, metadatas, **kwargs) @@ -827,6 +836,15 @@ class VectorStore(ABC): """ texts = [d.page_content for d in documents] metadatas = [d.metadata for d in documents] + + if "ids" not in kwargs: + ids = [doc.id for doc in documents] + + # If there's at least one valid ID, we'll assume that IDs + # should be used. + if any(ids): + kwargs["ids"] = ids + return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs) @classmethod @@ -848,6 +866,15 @@ class VectorStore(ABC): """ texts = [d.page_content for d in documents] metadatas = [d.metadata for d in documents] + + if "ids" not in kwargs: + ids = [doc.id for doc in documents] + + # If there's at least one valid ID, we'll assume that IDs + # should be used. + if any(ids): + kwargs["ids"] = ids + return await cls.afrom_texts(texts, embedding, metadatas=metadatas, **kwargs) @classmethod @@ -857,6 +884,8 @@ class VectorStore(ABC): texts: list[str], embedding: Embeddings, metadatas: Optional[list[dict]] = None, + *, + ids: Optional[list[str]] = None, **kwargs: Any, ) -> VST: """Return VectorStore initialized from texts and embeddings. @@ -866,6 +895,7 @@ class VectorStore(ABC): embedding: Embedding function to use. metadatas: Optional list of metadatas associated with the texts. Default is None. + ids: Optional list of IDs associated with the texts. kwargs: Additional keyword arguments. Returns: @@ -878,6 +908,8 @@ class VectorStore(ABC): texts: list[str], embedding: Embeddings, metadatas: Optional[list[dict]] = None, + *, + ids: Optional[list[str]] = None, **kwargs: Any, ) -> VST: """Async return VectorStore initialized from texts and embeddings. @@ -887,11 +919,14 @@ class VectorStore(ABC): embedding: Embedding function to use. metadatas: Optional list of metadatas associated with the texts. Default is None. + ids: Optional list of IDs associated with the texts. kwargs: Additional keyword arguments. Returns: VectorStore: VectorStore initialized from texts and embeddings. """ + if ids is not None: + kwargs["ids"] = ids return await run_in_executor( None, cls.from_texts, texts, embedding, metadatas, **kwargs ) diff --git a/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py b/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py index 29746bbe67d..aba1481c623 100644 --- a/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py +++ b/libs/core/tests/unit_tests/vectorstores/test_vectorstore.py @@ -10,8 +10,10 @@ import uuid from collections.abc import Iterable, Sequence from typing import Any, Optional +import pytest + from langchain_core.documents import Document -from langchain_core.embeddings import Embeddings +from langchain_core.embeddings import Embeddings, FakeEmbeddings from langchain_core.vectorstores import VectorStore @@ -25,10 +27,6 @@ class CustomAddTextsVectorstore(VectorStore): self, texts: Iterable[str], metadatas: Optional[list[dict]] = None, - # One of the kwargs should be `ids` which is a list of ids - # associated with the texts. - # This is not yet enforced in the type signature for backwards compatibility - # with existing implementations. ids: Optional[list[str]] = None, **kwargs: Any, ) -> list[str]: @@ -68,12 +66,59 @@ class CustomAddTextsVectorstore(VectorStore): raise NotImplementedError() -def test_default_add_documents() -> None: +class CustomAddDocumentsVectorstore(VectorStore): + """A vectorstore that only implements add documents.""" + + def __init__(self) -> None: + self.store: dict[str, Document] = {} + + def add_documents( + self, + documents: list[Document], + *, + ids: Optional[list[str]] = None, + **kwargs: Any, + ) -> list[str]: + ids_ = [] + ids_iter = iter(ids or []) + for document in documents: + id_ = next(ids_iter) if ids else document.id or str(uuid.uuid4()) + self.store[id_] = Document( + id=id_, page_content=document.page_content, metadata=document.metadata + ) + ids_.append(id_) + return ids_ + + def get_by_ids(self, ids: Sequence[str], /) -> list[Document]: + return [self.store[id] for id in ids if id in self.store] + + @classmethod + def from_texts( # type: ignore + cls, + texts: list[str], + embedding: Embeddings, + metadatas: Optional[list[dict]] = None, + **kwargs: Any, + ) -> CustomAddDocumentsVectorstore: + vectorstore = CustomAddDocumentsVectorstore() + vectorstore.add_texts(texts, metadatas=metadatas, **kwargs) + return vectorstore + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> list[Document]: + raise NotImplementedError() + + +@pytest.mark.parametrize( + "vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore] +) +def test_default_add_documents(vs_class: type[VectorStore]) -> None: """Test that we can implement the upsert method of the CustomVectorStore class without violating the Liskov Substitution Principle. """ - store = CustomAddTextsVectorstore() + store = vs_class() # Check upsert with id assert store.add_documents([Document(id="1", page_content="hello")]) == ["1"] @@ -95,8 +140,11 @@ def test_default_add_documents() -> None: assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")] -def test_default_add_texts() -> None: - store = CustomAddTextsVectorstore() +@pytest.mark.parametrize( + "vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore] +) +def test_default_add_texts(vs_class: type[VectorStore]) -> None: + store = vs_class() # Check that default implementation of add_texts works assert store.add_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"] @@ -122,9 +170,12 @@ def test_default_add_texts() -> None: ] -async def test_default_aadd_documents() -> None: +@pytest.mark.parametrize( + "vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore] +) +async def test_default_aadd_documents(vs_class: type[VectorStore]) -> None: """Test delegation to the synchronous method.""" - store = CustomAddTextsVectorstore() + store = vs_class() # Check upsert with id assert await store.aadd_documents([Document(id="1", page_content="hello")]) == ["1"] @@ -146,10 +197,13 @@ async def test_default_aadd_documents() -> None: assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")] -async def test_default_aadd_texts() -> None: +@pytest.mark.parametrize( + "vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore] +) +async def test_default_aadd_texts(vs_class: type[VectorStore]) -> None: """Test delegation to the synchronous method.""" - store = CustomAddTextsVectorstore() - # Check that default implementation of add_texts works + store = vs_class() + # Check that default implementation of aadd_texts works assert await store.aadd_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"] assert await store.aget_by_ids(["3", "4"]) == [ @@ -172,3 +226,61 @@ async def test_default_aadd_texts() -> None: Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}), Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}), ] + + +@pytest.mark.parametrize( + "vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore] +) +def test_default_from_documents(vs_class: type[VectorStore]) -> None: + embeddings = FakeEmbeddings(size=1) + store = vs_class.from_documents( + [Document(id="1", page_content="hello", metadata={"foo": "bar"})], embeddings + ) + + assert store.get_by_ids(["1"]) == [ + Document(id="1", page_content="hello", metadata={"foo": "bar"}) + ] + + # from_documents with ids in args + store = vs_class.from_documents( + [Document(page_content="hello", metadata={"foo": "bar"})], embeddings, ids=["1"] + ) + + assert store.get_by_ids(["1"]) == [ + Document(id="1", page_content="hello", metadata={"foo": "bar"}) + ] + + # Test from_documents with id specified in both document and ids + original_document = Document(id="7", page_content="baz") + store = vs_class.from_documents([original_document], embeddings, ids=["6"]) + assert original_document.id == "7" # original document should not be modified + assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")] + + +@pytest.mark.parametrize( + "vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore] +) +async def test_default_afrom_documents(vs_class: type[VectorStore]) -> None: + embeddings = FakeEmbeddings(size=1) + store = await vs_class.afrom_documents( + [Document(id="1", page_content="hello", metadata={"foo": "bar"})], embeddings + ) + + assert await store.aget_by_ids(["1"]) == [ + Document(id="1", page_content="hello", metadata={"foo": "bar"}) + ] + + # from_documents with ids in args + store = await vs_class.afrom_documents( + [Document(page_content="hello", metadata={"foo": "bar"})], embeddings, ids=["1"] + ) + + assert await store.aget_by_ids(["1"]) == [ + Document(id="1", page_content="hello", metadata={"foo": "bar"}) + ] + + # Test afrom_documents with id specified in both document and ids + original_document = Document(id="7", page_content="baz") + store = await vs_class.afrom_documents([original_document], embeddings, ids=["6"]) + assert original_document.id == "7" # original document should not be modified + assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]