mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-28 11:55:21 +00:00
core[minor]: Improve support for id in VectorStore (#26660)
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
parent
931ce8d026
commit
c4ebccfec2
@ -1,6 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
@ -42,7 +42,7 @@ class USearch(VectorStore):
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[Dict]] = None,
|
||||
ids: Optional[np.ndarray] = None,
|
||||
ids: Optional[Union[np.ndarray, list[str]]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
@ -69,6 +69,8 @@ class USearch(VectorStore):
|
||||
last_id = int(self.ids[-1]) + 1
|
||||
if ids is None:
|
||||
ids = np.array([str(last_id + id) for id, _ in enumerate(texts)])
|
||||
elif isinstance(ids, list):
|
||||
ids = np.array(ids)
|
||||
|
||||
self.index.add(np.array(ids), np.array(embeddings))
|
||||
self.docstore.add(dict(zip(ids, documents)))
|
||||
@ -134,7 +136,7 @@ class USearch(VectorStore):
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[Dict]] = None,
|
||||
ids: Optional[np.ndarray] = None,
|
||||
ids: Optional[Union[np.ndarray, list[str]]] = None,
|
||||
metric: str = "cos",
|
||||
**kwargs: Any,
|
||||
) -> USearch:
|
||||
@ -159,6 +161,8 @@ class USearch(VectorStore):
|
||||
documents: List[Document] = []
|
||||
if ids is None:
|
||||
ids = np.array([str(id) for id, _ in enumerate(texts)])
|
||||
elif isinstance(ids, list):
|
||||
ids = np.array(ids)
|
||||
for i, text in enumerate(texts):
|
||||
metadata = metadatas[i] if metadatas else {}
|
||||
documents.append(Document(page_content=text, metadata=metadata))
|
||||
|
@ -25,7 +25,7 @@ import logging
|
||||
import math
|
||||
import warnings
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Collection, Iterable, Sequence
|
||||
from collections.abc import Collection, Iterable, Iterator, Sequence
|
||||
from itertools import cycle
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
@ -61,10 +61,8 @@ class VectorStore(ABC):
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
# One of the kwargs should be `ids` which is a list of ids
|
||||
# associated with the texts.
|
||||
# This is not yet enforced in the type signature for backwards compatibility
|
||||
# with existing implementations.
|
||||
*,
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
@ -72,6 +70,7 @@ class VectorStore(ABC):
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
ids: Optional list of IDs associated with the texts.
|
||||
**kwargs: vectorstore specific parameters.
|
||||
One of the kwargs should be `ids` which is a list of ids
|
||||
associated with the texts.
|
||||
@ -99,10 +98,14 @@ class VectorStore(ABC):
|
||||
f"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
)
|
||||
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||
ids_: Iterator[Optional[str]] = iter(ids) if ids else cycle([None])
|
||||
docs = [
|
||||
Document(page_content=text, metadata=metadata_)
|
||||
for text, metadata_ in zip(texts, metadatas_)
|
||||
Document(id=id_, page_content=text, metadata=metadata_)
|
||||
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||
]
|
||||
if ids is not None:
|
||||
# For backward compatibility
|
||||
kwargs["ids"] = ids
|
||||
|
||||
return self.add_documents(docs, **kwargs)
|
||||
raise NotImplementedError(
|
||||
@ -206,6 +209,8 @@ class VectorStore(ABC):
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
*,
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[str]:
|
||||
"""Async run more texts through the embeddings and add to the vectorstore.
|
||||
@ -214,6 +219,7 @@ class VectorStore(ABC):
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
Default is None.
|
||||
ids: Optional list
|
||||
**kwargs: vectorstore specific parameters.
|
||||
|
||||
Returns:
|
||||
@ -223,6 +229,9 @@ class VectorStore(ABC):
|
||||
ValueError: If the number of metadatas does not match the number of texts.
|
||||
ValueError: If the number of ids does not match the number of texts.
|
||||
"""
|
||||
if ids is not None:
|
||||
# For backward compatibility
|
||||
kwargs["ids"] = ids
|
||||
if type(self).aadd_documents != VectorStore.aadd_documents:
|
||||
# Import document in local scope to avoid circular imports
|
||||
from langchain_core.documents import Document
|
||||
@ -239,12 +248,12 @@ class VectorStore(ABC):
|
||||
f"Got {len(metadatas)} metadatas and {len(texts_)} texts."
|
||||
)
|
||||
metadatas_ = iter(metadatas) if metadatas else cycle([{}])
|
||||
ids_: Iterator[Optional[str]] = iter(ids) if ids else cycle([None])
|
||||
|
||||
docs = [
|
||||
Document(page_content=text, metadata=metadata_)
|
||||
for text, metadata_ in zip(texts, metadatas_)
|
||||
Document(id=id_, page_content=text, metadata=metadata_)
|
||||
for text, metadata_, id_ in zip(texts, metadatas_, ids_)
|
||||
]
|
||||
|
||||
return await self.aadd_documents(docs, **kwargs)
|
||||
return await run_in_executor(None, self.add_texts, texts, metadatas, **kwargs)
|
||||
|
||||
@ -827,6 +836,15 @@ class VectorStore(ABC):
|
||||
"""
|
||||
texts = [d.page_content for d in documents]
|
||||
metadatas = [d.metadata for d in documents]
|
||||
|
||||
if "ids" not in kwargs:
|
||||
ids = [doc.id for doc in documents]
|
||||
|
||||
# If there's at least one valid ID, we'll assume that IDs
|
||||
# should be used.
|
||||
if any(ids):
|
||||
kwargs["ids"] = ids
|
||||
|
||||
return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs)
|
||||
|
||||
@classmethod
|
||||
@ -848,6 +866,15 @@ class VectorStore(ABC):
|
||||
"""
|
||||
texts = [d.page_content for d in documents]
|
||||
metadatas = [d.metadata for d in documents]
|
||||
|
||||
if "ids" not in kwargs:
|
||||
ids = [doc.id for doc in documents]
|
||||
|
||||
# If there's at least one valid ID, we'll assume that IDs
|
||||
# should be used.
|
||||
if any(ids):
|
||||
kwargs["ids"] = ids
|
||||
|
||||
return await cls.afrom_texts(texts, embedding, metadatas=metadatas, **kwargs)
|
||||
|
||||
@classmethod
|
||||
@ -857,6 +884,8 @@ class VectorStore(ABC):
|
||||
texts: list[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
*,
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> VST:
|
||||
"""Return VectorStore initialized from texts and embeddings.
|
||||
@ -866,6 +895,7 @@ class VectorStore(ABC):
|
||||
embedding: Embedding function to use.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
Default is None.
|
||||
ids: Optional list of IDs associated with the texts.
|
||||
kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
@ -878,6 +908,8 @@ class VectorStore(ABC):
|
||||
texts: list[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
*,
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> VST:
|
||||
"""Async return VectorStore initialized from texts and embeddings.
|
||||
@ -887,11 +919,14 @@ class VectorStore(ABC):
|
||||
embedding: Embedding function to use.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
Default is None.
|
||||
ids: Optional list of IDs associated with the texts.
|
||||
kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
VectorStore: VectorStore initialized from texts and embeddings.
|
||||
"""
|
||||
if ids is not None:
|
||||
kwargs["ids"] = ids
|
||||
return await run_in_executor(
|
||||
None, cls.from_texts, texts, embedding, metadatas, **kwargs
|
||||
)
|
||||
|
@ -10,8 +10,10 @@ import uuid
|
||||
from collections.abc import Iterable, Sequence
|
||||
from typing import Any, Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.embeddings import Embeddings, FakeEmbeddings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
|
||||
@ -25,10 +27,6 @@ class CustomAddTextsVectorstore(VectorStore):
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
# One of the kwargs should be `ids` which is a list of ids
|
||||
# associated with the texts.
|
||||
# This is not yet enforced in the type signature for backwards compatibility
|
||||
# with existing implementations.
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[str]:
|
||||
@ -68,12 +66,59 @@ class CustomAddTextsVectorstore(VectorStore):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def test_default_add_documents() -> None:
|
||||
class CustomAddDocumentsVectorstore(VectorStore):
|
||||
"""A vectorstore that only implements add documents."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.store: dict[str, Document] = {}
|
||||
|
||||
def add_documents(
|
||||
self,
|
||||
documents: list[Document],
|
||||
*,
|
||||
ids: Optional[list[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> list[str]:
|
||||
ids_ = []
|
||||
ids_iter = iter(ids or [])
|
||||
for document in documents:
|
||||
id_ = next(ids_iter) if ids else document.id or str(uuid.uuid4())
|
||||
self.store[id_] = Document(
|
||||
id=id_, page_content=document.page_content, metadata=document.metadata
|
||||
)
|
||||
ids_.append(id_)
|
||||
return ids_
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> list[Document]:
|
||||
return [self.store[id] for id in ids if id in self.store]
|
||||
|
||||
@classmethod
|
||||
def from_texts( # type: ignore
|
||||
cls,
|
||||
texts: list[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[list[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> CustomAddDocumentsVectorstore:
|
||||
vectorstore = CustomAddDocumentsVectorstore()
|
||||
vectorstore.add_texts(texts, metadatas=metadatas, **kwargs)
|
||||
return vectorstore
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> list[Document]:
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore]
|
||||
)
|
||||
def test_default_add_documents(vs_class: type[VectorStore]) -> None:
|
||||
"""Test that we can implement the upsert method of the CustomVectorStore
|
||||
class without violating the Liskov Substitution Principle.
|
||||
"""
|
||||
|
||||
store = CustomAddTextsVectorstore()
|
||||
store = vs_class()
|
||||
|
||||
# Check upsert with id
|
||||
assert store.add_documents([Document(id="1", page_content="hello")]) == ["1"]
|
||||
@ -95,8 +140,11 @@ def test_default_add_documents() -> None:
|
||||
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
def test_default_add_texts() -> None:
|
||||
store = CustomAddTextsVectorstore()
|
||||
@pytest.mark.parametrize(
|
||||
"vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore]
|
||||
)
|
||||
def test_default_add_texts(vs_class: type[VectorStore]) -> None:
|
||||
store = vs_class()
|
||||
# Check that default implementation of add_texts works
|
||||
assert store.add_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
|
||||
@ -122,9 +170,12 @@ def test_default_add_texts() -> None:
|
||||
]
|
||||
|
||||
|
||||
async def test_default_aadd_documents() -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore]
|
||||
)
|
||||
async def test_default_aadd_documents(vs_class: type[VectorStore]) -> None:
|
||||
"""Test delegation to the synchronous method."""
|
||||
store = CustomAddTextsVectorstore()
|
||||
store = vs_class()
|
||||
|
||||
# Check upsert with id
|
||||
assert await store.aadd_documents([Document(id="1", page_content="hello")]) == ["1"]
|
||||
@ -146,10 +197,13 @@ async def test_default_aadd_documents() -> None:
|
||||
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
async def test_default_aadd_texts() -> None:
|
||||
@pytest.mark.parametrize(
|
||||
"vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore]
|
||||
)
|
||||
async def test_default_aadd_texts(vs_class: type[VectorStore]) -> None:
|
||||
"""Test delegation to the synchronous method."""
|
||||
store = CustomAddTextsVectorstore()
|
||||
# Check that default implementation of add_texts works
|
||||
store = vs_class()
|
||||
# Check that default implementation of aadd_texts works
|
||||
assert await store.aadd_texts(["hello", "world"], ids=["3", "4"]) == ["3", "4"]
|
||||
|
||||
assert await store.aget_by_ids(["3", "4"]) == [
|
||||
@ -172,3 +226,61 @@ async def test_default_aadd_texts() -> None:
|
||||
Document(id=ids_2[0], page_content="foo", metadata={"foo": "bar"}),
|
||||
Document(id=ids_2[1], page_content="bar", metadata={"foo": "bar"}),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore]
|
||||
)
|
||||
def test_default_from_documents(vs_class: type[VectorStore]) -> None:
|
||||
embeddings = FakeEmbeddings(size=1)
|
||||
store = vs_class.from_documents(
|
||||
[Document(id="1", page_content="hello", metadata={"foo": "bar"})], embeddings
|
||||
)
|
||||
|
||||
assert store.get_by_ids(["1"]) == [
|
||||
Document(id="1", page_content="hello", metadata={"foo": "bar"})
|
||||
]
|
||||
|
||||
# from_documents with ids in args
|
||||
store = vs_class.from_documents(
|
||||
[Document(page_content="hello", metadata={"foo": "bar"})], embeddings, ids=["1"]
|
||||
)
|
||||
|
||||
assert store.get_by_ids(["1"]) == [
|
||||
Document(id="1", page_content="hello", metadata={"foo": "bar"})
|
||||
]
|
||||
|
||||
# Test from_documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
store = vs_class.from_documents([original_document], embeddings, ids=["6"])
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert store.get_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"vs_class", [CustomAddTextsVectorstore, CustomAddDocumentsVectorstore]
|
||||
)
|
||||
async def test_default_afrom_documents(vs_class: type[VectorStore]) -> None:
|
||||
embeddings = FakeEmbeddings(size=1)
|
||||
store = await vs_class.afrom_documents(
|
||||
[Document(id="1", page_content="hello", metadata={"foo": "bar"})], embeddings
|
||||
)
|
||||
|
||||
assert await store.aget_by_ids(["1"]) == [
|
||||
Document(id="1", page_content="hello", metadata={"foo": "bar"})
|
||||
]
|
||||
|
||||
# from_documents with ids in args
|
||||
store = await vs_class.afrom_documents(
|
||||
[Document(page_content="hello", metadata={"foo": "bar"})], embeddings, ids=["1"]
|
||||
)
|
||||
|
||||
assert await store.aget_by_ids(["1"]) == [
|
||||
Document(id="1", page_content="hello", metadata={"foo": "bar"})
|
||||
]
|
||||
|
||||
# Test afrom_documents with id specified in both document and ids
|
||||
original_document = Document(id="7", page_content="baz")
|
||||
store = await vs_class.afrom_documents([original_document], embeddings, ids=["6"])
|
||||
assert original_document.id == "7" # original document should not be modified
|
||||
assert await store.aget_by_ids(["6"]) == [Document(id="6", page_content="baz")]
|
||||
|
Loading…
Reference in New Issue
Block a user