Add: USearch Vector Store (#8835)

## Description I am excited to propose an integration with USearch, a lightweight vector-search engine available for both Python and JavaScript, among other languages. ## Dependencies It introduces a new PyPi dependency - `usearch`. I am unsure if it must be added to the Poetry file, as this would make the PR too clunky. Please let me know. ## Profiles - Maintainers: @ashvardanian @davvard - Twitter handles: @ashvardanian @unum_cloud --------- Co-authored-by: Davit Vardanyan <78792753+davvard@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-01 19:12:42 +00:00 · 2023-08-08 04:41:00 +01:00
parent b52a3785c9
commit 1f9124ceaa
4 changed files with 432 additions and 0 deletions
--- a/libs/langchain/langchain/vectorstores/init.py
+++ b/libs/langchain/langchain/vectorstores/init.py
@@ -62,6 +62,7 @@ from langchain.vectorstores.supabase import SupabaseVectorStore
 from langchain.vectorstores.tair import Tair
 from langchain.vectorstores.tigris import Tigris
 from langchain.vectorstores.typesense import Typesense
+from langchain.vectorstores.usearch import USearch
 from langchain.vectorstores.vectara import Vectara
 from langchain.vectorstores.weaviate import Weaviate
 from langchain.vectorstores.zilliz import Zilliz
@@ -120,4 +121,5 @@ __all__ = [
    "Weaviate",
    "Zilliz",
    "PGVector",
+    "USearch",
 ]
--- a/libs/langchain/langchain/vectorstores/usearch.py
+++ b/libs/langchain/langchain/vectorstores/usearch.py
@@ -0,0 +1,176 @@
+"""Wrapper around USearch vector database."""
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+
+import numpy as np
+
+from langchain.docstore.base import AddableMixin, Docstore
+from langchain.docstore.document import Document
+from langchain.docstore.in_memory import InMemoryDocstore
+from langchain.embeddings.base import Embeddings
+from langchain.vectorstores.base import VectorStore
+
+
+def dependable_usearch_import() -> Any:
+    """
+    Import usearch if available, otherwise raise error.
+    """
+    try:
+        import usearch.index
+    except ImportError:
+        raise ImportError(
+            "Could not import usearch python package. "
+            "Please install it with `pip install usearch` "
+        )
+    return usearch.index
+
+
+class USearch(VectorStore):
+    """Wrapper around USearch vector database.
+    To use, you should have the ``usearch`` python package installed.
+    """
+
+    def __init__(
+        self,
+        embedding: Embeddings,
+        index: Any,
+        docstore: Docstore,
+        ids: List[str],
+    ):
+        """Initialize with necessary components."""
+        self.embedding = embedding
+        self.index = index
+        self.docstore = docstore
+        self.ids = ids
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[Dict]] = None,
+        ids: Optional[np.ndarray] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Run more texts through the embeddings and add to the vectorstore.
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            ids: Optional list of unique IDs.
+
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+        """
+        if not isinstance(self.docstore, AddableMixin):
+            raise ValueError(
+                "If trying to add texts, the underlying docstore should support "
+                f"adding items, which {self.docstore} does not"
+            )
+
+        embeddings = self.embedding.embed_documents(list(texts))
+        documents = []
+        for i, text in enumerate(texts):
+            metadata = metadatas[i] if metadatas else {}
+            documents.append(Document(page_content=text, metadata=metadata))
+        last_id = int(self.ids[-1]) + 1
+        if ids is None:
+            ids = np.array([str(last_id + id) for id, _ in enumerate(texts)])
+
+        self.index.add(np.array(ids), np.array(embeddings))
+        self.docstore.add(dict(zip(ids, documents)))
+        self.ids.extend(ids)
+        return ids.tolist()
+
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: int = 4,
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to query.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of documents most similar to the query with distance.
+        """
+        query_embedding = self.embedding.embed_query(query)
+        matches = self.index.search(np.array(query_embedding), k)
+
+        docs_with_scores: List[Tuple[Document, float]] = []
+        for id, score in zip(matches.keys, matches.distances):
+            doc = self.docstore.search(str(id))
+            if not isinstance(doc, Document):
+                raise ValueError(f"Could not find document for id {id}, got {doc}")
+            docs_with_scores.append((doc, score))
+
+        return docs_with_scores
+
+    def similarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs most similar to query.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the query.
+        """
+        query_embedding = self.embedding.embed_query(query)
+        matches = self.index.search(np.array(query_embedding), k)
+
+        docs: List[Document] = []
+        for id in matches.keys:
+            doc = self.docstore.search(str(id))
+            if not isinstance(doc, Document):
+                raise ValueError(f"Could not find document for id {id}, got {doc}")
+            docs.append(doc)
+
+        return docs
+
+    @classmethod
+    def from_texts(
+        cls,
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[Dict]] = None,
+        ids: Optional[np.ndarray] = None,
+        metric: str = "cos",
+        **kwargs: Any,
+    ) -> USearch:
+        """Construct USearch wrapper from raw documents.
+        This is a user friendly interface that:
+            1. Embeds documents.
+            2. Creates an in memory docstore
+            3. Initializes the USearch database
+        This is intended to be a quick way to get started.
+
+        Example:
+            .. code-block:: python
+
+                from langchain.vectorstores import USearch
+                from langchain.embeddings import OpenAIEmbeddings
+
+                embeddings = OpenAIEmbeddings()
+                usearch = USearch.from_texts(texts, embeddings)
+        """
+        embeddings = embedding.embed_documents(texts)
+
+        documents: List[Document] = []
+        if ids is None:
+            ids = np.array([str(id) for id, _ in enumerate(texts)])
+        for i, text in enumerate(texts):
+            metadata = metadatas[i] if metadatas else {}
+            documents.append(Document(page_content=text, metadata=metadata))
+
+        docstore = InMemoryDocstore(dict(zip(ids, documents)))
+        usearch = dependable_usearch_import()
+        index = usearch.Index(ndim=len(embeddings[0]), metric=metric)
+        index.add(np.array(ids), np.array(embeddings))
+        return cls(embedding, index, docstore, ids.tolist())
--- a/libs/langchain/tests/integration_tests/vectorstores/test_usearch.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_usearch.py
@@ -0,0 +1,59 @@
+"""Test USearch functionality."""
+import pytest
+
+from langchain.docstore.document import Document
+from langchain.vectorstores.usearch import USearch
+from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+
+
+def test_usearch_from_texts() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = USearch.from_texts(texts, FakeEmbeddings())
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+def test_usearch_from_documents() -> None:
+    """Test from_documents constructor."""
+    texts = ["foo", "bar", "baz"]
+    docs = [Document(page_content=t, metadata={"a": "b"}) for t in texts]
+    docsearch = USearch.from_documents(docs, FakeEmbeddings())
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo", metadata={"a": "b"})]
+
+
+def test_usearch_add_texts() -> None:
+    """Test adding a new document"""
+    texts = ["foo", "bar", "baz"]
+    docsearch = USearch.from_texts(texts, FakeEmbeddings())
+    docsearch.add_texts(["foo"])
+    output = docsearch.similarity_search("foo", k=2)
+    assert output == [Document(page_content="foo"), Document(page_content="foo")]
+
+
+def test_ip() -> None:
+    """Test inner product distance."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = USearch.from_texts(texts, FakeEmbeddings(), metric="ip")
+    output = docsearch.similarity_search_with_score("far", k=2)
+    _, score = output[1]
+    assert score == -8.0
+
+
+def test_l2() -> None:
+    """Test Flat L2 distance."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = USearch.from_texts(texts, FakeEmbeddings(), metric="l2_sq")
+    output = docsearch.similarity_search_with_score("far", k=2)
+    _, score = output[1]
+    assert score == 1.0
+
+
+def test_cos() -> None:
+    """Test cosine distance."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = USearch.from_texts(texts, FakeEmbeddings(), metric="cos")
+    output = docsearch.similarity_search_with_score("far", k=2)
+    _, score = output[1]
+    assert score == pytest.approx(0.05, abs=0.002)