community[minor]: Added VLite as VectorStore (#20245)

Support [VLite](https://github.com/sdan/vlite) as a new VectorStore type. **Description**: vlite is a simple and blazing fast vector database(vdb) made with numpy. It abstracts a lot of the functionality around using a vdb in the retrieval augmented generation(RAG) pipeline such as embeddings generation, chunking, and file processing while still giving developers the functionality to change how they're made/stored. **Before submitting**: Added tests [here](c09c2ebd5c/libs/community/tests/integration_tests/vectorstores/test_vlite.py) Added ipython notebook [here](c09c2ebd5c/docs/docs/integrations/vectorstores/vlite.ipynb) Added simple docs on how to use [here](c09c2ebd5c/docs/docs/integrations/providers/vlite.mdx) **Profiles** Maintainers: @sdan Twitter handles: [@sdand](https://x.com/sdand) --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-12 12:59:07 +00:00 · 2024-04-16 18:24:38 -07:00
parent 7824291252
commit a7c5e41443
8 changed files with 560 additions and 0 deletions
--- a/libs/community/langchain_community/vectorstores/init.py
+++ b/libs/community/langchain_community/vectorstores/init.py
@@ -265,6 +265,9 @@ if TYPE_CHECKING:
    from langchain_community.vectorstores.vespa import (
        VespaStore,  # noqa: F401
    )
+    from langchain_community.vectorstores.vlite import (
+        VLite,  # noqa: F401
+    )
    from langchain_community.vectorstores.weaviate import (
        Weaviate,  # noqa: F401
    )
@@ -364,6 +367,7 @@ __all__ = [
    "Vectara",
    "VectorStore",
    "VespaStore",
+    "VLite",
    "Weaviate",
    "Yellowbrick",
    "ZepVectorStore",
@@ -456,6 +460,7 @@ _module_lookup = {
    "Vectara": "langchain_community.vectorstores.vectara",
    "VectorStore": "langchain_core.vectorstores",
    "VespaStore": "langchain_community.vectorstores.vespa",
+    "VLite": "langchain_community.vectorstores.vlite",
    "Weaviate": "langchain_community.vectorstores.weaviate",
    "Yellowbrick": "langchain_community.vectorstores.yellowbrick",
    "ZepVectorStore": "langchain_community.vectorstores.zep",
--- a/libs/community/langchain_community/vectorstores/vlite.py
+++ b/libs/community/langchain_community/vectorstores/vlite.py
@@ -0,0 +1,247 @@
+from __future__ import annotations
+
+# Standard library imports
+from typing import Any, Dict, Iterable, List, Optional, Tuple
+from uuid import uuid4
+
+# LangChain imports
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_core.vectorstores import VectorStore
+
+
+class VLite(VectorStore):
+    """VLite is a simple and fast vector database for semantic search."""
+
+    def __init__(
+        self,
+        embedding_function: Embeddings,
+        collection: Optional[str] = None,
+        **kwargs: Any,
+    ):
+        super().__init__()
+        self.embedding_function = embedding_function
+        self.collection = collection or f"vlite_{uuid4().hex}"
+        # Third-party imports
+        try:
+            from vlite import VLite
+        except ImportError:
+            raise ImportError(
+                "Could not import vlite python package. "
+                "Please install it with `pip install vlite`."
+            )
+        self.vlite = VLite(collection=self.collection, **kwargs)
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Run more texts through the embeddings and add to the vectorstore.
+
+        Args:
+            texts: Iterable of strings to add to the vectorstore.
+            metadatas: Optional list of metadatas associated with the texts.
+            kwargs: vectorstore specific parameters
+
+        Returns:
+            List of ids from adding the texts into the vectorstore.
+        """
+        texts = list(texts)
+        ids = kwargs.pop("ids", [str(uuid4()) for _ in texts])
+        embeddings = self.embedding_function.embed_documents(texts)
+        if not metadatas:
+            metadatas = [{} for _ in texts]
+        data_points = [
+            {"text": text, "metadata": metadata, "id": id, "embedding": embedding}
+            for text, metadata, id, embedding in zip(texts, metadatas, ids, embeddings)
+        ]
+        results = self.vlite.add(data_points)
+        return [result[0] for result in results]
+
+    def add_documents(
+        self,
+        documents: List[Document],
+        **kwargs: Any,
+    ) -> List[str]:
+        """Add a list of documents to the vectorstore.
+
+        Args:
+            documents: List of documents to add to the vectorstore.
+            kwargs: vectorstore specific parameters such as "file_path" for processing
+                    directly with vlite.
+
+        Returns:
+            List of ids from adding the documents into the vectorstore.
+        """
+        ids = kwargs.pop("ids", [str(uuid4()) for _ in documents])
+        texts = []
+        metadatas = []
+        for doc, id in zip(documents, ids):
+            if "file_path" in kwargs:
+                # Third-party imports
+                try:
+                    from vlite.utils import process_file
+                except ImportError:
+                    raise ImportError(
+                        "Could not import vlite python package. "
+                        "Please install it with `pip install vlite`."
+                    )
+                processed_data = process_file(kwargs["file_path"])
+                texts.extend(processed_data)
+                metadatas.extend([doc.metadata] * len(processed_data))
+                ids.extend([f"{id}_{i}" for i in range(len(processed_data))])
+            else:
+                texts.append(doc.page_content)
+                metadatas.append(doc.metadata)
+        return self.add_texts(texts, metadatas, ids=ids)
+
+    def similarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs most similar to query.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+
+        Returns:
+            List of Documents most similar to the query.
+        """
+        docs_and_scores = self.similarity_search_with_score(query, k=k)
+        return [doc for doc, _ in docs_and_scores]
+
+    def similarity_search_with_score(
+        self,
+        query: str,
+        k: int = 4,
+        filter: Optional[Dict[str, str]] = None,
+        **kwargs: Any,
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to query.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            filter: Filter by metadata. Defaults to None.
+
+        Returns:
+            List of Tuples of (doc, score), where score is the similarity score.
+        """
+        metadata = filter or {}
+        embedding = self.embedding_function.embed_query(query)
+        results = self.vlite.retrieve(
+            text=query,
+            top_k=k,
+            metadata=metadata,
+            return_scores=True,
+            embedding=embedding,
+        )
+        documents_with_scores = [
+            (Document(page_content=text, metadata=metadata), score)
+            for text, score, metadata in results
+        ]
+        return documents_with_scores
+
+    def update_document(self, document_id: str, document: Document) -> None:
+        """Update an existing document in the vectorstore."""
+        self.vlite.update(
+            document_id, text=document.page_content, metadata=document.metadata
+        )
+
+    def get(self, ids: List[str]) -> List[Document]:
+        """Get documents by their IDs."""
+        results = self.vlite.get(ids)
+        documents = [
+            Document(page_content=text, metadata=metadata) for text, metadata in results
+        ]
+        return documents
+
+    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
+        """Delete by ids."""
+        if ids is not None:
+            self.vlite.delete(ids, **kwargs)
+            return True
+        return None
+
+    @classmethod
+    def from_existing_index(
+        cls,
+        embedding: Embeddings,
+        collection: str,
+        **kwargs: Any,
+    ) -> VLite:
+        """Load an existing VLite index.
+
+        Args:
+            embedding: Embedding function
+            collection: Name of the collection to load.
+
+        Returns:
+            VLite vector store.
+        """
+        vlite = cls(embedding_function=embedding, collection=collection, **kwargs)
+        return vlite
+
+    @classmethod
+    def from_texts(
+        cls,
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        collection: Optional[str] = None,
+        **kwargs: Any,
+    ) -> VLite:
+        """Construct VLite wrapper from raw documents.
+
+        This is a user-friendly interface that:
+        1. Embeds documents.
+        2. Adds the documents to the vectorstore.
+
+        This is intended to be a quick way to get started.
+
+        Example:
+        .. code-block:: python
+
+            from langchain import VLite
+            from langchain.embeddings import OpenAIEmbeddings
+
+            embeddings = OpenAIEmbeddings()
+            vlite = VLite.from_texts(texts, embeddings)
+        """
+        vlite = cls(embedding_function=embedding, collection=collection, **kwargs)
+        vlite.add_texts(texts, metadatas, **kwargs)
+        return vlite
+
+    @classmethod
+    def from_documents(
+        cls,
+        documents: List[Document],
+        embedding: Embeddings,
+        collection: Optional[str] = None,
+        **kwargs: Any,
+    ) -> VLite:
+        """Construct VLite wrapper from a list of documents.
+
+        This is a user-friendly interface that:
+        1. Embeds documents.
+        2. Adds the documents to the vectorstore.
+
+        This is intended to be a quick way to get started.
+
+        Example:
+        .. code-block:: python
+
+            from langchain import VLite
+            from langchain.embeddings import OpenAIEmbeddings
+
+            embeddings = OpenAIEmbeddings()
+            vlite = VLite.from_documents(documents, embeddings)
+        """
+        vlite = cls(embedding_function=embedding, collection=collection, **kwargs)
+        vlite.add_documents(documents, **kwargs)
+        return vlite
--- a/libs/community/tests/integration_tests/vectorstores/test_vlite.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_vlite.py
@@ -0,0 +1,88 @@
+"""Test VLite functionality."""
+
+from langchain_core.documents import Document
+
+from langchain_community.embeddings import FakeEmbeddings
+from langchain_community.vectorstores import VLite
+
+
+def test_vlite() -> None:
+    """Test end to end construction and search."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = VLite.from_texts(texts=texts, embedding=FakeEmbeddings())
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
+
+
+def test_vlite_with_metadatas() -> None:
+    """Test end to end construction and search with metadata."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = VLite.from_texts(
+        texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas
+    )
+    output = docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo", metadata={"page": "0"})]
+
+
+def test_vlite_with_metadatas_with_scores() -> None:
+    """Test end to end construction and search with metadata and scores."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = VLite.from_texts(
+        texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas
+    )
+    output = docsearch.similarity_search_with_score("foo", k=1)
+    assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
+
+
+def test_vlite_update_document() -> None:
+    """Test updating a document."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = VLite.from_texts(
+        texts=texts, embedding=FakeEmbeddings(), ids=["1", "2", "3"]
+    )
+    docsearch.update_document("1", Document(page_content="updated_foo"))
+    output = docsearch.similarity_search("updated_foo", k=1)
+    assert output == [Document(page_content="updated_foo")]
+
+
+def test_vlite_delete_document() -> None:
+    """Test deleting a document."""
+    texts = ["foo", "bar", "baz"]
+    docsearch = VLite.from_texts(
+        texts=texts, embedding=FakeEmbeddings(), ids=["1", "2", "3"]
+    )
+    docsearch.delete(["1"])
+    output = docsearch.similarity_search("foo", k=3)
+    assert Document(page_content="foo") not in output
+
+
+def test_vlite_get_documents() -> None:
+    """Test getting documents by IDs."""
+    texts = ["foo", "bar", "baz"]
+    metadatas = [{"page": str(i)} for i in range(len(texts))]
+    docsearch = VLite.from_texts(
+        texts=texts,
+        embedding=FakeEmbeddings(),
+        metadatas=metadatas,
+        ids=["1", "2", "3"],
+    )
+    output = docsearch.get(ids=["1", "3"])
+    assert output == [
+        Document(page_content="foo", metadata={"page": "0"}),
+        Document(page_content="baz", metadata={"page": "2"}),
+    ]
+
+
+def test_vlite_from_existing_index() -> None:
+    """Test loading from an existing index."""
+    texts = ["foo", "bar", "baz"]
+    VLite.from_texts(
+        texts=texts, embedding=FakeEmbeddings(), collection="test_collection"
+    )
+    new_docsearch = VLite.from_existing_index(
+        collection="test_collection", embedding=FakeEmbeddings()
+    )
+    output = new_docsearch.similarity_search("foo", k=1)
+    assert output == [Document(page_content="foo")]
--- a/libs/community/tests/unit_tests/vectorstores/test_imports.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py
@@ -89,6 +89,7 @@ EXPECTED_ALL = [
    "Vectara",
    "VectorStore",
    "VespaStore",
+    "VLite",
    "Weaviate",
    "Yellowbrick",
    "ZepVectorStore",
--- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
@@ -88,6 +88,7 @@ def test_compatible_vectorstore_documentation() -> None:
        "VDMS",
        "Vearch",
        "VespaStore",
+        "VLite",
        "Weaviate",
        "ZepVectorStore",
        "Zilliz",
--- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py
@@ -82,6 +82,7 @@ _EXPECTED = [
    "Vearch",
    "Vectara",
    "VespaStore",
+    "VLite",
    "Weaviate",
    "ZepVectorStore",
    "Zilliz",