community[minor]: Added VLite as VectorStore (#20245)

Support [VLite](https://github.com/sdan/vlite) as a new VectorStore
type.

**Description**:
vlite is a simple and blazing fast vector database(vdb) made with numpy.
It abstracts a lot of the functionality around using a vdb in the
retrieval augmented generation(RAG) pipeline such as embeddings
generation, chunking, and file processing while still giving developers
the functionality to change how they're made/stored.

**Before submitting**:
Added tests
[here](c09c2ebd5c/libs/community/tests/integration_tests/vectorstores/test_vlite.py)
Added ipython notebook
[here](c09c2ebd5c/docs/docs/integrations/vectorstores/vlite.ipynb)
Added simple docs on how to use
[here](c09c2ebd5c/docs/docs/integrations/providers/vlite.mdx)

**Profiles**

Maintainers: @sdan
Twitter handles: [@sdand](https://x.com/sdand)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
sdan
2024-04-16 18:24:38 -07:00
committed by GitHub
parent 7824291252
commit a7c5e41443
8 changed files with 560 additions and 0 deletions

View File

@@ -265,6 +265,9 @@ if TYPE_CHECKING:
from langchain_community.vectorstores.vespa import (
VespaStore, # noqa: F401
)
from langchain_community.vectorstores.vlite import (
VLite, # noqa: F401
)
from langchain_community.vectorstores.weaviate import (
Weaviate, # noqa: F401
)
@@ -364,6 +367,7 @@ __all__ = [
"Vectara",
"VectorStore",
"VespaStore",
"VLite",
"Weaviate",
"Yellowbrick",
"ZepVectorStore",
@@ -456,6 +460,7 @@ _module_lookup = {
"Vectara": "langchain_community.vectorstores.vectara",
"VectorStore": "langchain_core.vectorstores",
"VespaStore": "langchain_community.vectorstores.vespa",
"VLite": "langchain_community.vectorstores.vlite",
"Weaviate": "langchain_community.vectorstores.weaviate",
"Yellowbrick": "langchain_community.vectorstores.yellowbrick",
"ZepVectorStore": "langchain_community.vectorstores.zep",

View File

@@ -0,0 +1,247 @@
from __future__ import annotations
# Standard library imports
from typing import Any, Dict, Iterable, List, Optional, Tuple
from uuid import uuid4
# LangChain imports
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore
class VLite(VectorStore):
"""VLite is a simple and fast vector database for semantic search."""
def __init__(
self,
embedding_function: Embeddings,
collection: Optional[str] = None,
**kwargs: Any,
):
super().__init__()
self.embedding_function = embedding_function
self.collection = collection or f"vlite_{uuid4().hex}"
# Third-party imports
try:
from vlite import VLite
except ImportError:
raise ImportError(
"Could not import vlite python package. "
"Please install it with `pip install vlite`."
)
self.vlite = VLite(collection=self.collection, **kwargs)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
kwargs: vectorstore specific parameters
Returns:
List of ids from adding the texts into the vectorstore.
"""
texts = list(texts)
ids = kwargs.pop("ids", [str(uuid4()) for _ in texts])
embeddings = self.embedding_function.embed_documents(texts)
if not metadatas:
metadatas = [{} for _ in texts]
data_points = [
{"text": text, "metadata": metadata, "id": id, "embedding": embedding}
for text, metadata, id, embedding in zip(texts, metadatas, ids, embeddings)
]
results = self.vlite.add(data_points)
return [result[0] for result in results]
def add_documents(
self,
documents: List[Document],
**kwargs: Any,
) -> List[str]:
"""Add a list of documents to the vectorstore.
Args:
documents: List of documents to add to the vectorstore.
kwargs: vectorstore specific parameters such as "file_path" for processing
directly with vlite.
Returns:
List of ids from adding the documents into the vectorstore.
"""
ids = kwargs.pop("ids", [str(uuid4()) for _ in documents])
texts = []
metadatas = []
for doc, id in zip(documents, ids):
if "file_path" in kwargs:
# Third-party imports
try:
from vlite.utils import process_file
except ImportError:
raise ImportError(
"Could not import vlite python package. "
"Please install it with `pip install vlite`."
)
processed_data = process_file(kwargs["file_path"])
texts.extend(processed_data)
metadatas.extend([doc.metadata] * len(processed_data))
ids.extend([f"{id}_{i}" for i in range(len(processed_data))])
else:
texts.append(doc.page_content)
metadatas.append(doc.metadata)
return self.add_texts(texts, metadatas, ids=ids)
def similarity_search(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query.
"""
docs_and_scores = self.similarity_search_with_score(query, k=k)
return [doc for doc, _ in docs_and_scores]
def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: Filter by metadata. Defaults to None.
Returns:
List of Tuples of (doc, score), where score is the similarity score.
"""
metadata = filter or {}
embedding = self.embedding_function.embed_query(query)
results = self.vlite.retrieve(
text=query,
top_k=k,
metadata=metadata,
return_scores=True,
embedding=embedding,
)
documents_with_scores = [
(Document(page_content=text, metadata=metadata), score)
for text, score, metadata in results
]
return documents_with_scores
def update_document(self, document_id: str, document: Document) -> None:
"""Update an existing document in the vectorstore."""
self.vlite.update(
document_id, text=document.page_content, metadata=document.metadata
)
def get(self, ids: List[str]) -> List[Document]:
"""Get documents by their IDs."""
results = self.vlite.get(ids)
documents = [
Document(page_content=text, metadata=metadata) for text, metadata in results
]
return documents
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
"""Delete by ids."""
if ids is not None:
self.vlite.delete(ids, **kwargs)
return True
return None
@classmethod
def from_existing_index(
cls,
embedding: Embeddings,
collection: str,
**kwargs: Any,
) -> VLite:
"""Load an existing VLite index.
Args:
embedding: Embedding function
collection: Name of the collection to load.
Returns:
VLite vector store.
"""
vlite = cls(embedding_function=embedding, collection=collection, **kwargs)
return vlite
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
collection: Optional[str] = None,
**kwargs: Any,
) -> VLite:
"""Construct VLite wrapper from raw documents.
This is a user-friendly interface that:
1. Embeds documents.
2. Adds the documents to the vectorstore.
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain import VLite
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vlite = VLite.from_texts(texts, embeddings)
"""
vlite = cls(embedding_function=embedding, collection=collection, **kwargs)
vlite.add_texts(texts, metadatas, **kwargs)
return vlite
@classmethod
def from_documents(
cls,
documents: List[Document],
embedding: Embeddings,
collection: Optional[str] = None,
**kwargs: Any,
) -> VLite:
"""Construct VLite wrapper from a list of documents.
This is a user-friendly interface that:
1. Embeds documents.
2. Adds the documents to the vectorstore.
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain import VLite
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vlite = VLite.from_documents(documents, embeddings)
"""
vlite = cls(embedding_function=embedding, collection=collection, **kwargs)
vlite.add_documents(documents, **kwargs)
return vlite

View File

@@ -0,0 +1,88 @@
"""Test VLite functionality."""
from langchain_core.documents import Document
from langchain_community.embeddings import FakeEmbeddings
from langchain_community.vectorstores import VLite
def test_vlite() -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
docsearch = VLite.from_texts(texts=texts, embedding=FakeEmbeddings())
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_vlite_with_metadatas() -> None:
"""Test end to end construction and search with metadata."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = VLite.from_texts(
texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas
)
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={"page": "0"})]
def test_vlite_with_metadatas_with_scores() -> None:
"""Test end to end construction and search with metadata and scores."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = VLite.from_texts(
texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas
)
output = docsearch.similarity_search_with_score("foo", k=1)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
def test_vlite_update_document() -> None:
"""Test updating a document."""
texts = ["foo", "bar", "baz"]
docsearch = VLite.from_texts(
texts=texts, embedding=FakeEmbeddings(), ids=["1", "2", "3"]
)
docsearch.update_document("1", Document(page_content="updated_foo"))
output = docsearch.similarity_search("updated_foo", k=1)
assert output == [Document(page_content="updated_foo")]
def test_vlite_delete_document() -> None:
"""Test deleting a document."""
texts = ["foo", "bar", "baz"]
docsearch = VLite.from_texts(
texts=texts, embedding=FakeEmbeddings(), ids=["1", "2", "3"]
)
docsearch.delete(["1"])
output = docsearch.similarity_search("foo", k=3)
assert Document(page_content="foo") not in output
def test_vlite_get_documents() -> None:
"""Test getting documents by IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = VLite.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
metadatas=metadatas,
ids=["1", "2", "3"],
)
output = docsearch.get(ids=["1", "3"])
assert output == [
Document(page_content="foo", metadata={"page": "0"}),
Document(page_content="baz", metadata={"page": "2"}),
]
def test_vlite_from_existing_index() -> None:
"""Test loading from an existing index."""
texts = ["foo", "bar", "baz"]
VLite.from_texts(
texts=texts, embedding=FakeEmbeddings(), collection="test_collection"
)
new_docsearch = VLite.from_existing_index(
collection="test_collection", embedding=FakeEmbeddings()
)
output = new_docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]

View File

@@ -89,6 +89,7 @@ EXPECTED_ALL = [
"Vectara",
"VectorStore",
"VespaStore",
"VLite",
"Weaviate",
"Yellowbrick",
"ZepVectorStore",

View File

@@ -88,6 +88,7 @@ def test_compatible_vectorstore_documentation() -> None:
"VDMS",
"Vearch",
"VespaStore",
"VLite",
"Weaviate",
"ZepVectorStore",
"Zilliz",

View File

@@ -82,6 +82,7 @@ _EXPECTED = [
"Vearch",
"Vectara",
"VespaStore",
"VLite",
"Weaviate",
"ZepVectorStore",
"Zilliz",