community[minor]: Add indexing via locality sensitive hashing to the Yellowbrick vector store (#20856)

- **Description:** Add LSH-based indexing to the Yellowbrick vector
store module
- **Twitter handle:** @markcusack

---------

Co-authored-by: markcusack <markcusack@markcusacksmac.lan>
Co-authored-by: markcusack <markcusack@Mark-Cusack-sMac.local>
Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
This commit is contained in:
Mark Cusack 2024-05-06 16:18:02 -04:00 committed by GitHub
parent a2fdabdad2
commit 060987d755
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 1092 additions and 167 deletions

View File

@ -98,7 +98,7 @@
"import psycopg2\n",
"from IPython.display import Markdown, display\n",
"from langchain.chains import LLMChain, RetrievalQAWithSourcesChain\n",
"from langchain_community.docstore.document import Document\n",
"from langchain.schema import Document\n",
"from langchain_community.vectorstores import Yellowbrick\n",
"from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
@ -209,14 +209,12 @@
"\n",
"# Define the SQL statement to create a table\n",
"create_table_query = f\"\"\"\n",
"CREATE TABLE if not exists {embedding_table} (\n",
" id uuid,\n",
" embedding_id integer,\n",
" text character varying(60000),\n",
" metadata character varying(1024),\n",
" embedding double precision\n",
"CREATE TABLE IF NOT EXISTS {embedding_table} (\n",
" doc_id uuid NOT NULL,\n",
" embedding_id smallint NOT NULL,\n",
" embedding double precision NOT NULL\n",
")\n",
"DISTRIBUTE ON (id);\n",
"DISTRIBUTE ON (doc_id);\n",
"truncate table {embedding_table};\n",
"\"\"\"\n",
"\n",
@ -257,6 +255,8 @@
" f\"postgres://{urlparse.quote(YBUSER)}:{YBPASSWORD}@{YBHOST}:5432/{YB_DOC_DATABASE}\"\n",
")\n",
"\n",
"print(yellowbrick_doc_connection_string)\n",
"\n",
"# Establish a connection to the Yellowbrick database\n",
"conn = psycopg2.connect(yellowbrick_doc_connection_string)\n",
"\n",
@ -324,7 +324,7 @@
"vector_store = Yellowbrick.from_documents(\n",
" documents=split_docs,\n",
" embedding=embeddings,\n",
" connection_string=yellowbrick_connection_string,\n",
" connection_info=yellowbrick_connection_string,\n",
" table=embedding_table,\n",
")\n",
"\n",
@ -403,6 +403,88 @@
"print_result_sources(\"Whats an easy way to add users in bulk to Yellowbrick?\")"
]
},
{
"cell_type": "markdown",
"id": "1f39fd30",
"metadata": {},
"source": [
"## Part 6: Introducing an Index to Increase Performance\n",
"\n",
"Yellowbrick also supports indexing using the Locality-Sensitive Hashing approach. This is an approximate nearest-neighbor search technique, and allows one to trade off similarity search time at the expense of accuracy. The index introduces two new tunable parameters:\n",
"\n",
"- The number of hyperplanes, which is provided as an argument to `create_lsh_index(num_hyperplanes)`. The more documents, the more hyperplanes are needed. LSH is a form of dimensionality reduction. The original embeddings are transformed into lower dimensional vectors where the number of components is the same as the number of hyperplanes.\n",
"- The Hamming distance, an integer representing the breadth of the search. Smaller Hamming distances result in faster retreival but lower accuracy.\n",
"\n",
"Here's how you can create an index on the embeddings we loaded into Yellowbrick. We'll also re-run the previous chat session, but this time the retrieval will use the index. Note that for such a small number of documents, you won't see the benefit of indexing in terms of performance."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "02ba61c4",
"metadata": {},
"outputs": [],
"source": [
"system_template = \"\"\"Use the following pieces of context to answer the users question.\n",
"Take note of the sources and include them in the answer in the format: \"SOURCES: source1 source2\", use \"SOURCES\" in capital letters regardless of the number of sources.\n",
"If you don't know the answer, just say that \"I don't know\", don't try to make up an answer.\n",
"----------------\n",
"{summaries}\"\"\"\n",
"messages = [\n",
" SystemMessagePromptTemplate.from_template(system_template),\n",
" HumanMessagePromptTemplate.from_template(\"{question}\"),\n",
"]\n",
"prompt = ChatPromptTemplate.from_messages(messages)\n",
"\n",
"vector_store = Yellowbrick(\n",
" OpenAIEmbeddings(),\n",
" yellowbrick_connection_string,\n",
" embedding_table, # Change the table name to reflect your embeddings\n",
")\n",
"\n",
"lsh_params = Yellowbrick.IndexParams(\n",
" Yellowbrick.IndexType.LSH, {\"num_hyperplanes\": 8, \"hamming_distance\": 2}\n",
")\n",
"vector_store.create_index(lsh_params)\n",
"\n",
"chain_type_kwargs = {\"prompt\": prompt}\n",
"llm = ChatOpenAI(\n",
" model_name=\"gpt-3.5-turbo\", # Modify model_name if you have access to GPT-4\n",
" temperature=0,\n",
" max_tokens=256,\n",
")\n",
"chain = RetrievalQAWithSourcesChain.from_chain_type(\n",
" llm=llm,\n",
" chain_type=\"stuff\",\n",
" retriever=vector_store.as_retriever(\n",
" k=5, search_kwargs={\"index_params\": lsh_params}\n",
" ),\n",
" return_source_documents=True,\n",
" chain_type_kwargs=chain_type_kwargs,\n",
")\n",
"\n",
"\n",
"def print_result_sources(query):\n",
" result = chain(query)\n",
" output_text = f\"\"\"### Question: \n",
" {query}\n",
" ### Answer: \n",
" {result['answer']}\n",
" ### Sources: \n",
" {result['sources']}\n",
" ### All relevant sources:\n",
" {', '.join(list(set([doc.metadata['source'] for doc in result['source_documents']])))}\n",
" \"\"\"\n",
" display(Markdown(output_text))\n",
"\n",
"\n",
"# Use the chain to query\n",
"\n",
"print_result_sources(\"How many databases can be in a Yellowbrick Instance?\")\n",
"\n",
"print_result_sources(\"Whats an easy way to add users in bulk to Yellowbrick?\")"
]
},
{
"cell_type": "markdown",
"id": "697c8a38",
@ -418,9 +500,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "langchain_venv",
"display_name": "Python 3",
"language": "python",
"name": "langchain_venv"
"name": "python3"
},
"language_info": {
"codemirror_mode": {

View File

@ -60,7 +60,7 @@
" * document addition by id (`add_documents` method with `ids` argument)\n",
" * delete by id (`delete` method with `ids` argument)\n",
"\n",
"Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AzureCosmosDBVectorSearch`, `AzureSearch`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `LanceDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `UpstashVectorStore`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n",
"Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AzureCosmosDBVectorSearch`, `AzureSearch`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `LanceDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `UpstashVectorStore`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`, `Yellowbrick`.\n",
" \n",
"## Caution\n",
"\n",

View File

@ -1,3 +1,4 @@
import logging
from typing import List, Optional
import pytest
@ -5,60 +6,256 @@ import pytest
from langchain_community.docstore.document import Document
from langchain_community.vectorstores import Yellowbrick
from tests.integration_tests.vectorstores.fake_embeddings import (
FakeEmbeddings,
ConsistentFakeEmbeddings,
fake_texts,
)
YELLOWBRICK_URL = "postgres://username:password@host:port/database"
YELLOWBRICK_TABLE = "test_table"
YELLOWBRICK_CONTENT = "test_table_content"
YELLOWBRICK_SCHEMA = "test_schema"
def _yellowbrick_vector_from_texts(
metadatas: Optional[List[dict]] = None, drop: bool = True
) -> Yellowbrick:
return Yellowbrick.from_texts(
db = Yellowbrick.from_texts(
fake_texts,
FakeEmbeddings(),
ConsistentFakeEmbeddings(),
metadatas,
YELLOWBRICK_URL,
YELLOWBRICK_TABLE,
table=YELLOWBRICK_TABLE,
schema=YELLOWBRICK_SCHEMA,
drop=drop,
)
db.logger.setLevel(logging.DEBUG)
return db
def _yellowbrick_vector_from_texts_no_schema(
metadatas: Optional[List[dict]] = None, drop: bool = True
) -> Yellowbrick:
db = Yellowbrick.from_texts(
fake_texts,
ConsistentFakeEmbeddings(),
metadatas,
YELLOWBRICK_URL,
table=YELLOWBRICK_TABLE,
drop=drop,
)
db.logger.setLevel(logging.DEBUG)
return db
@pytest.mark.requires("yb-vss")
def test_yellowbrick() -> None:
"""Test end to end construction and search."""
docsearch = _yellowbrick_vector_from_texts()
output = docsearch.similarity_search("foo", k=1)
docsearch.drop(YELLOWBRICK_TABLE)
assert output == [Document(page_content="foo", metadata={})]
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_add_text() -> None:
"""Test end to end construction and search."""
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={})]
texts = ["oof"]
docsearch.add_texts(texts)
output = docsearch.similarity_search("oof", k=1)
assert output == [Document(page_content="oof", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_delete() -> None:
"""Test end to end construction and search."""
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={})]
texts = ["oof"]
added_docs = docsearch.add_texts(texts)
output = docsearch.similarity_search("oof", k=1)
assert output == [Document(page_content="oof", metadata={})]
docsearch.delete(added_docs)
output = docsearch.similarity_search("oof", k=1)
assert output != [Document(page_content="oof", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_delete_all() -> None:
"""Test end to end construction and search."""
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
output = docsearch.similarity_search("foo", k=1)
assert output == [Document(page_content="foo", metadata={})]
texts = ["oof"]
docsearch.add_texts(texts)
output = docsearch.similarity_search("oof", k=1)
assert output == [Document(page_content="oof", metadata={})]
docsearch.delete(delete_all=True)
output = docsearch.similarity_search("oof", k=1)
assert output != [Document(page_content="oof", metadata={})]
output = docsearch.similarity_search("foo", k=1)
assert output != [Document(page_content="foo", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_lsh_search() -> None:
"""Test end to end construction and search."""
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
index_params = Yellowbrick.IndexParams(
Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
)
docsearch.drop_index(index_params)
docsearch.create_index(index_params)
output = docsearch.similarity_search("foo", k=1, index_params=index_params)
assert output == [Document(page_content="foo", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
docsearch.drop_index(index_params=index_params)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_lsh_search_update() -> None:
"""Test end to end construction and search."""
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
index_params = Yellowbrick.IndexParams(
Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
)
docsearch.drop_index(index_params)
docsearch.create_index(index_params)
output = docsearch.similarity_search("foo", k=1, index_params=index_params)
assert output == [Document(page_content="foo", metadata={})]
texts = ["oof"]
docsearch.add_texts(texts, index_params=index_params)
output = docsearch.similarity_search("oof", k=1, index_params=index_params)
assert output == [Document(page_content="oof", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
docsearch.drop_index(index_params=index_params)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_lsh_delete() -> None:
"""Test end to end construction and search."""
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
index_params = Yellowbrick.IndexParams(
Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
)
docsearch.drop_index(index_params)
docsearch.create_index(index_params)
output = docsearch.similarity_search("foo", k=1, index_params=index_params)
assert output == [Document(page_content="foo", metadata={})]
texts = ["oof"]
added_docs = docsearch.add_texts(texts, index_params=index_params)
output = docsearch.similarity_search("oof", k=1, index_params=index_params)
assert output == [Document(page_content="oof", metadata={})]
docsearch.delete(added_docs)
output = docsearch.similarity_search("oof", k=1, index_params=index_params)
assert output != [Document(page_content="oof", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
docsearch.drop_index(index_params=index_params)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_lsh_delete_all() -> None:
"""Test end to end construction and search."""
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
index_params = Yellowbrick.IndexParams(
Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
)
docsearch.drop_index(index_params)
docsearch.create_index(index_params)
output = docsearch.similarity_search("foo", k=1, index_params=index_params)
assert output == [Document(page_content="foo", metadata={})]
texts = ["oof"]
docsearch.add_texts(texts, index_params=index_params)
output = docsearch.similarity_search("oof", k=1, index_params=index_params)
assert output == [Document(page_content="oof", metadata={})]
docsearch.delete(delete_all=True)
output = docsearch.similarity_search("oof", k=1, index_params=index_params)
assert output != [Document(page_content="oof", metadata={})]
output = docsearch.similarity_search("foo", k=1, index_params=index_params)
assert output != [Document(page_content="foo", metadata={})]
docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
docsearch.drop_index(index_params=index_params)
@pytest.mark.requires("yb-vss")
def test_yellowbrick_with_score() -> None:
"""Test end to end construction and search with scores and IDs."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output]
distances = [o[1] for o in output]
docsearch.drop(YELLOWBRICK_TABLE)
assert docs == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
Document(page_content="baz", metadata={"page": 2}),
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
assert distances[0] > distances[1] > distances[2]
for docsearch in docsearches:
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
output = docsearch.similarity_search_with_score("foo", k=3)
docs = [o[0] for o in output]
distances = [o[1] for o in output]
assert docs == [
Document(page_content="foo", metadata={"page": 0}),
Document(page_content="bar", metadata={"page": 1}),
Document(page_content="baz", metadata={"page": 2}),
]
assert distances[0] > distances[1] > distances[2]
@pytest.mark.requires("yb-vss")
def test_yellowbrick_add_extra() -> None:
"""Test end to end construction and MRR search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
docsearch.add_texts(texts, metadatas)
output = docsearch.similarity_search("foo", k=10)
docsearch.drop(YELLOWBRICK_TABLE)
assert len(output) == 6
docsearches = [
_yellowbrick_vector_from_texts(),
_yellowbrick_vector_from_texts_no_schema(),
]
for docsearch in docsearches:
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
docsearch.add_texts(texts, metadatas)
output = docsearch.similarity_search("foo", k=10)
assert len(output) == 6

View File

@ -95,6 +95,7 @@ def test_compatible_vectorstore_documentation() -> None:
"VespaStore",
"VLite",
"Weaviate",
"Yellowbrick",
"ZepVectorStore",
"Zilliz",
"Lantern",