community[minor]: Add indexing via locality sensitive hashing to the Yellowbrick vector store (#20856)

- **Description:** Add LSH-based indexing to the Yellowbrick vector store module - **Twitter handle:** @markcusack --------- Co-authored-by: markcusack <markcusack@markcusacksmac.lan> Co-authored-by: markcusack <markcusack@Mark-Cusack-sMac.local> Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-09-01 02:50:47 +00:00 · 2024-05-06 16:18:02 -04:00
parent a2fdabdad2
commit 060987d755
5 changed files with 1092 additions and 167 deletions
--- a/docs/docs/integrations/vectorstores/yellowbrick.ipynb
+++ b/docs/docs/integrations/vectorstores/yellowbrick.ipynb
@@ -98,7 +98,7 @@
    "import psycopg2\n",
    "from IPython.display import Markdown, display\n",
    "from langchain.chains import LLMChain, RetrievalQAWithSourcesChain\n",
-    "from langchain_community.docstore.document import Document\n",
+    "from langchain.schema import Document\n",
    "from langchain_community.vectorstores import Yellowbrick\n",
    "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
@@ -209,14 +209,12 @@
    "\n",
    "# Define the SQL statement to create a table\n",
    "create_table_query = f\"\"\"\n",
-    "CREATE TABLE if not exists {embedding_table} (\n",
-    "    id uuid,\n",
-    "    embedding_id integer,\n",
-    "    text character varying(60000),\n",
-    "    metadata character varying(1024),\n",
-    "    embedding double precision\n",
+    "CREATE TABLE IF NOT EXISTS {embedding_table} (\n",
+    "    doc_id uuid NOT NULL,\n",
+    "    embedding_id smallint NOT NULL,\n",
+    "    embedding double precision NOT NULL\n",
    ")\n",
-    "DISTRIBUTE ON (id);\n",
+    "DISTRIBUTE ON (doc_id);\n",
    "truncate table {embedding_table};\n",
    "\"\"\"\n",
    "\n",
@@ -257,6 +255,8 @@
    "    f\"postgres://{urlparse.quote(YBUSER)}:{YBPASSWORD}@{YBHOST}:5432/{YB_DOC_DATABASE}\"\n",
    ")\n",
    "\n",
+    "print(yellowbrick_doc_connection_string)\n",
+    "\n",
    "# Establish a connection to the Yellowbrick database\n",
    "conn = psycopg2.connect(yellowbrick_doc_connection_string)\n",
    "\n",
@@ -324,7 +324,7 @@
    "vector_store = Yellowbrick.from_documents(\n",
    "    documents=split_docs,\n",
    "    embedding=embeddings,\n",
-    "    connection_string=yellowbrick_connection_string,\n",
+    "    connection_info=yellowbrick_connection_string,\n",
    "    table=embedding_table,\n",
    ")\n",
    "\n",
@@ -403,6 +403,88 @@
    "print_result_sources(\"Whats an easy way to add users in bulk to Yellowbrick?\")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "1f39fd30",
+   "metadata": {},
+   "source": [
+    "## Part 6: Introducing an Index to Increase Performance\n",
+    "\n",
+    "Yellowbrick also supports indexing using the Locality-Sensitive Hashing approach. This is an approximate nearest-neighbor search technique, and allows one to trade off similarity search time at the expense of accuracy. The index introduces two new tunable parameters:\n",
+    "\n",
+    "- The number of hyperplanes, which is provided as an argument to `create_lsh_index(num_hyperplanes)`. The more documents, the more hyperplanes are needed. LSH is a form of dimensionality reduction. The original embeddings are transformed into lower dimensional vectors where the number of components is the same as the number of hyperplanes.\n",
+    "- The Hamming distance, an integer representing the breadth of the search. Smaller Hamming distances result in faster retreival but lower accuracy.\n",
+    "\n",
+    "Here's how you can create an index on the embeddings we loaded into Yellowbrick. We'll also re-run the previous chat session, but this time the retrieval will use the index. Note that for such a small number of documents, you won't see the benefit of indexing in terms of performance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02ba61c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "system_template = \"\"\"Use the following pieces of context to answer the users question.\n",
+    "Take note of the sources and include them in the answer in the format: \"SOURCES: source1 source2\", use \"SOURCES\" in capital letters regardless of the number of sources.\n",
+    "If you don't know the answer, just say that \"I don't know\", don't try to make up an answer.\n",
+    "----------------\n",
+    "{summaries}\"\"\"\n",
+    "messages = [\n",
+    "    SystemMessagePromptTemplate.from_template(system_template),\n",
+    "    HumanMessagePromptTemplate.from_template(\"{question}\"),\n",
+    "]\n",
+    "prompt = ChatPromptTemplate.from_messages(messages)\n",
+    "\n",
+    "vector_store = Yellowbrick(\n",
+    "    OpenAIEmbeddings(),\n",
+    "    yellowbrick_connection_string,\n",
+    "    embedding_table,  # Change the table name to reflect your embeddings\n",
+    ")\n",
+    "\n",
+    "lsh_params = Yellowbrick.IndexParams(\n",
+    "    Yellowbrick.IndexType.LSH, {\"num_hyperplanes\": 8, \"hamming_distance\": 2}\n",
+    ")\n",
+    "vector_store.create_index(lsh_params)\n",
+    "\n",
+    "chain_type_kwargs = {\"prompt\": prompt}\n",
+    "llm = ChatOpenAI(\n",
+    "    model_name=\"gpt-3.5-turbo\",  # Modify model_name if you have access to GPT-4\n",
+    "    temperature=0,\n",
+    "    max_tokens=256,\n",
+    ")\n",
+    "chain = RetrievalQAWithSourcesChain.from_chain_type(\n",
+    "    llm=llm,\n",
+    "    chain_type=\"stuff\",\n",
+    "    retriever=vector_store.as_retriever(\n",
+    "        k=5, search_kwargs={\"index_params\": lsh_params}\n",
+    "    ),\n",
+    "    return_source_documents=True,\n",
+    "    chain_type_kwargs=chain_type_kwargs,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def print_result_sources(query):\n",
+    "    result = chain(query)\n",
+    "    output_text = f\"\"\"### Question: \n",
+    "  {query}\n",
+    "  ### Answer: \n",
+    "  {result['answer']}\n",
+    "  ### Sources: \n",
+    "  {result['sources']}\n",
+    "  ### All relevant sources:\n",
+    "  {', '.join(list(set([doc.metadata['source'] for doc in result['source_documents']])))}\n",
+    "    \"\"\"\n",
+    "    display(Markdown(output_text))\n",
+    "\n",
+    "\n",
+    "# Use the chain to query\n",
+    "\n",
+    "print_result_sources(\"How many databases can be in a Yellowbrick Instance?\")\n",
+    "\n",
+    "print_result_sources(\"Whats an easy way to add users in bulk to Yellowbrick?\")"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "697c8a38",
@@ -418,9 +500,9 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "langchain_venv",
+   "display_name": "Python 3",
   "language": "python",
-   "name": "langchain_venv"
+   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
--- a/docs/docs/modules/data_connection/indexing.ipynb
+++ b/docs/docs/modules/data_connection/indexing.ipynb
@@ -60,7 +60,7 @@
    "   * document addition by id (`add_documents` method with `ids` argument)\n",
    "   * delete by id (`delete` method with `ids` argument)\n",
    "\n",
-    "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AzureCosmosDBVectorSearch`, `AzureSearch`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `LanceDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `UpstashVectorStore`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`.\n",
+    "Compatible Vectorstores: `AnalyticDB`, `AstraDB`, `AzureCosmosDBVectorSearch`, `AzureSearch`, `AwaDB`, `Bagel`, `Cassandra`, `Chroma`, `CouchbaseVectorStore`, `DashVector`, `DatabricksVectorSearch`, `DeepLake`, `Dingo`, `ElasticVectorSearch`, `ElasticsearchStore`, `FAISS`, `HanaDB`, `LanceDB`, `Milvus`, `MyScale`, `OpenSearchVectorSearch`, `PGVector`, `Pinecone`, `Qdrant`, `Redis`, `Rockset`, `ScaNN`, `SupabaseVectorStore`, `SurrealDBStore`, `TimescaleVector`, `UpstashVectorStore`, `Vald`, `VDMS`, `Vearch`, `VespaStore`, `Weaviate`, `ZepVectorStore`, `TencentVectorDB`, `OpenSearchVectorSearch`, `Yellowbrick`.\n",
    "  \n",
    "## Caution\n",
    "\n",
--- a/libs/community/langchain_community/vectorstores/yellowbrick.py
+++ b/libs/community/langchain_community/vectorstores/yellowbrick.py
--- a/libs/community/tests/integration_tests/vectorstores/test_yellowbrick.py
+++ b/libs/community/tests/integration_tests/vectorstores/test_yellowbrick.py
@@ -1,3 +1,4 @@
+import logging
 from typing import List, Optional

 import pytest
@@ -5,60 +6,256 @@ import pytest
 from langchain_community.docstore.document import Document
 from langchain_community.vectorstores import Yellowbrick
 from tests.integration_tests.vectorstores.fake_embeddings import (
-    FakeEmbeddings,
+    ConsistentFakeEmbeddings,
    fake_texts,
 )

 YELLOWBRICK_URL = "postgres://username:password@host:port/database"
 YELLOWBRICK_TABLE = "test_table"
+YELLOWBRICK_CONTENT = "test_table_content"
+YELLOWBRICK_SCHEMA = "test_schema"


 def _yellowbrick_vector_from_texts(
    metadatas: Optional[List[dict]] = None, drop: bool = True
 ) -> Yellowbrick:
-    return Yellowbrick.from_texts(
+    db = Yellowbrick.from_texts(
        fake_texts,
-        FakeEmbeddings(),
+        ConsistentFakeEmbeddings(),
        metadatas,
        YELLOWBRICK_URL,
-        YELLOWBRICK_TABLE,
+        table=YELLOWBRICK_TABLE,
+        schema=YELLOWBRICK_SCHEMA,
+        drop=drop,
    )
+    db.logger.setLevel(logging.DEBUG)
+    return db
+
+
+def _yellowbrick_vector_from_texts_no_schema(
+    metadatas: Optional[List[dict]] = None, drop: bool = True
+) -> Yellowbrick:
+    db = Yellowbrick.from_texts(
+        fake_texts,
+        ConsistentFakeEmbeddings(),
+        metadatas,
+        YELLOWBRICK_URL,
+        table=YELLOWBRICK_TABLE,
+        drop=drop,
+    )
+    db.logger.setLevel(logging.DEBUG)
+    return db


@pytest.mark.requires("yb-vss")
 def test_yellowbrick() -> None:
    """Test end to end construction and search."""
-    docsearch = _yellowbrick_vector_from_texts()
-    output = docsearch.similarity_search("foo", k=1)
-    docsearch.drop(YELLOWBRICK_TABLE)
-    assert output == [Document(page_content="foo", metadata={})]
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        output = docsearch.similarity_search("foo", k=1)
+        assert output == [Document(page_content="foo", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+
+
+@pytest.mark.requires("yb-vss")
+def test_yellowbrick_add_text() -> None:
+    """Test end to end construction and search."""
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        output = docsearch.similarity_search("foo", k=1)
+        assert output == [Document(page_content="foo", metadata={})]
+        texts = ["oof"]
+        docsearch.add_texts(texts)
+        output = docsearch.similarity_search("oof", k=1)
+        assert output == [Document(page_content="oof", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+
+
+@pytest.mark.requires("yb-vss")
+def test_yellowbrick_delete() -> None:
+    """Test end to end construction and search."""
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        output = docsearch.similarity_search("foo", k=1)
+        assert output == [Document(page_content="foo", metadata={})]
+        texts = ["oof"]
+        added_docs = docsearch.add_texts(texts)
+        output = docsearch.similarity_search("oof", k=1)
+        assert output == [Document(page_content="oof", metadata={})]
+        docsearch.delete(added_docs)
+        output = docsearch.similarity_search("oof", k=1)
+        assert output != [Document(page_content="oof", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+
+
+@pytest.mark.requires("yb-vss")
+def test_yellowbrick_delete_all() -> None:
+    """Test end to end construction and search."""
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        output = docsearch.similarity_search("foo", k=1)
+        assert output == [Document(page_content="foo", metadata={})]
+        texts = ["oof"]
+        docsearch.add_texts(texts)
+        output = docsearch.similarity_search("oof", k=1)
+        assert output == [Document(page_content="oof", metadata={})]
+        docsearch.delete(delete_all=True)
+        output = docsearch.similarity_search("oof", k=1)
+        assert output != [Document(page_content="oof", metadata={})]
+        output = docsearch.similarity_search("foo", k=1)
+        assert output != [Document(page_content="foo", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+
+
+@pytest.mark.requires("yb-vss")
+def test_yellowbrick_lsh_search() -> None:
+    """Test end to end construction and search."""
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        index_params = Yellowbrick.IndexParams(
+            Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
+        )
+        docsearch.drop_index(index_params)
+        docsearch.create_index(index_params)
+        output = docsearch.similarity_search("foo", k=1, index_params=index_params)
+        assert output == [Document(page_content="foo", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+        docsearch.drop_index(index_params=index_params)
+
+
+@pytest.mark.requires("yb-vss")
+def test_yellowbrick_lsh_search_update() -> None:
+    """Test end to end construction and search."""
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        index_params = Yellowbrick.IndexParams(
+            Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
+        )
+        docsearch.drop_index(index_params)
+        docsearch.create_index(index_params)
+        output = docsearch.similarity_search("foo", k=1, index_params=index_params)
+        assert output == [Document(page_content="foo", metadata={})]
+        texts = ["oof"]
+        docsearch.add_texts(texts, index_params=index_params)
+        output = docsearch.similarity_search("oof", k=1, index_params=index_params)
+        assert output == [Document(page_content="oof", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+        docsearch.drop_index(index_params=index_params)
+
+
+@pytest.mark.requires("yb-vss")
+def test_yellowbrick_lsh_delete() -> None:
+    """Test end to end construction and search."""
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        index_params = Yellowbrick.IndexParams(
+            Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
+        )
+        docsearch.drop_index(index_params)
+        docsearch.create_index(index_params)
+        output = docsearch.similarity_search("foo", k=1, index_params=index_params)
+        assert output == [Document(page_content="foo", metadata={})]
+        texts = ["oof"]
+        added_docs = docsearch.add_texts(texts, index_params=index_params)
+        output = docsearch.similarity_search("oof", k=1, index_params=index_params)
+        assert output == [Document(page_content="oof", metadata={})]
+        docsearch.delete(added_docs)
+        output = docsearch.similarity_search("oof", k=1, index_params=index_params)
+        assert output != [Document(page_content="oof", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+        docsearch.drop_index(index_params=index_params)
+
+
+@pytest.mark.requires("yb-vss")
+def test_yellowbrick_lsh_delete_all() -> None:
+    """Test end to end construction and search."""
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        index_params = Yellowbrick.IndexParams(
+            Yellowbrick.IndexType.LSH, {"num_hyperplanes": 10, "hamming_distance": 0}
+        )
+        docsearch.drop_index(index_params)
+        docsearch.create_index(index_params)
+        output = docsearch.similarity_search("foo", k=1, index_params=index_params)
+        assert output == [Document(page_content="foo", metadata={})]
+        texts = ["oof"]
+        docsearch.add_texts(texts, index_params=index_params)
+        output = docsearch.similarity_search("oof", k=1, index_params=index_params)
+        assert output == [Document(page_content="oof", metadata={})]
+        docsearch.delete(delete_all=True)
+        output = docsearch.similarity_search("oof", k=1, index_params=index_params)
+        assert output != [Document(page_content="oof", metadata={})]
+        output = docsearch.similarity_search("foo", k=1, index_params=index_params)
+        assert output != [Document(page_content="foo", metadata={})]
+        docsearch.drop(table=YELLOWBRICK_TABLE, schema=docsearch._schema)
+        docsearch.drop(table=YELLOWBRICK_CONTENT, schema=docsearch._schema)
+        docsearch.drop_index(index_params=index_params)


@pytest.mark.requires("yb-vss")
 def test_yellowbrick_with_score() -> None:
    """Test end to end construction and search with scores and IDs."""
-    texts = ["foo", "bar", "baz"]
-    metadatas = [{"page": i} for i in range(len(texts))]
-    docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
-    output = docsearch.similarity_search_with_score("foo", k=3)
-    docs = [o[0] for o in output]
-    distances = [o[1] for o in output]
-    docsearch.drop(YELLOWBRICK_TABLE)
-    assert docs == [
-        Document(page_content="foo", metadata={"page": 0}),
-        Document(page_content="bar", metadata={"page": 1}),
-        Document(page_content="baz", metadata={"page": 2}),
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
    ]
-    assert distances[0] > distances[1] > distances[2]
+    for docsearch in docsearches:
+        texts = ["foo", "bar", "baz"]
+        metadatas = [{"page": i} for i in range(len(texts))]
+        docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
+        output = docsearch.similarity_search_with_score("foo", k=3)
+        docs = [o[0] for o in output]
+        distances = [o[1] for o in output]
+        assert docs == [
+            Document(page_content="foo", metadata={"page": 0}),
+            Document(page_content="bar", metadata={"page": 1}),
+            Document(page_content="baz", metadata={"page": 2}),
+        ]
+        assert distances[0] > distances[1] > distances[2]


@pytest.mark.requires("yb-vss")
 def test_yellowbrick_add_extra() -> None:
    """Test end to end construction and MRR search."""
-    texts = ["foo", "bar", "baz"]
-    metadatas = [{"page": i} for i in range(len(texts))]
-    docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
-    docsearch.add_texts(texts, metadatas)
-    output = docsearch.similarity_search("foo", k=10)
-    docsearch.drop(YELLOWBRICK_TABLE)
-    assert len(output) == 6
+    docsearches = [
+        _yellowbrick_vector_from_texts(),
+        _yellowbrick_vector_from_texts_no_schema(),
+    ]
+    for docsearch in docsearches:
+        texts = ["foo", "bar", "baz"]
+        metadatas = [{"page": i} for i in range(len(texts))]
+        docsearch = _yellowbrick_vector_from_texts(metadatas=metadatas)
+        docsearch.add_texts(texts, metadatas)
+        output = docsearch.similarity_search("foo", k=10)
+        assert len(output) == 6
--- a/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_indexing_docs.py
@@ -95,6 +95,7 @@ def test_compatible_vectorstore_documentation() -> None:
        "VespaStore",
        "VLite",
        "Weaviate",
+        "Yellowbrick",
        "ZepVectorStore",
        "Zilliz",
        "Lantern",