diff --git a/docs/docs/integrations/vectorstores/google_vertex_ai_vector_search.ipynb b/docs/docs/integrations/vectorstores/google_vertex_ai_vector_search.ipynb index 6f29773acd4..2e6e91dc451 100644 --- a/docs/docs/integrations/vectorstores/google_vertex_ai_vector_search.ipynb +++ b/docs/docs/integrations/vectorstores/google_vertex_ai_vector_search.ipynb @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "dfa92a08", "metadata": {}, "outputs": [], @@ -91,12 +91,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c795913e", "metadata": {}, "outputs": [], "source": [ - "embedding_model = VertexAIEmbeddings(model_name=\"textembedding-gecko@003\")" + "embedding_model = VertexAIEmbeddings(model_name=\"text-embedding-005\")" ] }, { @@ -722,7 +722,139 @@ "cell_type": "markdown", "id": "31222b03", "metadata": {}, - "source": [] + "source": [ + "## Hybrid Search" + ] + }, + { + "cell_type": "markdown", + "id": "b8a308f2", + "metadata": {}, + "source": [ + "Vector Search supports hybrid search, a popular architecture pattern in information retrieval (IR) that combines both semantic search and keyword search (also called token-based search). With hybrid search, developers can take advantage of the best of the two approaches, effectively providing higher search quality.\n", + "Click [here](https://cloud.google.com/vertex-ai/docs/vector-search/about-hybrid-search) to learn more.\n", + "\n", + "In order to use hybrid search, we need to fit a sparse embedding vectorizer and handle the embeddings outside of the Vector Search integration.\n", + "An example of sparse embedding vectorizer is sklearn TfidfVectorizer but other techniques can be used, for instance BM25." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e319402d", + "metadata": {}, + "outputs": [], + "source": [ + "# Define some sample data\n", + "texts = [\n", + " \"The cat sat on\",\n", + " \"the mat.\",\n", + " \"I like to\",\n", + " \"eat pizza for\",\n", + " \"dinner.\",\n", + " \"The sun sets\",\n", + " \"in the west.\",\n", + "]\n", + "\n", + "# optional IDs\n", + "ids = [\"i_\" + str(i + 1) for i in range(len(texts))]\n", + "\n", + "# optional metadata\n", + "metadatas = [{\"my_metadata\": i} for i in range(len(texts))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14efefc1", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)\n", + "vectorizer = TfidfVectorizer()\n", + "vectorizer.fit(texts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c7206c2", + "metadata": {}, + "outputs": [], + "source": [ + "# Utility function to transform text into a TF-IDF Sparse Vector\n", + "def get_sparse_embedding(tfidf_vectorizer, text):\n", + " tfidf_vector = tfidf_vectorizer.transform([text])\n", + " values = []\n", + " dims = []\n", + " for i, tfidf_value in enumerate(tfidf_vector.data):\n", + " values.append(float(tfidf_value))\n", + " dims.append(int(tfidf_vector.indices[i]))\n", + " return {\"values\": values, \"dimensions\": dims}" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "0dc5b782", + "metadata": {}, + "outputs": [], + "source": [ + "# semantic (dense) embeddings\n", + "embeddings = embedding_model.embed_documents(texts)\n", + "# tfidf (sparse) embeddings\n", + "sparse_embeddings = [get_sparse_embedding(vectorizer, x) for x in texts]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a353679", + "metadata": {}, + "outputs": [], + "source": [ + "sparse_embeddings[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2623cad9", + "metadata": {}, + "outputs": [], + "source": [ + "# Add the dense and sparse embeddings in Vector Search\n", + "\n", + "vector_store.add_texts_with_embeddings(\n", + " texts=texts,\n", + " embeddings=embeddings,\n", + " sparse_embeddings=sparse_embeddings,\n", + " ids=ids,\n", + " metadatas=metadatas,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29885e38", + "metadata": {}, + "outputs": [], + "source": [ + "# Run hybrid search\n", + "query = \"the cat\"\n", + "embedding = embedding_model.embed_query(query)\n", + "sparse_embedding = get_sparse_embedding(vectorizer, query)\n", + "\n", + "vector_store.similarity_search_by_vector_with_score(\n", + " embedding=embedding,\n", + " sparse_embedding=sparse_embedding,\n", + " k=5,\n", + " rrf_ranking_alpha=0.7, # 0.7 weight to dense and 0.3 weight to sparse\n", + ")" + ] } ], "metadata": { @@ -733,7 +865,7 @@ "uri": "gcr.io/deeplearning-platform-release/base-cpu:m107" }, "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "langchain-google-community-3Os9yvMd-py3.10", "language": "python", "name": "python3" }, @@ -747,7 +879,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.10.14" } }, "nbformat": 4,