mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
Add Google Vertex AI Vector Search Hybrid Search Documentation (#29064)
Add examples in the documentation to use hybrid search in Vertex AI [Vector Search](https://github.com/langchain-ai/langchain-google/pull/628)
This commit is contained in:
parent
0d226de25c
commit
a49448a7c9
@ -70,7 +70,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "dfa92a08",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -91,12 +91,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 2,
|
||||
"id": "c795913e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embedding_model = VertexAIEmbeddings(model_name=\"textembedding-gecko@003\")"
|
||||
"embedding_model = VertexAIEmbeddings(model_name=\"text-embedding-005\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -722,7 +722,139 @@
|
||||
"cell_type": "markdown",
|
||||
"id": "31222b03",
|
||||
"metadata": {},
|
||||
"source": []
|
||||
"source": [
|
||||
"## Hybrid Search"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b8a308f2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Vector Search supports hybrid search, a popular architecture pattern in information retrieval (IR) that combines both semantic search and keyword search (also called token-based search). With hybrid search, developers can take advantage of the best of the two approaches, effectively providing higher search quality.\n",
|
||||
"Click [here](https://cloud.google.com/vertex-ai/docs/vector-search/about-hybrid-search) to learn more.\n",
|
||||
"\n",
|
||||
"In order to use hybrid search, we need to fit a sparse embedding vectorizer and handle the embeddings outside of the Vector Search integration.\n",
|
||||
"An example of sparse embedding vectorizer is sklearn TfidfVectorizer but other techniques can be used, for instance BM25."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "e319402d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Define some sample data\n",
|
||||
"texts = [\n",
|
||||
" \"The cat sat on\",\n",
|
||||
" \"the mat.\",\n",
|
||||
" \"I like to\",\n",
|
||||
" \"eat pizza for\",\n",
|
||||
" \"dinner.\",\n",
|
||||
" \"The sun sets\",\n",
|
||||
" \"in the west.\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# optional IDs\n",
|
||||
"ids = [\"i_\" + str(i + 1) for i in range(len(texts))]\n",
|
||||
"\n",
|
||||
"# optional metadata\n",
|
||||
"metadatas = [{\"my_metadata\": i} for i in range(len(texts))]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "14efefc1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"\n",
|
||||
"# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)\n",
|
||||
"vectorizer = TfidfVectorizer()\n",
|
||||
"vectorizer.fit(texts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2c7206c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Utility function to transform text into a TF-IDF Sparse Vector\n",
|
||||
"def get_sparse_embedding(tfidf_vectorizer, text):\n",
|
||||
" tfidf_vector = tfidf_vectorizer.transform([text])\n",
|
||||
" values = []\n",
|
||||
" dims = []\n",
|
||||
" for i, tfidf_value in enumerate(tfidf_vector.data):\n",
|
||||
" values.append(float(tfidf_value))\n",
|
||||
" dims.append(int(tfidf_vector.indices[i]))\n",
|
||||
" return {\"values\": values, \"dimensions\": dims}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "0dc5b782",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# semantic (dense) embeddings\n",
|
||||
"embeddings = embedding_model.embed_documents(texts)\n",
|
||||
"# tfidf (sparse) embeddings\n",
|
||||
"sparse_embeddings = [get_sparse_embedding(vectorizer, x) for x in texts]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a353679",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sparse_embeddings[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2623cad9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Add the dense and sparse embeddings in Vector Search\n",
|
||||
"\n",
|
||||
"vector_store.add_texts_with_embeddings(\n",
|
||||
" texts=texts,\n",
|
||||
" embeddings=embeddings,\n",
|
||||
" sparse_embeddings=sparse_embeddings,\n",
|
||||
" ids=ids,\n",
|
||||
" metadatas=metadatas,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "29885e38",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Run hybrid search\n",
|
||||
"query = \"the cat\"\n",
|
||||
"embedding = embedding_model.embed_query(query)\n",
|
||||
"sparse_embedding = get_sparse_embedding(vectorizer, query)\n",
|
||||
"\n",
|
||||
"vector_store.similarity_search_by_vector_with_score(\n",
|
||||
" embedding=embedding,\n",
|
||||
" sparse_embedding=sparse_embedding,\n",
|
||||
" k=5,\n",
|
||||
" rrf_ranking_alpha=0.7, # 0.7 weight to dense and 0.3 weight to sparse\n",
|
||||
")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@ -733,7 +865,7 @@
|
||||
"uri": "gcr.io/deeplearning-platform-release/base-cpu:m107"
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"display_name": "langchain-google-community-3Os9yvMd-py3.10",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -747,7 +879,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.6"
|
||||
"version": "3.10.14"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Loading…
Reference in New Issue
Block a user