mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-12 06:13:36 +00:00
Add Google Vertex AI Vector Search Hybrid Search Documentation (#29064)
Add examples in the documentation to use hybrid search in Vertex AI [Vector Search](https://github.com/langchain-ai/langchain-google/pull/628)
This commit is contained in:
parent
0d226de25c
commit
a49448a7c9
@ -70,7 +70,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"id": "dfa92a08",
|
"id": "dfa92a08",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -91,12 +91,12 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 2,
|
||||||
"id": "c795913e",
|
"id": "c795913e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"embedding_model = VertexAIEmbeddings(model_name=\"textembedding-gecko@003\")"
|
"embedding_model = VertexAIEmbeddings(model_name=\"text-embedding-005\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -722,7 +722,139 @@
|
|||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "31222b03",
|
"id": "31222b03",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": []
|
"source": [
|
||||||
|
"## Hybrid Search"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "b8a308f2",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Vector Search supports hybrid search, a popular architecture pattern in information retrieval (IR) that combines both semantic search and keyword search (also called token-based search). With hybrid search, developers can take advantage of the best of the two approaches, effectively providing higher search quality.\n",
|
||||||
|
"Click [here](https://cloud.google.com/vertex-ai/docs/vector-search/about-hybrid-search) to learn more.\n",
|
||||||
|
"\n",
|
||||||
|
"In order to use hybrid search, we need to fit a sparse embedding vectorizer and handle the embeddings outside of the Vector Search integration.\n",
|
||||||
|
"An example of sparse embedding vectorizer is sklearn TfidfVectorizer but other techniques can be used, for instance BM25."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"id": "e319402d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Define some sample data\n",
|
||||||
|
"texts = [\n",
|
||||||
|
" \"The cat sat on\",\n",
|
||||||
|
" \"the mat.\",\n",
|
||||||
|
" \"I like to\",\n",
|
||||||
|
" \"eat pizza for\",\n",
|
||||||
|
" \"dinner.\",\n",
|
||||||
|
" \"The sun sets\",\n",
|
||||||
|
" \"in the west.\",\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"# optional IDs\n",
|
||||||
|
"ids = [\"i_\" + str(i + 1) for i in range(len(texts))]\n",
|
||||||
|
"\n",
|
||||||
|
"# optional metadata\n",
|
||||||
|
"metadatas = [{\"my_metadata\": i} for i in range(len(texts))]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "14efefc1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"\n",
|
||||||
|
"# Fit the TFIDF Vectorizer (This is usually done on a very large corpus of data to make sure that word statistics generalize well on new data)\n",
|
||||||
|
"vectorizer = TfidfVectorizer()\n",
|
||||||
|
"vectorizer.fit(texts)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2c7206c2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Utility function to transform text into a TF-IDF Sparse Vector\n",
|
||||||
|
"def get_sparse_embedding(tfidf_vectorizer, text):\n",
|
||||||
|
" tfidf_vector = tfidf_vectorizer.transform([text])\n",
|
||||||
|
" values = []\n",
|
||||||
|
" dims = []\n",
|
||||||
|
" for i, tfidf_value in enumerate(tfidf_vector.data):\n",
|
||||||
|
" values.append(float(tfidf_value))\n",
|
||||||
|
" dims.append(int(tfidf_vector.indices[i]))\n",
|
||||||
|
" return {\"values\": values, \"dimensions\": dims}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "0dc5b782",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# semantic (dense) embeddings\n",
|
||||||
|
"embeddings = embedding_model.embed_documents(texts)\n",
|
||||||
|
"# tfidf (sparse) embeddings\n",
|
||||||
|
"sparse_embeddings = [get_sparse_embedding(vectorizer, x) for x in texts]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3a353679",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sparse_embeddings[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "2623cad9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Add the dense and sparse embeddings in Vector Search\n",
|
||||||
|
"\n",
|
||||||
|
"vector_store.add_texts_with_embeddings(\n",
|
||||||
|
" texts=texts,\n",
|
||||||
|
" embeddings=embeddings,\n",
|
||||||
|
" sparse_embeddings=sparse_embeddings,\n",
|
||||||
|
" ids=ids,\n",
|
||||||
|
" metadatas=metadatas,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "29885e38",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Run hybrid search\n",
|
||||||
|
"query = \"the cat\"\n",
|
||||||
|
"embedding = embedding_model.embed_query(query)\n",
|
||||||
|
"sparse_embedding = get_sparse_embedding(vectorizer, query)\n",
|
||||||
|
"\n",
|
||||||
|
"vector_store.similarity_search_by_vector_with_score(\n",
|
||||||
|
" embedding=embedding,\n",
|
||||||
|
" sparse_embedding=sparse_embedding,\n",
|
||||||
|
" k=5,\n",
|
||||||
|
" rrf_ranking_alpha=0.7, # 0.7 weight to dense and 0.3 weight to sparse\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -733,7 +865,7 @@
|
|||||||
"uri": "gcr.io/deeplearning-platform-release/base-cpu:m107"
|
"uri": "gcr.io/deeplearning-platform-release/base-cpu:m107"
|
||||||
},
|
},
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3 (ipykernel)",
|
"display_name": "langchain-google-community-3Os9yvMd-py3.10",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -747,7 +879,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.6"
|
"version": "3.10.14"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
Loading…
Reference in New Issue
Block a user