From 0b79fc17338e00b29f83ded9ab00422545e09e01 Mon Sep 17 00:00:00 2001
From: Simonas Jakubonis <20096648+simjak@users.noreply.github.com>
Date: Fri, 2 May 2025 01:08:10 +0300
Subject: [PATCH] docs: Pinecone Sparse vectorstore example (#31066)

Description: Pinecone SparseVectorStore example
cc @jamescalam

---------

Co-authored-by: James Briggs <35938317+jamescalam@users.noreply.github.com>
Co-authored-by: ccurme <chester.curme@gmail.com>
---
 docs/docs/integrations/providers/pinecone.mdx |   9 +
 .../integrations/vectorstores/pinecone.ipynb  |   2 +-
 .../vectorstores/pinecone_sparse.ipynb        | 576 ++++++++++++++++++
 3 files changed, 586 insertions(+), 1 deletion(-)
 create mode 100644 docs/docs/integrations/vectorstores/pinecone_sparse.ipynb

diff --git a/docs/docs/integrations/providers/pinecone.mdx b/docs/docs/integrations/providers/pinecone.mdx
index c9c84822034..e6cf0b87e2c 100644
--- a/docs/docs/integrations/providers/pinecone.mdx
+++ b/docs/docs/integrations/providers/pinecone.mdx
@@ -27,6 +27,15 @@ from langchain_pinecone import PineconeVectorStore
 
 For a more detailed walkthrough of the Pinecone vectorstore, see [this notebook](/docs/integrations/vectorstores/pinecone)
 
+### Sparse Vector store
+
+```python
+from langchain_pinecone import PineconeSparseVectorStore
+```
+
+For a more detailed walkthrough of the Pinecone vectorstore, see [this notebook](/docs/integrations/vectorstores/pinecone_sparse)
+
+
 ## Retrievers
 
 ### Pinecone Hybrid Search
diff --git a/docs/docs/integrations/vectorstores/pinecone.ipynb b/docs/docs/integrations/vectorstores/pinecone.ipynb
index a571ce9bf11..fb511e574ea 100644
--- a/docs/docs/integrations/vectorstores/pinecone.ipynb
+++ b/docs/docs/integrations/vectorstores/pinecone.ipynb
@@ -34,7 +34,7 @@
    "id": "1917d123",
    "metadata": {},
    "source": [
-    "Migration note: if you are migrating from the `langchain_community.vectorstores` implementation of Pinecone, you may need to remove your `pinecone-client` v2 dependency before installing `langchain-pinecone`, which relies on `pinecone-client` v3."
+    "Migration note: if you are migrating from the `langchain_community.vectorstores` implementation of Pinecone, you may need to remove your `pinecone-client` v2 dependency before installing `langchain-pinecone`, which relies on `pinecone-client` v6."
    ]
   },
   {
diff --git a/docs/docs/integrations/vectorstores/pinecone_sparse.ipynb b/docs/docs/integrations/vectorstores/pinecone_sparse.ipynb
new file mode 100644
index 00000000000..70b156d5323
--- /dev/null
+++ b/docs/docs/integrations/vectorstores/pinecone_sparse.ipynb
@@ -0,0 +1,576 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Pinecone (sparse)\n",
+    "\n",
+    ">[Pinecone](https://docs.pinecone.io/docs/overview) is a vector database with broad functionality.\n",
+    "\n",
+    "This notebook shows how to use functionality related to the `Pinecone` vector database.\n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "To use the `PineconeSparseVectorStore` you first need to install the partner package, as well as the other packages used throughout this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "5g1-ZqcEONGD",
+    "outputId": "2d49c259-683b-46c2-994f-642f35e30357"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mWARNING: pinecone 6.0.2 does not provide the extra 'async'\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -qU \"langchain-pinecone==0.2.5\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-1vfBVLhONGE"
+   },
+   "source": [
+    "### Credentials\n",
+    "Create a new Pinecone account, or sign into your existing one, and create an API key to use in this notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "k_Dp_DIlONGF",
+    "outputId": "01728754-8708-4f05-e53d-2e251541370e"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Enter your Pinecone API key: ··········\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "from getpass import getpass\n",
+    "\n",
+    "from pinecone import Pinecone\n",
+    "\n",
+    "# get API key at app.pinecone.io\n",
+    "os.environ[\"PINECONE_API_KEY\"] = os.getenv(\"PINECONE_API_KEY\") or getpass(\n",
+    "    \"Enter your Pinecone API key: \"\n",
+    ")\n",
+    "\n",
+    "# initialize client\n",
+    "pc = Pinecone()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "OFqeT2xHONGF"
+   },
+   "source": [
+    "## Initialization\n",
+    "Before initializing our vector store, let's connect to a Pinecone index. If one named index_name doesn't exist, it will be created."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "9xNxmZsRONGF",
+    "outputId": "b661d0af-26bd-43b2-f277-5366efd1d865"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index `langchain-sparse-vector-search` host: https://langchain-sparse-vector-search-yrrgefy.svc.aped-4627-b74a.pinecone.io\n"
+     ]
+    }
+   ],
+   "source": [
+    "from pinecone import AwsRegion, CloudProvider, Metric, ServerlessSpec\n",
+    "\n",
+    "index_name = \"langchain-sparse-vector-search\"  # change if desired\n",
+    "model_name = \"pinecone-sparse-english-v0\"\n",
+    "\n",
+    "if not pc.has_index(index_name):\n",
+    "    pc.create_index_for_model(\n",
+    "        name=index_name,\n",
+    "        cloud=CloudProvider.AWS,\n",
+    "        region=AwsRegion.US_EAST_1,\n",
+    "        embed={\n",
+    "            \"model\": model_name,\n",
+    "            \"field_map\": {\"text\": \"chunk_text\"},\n",
+    "            \"metric\": Metric.DOTPRODUCT,\n",
+    "        },\n",
+    "    )\n",
+    "\n",
+    "index = pc.Index(index_name)\n",
+    "print(f\"Index `{index_name}` host: {index.config.host}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "E0fDNTF9ONGF"
+   },
+   "source": [
+    "For our sparse embedding model we use [`pinecone-sparse-english-v0`](https://docs.pinecone.io/models/pinecone-sparse-english-v0), we initialize it like so:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "id": "swM_SFhOONGF"
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_pinecone.embeddings import PineconeSparseEmbeddings\n",
+    "\n",
+    "sparse_embeddings = PineconeSparseEmbeddings(model=model_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZDL58ZRZONGF"
+   },
+   "source": [
+    "Now that our Pinecone index and embedding model are both ready, we can initialize our sparse vector store in LangChain:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "id": "GrBUli1VONGF"
+   },
+   "outputs": [],
+   "source": [
+    "from langchain_pinecone import PineconeSparseVectorStore\n",
+    "\n",
+    "vector_store = PineconeSparseVectorStore(index=index, embedding=sparse_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Bp8aeN4SONGG"
+   },
+   "source": [
+    "## Manage vector store\n",
+    "Once you have created your vector store, we can interact with it by adding and deleting different items.\n",
+    "\n",
+    "### Add items to vector store\n",
+    "We can add items to our vector store by using the `add_documents` function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Ef0s0ovdONGG",
+    "outputId": "bd61d82f-902d-4010-9701-b0e2ecb02d17"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['95b598af-c3dc-4a8a-bdb7-5d21283e5a86',\n",
+       " '838614a5-5635-4efd-9ac3-5237a37a542b',\n",
+       " '093fd11f-c85b-4c83-83f0-117df64ff442',\n",
+       " 'fb3ba32f-f802-410a-ad79-56f7bce938fe',\n",
+       " '75cde9bf-7e91-4f06-8bae-c824dab16a08',\n",
+       " '9de8f769-d604-4e56-b677-ee333cbc8e34',\n",
+       " 'f5f4ae97-88e6-4669-bcf7-87072bb08550',\n",
+       " 'f9f82811-187c-4b25-85b5-7a42b4da3bff',\n",
+       " 'ce45957c-e8fc-41ef-819b-1bd52b6fc815',\n",
+       " '66cacc6f-b8e2-441b-9f7f-468788aad88f']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from uuid import uuid4\n",
+    "\n",
+    "from langchain_core.documents import Document\n",
+    "\n",
+    "documents = [\n",
+    "    Document(\n",
+    "        page_content=\"I had chocolate chip pancakes and scrambled eggs for breakfast this morning.\",\n",
+    "        metadata={\"source\": \"social\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.\",\n",
+    "        metadata={\"source\": \"news\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"Building an exciting new project with LangChain - come check it out!\",\n",
+    "        metadata={\"source\": \"social\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"Robbers broke into the city bank and stole $1 million in cash.\",\n",
+    "        metadata={\"source\": \"news\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"Wow! That was an amazing movie. I can't wait to see it again.\",\n",
+    "        metadata={\"source\": \"social\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"Is the new iPhone worth the price? Read this review to find out.\",\n",
+    "        metadata={\"source\": \"website\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"The top 10 soccer players in the world right now.\",\n",
+    "        metadata={\"source\": \"website\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"LangGraph is the best framework for building stateful, agentic applications!\",\n",
+    "        metadata={\"source\": \"social\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"The stock market is down 500 points today due to fears of a recession.\",\n",
+    "        metadata={\"source\": \"news\"},\n",
+    "    ),\n",
+    "    Document(\n",
+    "        page_content=\"I have a bad feeling I am going to get deleted :(\",\n",
+    "        metadata={\"source\": \"social\"},\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "uuids = [str(uuid4()) for _ in range(len(documents))]\n",
+    "\n",
+    "vector_store.add_documents(documents=documents, ids=uuids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KUIIEuYxONGG"
+   },
+   "source": [
+    "### Delete items from vector store"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pLFI8xEJONGG"
+   },
+   "source": [
+    "We can delete records from our vector store using the `delete` method, providing it with a list of document IDs to delete."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "id": "eboEnsbRONGG"
+   },
+   "outputs": [],
+   "source": [
+    "vector_store.delete(ids=[uuids[-1]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ggaQ5g4bONGG"
+   },
+   "source": [
+    "## Query vector store\n",
+    "\n",
+    "Once we have loaded our documents into the vector store we're most likely ready to begin querying. There are various method for doing this in LangChain.\n",
+    "\n",
+    "First, we'll see how to perform a simple vector search by querying our `vector_store` directly via the `similarity_search` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "QcIS_P8CONGG",
+    "outputId": "774da46e-b919-4128-bc77-6c392e77f9f3"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Building an exciting new project with LangChain - come check it out! [{'source': 'social'}]\n",
+      "* Building an exciting new project with LangChain - come check it out! [{'source': 'social'}]\n",
+      "* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'social'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = vector_store.similarity_search(\"I'm building a new LangChain project!\", k=3)\n",
+    "\n",
+    "for res in results:\n",
+    "    print(f\"* {res.page_content} [{res.metadata}]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "n-JlbidpONGG"
+   },
+   "source": [
+    "We can also add [metadata filtering](https://docs.pinecone.io/guides/data/understanding-metadata#metadata-query-language) to our query to limit our search based on various criteria. Let's try a simple filter to limit our search to include only records with `source==\"social\"`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "N19HBLNCONGG",
+    "outputId": "9c5e96c2-0b4e-4083-cd6a-dd6d8662df09"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Building an exciting new project with LangChain - come check it out! [{'source': 'social'}]\n",
+      "* Building an exciting new project with LangChain - come check it out! [{'source': 'social'}]\n",
+      "* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'social'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = vector_store.similarity_search(\n",
+    "    \"I'm building a new LangChain project!\",\n",
+    "    k=3,\n",
+    "    filter={\"source\": \"social\"},\n",
+    ")\n",
+    "for res in results:\n",
+    "    print(f\"* {res.page_content} [{res.metadata}]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "yEs4xmESONGG"
+   },
+   "source": [
+    "When comparing these results, we can see that our first query returned a different record from the `\"website\"` source. In our latter, filtered, query — this is no longer the case."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vWGkIfzCONGH"
+   },
+   "source": [
+    "### Similarity Search and Scores\n",
+    "\n",
+    "We can also search while returning the similarity score in a list of `(document, score)` tuples. Where the `document` is a LangChain `Document` object containing our text content and metadata."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "aRiHj8ADONGH",
+    "outputId": "5649d70a-0bd0-446d-8e60-2348170da706"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[SIM=12.959961] Building an exciting new project with LangChain - come check it out! [{'source': 'social'}]\n",
+      "[SIM=12.959961] Building an exciting new project with LangChain - come check it out! [{'source': 'social'}]\n",
+      "[SIM=1.942383] LangGraph is the best framework for building stateful, agentic applications! [{'source': 'social'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = vector_store.similarity_search_with_score(\n",
+    "    \"I'm building a new LangChain project!\", k=3, filter={\"source\": \"social\"}\n",
+    ")\n",
+    "for doc, score in results:\n",
+    "    print(f\"[SIM={score:3f}] {doc.page_content} [{doc.metadata}]\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "aqGelo6sONGH"
+   },
+   "source": [
+    "### As a Retriever\n",
+    "\n",
+    "In our chains and agents we'll often use the vector store as a `VectorStoreRetriever`. To create that, we use the `as_retriever` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Tsn9--KsONGH",
+    "outputId": "3da43258-49fb-4080-cbdd-f97101fb8099"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "VectorStoreRetriever(tags=['PineconeSparseVectorStore', 'PineconeSparseEmbeddings'], vectorstore=<langchain_pinecone.vectorstores_sparse.PineconeSparseVectorStore object at 0x7c8087b24290>, search_type='similarity_score_threshold', search_kwargs={'k': 3, 'score_threshold': 0.5})"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "retriever = vector_store.as_retriever(\n",
+    "    search_type=\"similarity_score_threshold\",\n",
+    "    search_kwargs={\"k\": 3, \"score_threshold\": 0.5},\n",
+    ")\n",
+    "retriever"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "yDqb3q-VUY2t"
+   },
+   "source": [
+    "We can now query our retriever using the `invoke` method:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "mGV0p0TDUJyx",
+    "outputId": "89db72c3-4ff2-4900-d302-482e54549b39"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.11/dist-packages/langchain_core/vectorstores/base.py:1082: UserWarning: Relevance scores must be between 0 and 1, got [(Document(id='093fd11f-c85b-4c83-83f0-117df64ff442', metadata={'source': 'social'}, page_content='Building an exciting new project with LangChain - come check it out!'), 6.97998045), (Document(id='54f8f645-9f77-4aab-b9fa-709fd91ae3b3', metadata={'source': 'social'}, page_content='Building an exciting new project with LangChain - come check it out!'), 6.97998045), (Document(id='f9f82811-187c-4b25-85b5-7a42b4da3bff', metadata={'source': 'social'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'), 1.471191405)]\n",
+      "  self.vectorstore.similarity_search_with_relevance_scores(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[Document(id='093fd11f-c85b-4c83-83f0-117df64ff442', metadata={'source': 'social'}, page_content='Building an exciting new project with LangChain - come check it out!'),\n",
+       " Document(id='54f8f645-9f77-4aab-b9fa-709fd91ae3b3', metadata={'source': 'social'}, page_content='Building an exciting new project with LangChain - come check it out!'),\n",
+       " Document(id='f9f82811-187c-4b25-85b5-7a42b4da3bff', metadata={'source': 'social'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "retriever.invoke(\n",
+    "    input=\"I'm building a new LangChain project!\", filter={\"source\": \"social\"}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage for retrieval-augmented generation\n",
+    "\n",
+    "For guides on how to use this vector store for retrieval-augmented generation (RAG), see the following sections:\n",
+    "\n",
+    "- [Tutorials](/docs/tutorials/)\n",
+    "- [How-to: Question and answer with RAG](https://python.langchain.com/docs/how_to/#qa-with-rag)\n",
+    "- [Retrieval conceptual docs](https://python.langchain.com/docs/concepts/retrieval)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## API reference\n",
+    "\n",
+    "For detailed documentation of all features and configurations head to the API reference: \n",
+    "https://python.langchain.com/api_reference/pinecone/vectorstores_sparse/langchain_pinecone.vectorstores_sparse.PineconeSparseVectorStore.html#langchain_pinecone.vectorstores_sparse.PineconeSparseVectorStore\n",
+    "\n",
+    "Sparse Embeddings:\n",
+    "https://python.langchain.com/api_reference/pinecone/embeddings/langchain_pinecone.embeddings.PineconeSparseEmbeddings.html"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}