From dce8d08e1d14a1d5d8f886c9f1a1fe596c6a4ce3 Mon Sep 17 00:00:00 2001 From: YIWEI-HE Date: Thu, 24 Apr 2025 17:51:55 -0700 Subject: [PATCH] Add docs for IBM DB2 vector store. --- docs/docs/integrations/providers/ibm.mdx | 17 + docs/docs/integrations/vectorstores/db2.ipynb | 400 ++++++++++++++++++ libs/packages.yml | 3 + 3 files changed, 420 insertions(+) create mode 100644 docs/docs/integrations/vectorstores/db2.ipynb diff --git a/docs/docs/integrations/providers/ibm.mdx b/docs/docs/integrations/providers/ibm.mdx index da59bc82edd..f1dadce3f1b 100644 --- a/docs/docs/integrations/providers/ibm.mdx +++ b/docs/docs/integrations/providers/ibm.mdx @@ -77,3 +77,20 @@ See a [usage example](/docs/integrations/tools/ibm_watsonx). ```python from langchain_ibm import WatsonxToolkit ``` + +## Vector stores + +### IBM DB2 Vector Store and Vector Search + +The IBM DB2 relational database v12.1.2+ offer the +See detailed usage example in Jupyter Notebook [/docs/integrations/vectorstores/db2.ipynb](/docs/integrations/vectorstores/db2.ipynb). + +Installation: This is a seperate package for vector store feature only and can be run without the `langchain-ibm` package. +```python +pip install langchain-db2 +``` +Usage: +``` +from langchain_db2 import db2vs +from langchain_db2.db2vs import DB2VS +``` \ No newline at end of file diff --git a/docs/docs/integrations/vectorstores/db2.ipynb b/docs/docs/integrations/vectorstores/db2.ipynb new file mode 100644 index 00000000000..e9489819de1 --- /dev/null +++ b/docs/docs/integrations/vectorstores/db2.ipynb @@ -0,0 +1,400 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "dd33e9d5-9dba-4aac-9f7f-4cf9e6686593", + "metadata": {}, + "source": [ + "# IBM DB2 Vector Store and Vector Search\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "452c4bd6", + "metadata": {}, + "source": [ + "LangChain's DB2 integration (langchain-db2) provides vector store and vector search capabilities for working with IBM relational database DB2 version v12.1.2 and above, distributed under the MIT license. Users can use the provided implementations as-is or customize them for specific needs.\n", + " Key features include:\n", + "\n", + " * Vector storage with metadata\n", + " * Vector similarity search and max marginal relevance search, with metadata filtering options\n", + " * Support for dot production, cosine, and euclidean distance metrics\n", + " * Performance optimization by index creation and Approximate nearest neighbors search. (Will be added shortly)" + ] + }, + { + "cell_type": "markdown", + "id": "7bd80054-c803-47e1-a259-c40ed073c37d", + "metadata": {}, + "source": [ + "### Prerequisites for using Langchain with DB2 Vector Store and Search\n", + "\n", + "Install package `langchain-db2` which is the integration package for the db2 LangChain Vector Store and Search.\n", + "\n", + "The installation of the package should also install its dependencies like `langchain-core` and `ibm_db`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2bbb989d-c6fb-4ab9-bafd-a95fd48538d0", + "metadata": {}, + "outputs": [], + "source": [ + "# pip install langchain-db2" + ] + }, + { + "cell_type": "markdown", + "id": "0fceaa5a-95da-4ebd-8b8d-5e73bb653172", + "metadata": {}, + "source": [ + "### Connect to DB2 Vector Store\n", + "\n", + "The following sample code will show how to connect to DB2 Database. Besides the dependencies above, you will need a DB2 database instance (with version v12.1.2+, which has the vector datatype support) running." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4421e4b7-2c7e-4bcd-82b3-9576595edd0f", + "metadata": {}, + "outputs": [], + "source": [ + "import ibm_db\n", + "import ibm_db_dbi \n", + "\n", + "database = \"\"\n", + "username = \"\"\n", + "password = \"\"\n", + "\n", + "try:\n", + " connection = ibm_db_dbi.connect(database, username, password)\n", + " print(\"Connection successful!\")\n", + "except Exception as e:\n", + " print(\"Connection failed!\")" + ] + }, + { + "cell_type": "markdown", + "id": "b11cf362-01b0-485d-8527-31b0fbb5028e", + "metadata": {}, + "source": [ + "### Import the required dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43ea59e3-2910-45a6-b195-5f06094bb7c9", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_db2 import db2vs\n", + "from langchain_db2.db2vs import DB2VS\n", + "from langchain_community.vectorstores.utils import DistanceStrategy\n", + "from langchain_core.documents import Document\n", + "from langchain_community.embeddings import HuggingFaceEmbeddings" + ] + }, + { + "cell_type": "markdown", + "id": "0aac10dc-a9cc-4fdb-901c-1b7a4bbbe5a7", + "metadata": {}, + "source": [ + "### Create Documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70ac6982-b13a-4e8c-9c47-57c6d136ac60", + "metadata": {}, + "outputs": [], + "source": [ + "# Define a list of documents\n", + "documents_json_list = [\n", + " {\n", + " \"id\": \"doc_1_2_P4\",\n", + " \"text\": \"Db2 handles LOB data differently than other kinds of data. As a result, you sometimes need to take additional actions when you define LOB columns and insert the LOB data.\",\n", + " \"link\": \"https://www.ibm.com/docs/en/db2-for-zos/12?topic=programs-storing-lob-data-in-tables\",\n", + " },\n", + " {\n", + " \"id\": \"doc_15.5.1_P1\",\n", + " \"text\": \"Introduced in Db2 13, SQL Data Insights brought artificial intelligence (AI) functionality to the Db2 for z/OS engine. It provided the capability to run SQL AI query to find valuable insights hidden in your Db2 data and help you make better business decisions.\",\n", + " \"link\": \"https://community.ibm.com/community/user/datamanagement/blogs/neena-cherian/2023/03/07/accelerating-db2-ai-queries-with-the-new-vector-pr\",\n", + " },\n", + " {\n", + " \"id\": \"id_22.3.4.3.1_P2\",\n", + " \"text\": \"Data structures are elements that are required to use DB2®. You can access and use these elements to organize your data. Examples of data structures include tables, table spaces, indexes, index spaces, keys, views, and databases.\",\n", + " \"link\": \"https://www.ibm.com/docs/en/zos-basic-skills?topic=concepts-db2-data-structures\",\n", + " },\n", + " {\n", + " \"id\": \"id_3.4.3.1_P3\",\n", + " \"text\": \"DB2® maintains a set of tables that contain information about the data that DB2 controls. These tables are collectively known as the catalog. The catalog tables contain information about DB2 objects such as tables, views, and indexes. When you create, alter, or drop an object, DB2 inserts, updates, or deletes rows of the catalog that describe the object.\",\n", + " \"link\": \"https://www.ibm.com/docs/en/zos-basic-skills?topic=objects-db2-catalog\",\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eaa942d6-5954-4898-8c32-3627b923a3a5", + "metadata": {}, + "outputs": [], + "source": [ + "# Create Langchain Documents\n", + "\n", + "documents_langchain = []\n", + "\n", + "for doc in documents_json_list:\n", + " metadata = {\"id\": doc[\"id\"], \"link\": doc[\"link\"]}\n", + " doc_langchain = Document(page_content=doc[\"text\"], metadata=metadata)\n", + " documents_langchain.append(doc_langchain)" + ] + }, + { + "cell_type": "markdown", + "id": "6823f5e6-997c-4f15-927b-bd44c61f105f", + "metadata": {}, + "source": [ + "### Create Vector Stores with different distance metrics\n", + "\n", + "First we will create three vector stores each with different distance strategies. \n", + "\n", + "(You can manually connect to the DB2 Database and will see three tables : \n", + "Documents_DOT, Documents_COSINE and Documents_EUCLIDEAN. )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed1b253e-5f5c-4a81-983c-74645213a170", + "metadata": {}, + "outputs": [], + "source": [ + "# Create DB2 Vector Stores using different distance strategies\n", + "\n", + "# When using our API calls, start by initializing your vector store with a subset of your documents\n", + "# through from_documents(), then incrementally add more documents using add_texts().\n", + "# This approach prevents system overload and ensures efficient document processing.\n", + "\n", + "model = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", + "\n", + "vector_store_dot = DB2VS.from_documents(\n", + " documents_langchain,\n", + " model,\n", + " client=connection,\n", + " table_name=\"Documents_DOT\",\n", + " distance_strategy=DistanceStrategy.DOT_PRODUCT,\n", + ")\n", + "vector_store_max = DB2VS.from_documents(\n", + " documents_langchain,\n", + " model,\n", + " client=connection,\n", + " table_name=\"Documents_COSINE\",\n", + " distance_strategy=DistanceStrategy.COSINE,\n", + ")\n", + "vector_store_euclidean = DB2VS.from_documents(\n", + " documents_langchain,\n", + " model,\n", + " client=connection,\n", + " table_name=\"Documents_EUCLIDEAN\",\n", + " distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "77c29505-8688-4b87-9a99-e648fbb2d425", + "metadata": {}, + "source": [ + "### Demonstrating add and delete operations for texts, along with basic similarity search\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "306563ae-577b-4bc7-8a92-3dd6a59310f5", + "metadata": {}, + "outputs": [], + "source": [ + "def manage_texts(vector_stores):\n", + " \"\"\"\n", + " Adds texts to each vector store, demonstrates error handling for duplicate additions,\n", + " and performs deletion of texts. Showcases similarity searches and index creation for each vector store.\n", + "\n", + " Args:\n", + " - vector_stores (list): A list of DB2VS instances.\n", + " \"\"\"\n", + " texts = [\"Rohan\", \"Shailendra\"]\n", + " metadata = [\n", + " {\"id\": \"100\", \"link\": \"Document Example Test 1\"},\n", + " {\"id\": \"101\", \"link\": \"Document Example Test 2\"},\n", + " ]\n", + "\n", + " for i, vs in enumerate(vector_stores, start=1):\n", + " # Adding texts\n", + " try:\n", + " vs.add_texts(texts, metadata)\n", + " print(f\"\\n\\n\\nAdd texts complete for vector store {i}\\n\\n\\n\")\n", + " except Exception as ex:\n", + " print(f\"\\n\\n\\nExpected error on duplicate add for vector store {i}\\n\\n\\n\")\n", + "\n", + " # Deleting texts using the value of 'id'\n", + " vs.delete([metadata[0][\"id\"], metadata[1][\"id\"]])\n", + " print(f\"\\n\\n\\nDelete texts complete for vector store {i}\\n\\n\\n\")\n", + "\n", + " # Similarity search\n", + " results = vs.similarity_search(\"How are LOBS stored in DB2 Database\", 2)\n", + " print(f\"\\n\\n\\nSimilarity search results for vector store {i}: {results}\\n\\n\\n\")\n", + "\n", + "\n", + "vector_store_list = [\n", + " vector_store_dot,\n", + " vector_store_max,\n", + " vector_store_euclidean,\n", + "]\n", + "manage_texts(vector_store_list)" + ] + }, + { + "cell_type": "markdown", + "id": "0980cb33-69cf-4547-842a-afdc4d6fa7d3", + "metadata": {}, + "source": [ + "### Demonstrating index creation with specific parameters for each distance strategy\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46298a27-e309-456e-b2b8-771d9cb3be29", + "metadata": {}, + "outputs": [], + "source": [ + "# TODO: enable this part when the create index is implemented in db2:\n", + "\n", + "# def create_search_indices(connection):\n", + "# \"\"\"\n", + "# Creates search indices for the vector stores, each with specific parameters tailored to their distance strategy.\n", + "# \"\"\"\n", + "# # Index for DOT_PRODUCT strategy\n", + "# # Notice we are creating a HNSW index with default parameters\n", + "# # This will default to creating a HNSW index with 8 Parallel Workers and use the Default Accuracy used by DB2 AI Vector Search\n", + "# db2vs.create_index(\n", + "# connection,\n", + "# vector_store_dot,\n", + "# params={\"idx_name\": \"hnsw_idx1\", \"idx_type\": \"HNSW\"},\n", + "# )\n", + "\n", + "# # Index for COSINE strategy with specific parameters\n", + "# # Notice we are creating a HNSW index with parallel 16 and Target Accuracy Specification as 97 percent\n", + "# db2vs.create_index(\n", + "# connection,\n", + "# vector_store_max,\n", + "# params={\n", + "# \"idx_name\": \"hnsw_idx2\",\n", + "# \"idx_type\": \"HNSW\",\n", + "# \"accuracy\": 97,\n", + "# \"parallel\": 16,\n", + "# },\n", + "# )\n", + "\n", + "# # Index for EUCLIDEAN_DISTANCE strategy with specific parameters\n", + "# # Notice we are creating a HNSW index by specifying Power User Parameters which are neighbors = 64 and efConstruction = 100\n", + "# db2vs.create_index(\n", + "# connection,\n", + "# vector_store_euclidean,\n", + "# params={\n", + "# \"idx_name\": \"hnsw_idx3\",\n", + "# \"idx_type\": \"HNSW\",\n", + "# \"neighbors\": 64,\n", + "# \"efConstruction\": 100,\n", + "# },\n", + "# )\n", + "\n", + "\n", + "# create_search_indices(connection)" + ] + }, + { + "cell_type": "markdown", + "id": "7223d048-5c0b-4e91-a91b-a7daa9f86758", + "metadata": {}, + "source": [ + "### Demonstrate advanced searches on vector stores, with and without attribute filtering \n", + "\n", + "With filtering, we only select the document id 101 and nothing else" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37ca2e7d-9803-4260-95e7-62776d4fb820", + "metadata": {}, + "outputs": [], + "source": [ + "# Conduct advanced searches\n", + "def conduct_advanced_searches(vector_stores):\n", + " query = \"How are LOBS stored in DB2 Database\"\n", + " # Constructing a filter for direct comparison against document metadata\n", + " # This filter aims to include documents whose metadata 'id' is exactly '101'\n", + " filter_criteria = {\"id\": [\"101\"]} # Direct comparison filter\n", + "\n", + " for i, vs in enumerate(vector_stores, start=1):\n", + " print(f\"\\n--- Vector Store {i} Advanced Searches ---\")\n", + " # Similarity search without a filter\n", + " print(\"\\nSimilarity search results without filter:\")\n", + " print(vs.similarity_search(query, 2))\n", + "\n", + " # Similarity search with a filter\n", + " print(\"\\nSimilarity search results with filter:\")\n", + " print(vs.similarity_search(query, 2, filter=filter_criteria))\n", + "\n", + " # Similarity search with relevance score\n", + " print(\"\\nSimilarity search with relevance score:\")\n", + " print(vs.similarity_search_with_score(query, 2))\n", + "\n", + " # Similarity search with relevance score with filter\n", + " print(\"\\nSimilarity search with relevance score with filter:\")\n", + " print(vs.similarity_search_with_score(query, 2, filter=filter_criteria))\n", + "\n", + " # Max marginal relevance search\n", + " print(\"\\nMax marginal relevance search results:\")\n", + " print(vs.max_marginal_relevance_search(query, 2, fetch_k=20, lambda_mult=0.5))\n", + "\n", + " # Max marginal relevance search with filter\n", + " print(\"\\nMax marginal relevance search results with filter:\")\n", + " print(vs.max_marginal_relevance_search(\n", + " query, 2, fetch_k=20, lambda_mult=0.5, filter=filter_criteria))\n", + "\n", + "\n", + "conduct_advanced_searches(vector_store_list)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pythonEnv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.21" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/packages.yml b/libs/packages.yml index 70db4b652d7..f29f1f638e8 100644 --- a/libs/packages.yml +++ b/libs/packages.yml @@ -643,3 +643,6 @@ packages: repo: valyu-network/langchain-valyu downloads: 120 downloads_updated_at: '2025-04-22T15:25:24.644345+00:00' +- name: langchain-db2 + path: . + repo: langchain-ai/langchain-ibm