feat: Supprt pgvecto.rs as a VectorStore (#12718)

Supprt [pgvecto.rs](https://github.com/tensorchord/pgvecto.rs) as a new VectorStore type. This introduces a new dependency [pgvecto_rs](https://pypi.org/project/pgvecto_rs/) and upgrade SQLAlchemy to ^2. Relate to https://github.com/tensorchord/pgvecto.rs/issues/11 --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-05 21:12:48 +00:00 · 2023-11-03 08:16:04 +08:00
parent 0cbdba6a9b
commit 1b233798a0
2 changed files with 463 additions and 0 deletions
--- a/docs/docs/integrations/vectorstores/pgvecto_rs.ipynb
+++ b/docs/docs/integrations/vectorstores/pgvecto_rs.ipynb
@@ -0,0 +1,214 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# PGVecto.rs\n",
+    "\n",
+    "This notebook shows how to use functionality related to the Postgres vector database ([pgvecto.rs](https://github.com/tensorchord/pgvecto.rs)). You need to install SQLAlchemy >= 2 manually."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Loading Environment Variables\n",
+    "from dotenv import load_dotenv\n",
+    "\n",
+    "load_dotenv()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from typing import List\n",
+    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "from langchain.vectorstores.pgvecto_rs import PGVecto_rs\n",
+    "from langchain.document_loaders import TextLoader\n",
+    "from langchain.docstore.document import Document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
+    "documents = loader.load()\n",
+    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
+    "docs = text_splitter.split_documents(documents)\n",
+    "\n",
+    "embeddings = OpenAIEmbeddings()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Start the database with the [official demo docker image](https://github.com/tensorchord/pgvecto.rs#installation)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "shellscript"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "docker run --name pgvecto-rs-demo -e POSTGRES_PASSWORD=mysecretpassword -p 5432:5432 -d tensorchord/pgvecto-rs:latest"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Then contruct the db URL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## PGVecto.rs needs the connection string to the database.\n",
+    "## We will load it from the environment variables.\n",
+    "import os\n",
+    "\n",
+    "PORT = os.getenv(\"DB_PORT\", 5432)\n",
+    "HOST = os.getenv(\"DB_HOST\", \"localhost\")\n",
+    "USER = os.getenv(\"DB_USER\", \"postgres\")\n",
+    "PASS = os.getenv(\"DB_PASS\", \"mysecretpassword\")\n",
+    "DB_NAME = os.getenv(\"DB_NAME\", \"postgres\")\n",
+    "\n",
+    "# Run tests with shell:\n",
+    "URL = \"postgresql+psycopg://{username}:{password}@{host}:{port}/{db_name}\".format(\n",
+    "    port=PORT,\n",
+    "    host=HOST,\n",
+    "    username=USER,\n",
+    "    password=PASS,\n",
+    "    db_name=DB_NAME,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, create the VectorStore from the documents:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db1 = PGVecto_rs.from_documents(\n",
+    "    documents=docs,\n",
+    "    embedding=embeddings,\n",
+    "    db_url=URL,\n",
+    "    # The table name is f\"collection_{collection_name}\", so that it should be unique.\n",
+    "    collection_name=\"state_of_the_union\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can connect to the table laterly with:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create new empty vectorstore with collection_name.\n",
+    "# Or connect to an existing vectorstore in database if exists.\n",
+    "# Arguments should be the same as when the vectorstore was created.\n",
+    "db1 = PGVecto_rs.from_collection_name(\n",
+    "    embedding=embeddings,\n",
+    "    db_url=URL,\n",
+    "    collection_name=\"state_of_the_union\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Make sure that the user is permitted to create a table."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Similarity search with score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Similarity Search with Euclidean Distance (Default)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
+    "docs: List[Document] = db1.similarity_search(query, k=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for doc in docs:\n",
+    "    print(doc.page_content)\n",
+    "    print(\"======================\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}