mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-11-03 17:54:10 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			339 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			339 lines
		
	
	
		
			9.4 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
{
 | 
						|
 "cells": [
 | 
						|
  {
 | 
						|
   "attachments": {},
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "1292f057",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "# pg_embedding\n",
 | 
						|
    "\n",
 | 
						|
    "> [pg_embedding](https://github.com/neondatabase/pg_embedding) is an open-source vector similarity search for `Postgres` that uses  Hierarchical Navigable Small Worlds for approximate nearest neighbor search.\n",
 | 
						|
    "\n",
 | 
						|
    "It supports:\n",
 | 
						|
    "- exact and approximate nearest neighbor search using HNSW\n",
 | 
						|
    "- L2 distance\n",
 | 
						|
    "\n",
 | 
						|
    "This notebook shows how to use the Postgres vector database (`PGEmbedding`).\n",
 | 
						|
    "\n",
 | 
						|
    "> The PGEmbedding integration creates the pg_embedding extension for you, but you run the following Postgres query to add it:\n",
 | 
						|
    "```sql\n",
 | 
						|
    "CREATE EXTENSION embedding;\n",
 | 
						|
    "```"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "a6214221",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "# Pip install necessary package\n",
 | 
						|
    "!pip install openai\n",
 | 
						|
    "!pip install psycopg2-binary\n",
 | 
						|
    "!pip install tiktoken"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "attachments": {},
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "b2e49694",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "Add the OpenAI API Key to the environment variables to use `OpenAIEmbeddings`."
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 2,
 | 
						|
   "id": "1dcc8d99",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "OpenAI API Key:········\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "import os\n",
 | 
						|
    "import getpass\n",
 | 
						|
    "\n",
 | 
						|
    "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 3,
 | 
						|
   "id": "9719ea68",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "## Loading Environment Variables\n",
 | 
						|
    "from typing import List, Tuple"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "dfd1f38d",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
 | 
						|
    "from langchain.text_splitter import CharacterTextSplitter\n",
 | 
						|
    "from langchain.vectorstores import PGEmbedding\n",
 | 
						|
    "from langchain.document_loaders import TextLoader\n",
 | 
						|
    "from langchain.docstore.document import Document"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 5,
 | 
						|
   "id": "8fab8cc2",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "name": "stdout",
 | 
						|
     "output_type": "stream",
 | 
						|
     "text": [
 | 
						|
      "Database Url:········\n"
 | 
						|
     ]
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "os.environ[\"DATABASE_URL\"] = getpass.getpass(\"Database Url:\")"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 6,
 | 
						|
   "id": "bef17115",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "loader = TextLoader(\"state_of_the_union.txt\")\n",
 | 
						|
    "documents = loader.load()\n",
 | 
						|
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
 | 
						|
    "docs = text_splitter.split_documents(documents)\n",
 | 
						|
    "\n",
 | 
						|
    "embeddings = OpenAIEmbeddings()\n",
 | 
						|
    "connection_string = os.environ.get(\"DATABASE_URL\")\n",
 | 
						|
    "collection_name = \"state_of_the_union\""
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 13,
 | 
						|
   "id": "743abfaa",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "db = PGEmbedding.from_documents(\n",
 | 
						|
    "    embedding=embeddings,\n",
 | 
						|
    "    documents=docs,\n",
 | 
						|
    "    collection_name=collection_name,\n",
 | 
						|
    "    connection_string=connection_string,\n",
 | 
						|
    ")\n",
 | 
						|
    "\n",
 | 
						|
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
 | 
						|
    "docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "41ce4c4e",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "for doc, score in docs_with_score:\n",
 | 
						|
    "    print(\"-\" * 80)\n",
 | 
						|
    "    print(\"Score: \", score)\n",
 | 
						|
    "    print(doc.page_content)\n",
 | 
						|
    "    print(\"-\" * 80)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "attachments": {},
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "7ef7b052",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "## Working with vectorstore in Postgres"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "attachments": {},
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "939151f7",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "### Uploading a vectorstore in PG "
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 32,
 | 
						|
   "id": "595ac511",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "db = PGEmbedding.from_documents(\n",
 | 
						|
    "    embedding=embeddings,\n",
 | 
						|
    "    documents=docs,\n",
 | 
						|
    "    collection_name=collection_name,\n",
 | 
						|
    "    connection_string=connection_string,\n",
 | 
						|
    "    pre_delete_collection=False,\n",
 | 
						|
    ")"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "attachments": {},
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "f9510e6b",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "### Create HNSW Index\n",
 | 
						|
    "By default, the extension performs a sequential scan search, with 100% recall. You might consider creating an HNSW index for approximate nearest neighbor (ANN) search to speed up `similarity_search_with_score` execution time. To create the HNSW index on your vector column, use a `create_hnsw_index` function:"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "2d1981fa",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "PGEmbedding.create_hnsw_index(\n",
 | 
						|
    "    max_elements=10000, dims=1536, m=8, ef_construction=16, ef_search=16\n",
 | 
						|
    ")"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "attachments": {},
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "7adacf29",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "The function above is equivalent to running the below SQL query:\n",
 | 
						|
    "```sql\n",
 | 
						|
    "CREATE INDEX ON vectors USING hnsw(vec) WITH (maxelements=10000, dims=1536, m=3, efconstruction=16, efsearch=16);\n",
 | 
						|
    "```\n",
 | 
						|
    "The HNSW index options used in the statement above include:\n",
 | 
						|
    "\n",
 | 
						|
    "- maxelements: Defines the maximum number of elements indexed. This is a required parameter. The example shown above has a value of 3. A real-world example would have a much large value, such as 1000000. An \"element\" refers to a data point (a vector) in the dataset, which is represented as a node in the HNSW graph. Typically, you would set this option to a value able to accommodate the number of rows in your in your dataset.\n",
 | 
						|
    "- dims: Defines the number of dimensions in your vector data. This is a required parameter. A small value is used in the example above. If you are storing data generated using OpenAI's text-embedding-ada-002 model, which supports 1536 dimensions, you would define a value of 1536, for example.\n",
 | 
						|
    "- m: Defines the maximum number of bi-directional links (also referred to as \"edges\") created for each node during graph construction.\n",
 | 
						|
    "The following additional index options are supported:\n",
 | 
						|
    "\n",
 | 
						|
    "- efConstruction: Defines the number of nearest neighbors considered during index construction. The default value is 32.\n",
 | 
						|
    "- efsearch: Defines the number of nearest neighbors considered during index search. The default value is 32.\n",
 | 
						|
    "For information about how you can configure these options to influence the HNSW algorithm, refer to [Tuning the HNSW algorithm](https://neon.tech/docs/extensions/pg_embedding#tuning-the-hnsw-algorithm)."
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "attachments": {},
 | 
						|
   "cell_type": "markdown",
 | 
						|
   "id": "528893fb",
 | 
						|
   "metadata": {},
 | 
						|
   "source": [
 | 
						|
    "### Retrieving a vectorstore in PG"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 15,
 | 
						|
   "id": "b6162b1c",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "store = PGEmbedding(\n",
 | 
						|
    "    connection_string=connection_string,\n",
 | 
						|
    "    embedding_function=embeddings,\n",
 | 
						|
    "    collection_name=collection_name,\n",
 | 
						|
    ")\n",
 | 
						|
    "\n",
 | 
						|
    "retriever = store.as_retriever()"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 16,
 | 
						|
   "id": "1a5fedb1",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [
 | 
						|
    {
 | 
						|
     "data": {
 | 
						|
      "text/plain": [
 | 
						|
       "VectorStoreRetriever(vectorstore=<langchain.vectorstores.pghnsw.HNSWVectoreStore object at 0x121d3c8b0>, search_type='similarity', search_kwargs={})"
 | 
						|
      ]
 | 
						|
     },
 | 
						|
     "execution_count": 16,
 | 
						|
     "metadata": {},
 | 
						|
     "output_type": "execute_result"
 | 
						|
    }
 | 
						|
   ],
 | 
						|
   "source": [
 | 
						|
    "retriever"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": 17,
 | 
						|
   "id": "0cefc938",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "db1 = PGEmbedding.from_existing_index(\n",
 | 
						|
    "    embedding=embeddings,\n",
 | 
						|
    "    collection_name=collection_name,\n",
 | 
						|
    "    pre_delete_collection=False,\n",
 | 
						|
    "    connection_string=connection_string,\n",
 | 
						|
    ")\n",
 | 
						|
    "\n",
 | 
						|
    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
 | 
						|
    "docs_with_score: List[Tuple[Document, float]] = db1.similarity_search_with_score(query)"
 | 
						|
   ]
 | 
						|
  },
 | 
						|
  {
 | 
						|
   "cell_type": "code",
 | 
						|
   "execution_count": null,
 | 
						|
   "id": "85cde495",
 | 
						|
   "metadata": {},
 | 
						|
   "outputs": [],
 | 
						|
   "source": [
 | 
						|
    "for doc, score in docs_with_score:\n",
 | 
						|
    "    print(\"-\" * 80)\n",
 | 
						|
    "    print(\"Score: \", score)\n",
 | 
						|
    "    print(doc.page_content)\n",
 | 
						|
    "    print(\"-\" * 80)"
 | 
						|
   ]
 | 
						|
  }
 | 
						|
 ],
 | 
						|
 "metadata": {
 | 
						|
  "kernelspec": {
 | 
						|
   "display_name": "Python 3 (ipykernel)",
 | 
						|
   "language": "python",
 | 
						|
   "name": "python3"
 | 
						|
  },
 | 
						|
  "language_info": {
 | 
						|
   "codemirror_mode": {
 | 
						|
    "name": "ipython",
 | 
						|
    "version": 3
 | 
						|
   },
 | 
						|
   "file_extension": ".py",
 | 
						|
   "mimetype": "text/x-python",
 | 
						|
   "name": "python",
 | 
						|
   "nbconvert_exporter": "python",
 | 
						|
   "pygments_lexer": "ipython3",
 | 
						|
   "version": "3.9.6"
 | 
						|
  }
 | 
						|
 },
 | 
						|
 "nbformat": 4,
 | 
						|
 "nbformat_minor": 5
 | 
						|
}
 |