chroma docs (#1012)

2025-09-08 06:23:20 +00:00 · 2023-02-12 23:02:01 -08:00
parent 0c553d2064
commit 7fb33fca47
18 changed files with 354 additions and 179 deletions
--- a/docs/modules/chains/combine_docs_examples/chat_vector_db.ipynb
+++ b/docs/modules/chains/combine_docs_examples/chat_vector_db.ipynb
@@ -1,7 +1,6 @@
 {
 "cells": [
  {
-   "attachments": {},
   "cell_type": "markdown",
   "id": "134a0785",
   "metadata": {},
@@ -19,11 +18,10 @@
   "outputs": [],
   "source": [
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores.faiss import FAISS\n",
+    "from langchain.vectorstores import Chroma\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.llms import OpenAI\n",
-    "from langchain.chains import ChatVectorDBChain\n",
-    "from langchain.document_loaders import TextLoader"
+    "from langchain.chains import ChatVectorDBChain"
   ]
  },
  {
@@ -41,6 +39,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "from langchain.document_loaders import TextLoader\n",
    "loader = TextLoader('../../state_of_the_union.txt')\n",
    "documents = loader.load()"
   ]
@@ -76,16 +75,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 3,
   "id": "a8930cf7",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running Chroma using direct local API.\n",
+      "Using DuckDB in-memory for database. Data will be transient.\n"
+     ]
+    }
+   ],
   "source": [
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
    "documents = text_splitter.split_documents(documents)\n",
    "\n",
    "embeddings = OpenAIEmbeddings()\n",
-    "vectorstore = FAISS.from_documents(documents, embeddings)"
+    "vectorstore = Chroma.from_documents(documents, embeddings)"
   ]
  },
  {
--- a/docs/modules/chains/combine_docs_examples/qa_with_sources.ipynb
+++ b/docs/modules/chains/combine_docs_examples/qa_with_sources.ipynb
@@ -21,7 +21,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 4,
   "id": "78f28130",
   "metadata": {},
   "outputs": [],
@@ -30,14 +30,14 @@
    "from langchain.embeddings.cohere import CohereEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
-    "from langchain.vectorstores.faiss import FAISS\n",
+    "from langchain.vectorstores import Chroma\n",
    "from langchain.docstore.document import Document\n",
    "from langchain.prompts import PromptTemplate"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "id": "4da195a3",
   "metadata": {},
   "outputs": [],
@@ -52,17 +52,26 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
   "id": "5ec2b55b",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running Chroma using direct local API.\n",
+      "Using DuckDB in-memory for database. Data will be transient.\n"
+     ]
+    }
+   ],
   "source": [
-    "docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{\"source\": i} for i in range(len(texts))])"
+    "docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": str(i)} for i in range(len(texts))])"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
   "id": "5286f58f",
   "metadata": {},
   "outputs": [],
@@ -73,7 +82,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
   "id": "005a47e9",
   "metadata": {},
   "outputs": [],
@@ -93,7 +102,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
   "id": "3722373b",
   "metadata": {},
   "outputs": [
@@ -103,7 +112,7 @@
       "{'output_text': ' The president thanked Justice Breyer for his service.\\nSOURCES: 30-pl'}"
      ]
     },
-     "execution_count": 13,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -699,7 +708,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  },
  "vscode": {
   "interpreter": {
--- a/docs/modules/chains/combine_docs_examples/question_answering.ipynb
+++ b/docs/modules/chains/combine_docs_examples/question_answering.ipynb
@@ -28,7 +28,7 @@
   "source": [
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
-    "from langchain.vectorstores.faiss import FAISS\n",
+    "from langchain.vectorstores import Chroma\n",
    "from langchain.docstore.document import Document\n",
    "from langchain.prompts import PromptTemplate"
   ]
@@ -40,27 +40,37 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "with open('../../state_of_the_union.txt') as f:\n",
-    "    state_of_the_union = f.read()\n",
+    "from langchain.document_loaders import TextLoader\n",
+    "loader = TextLoader('../../state_of_the_union.txt')\n",
+    "documents = loader.load()\n",
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
-    "texts = text_splitter.split_text(state_of_the_union)\n",
+    "texts = text_splitter.split_documents(documents)\n",
    "\n",
    "embeddings = OpenAIEmbeddings()"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
   "id": "fd9666a9",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running Chroma using direct local API.\n",
+      "Using DuckDB in-memory for database. Data will be transient.\n"
+     ]
+    }
+   ],
   "source": [
-    "docsearch = FAISS.from_texts(texts, embeddings)"
+    "docsearch = Chroma.from_documents(texts, embeddings)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
   "id": "d1eaf6e6",
   "metadata": {},
   "outputs": [],
@@ -673,7 +683,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  },
  "vscode": {
   "interpreter": {
--- a/docs/modules/chains/combine_docs_examples/vector_db_qa.ipynb
+++ b/docs/modules/chains/combine_docs_examples/vector_db_qa.ipynb
@@ -18,7 +18,7 @@
   "outputs": [],
   "source": [
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores.faiss import FAISS\n",
+    "from langchain.vectorstores import Chroma\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain import OpenAI, VectorDBQA"
   ]
@@ -28,15 +28,25 @@
   "execution_count": 2,
   "id": "5c7049db",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running Chroma using direct local API.\n",
+      "Using DuckDB in-memory for database. Data will be transient.\n"
+     ]
+    }
+   ],
   "source": [
-    "with open('../../state_of_the_union.txt') as f:\n",
-    "    state_of_the_union = f.read()\n",
+    "from langchain.document_loaders import TextLoader\n",
+    "loader = TextLoader('../../state_of_the_union.txt')\n",
+    "documents = loader.load()\n",
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
-    "texts = text_splitter.split_text(state_of_the_union)\n",
+    "texts = text_splitter.split_documents(documents)\n",
    "\n",
    "embeddings = OpenAIEmbeddings()\n",
-    "docsearch = FAISS.from_texts(texts, embeddings)"
+    "docsearch = Chroma.from_documents(texts, embeddings)"
   ]
  },
  {
@@ -58,7 +68,7 @@
    {
     "data": {
      "text/plain": [
-       "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, from a family of public school educators and police officers, a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
+       "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
      ]
     },
     "execution_count": 4,
@@ -256,7 +266,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  },
  "vscode": {
   "interpreter": {
--- a/docs/modules/chains/combine_docs_examples/vector_db_qa_with_sources.ipynb
+++ b/docs/modules/chains/combine_docs_examples/vector_db_qa_with_sources.ipynb
@@ -21,7 +21,7 @@
    "from langchain.embeddings.cohere import CohereEmbeddings\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
-    "from langchain.vectorstores.faiss import FAISS"
+    "from langchain.vectorstores import Chromaoma"
   ]
  },
  {
@@ -41,29 +41,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
   "id": "0e745d99",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running Chroma using direct local API.\n",
+      "Using DuckDB in-memory for database. Data will be transient.\n",
+      "Exiting: Cleaning up .chroma directory\n"
+     ]
+    }
+   ],
   "source": [
-    "docsearch = FAISS.from_texts(texts, embeddings)"
+    "docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": f\"{i}-pl\"} for i in range(len(texts))])"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "id": "f42d79dc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Add in a fake source information\n",
-    "for i, d in enumerate(docsearch.docstore._dict.values()):\n",
-    "    d.metadata = {'source': f\"{i}-pl\"}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "id": "8aa571ae",
   "metadata": {},
   "outputs": [],
@@ -73,7 +71,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "id": "aa859d4c",
   "metadata": {},
   "outputs": [],
@@ -85,18 +83,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
   "id": "8ba36fa7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "{'answer': ' The president thanked Justice Breyer for his service.\\n',\n",
+       "{'answer': ' The president thanked Justice Breyer for his service and mentioned his legacy of excellence.\\n',\n",
       " 'sources': '30-pl'}"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -207,7 +205,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  },
  "vscode": {
   "interpreter": {
--- a/docs/modules/chains/combine_docs_examples/vector_db_text_generation.ipynb
+++ b/docs/modules/chains/combine_docs_examples/vector_db_text_generation.ipynb
@@ -28,7 +28,7 @@
    "from langchain.docstore.document import Document\n",
    "import requests\n",
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores.faiss import FAISS\n",
+    "from langchain.vectorstores import Chromama\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.prompts import PromptTemplate\n",
    "import pathlib\n",
@@ -96,7 +96,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings())"
+    "search_index = Chroma.from_documents(source_chunks, OpenAIEmbeddings())"
   ]
  },
  {
@@ -191,7 +191,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,
--- a/docs/modules/chains/generic/from_hub.ipynb
+++ b/docs/modules/chains/generic/from_hub.ipynb
@@ -12,7 +12,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "id": "8b54479e",
   "metadata": {},
   "outputs": [],
@@ -65,36 +65,46 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
   "id": "aab39528",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores.faiss import FAISS\n",
+    "from langchain.vectorstores import Chroma\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain import OpenAI, VectorDBQA"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
   "id": "16a85d5e",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running Chroma using direct local API.\n",
+      "Using DuckDB in-memory for database. Data will be transient.\n"
+     ]
+    }
+   ],
   "source": [
-    "with open('../../state_of_the_union.txt') as f:\n",
-    "    state_of_the_union = f.read()\n",
+    "from langchain.document_loaders import TextLoader\n",
+    "loader = TextLoader('../../state_of_the_union.txt')\n",
+    "documents = loader.load()\n",
    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
-    "texts = text_splitter.split_text(state_of_the_union)\n",
+    "texts = text_splitter.split_documents(documents)\n",
    "\n",
    "embeddings = OpenAIEmbeddings()\n",
-    "vectorstore = FAISS.from_texts(texts, embeddings)"
+    "vectorstore = Chroma.from_documents(texts, embeddings)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 6,
   "id": "6a82e91e",
   "metadata": {},
   "outputs": [],
@@ -104,17 +114,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
   "id": "efe9b25b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "\" The president said that Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers, and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
+       "\" The president said that Ketanji Brown Jackson is a Circuit Court of Appeals Judge, one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans, and will continue Justice Breyer's legacy of excellence.\""
      ]
     },
-     "execution_count": 10,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -149,7 +159,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.9.1"
  }
 },
 "nbformat": 4,