chroma docs (#1012)

This commit is contained in:
Harrison Chase
2023-02-12 23:02:01 -08:00
committed by GitHub
parent 0c553d2064
commit 7fb33fca47
18 changed files with 354 additions and 179 deletions

View File

@@ -1,7 +1,6 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "134a0785",
"metadata": {},
@@ -19,11 +18,10 @@
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.llms import OpenAI\n",
"from langchain.chains import ChatVectorDBChain\n",
"from langchain.document_loaders import TextLoader"
"from langchain.chains import ChatVectorDBChain"
]
},
{
@@ -41,6 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()"
]
@@ -76,16 +75,25 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"id": "a8930cf7",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"documents = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"vectorstore = FAISS.from_documents(documents, embeddings)"
"vectorstore = Chroma.from_documents(documents, embeddings)"
]
},
{

View File

@@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"id": "78f28130",
"metadata": {},
"outputs": [],
@@ -30,14 +30,14 @@
"from langchain.embeddings.cohere import CohereEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.docstore.document import Document\n",
"from langchain.prompts import PromptTemplate"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"id": "4da195a3",
"metadata": {},
"outputs": [],
@@ -52,17 +52,26 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"id": "5ec2b55b",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{\"source\": i} for i in range(len(texts))])"
"docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": str(i)} for i in range(len(texts))])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 7,
"id": "5286f58f",
"metadata": {},
"outputs": [],
@@ -73,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 8,
"id": "005a47e9",
"metadata": {},
"outputs": [],
@@ -93,7 +102,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 9,
"id": "3722373b",
"metadata": {},
"outputs": [
@@ -103,7 +112,7 @@
"{'output_text': ' The president thanked Justice Breyer for his service.\\nSOURCES: 30-pl'}"
]
},
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -699,7 +708,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

View File

@@ -28,7 +28,7 @@
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.docstore.document import Document\n",
"from langchain.prompts import PromptTemplate"
]
@@ -40,27 +40,37 @@
"metadata": {},
"outputs": [],
"source": [
"with open('../../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "fd9666a9",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)"
"docsearch = Chroma.from_documents(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "d1eaf6e6",
"metadata": {},
"outputs": [],
@@ -673,7 +683,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

View File

@@ -18,7 +18,7 @@
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain import OpenAI, VectorDBQA"
]
@@ -28,15 +28,25 @@
"execution_count": 2,
"id": "5c7049db",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"with open('../../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"docsearch = FAISS.from_texts(texts, embeddings)"
"docsearch = Chroma.from_documents(texts, embeddings)"
]
},
{
@@ -58,7 +68,7 @@
{
"data": {
"text/plain": [
"\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, from a family of public school educators and police officers, a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
"\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
]
},
"execution_count": 4,
@@ -256,7 +266,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

View File

@@ -21,7 +21,7 @@
"from langchain.embeddings.cohere import CohereEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
"from langchain.vectorstores.faiss import FAISS"
"from langchain.vectorstores import Chromaoma"
]
},
{
@@ -41,29 +41,27 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "0e745d99",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n",
"Exiting: Cleaning up .chroma directory\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)"
"docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": f\"{i}-pl\"} for i in range(len(texts))])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f42d79dc",
"metadata": {},
"outputs": [],
"source": [
"# Add in a fake source information\n",
"for i, d in enumerate(docsearch.docstore._dict.values()):\n",
" d.metadata = {'source': f\"{i}-pl\"}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "8aa571ae",
"metadata": {},
"outputs": [],
@@ -73,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "aa859d4c",
"metadata": {},
"outputs": [],
@@ -85,18 +83,18 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "8ba36fa7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'answer': ' The president thanked Justice Breyer for his service.\\n',\n",
"{'answer': ' The president thanked Justice Breyer for his service and mentioned his legacy of excellence.\\n',\n",
" 'sources': '30-pl'}"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -207,7 +205,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

View File

@@ -28,7 +28,7 @@
"from langchain.docstore.document import Document\n",
"import requests\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chromama\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.prompts import PromptTemplate\n",
"import pathlib\n",
@@ -96,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
"search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings())"
"search_index = Chroma.from_documents(source_chunks, OpenAIEmbeddings())"
]
},
{
@@ -191,7 +191,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
}
},
"nbformat": 4,

View File

@@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"id": "8b54479e",
"metadata": {},
"outputs": [],
@@ -65,36 +65,46 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"id": "aab39528",
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain import OpenAI, VectorDBQA"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 3,
"id": "16a85d5e",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"with open('../../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"vectorstore = FAISS.from_texts(texts, embeddings)"
"vectorstore = Chroma.from_documents(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"id": "6a82e91e",
"metadata": {},
"outputs": [],
@@ -104,17 +114,17 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 7,
"id": "efe9b25b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\" The president said that Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers, and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
"\" The president said that Ketanji Brown Jackson is a Circuit Court of Appeals Judge, one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans, and will continue Justice Breyer's legacy of excellence.\""
]
},
"execution_count": 10,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@@ -149,7 +159,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
}
},
"nbformat": 4,