mirror of
				https://github.com/hwchase17/langchain.git
				synced 2025-10-25 12:44:04 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			872 lines
		
	
	
		
			34 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			872 lines
		
	
	
		
			34 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| {
 | ||
|  "cells": [
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "#  Deep Lake\n",
 | ||
|     "\n",
 | ||
|     ">[Deep Lake](https://docs.activeloop.ai/) as a Multi-Modal Vector Store that stores embeddings and their metadata including text, jsons, images, audio, video, and more. It saves the data locally, in your cloud, or on Activeloop storage. It performs hybrid search including embeddings and their attributes.\n",
 | ||
|     "\n",
 | ||
|     "This notebook showcases basic functionality related to `Deep Lake`. While `Deep Lake` can store embeddings, it is capable of storing any type of data. It is a fully fledged serverless data lake with version control, query engine and streaming dataloader to deep learning frameworks.  \n",
 | ||
|     "\n",
 | ||
|     "For more information, please see the Deep Lake [documentation](https://docs.activeloop.ai) or [api reference](https://docs.deeplake.ai)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": null,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "!pip install openai deeplake tiktoken"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 2,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "from langchain.embeddings.openai import OpenAIEmbeddings\n",
 | ||
|     "from langchain.text_splitter import CharacterTextSplitter\n",
 | ||
|     "from langchain.vectorstores import DeepLake"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 3,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdin",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "OpenAI API Key: ········\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "import os\n",
 | ||
|     "import getpass\n",
 | ||
|     "\n",
 | ||
|     "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
 | ||
|     "embeddings = OpenAIEmbeddings()"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 4,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "from langchain.document_loaders import TextLoader\n",
 | ||
|     "\n",
 | ||
|     "loader = TextLoader('../../../state_of_the_union.txt')\n",
 | ||
|     "documents = loader.load()\n",
 | ||
|     "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
 | ||
|     "docs = text_splitter.split_documents(documents)\n",
 | ||
|     "\n",
 | ||
|     "embeddings = OpenAIEmbeddings()"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "Create a dataset locally at `./deeplake/`, then run similiarity search. The Deeplake+LangChain integration uses Deep Lake datasets under the hood, so `dataset` and `vector store` are used interchangeably. To create a dataset in your own cloud, or in the Deep Lake storage, [adjust the path accordingly](https://docs.activeloop.ai/storage-and-credentials/storage-options)."
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 6,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "/home/leo/.local/lib/python3.10/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.3.2) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n",
 | ||
|       "  warnings.warn(\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "./my_deeplake/ loaded successfully.\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": []
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": []
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": []
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Evaluating ingest: 100%|██████████████████████████████████████| 1/1 [00:07<00:00\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='./my_deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape      dtype  compression\n",
 | ||
|       "  -------   -------   -------    -------  ------- \n",
 | ||
|       " embedding  generic  (42, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (42, 1)      str     None   \n",
 | ||
|       " metadata    json     (42, 1)      str     None   \n",
 | ||
|       "   text      text     (42, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "db = DeepLake(dataset_path=\"./my_deeplake/\", embedding_function=embeddings)\n",
 | ||
|     "db.add_documents(docs)\n",
 | ||
|     "# or shorter\n",
 | ||
|     "# db = DeepLake.from_documents(docs, dataset_path=\"./my_deeplake/\", embedding=embeddings, overwrite=True)\n",
 | ||
|     "query = \"What did the president say about Ketanji Brown Jackson\"\n",
 | ||
|     "docs = db.similarity_search(query)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 7,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
 | ||
|       "\n",
 | ||
|       "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
 | ||
|       "\n",
 | ||
|       "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
 | ||
|       "\n",
 | ||
|       "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "print(docs[0].page_content)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "Later, you can reload the dataset without recomputing embeddings"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 8,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "./my_deeplake/ loaded successfully.\n",
 | ||
|       "\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": []
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Deep Lake Dataset in ./my_deeplake/ already exists, loading from the storage\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='./my_deeplake/', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape      dtype  compression\n",
 | ||
|       "  -------   -------   -------    -------  ------- \n",
 | ||
|       " embedding  generic  (42, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (42, 1)      str     None   \n",
 | ||
|       " metadata    json     (42, 1)      str     None   \n",
 | ||
|       "   text      text     (42, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "db = DeepLake(dataset_path=\"./my_deeplake/\", embedding_function=embeddings, read_only=True)\n",
 | ||
|     "docs = db.similarity_search(query)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "Deep Lake, for now, is single writer and multiple reader. Setting `read_only=True` helps to avoid acquring the writer lock."
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Retrieval Question/Answering"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 9,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "/home/leo/.local/lib/python3.10/site-packages/langchain/llms/openai.py:624: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n",
 | ||
|       "  warnings.warn(\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "from langchain.chains import RetrievalQA\n",
 | ||
|     "from langchain.llms import OpenAIChat\n",
 | ||
|     "\n",
 | ||
|     "qa = RetrievalQA.from_chain_type(llm=OpenAIChat(model='gpt-3.5-turbo'), chain_type='stuff', retriever=db.as_retriever())"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 10,
 | ||
|    "metadata": {
 | ||
|     "tags": []
 | ||
|    },
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "'The president nominated Ketanji Brown Jackson to serve on the United States Supreme Court. He described her as a former top litigator in private practice, a former federal public defender, a consensus builder, and from a family of public school educators and police officers. He also mentioned that she has received broad support from various groups since being nominated.'"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 10,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "query = 'What did the president say about Ketanji Brown Jackson'\n",
 | ||
|     "qa.run(query)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Attribute based filtering in metadata"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 54,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "./my_deeplake/ loaded successfully.\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Evaluating ingest: 100%|██████████| 1/1 [00:04<00:00\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='./my_deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape     dtype  compression\n",
 | ||
|       "  -------   -------   -------   -------  ------- \n",
 | ||
|       " embedding  generic  (4, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (4, 1)      str     None   \n",
 | ||
|       " metadata    json     (4, 1)      str     None   \n",
 | ||
|       "   text      text     (4, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": []
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "import random\n",
 | ||
|     "\n",
 | ||
|     "for d in docs:\n",
 | ||
|     "    d.metadata['year'] = random.randint(2012, 2014)\n",
 | ||
|     "\n",
 | ||
|     "db = DeepLake.from_documents(docs, embeddings, dataset_path=\"./my_deeplake/\", overwrite=True)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 55,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "100%|██████████| 4/4 [00:00<00:00, 1080.24it/s]\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n",
 | ||
|        " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together.  \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013})]"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 55,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "db.similarity_search('What did the president say about Ketanji Brown Jackson', filter={'year': 2013})"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Choosing distance function\n",
 | ||
|     "Distance function `L2` for Euclidean, `L1` for Nuclear, `Max` l-infinity distnace, `cos` for cosine similarity, `dot` for dot product "
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 56,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n",
 | ||
|        " Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n",
 | ||
|        " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together.  \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n",
 | ||
|        " Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up.  \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave.  \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012})]"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 56,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "db.similarity_search('What did the president say about Ketanji Brown Jackson?', distance_metric='cos')"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Maximal Marginal relevance\n",
 | ||
|     "Using maximal marginal relevance"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 57,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n",
 | ||
|        " Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up.  \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave.  \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n",
 | ||
|        " Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling.  \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.  \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n",
 | ||
|        " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together.  \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013})]"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 57,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "db.max_marginal_relevance_search('What did the president say about Ketanji Brown Jackson?')"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Delete dataset"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 59,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "db.delete_dataset()"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "and if delete fails you can also force delete"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 61,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": []
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "DeepLake.force_delete_by_path(\"./my_deeplake\")"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "## Deep Lake datasets on cloud (Activeloop, AWS, GCS, etc.) or in memory\n",
 | ||
|     "By default deep lake datasets are stored locally, in case you want to store them in memory, in the Deep Lake Managed DB, or in any object storage, you can provide the [corresponding path to the dataset](https://docs.activeloop.ai/storage-and-credentials/storage-options). You can retrieve your user token from [app.activeloop.ai](https://app.activeloop.ai/)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 62,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 63,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Your Deep Lake dataset has been successfully created!\n",
 | ||
|       "The dataset is private so make sure you are logged in!\n",
 | ||
|       "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/langchain_test\n",
 | ||
|       "hub://davitbun/langchain_test loaded successfully.\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Evaluating ingest: 100%|██████████| 1/1 [00:14<00:00\n",
 | ||
|       " \r"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='hub://davitbun/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape     dtype  compression\n",
 | ||
|       "  -------   -------   -------   -------  ------- \n",
 | ||
|       " embedding  generic  (4, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (4, 1)      str     None   \n",
 | ||
|       " metadata    json     (4, 1)      str     None   \n",
 | ||
|       "   text      text     (4, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "['d6d6ccb4-e187-11ed-b66d-41c5f7b85421',\n",
 | ||
|        " 'd6d6ccb5-e187-11ed-b66d-41c5f7b85421',\n",
 | ||
|        " 'd6d6ccb6-e187-11ed-b66d-41c5f7b85421',\n",
 | ||
|        " 'd6d6ccb7-e187-11ed-b66d-41c5f7b85421']"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 63,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "# Embed and store the texts\n",
 | ||
|     "username = \"<username>\" # your username on app.activeloop.ai   \n",
 | ||
|     "dataset_path = f\"hub://{username}/langchain_test\" # could be also ./local/path (much faster locally), s3://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n",
 | ||
|     "\n",
 | ||
|     "embedding = OpenAIEmbeddings()\n",
 | ||
|     "db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings, overwrite=True)\n",
 | ||
|     "db.add_documents(docs)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 64,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n",
 | ||
|       "\n",
 | ||
|       "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
 | ||
|       "\n",
 | ||
|       "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
 | ||
|       "\n",
 | ||
|       "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "query = \"What did the president say about Ketanji Brown Jackson\"\n",
 | ||
|     "docs = db.similarity_search(query)\n",
 | ||
|     "print(docs[0].page_content)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "## Creating dataset on AWS S3"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 82,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "s3://hub-2.0-datasets-n/langchain_test loaded successfully.\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Evaluating ingest: 100%|██████████| 1/1 [00:10<00:00\n",
 | ||
|       "\\"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='s3://hub-2.0-datasets-n/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape     dtype  compression\n",
 | ||
|       "  -------   -------   -------   -------  ------- \n",
 | ||
|       " embedding  generic  (4, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (4, 1)      str     None   \n",
 | ||
|       " metadata    json     (4, 1)      str     None   \n",
 | ||
|       "   text      text     (4, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       " \r"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "dataset_path = f\"s3://BUCKET/langchain_test\" # could be also ./local/path (much faster locally), hub://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n",
 | ||
|     "\n",
 | ||
|     "embedding = OpenAIEmbeddings()\n",
 | ||
|     "db = DeepLake.from_documents(docs, dataset_path=dataset_path, embedding=embeddings, overwrite=True, creds = {\n",
 | ||
|     "   'aws_access_key_id': os.environ['AWS_ACCESS_KEY_ID'], \n",
 | ||
|     "   'aws_secret_access_key':  os.environ['AWS_SECRET_ACCESS_KEY'], \n",
 | ||
|     "   'aws_session_token': os.environ['AWS_SESSION_TOKEN'], # Optional\n",
 | ||
|     "})"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "## Deep Lake API\n",
 | ||
|     "you can access the Deep Lake  dataset at `db.ds`"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 66,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='hub://davitbun/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape     dtype  compression\n",
 | ||
|       "  -------   -------   -------   -------  ------- \n",
 | ||
|       " embedding  generic  (4, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (4, 1)      str     None   \n",
 | ||
|       " metadata    json     (4, 1)      str     None   \n",
 | ||
|       "   text      text     (4, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "# get structure of the dataset\n",
 | ||
|     "db.ds.summary()"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 67,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": [
 | ||
|     "# get embeddings numpy array\n",
 | ||
|     "embeds = db.ds.embedding.numpy()"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "markdown",
 | ||
|    "metadata": {},
 | ||
|    "source": [
 | ||
|     "### Transfer local dataset to cloud\n",
 | ||
|     "Copy already created dataset to the cloud. You can also transfer from cloud to local."
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 73,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Copying dataset: 100%|██████████| 56/56 [00:38<00:00\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/langchain_test_copy\n",
 | ||
|       "Your Deep Lake dataset has been successfully created!\n",
 | ||
|       "The dataset is private so make sure you are logged in!\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "Dataset(path='hub://davitbun/langchain_test_copy', tensors=['embedding', 'ids', 'metadata', 'text'])"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 73,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "import deeplake\n",
 | ||
|     "username = \"davitbun\" # your username on app.activeloop.ai   \n",
 | ||
|     "source = f\"hub://{username}/langchain_test\" # could be local, s3, gcs, etc.\n",
 | ||
|     "destination = f\"hub://{username}/langchain_test_copy\" # could be local, s3, gcs, etc.\n",
 | ||
|     "\n",
 | ||
|     "deeplake.deepcopy(src=source, dest=destination, overwrite=True)\n"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": 76,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       " \r"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/langchain_test_copy\n",
 | ||
|       "\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "/"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "hub://davitbun/langchain_test_copy loaded successfully.\n",
 | ||
|       "\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Deep Lake Dataset in hub://davitbun/langchain_test_copy already exists, loading from the storage\n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='hub://davitbun/langchain_test_copy', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape     dtype  compression\n",
 | ||
|       "  -------   -------   -------   -------  ------- \n",
 | ||
|       " embedding  generic  (4, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (4, 1)      str     None   \n",
 | ||
|       " metadata    json     (4, 1)      str     None   \n",
 | ||
|       "   text      text     (4, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Evaluating ingest: 100%|██████████| 1/1 [00:31<00:00\n",
 | ||
|       "-"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stdout",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       "Dataset(path='hub://davitbun/langchain_test_copy', tensors=['embedding', 'ids', 'metadata', 'text'])\n",
 | ||
|       "\n",
 | ||
|       "  tensor     htype     shape     dtype  compression\n",
 | ||
|       "  -------   -------   -------   -------  ------- \n",
 | ||
|       " embedding  generic  (8, 1536)  float32   None   \n",
 | ||
|       "    ids      text     (8, 1)      str     None   \n",
 | ||
|       " metadata    json     (8, 1)      str     None   \n",
 | ||
|       "   text      text     (8, 1)      str     None   \n"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "name": "stderr",
 | ||
|      "output_type": "stream",
 | ||
|      "text": [
 | ||
|       " \r"
 | ||
|      ]
 | ||
|     },
 | ||
|     {
 | ||
|      "data": {
 | ||
|       "text/plain": [
 | ||
|        "['ad42f3fe-e188-11ed-b66d-41c5f7b85421',\n",
 | ||
|        " 'ad42f3ff-e188-11ed-b66d-41c5f7b85421',\n",
 | ||
|        " 'ad42f400-e188-11ed-b66d-41c5f7b85421',\n",
 | ||
|        " 'ad42f401-e188-11ed-b66d-41c5f7b85421']"
 | ||
|       ]
 | ||
|      },
 | ||
|      "execution_count": 76,
 | ||
|      "metadata": {},
 | ||
|      "output_type": "execute_result"
 | ||
|     }
 | ||
|    ],
 | ||
|    "source": [
 | ||
|     "db = DeepLake(dataset_path=destination, embedding_function=embeddings)\n",
 | ||
|     "db.add_documents(docs)"
 | ||
|    ]
 | ||
|   },
 | ||
|   {
 | ||
|    "cell_type": "code",
 | ||
|    "execution_count": null,
 | ||
|    "metadata": {},
 | ||
|    "outputs": [],
 | ||
|    "source": []
 | ||
|   }
 | ||
|  ],
 | ||
|  "metadata": {
 | ||
|   "kernelspec": {
 | ||
|    "display_name": "Python 3 (ipykernel)",
 | ||
|    "language": "python",
 | ||
|    "name": "python3"
 | ||
|   },
 | ||
|   "language_info": {
 | ||
|    "codemirror_mode": {
 | ||
|     "name": "ipython",
 | ||
|     "version": 3
 | ||
|    },
 | ||
|    "file_extension": ".py",
 | ||
|    "mimetype": "text/x-python",
 | ||
|    "name": "python",
 | ||
|    "nbconvert_exporter": "python",
 | ||
|    "pygments_lexer": "ipython3",
 | ||
|    "version": "3.10.6"
 | ||
|   },
 | ||
|   "vscode": {
 | ||
|    "interpreter": {
 | ||
|     "hash": "7b14174bb6f9d4680b62ac2a6390e1ce94fbfabf172a10844870451d539c58d6"
 | ||
|    }
 | ||
|   }
 | ||
|  },
 | ||
|  "nbformat": 4,
 | ||
|  "nbformat_minor": 4
 | ||
| }
 |