Deep Lake mini upgrades (#3375)

Improvements * set default num_workers for ingestion to 0 * upgraded notebooks for avoiding dataset creation ambiguity * added `force_delete_dataset_by_path` * bumped deeplake to 3.3.0 * creds arg passing to deeplake object that would allow custom S3 Notes * please double check if poetry is not messed up (thanks!) Asks * Would be great to create a shared slack channel for quick questions --------- Co-authored-by: Davit Buniatyan <d@activeloop.ai>
2025-09-05 21:12:48 +00:00 · 2023-04-23 21:23:54 -07:00
parent 93d53e417a
commit 2c0023393b
7 changed files with 489 additions and 178 deletions
--- a/docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb
+++ b/docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb
@@ -40,8 +40,24 @@
    "from langchain.vectorstores import DeepLake\n",
    "\n",
    "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
-    "os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')\n",
-    "embeddings = OpenAIEmbeddings()"
+    "os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = OpenAIEmbeddings(disallowed_special=())"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "disallowed_special=() is required to avoid `Exception: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte` from tiktoken for some repositories"
   ]
  },
  {
@@ -120,7 +136,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "db = DeepLake.from_documents(texts, embeddings, dataset_path=\"hub://davitbun/twitter-algorithm\")"
+    "username = \"davitbun\" # replace with your username from app.activeloop.ai\n",
+    "db = DeepLake(dataset_path=f\"hub://{username}/twitter-algorithm\", embedding_function=embeddings, public=True) #dataset would be publicly available\n",
+    "db.add_documents(texts)"
   ]
  },
  {
@@ -133,61 +151,9 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "-"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/twitter-algorithm\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "-"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "hub://davitbun/twitter-algorithm loaded successfully.\n",
-      "\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Deep Lake Dataset in hub://davitbun/twitter-algorithm already exists, loading from the storage\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Dataset(path='hub://davitbun/twitter-algorithm', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
-      "\n",
-      "  tensor     htype       shape       dtype  compression\n",
-      "  -------   -------     -------     -------  ------- \n",
-      " embedding  generic  (23152, 1536)  float32   None   \n",
-      "    ids      text     (23152, 1)      str     None   \n",
-      " metadata    json     (23152, 1)      str     None   \n",
-      "   text      text     (23152, 1)      str     None   \n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "db = DeepLake(dataset_path=\"hub://davitbun/twitter-algorithm\", read_only=True, embedding_function=embeddings)"
   ]
@@ -203,7 +169,7 @@
    "retriever.search_kwargs['distance_metric'] = 'cos'\n",
    "retriever.search_kwargs['fetch_k'] = 100\n",
    "retriever.search_kwargs['maximal_marginal_relevance'] = True\n",
-    "retriever.search_kwargs['k'] = 20"
+    "retriever.search_kwargs['k'] = 10"
   ]
  },
  {
@@ -241,7 +207,7 @@
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.chains import ConversationalRetrievalChain\n",
    "\n",
-    "model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',\n",
+    "model = ChatOpenAI(model='gpt-3.5-turbo') # switch to 'gpt-4'\n",
    "qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)"
   ]
  },
--- a/docs/use_cases/question_answering/semantic-search-over-chat.ipynb
+++ b/docs/use_cases/question_answering/semantic-search-over-chat.ipynb
@@ -108,7 +108,7 @@
    "\n",
    "dataset_path = 'hub://'+org+'/data'\n",
    "embeddings = OpenAIEmbeddings()\n",
-    "db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path)"
+    "db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path, overwrite=True)"
   ]
  },
  {