Deep Lake mini upgrades (#3375)

Improvements
* set default num_workers for ingestion to 0
* upgraded notebooks for avoiding dataset creation ambiguity
* added `force_delete_dataset_by_path`
* bumped deeplake to 3.3.0
* creds arg passing to deeplake object that would allow custom S3

Notes
* please double check if poetry is not messed up (thanks!)

Asks
* Would be great to create a shared slack channel for quick questions

---------

Co-authored-by: Davit Buniatyan <d@activeloop.ai>
This commit is contained in:
Davit Buniatyan
2023-04-23 21:23:54 -07:00
committed by GitHub
parent 93d53e417a
commit 2c0023393b
7 changed files with 489 additions and 178 deletions

View File

@@ -40,8 +40,24 @@
"from langchain.vectorstores import DeepLake\n",
"\n",
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')\n",
"embeddings = OpenAIEmbeddings()"
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"embeddings = OpenAIEmbeddings(disallowed_special=())"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"disallowed_special=() is required to avoid `Exception: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte` from tiktoken for some repositories"
]
},
{
@@ -120,7 +136,9 @@
"metadata": {},
"outputs": [],
"source": [
"db = DeepLake.from_documents(texts, embeddings, dataset_path=\"hub://davitbun/twitter-algorithm\")"
"username = \"davitbun\" # replace with your username from app.activeloop.ai\n",
"db = DeepLake(dataset_path=f\"hub://{username}/twitter-algorithm\", embedding_function=embeddings, public=True) #dataset would be publicly available\n",
"db.add_documents(texts)"
]
},
{
@@ -133,61 +151,9 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"-"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/twitter-algorithm\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"-"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"hub://davitbun/twitter-algorithm loaded successfully.\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Deep Lake Dataset in hub://davitbun/twitter-algorithm already exists, loading from the storage\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataset(path='hub://davitbun/twitter-algorithm', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
"\n",
" tensor htype shape dtype compression\n",
" ------- ------- ------- ------- ------- \n",
" embedding generic (23152, 1536) float32 None \n",
" ids text (23152, 1) str None \n",
" metadata json (23152, 1) str None \n",
" text text (23152, 1) str None \n"
]
}
],
"outputs": [],
"source": [
"db = DeepLake(dataset_path=\"hub://davitbun/twitter-algorithm\", read_only=True, embedding_function=embeddings)"
]
@@ -203,7 +169,7 @@
"retriever.search_kwargs['distance_metric'] = 'cos'\n",
"retriever.search_kwargs['fetch_k'] = 100\n",
"retriever.search_kwargs['maximal_marginal_relevance'] = True\n",
"retriever.search_kwargs['k'] = 20"
"retriever.search_kwargs['k'] = 10"
]
},
{
@@ -241,7 +207,7 @@
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"\n",
"model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',\n",
"model = ChatOpenAI(model='gpt-3.5-turbo') # switch to 'gpt-4'\n",
"qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)"
]
},

View File

@@ -108,7 +108,7 @@
"\n",
"dataset_path = 'hub://'+org+'/data'\n",
"embeddings = OpenAIEmbeddings()\n",
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path)"
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path, overwrite=True)"
]
},
{