mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 21:12:48 +00:00
Deep Lake mini upgrades (#3375)
Improvements * set default num_workers for ingestion to 0 * upgraded notebooks for avoiding dataset creation ambiguity * added `force_delete_dataset_by_path` * bumped deeplake to 3.3.0 * creds arg passing to deeplake object that would allow custom S3 Notes * please double check if poetry is not messed up (thanks!) Asks * Would be great to create a shared slack channel for quick questions --------- Co-authored-by: Davit Buniatyan <d@activeloop.ai>
This commit is contained in:
@@ -40,8 +40,24 @@
|
||||
"from langchain.vectorstores import DeepLake\n",
|
||||
"\n",
|
||||
"os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n",
|
||||
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')\n",
|
||||
"embeddings = OpenAIEmbeddings()"
|
||||
"os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = OpenAIEmbeddings(disallowed_special=())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"disallowed_special=() is required to avoid `Exception: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte` from tiktoken for some repositories"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -120,7 +136,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = DeepLake.from_documents(texts, embeddings, dataset_path=\"hub://davitbun/twitter-algorithm\")"
|
||||
"username = \"davitbun\" # replace with your username from app.activeloop.ai\n",
|
||||
"db = DeepLake(dataset_path=f\"hub://{username}/twitter-algorithm\", embedding_function=embeddings, public=True) #dataset would be publicly available\n",
|
||||
"db.add_documents(texts)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -133,61 +151,9 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"-"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/twitter-algorithm\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"-"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"hub://davitbun/twitter-algorithm loaded successfully.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Deep Lake Dataset in hub://davitbun/twitter-algorithm already exists, loading from the storage\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dataset(path='hub://davitbun/twitter-algorithm', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n",
|
||||
"\n",
|
||||
" tensor htype shape dtype compression\n",
|
||||
" ------- ------- ------- ------- ------- \n",
|
||||
" embedding generic (23152, 1536) float32 None \n",
|
||||
" ids text (23152, 1) str None \n",
|
||||
" metadata json (23152, 1) str None \n",
|
||||
" text text (23152, 1) str None \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = DeepLake(dataset_path=\"hub://davitbun/twitter-algorithm\", read_only=True, embedding_function=embeddings)"
|
||||
]
|
||||
@@ -203,7 +169,7 @@
|
||||
"retriever.search_kwargs['distance_metric'] = 'cos'\n",
|
||||
"retriever.search_kwargs['fetch_k'] = 100\n",
|
||||
"retriever.search_kwargs['maximal_marginal_relevance'] = True\n",
|
||||
"retriever.search_kwargs['k'] = 20"
|
||||
"retriever.search_kwargs['k'] = 10"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -241,7 +207,7 @@
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.chains import ConversationalRetrievalChain\n",
|
||||
"\n",
|
||||
"model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',\n",
|
||||
"model = ChatOpenAI(model='gpt-3.5-turbo') # switch to 'gpt-4'\n",
|
||||
"qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)"
|
||||
]
|
||||
},
|
||||
|
@@ -108,7 +108,7 @@
|
||||
"\n",
|
||||
"dataset_path = 'hub://'+org+'/data'\n",
|
||||
"embeddings = OpenAIEmbeddings()\n",
|
||||
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path)"
|
||||
"db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path, overwrite=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
Reference in New Issue
Block a user