notebook fmt (#12498)

2025-09-23 11:30:37 +00:00 · 2023-10-29 15:50:09 -07:00
parent 56cc5b847c
commit 2424fff3f1
342 changed files with 8261 additions and 6796 deletions
--- a/cookbook/Semi_structured_multi_modal_RAG_LLaMA2.ipynb
+++ b/cookbook/Semi_structured_multi_modal_RAG_LLaMA2.ipynb
@@ -92,22 +92,24 @@
    "path = \"/Users/rlm/Desktop/Papers/LLaVA/\"\n",
    "\n",
    "# Get elements\n",
-    "raw_pdf_elements = partition_pdf(filename=path+\"LLaVA.pdf\",\n",
-    "                                 # Using pdf format to find embedded image blocks\n",
-    "                                 extract_images_in_pdf=True,\n",
-    "                                 # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles\n",
-    "                                 # Titles are any sub-section of the document \n",
-    "                                 infer_table_structure=True, \n",
-    "                                 # Post processing to aggregate text once we have the title \n",
-    "                                 chunking_strategy=\"by_title\",\n",
-    "                                 # Chunking params to aggregate text blocks\n",
-    "                                 # Attempt to create a new chunk 3800 chars\n",
-    "                                 # Attempt to keep chunks > 2000 chars \n",
-    "                                 # Hard max on chunks\n",
-    "                                 max_characters=4000, \n",
-    "                                 new_after_n_chars=3800, \n",
-    "                                 combine_text_under_n_chars=2000,\n",
-    "                                 image_output_dir_path=path)"
+    "raw_pdf_elements = partition_pdf(\n",
+    "    filename=path + \"LLaVA.pdf\",\n",
+    "    # Using pdf format to find embedded image blocks\n",
+    "    extract_images_in_pdf=True,\n",
+    "    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles\n",
+    "    # Titles are any sub-section of the document\n",
+    "    infer_table_structure=True,\n",
+    "    # Post processing to aggregate text once we have the title\n",
+    "    chunking_strategy=\"by_title\",\n",
+    "    # Chunking params to aggregate text blocks\n",
+    "    # Attempt to create a new chunk 3800 chars\n",
+    "    # Attempt to keep chunks > 2000 chars\n",
+    "    # Hard max on chunks\n",
+    "    max_characters=4000,\n",
+    "    new_after_n_chars=3800,\n",
+    "    combine_text_under_n_chars=2000,\n",
+    "    image_output_dir_path=path,\n",
+    ")"
   ]
  },
  {
@@ -165,6 +167,7 @@
    "    type: str\n",
    "    text: Any\n",
    "\n",
+    "\n",
    "# Categorize by type\n",
    "categorized_elements = []\n",
    "for element in raw_pdf_elements:\n",
@@ -219,14 +222,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Prompt \n",
-    "prompt_text=\"\"\"You are an assistant tasked with summarizing tables and text. \\ \n",
+    "# Prompt\n",
+    "prompt_text = \"\"\"You are an assistant tasked with summarizing tables and text. \\ \n",
    "Give a concise summary of the table or text. Table or text chunk: {element} \"\"\"\n",
-    "prompt = ChatPromptTemplate.from_template(prompt_text) \n",
+    "prompt = ChatPromptTemplate.from_template(prompt_text)\n",
    "\n",
-    "# Summary chain \n",
+    "# Summary chain\n",
    "model = ChatOllama(model=\"llama2:13b-chat\")\n",
-    "summarize_chain = {\"element\": lambda x:x} | prompt | model | StrOutputParser()"
+    "summarize_chain = {\"element\": lambda x: x} | prompt | model | StrOutputParser()"
   ]
  },
  {
@@ -327,11 +330,14 @@
    "# Read each file and store its content in a list\n",
    "img_summaries = []\n",
    "for file_path in file_paths:\n",
-    "    with open(file_path, 'r') as file:\n",
+    "    with open(file_path, \"r\") as file:\n",
    "        img_summaries.append(file.read())\n",
    "\n",
    "# Clean up residual logging\n",
-    "cleaned_img_summary = [s.split(\"clip_model_load: total allocated memory: 201.27 MB\\n\\n\", 1)[1].strip() for s in img_summaries]"
+    "cleaned_img_summary = [\n",
+    "    s.split(\"clip_model_load: total allocated memory: 201.27 MB\\n\\n\", 1)[1].strip()\n",
+    "    for s in img_summaries\n",
+    "]"
   ]
  },
  {
@@ -377,18 +383,17 @@
    "\n",
    "# The vectorstore to use to index the child chunks\n",
    "vectorstore = Chroma(\n",
-    "    collection_name=\"summaries\",\n",
-    "    embedding_function=GPT4AllEmbeddings()\n",
+    "    collection_name=\"summaries\", embedding_function=GPT4AllEmbeddings()\n",
    ")\n",
    "\n",
    "# The storage layer for the parent documents\n",
-    "store = InMemoryStore() # <- Can we extend this to images \n",
+    "store = InMemoryStore()  # <- Can we extend this to images\n",
    "id_key = \"doc_id\"\n",
    "\n",
    "# The retriever (empty to start)\n",
    "retriever = MultiVectorRetriever(\n",
-    "    vectorstore=vectorstore, \n",
-    "    docstore=store, \n",
+    "    vectorstore=vectorstore,\n",
+    "    docstore=store,\n",
    "    id_key=id_key,\n",
    ")"
   ]
@@ -412,21 +417,32 @@
   "source": [
    "# Add texts\n",
    "doc_ids = [str(uuid.uuid4()) for _ in texts]\n",
-    "summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]\n",
+    "summary_texts = [\n",
+    "    Document(page_content=s, metadata={id_key: doc_ids[i]})\n",
+    "    for i, s in enumerate(text_summaries)\n",
+    "]\n",
    "retriever.vectorstore.add_documents(summary_texts)\n",
    "retriever.docstore.mset(list(zip(doc_ids, texts)))\n",
    "\n",
    "# Add tables\n",
    "table_ids = [str(uuid.uuid4()) for _ in tables]\n",
-    "summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]\n",
+    "summary_tables = [\n",
+    "    Document(page_content=s, metadata={id_key: table_ids[i]})\n",
+    "    for i, s in enumerate(table_summaries)\n",
+    "]\n",
    "retriever.vectorstore.add_documents(summary_tables)\n",
    "retriever.docstore.mset(list(zip(table_ids, tables)))\n",
    "\n",
    "# Add images\n",
    "img_ids = [str(uuid.uuid4()) for _ in cleaned_img_summary]\n",
-    "summary_img = [Document(page_content=s,metadata={id_key: img_ids[i]}) for i, s in enumerate(cleaned_img_summary)]\n",
+    "summary_img = [\n",
+    "    Document(page_content=s, metadata={id_key: img_ids[i]})\n",
+    "    for i, s in enumerate(cleaned_img_summary)\n",
+    "]\n",
    "retriever.vectorstore.add_documents(summary_img)\n",
-    "retriever.docstore.mset(list(zip(img_ids, cleaned_img_summary))) # Store the image summary as the raw document"
+    "retriever.docstore.mset(\n",
+    "    list(zip(img_ids, cleaned_img_summary))\n",
+    ")  # Store the image summary as the raw document"
   ]
  },
  {
@@ -484,7 +500,9 @@
    }
   ],
   "source": [
-    "retriever.get_relevant_documents(\"Images / figures with playful and creative examples\")[0]"
+    "retriever.get_relevant_documents(\"Images / figures with playful and creative examples\")[\n",
+    "    0\n",
+    "]"
   ]
  },
  {
@@ -530,9 +548,9 @@
    "\n",
    "# RAG pipeline\n",
    "chain = (\n",
-    "    {\"context\": retriever, \"question\": RunnablePassthrough()} \n",
-    "    | prompt \n",
-    "    | model \n",
+    "    {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
+    "    | prompt\n",
+    "    | model\n",
    "    | StrOutputParser()\n",
    ")"
   ]
@@ -555,7 +573,9 @@
    }
   ],
   "source": [
-    "chain.invoke(\"What is the performance of LLaVa across across multiple image domains / subjects?\")"
+    "chain.invoke(\n",
+    "    \"What is the performance of LLaVa across across multiple image domains / subjects?\"\n",
+    ")"
   ]
  },
  {
@@ -584,7 +604,9 @@
    }
   ],
   "source": [
-    "chain.invoke(\"Explain any images / figures in the paper with playful and creative examples.\")"
+    "chain.invoke(\n",
+    "    \"Explain any images / figures in the paper with playful and creative examples.\"\n",
+    ")"
   ]
  },
  {