notebook fmt (#12498)

This commit is contained in:
Bagatur
2023-10-29 15:50:09 -07:00
committed by GitHub
parent 56cc5b847c
commit 2424fff3f1
342 changed files with 8261 additions and 6796 deletions

View File

@@ -60,7 +60,7 @@
"metadata": {},
"outputs": [],
"source": [
"! brew install tesseract \n",
"! brew install tesseract\n",
"! brew install poppler"
]
},
@@ -108,21 +108,23 @@
"from unstructured.partition.pdf import partition_pdf\n",
"\n",
"# Get elements\n",
"raw_pdf_elements = partition_pdf(filename=path+\"LLaMA2.pdf\",\n",
" # Unstructured first finds embedded image blocks\n",
" extract_images_in_pdf=False,\n",
" # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles\n",
" # Titles are any sub-section of the document \n",
" infer_table_structure=True, \n",
" # Post processing to aggregate text once we have the title \n",
" chunking_strategy=\"by_title\",\n",
" # Chunking params to aggregate text blocks\n",
" # Attempt to create a new chunk 3800 chars\n",
" # Attempt to keep chunks > 2000 chars \n",
" max_characters=4000, \n",
" new_after_n_chars=3800, \n",
" combine_text_under_n_chars=2000,\n",
" image_output_dir_path=path)"
"raw_pdf_elements = partition_pdf(\n",
" filename=path + \"LLaMA2.pdf\",\n",
" # Unstructured first finds embedded image blocks\n",
" extract_images_in_pdf=False,\n",
" # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles\n",
" # Titles are any sub-section of the document\n",
" infer_table_structure=True,\n",
" # Post processing to aggregate text once we have the title\n",
" chunking_strategy=\"by_title\",\n",
" # Chunking params to aggregate text blocks\n",
" # Attempt to create a new chunk 3800 chars\n",
" # Attempt to keep chunks > 2000 chars\n",
" max_characters=4000,\n",
" new_after_n_chars=3800,\n",
" combine_text_under_n_chars=2000,\n",
" image_output_dir_path=path,\n",
")"
]
},
{
@@ -190,6 +192,7 @@
" type: str\n",
" text: Any\n",
"\n",
"\n",
"# Categorize by type\n",
"categorized_elements = []\n",
"for element in raw_pdf_elements:\n",
@@ -259,14 +262,14 @@
"metadata": {},
"outputs": [],
"source": [
"# Prompt \n",
"prompt_text=\"\"\"You are an assistant tasked with summarizing tables and text. \\ \n",
"# Prompt\n",
"prompt_text = \"\"\"You are an assistant tasked with summarizing tables and text. \\ \n",
"Give a concise summary of the table or text. Table or text chunk: {element} \"\"\"\n",
"prompt = ChatPromptTemplate.from_template(prompt_text) \n",
"prompt = ChatPromptTemplate.from_template(prompt_text)\n",
"\n",
"# Summary chain \n",
"model = ChatOpenAI(temperature=0,model=\"gpt-4\")\n",
"summarize_chain = {\"element\": lambda x:x} | prompt | model | StrOutputParser()"
"# Summary chain\n",
"model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
"summarize_chain = {\"element\": lambda x: x} | prompt | model | StrOutputParser()"
]
},
{
@@ -321,10 +324,7 @@
"from langchain.retrievers.multi_vector import MultiVectorRetriever\n",
"\n",
"# The vectorstore to use to index the child chunks\n",
"vectorstore = Chroma(\n",
" collection_name=\"summaries\",\n",
" embedding_function=OpenAIEmbeddings()\n",
")\n",
"vectorstore = Chroma(collection_name=\"summaries\", embedding_function=OpenAIEmbeddings())\n",
"\n",
"# The storage layer for the parent documents\n",
"store = InMemoryStore()\n",
@@ -332,20 +332,26 @@
"\n",
"# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore, \n",
" docstore=store, \n",
" vectorstore=vectorstore,\n",
" docstore=store,\n",
" id_key=id_key,\n",
")\n",
"\n",
"# Add texts\n",
"doc_ids = [str(uuid.uuid4()) for _ in texts]\n",
"summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]\n",
"summary_texts = [\n",
" Document(page_content=s, metadata={id_key: doc_ids[i]})\n",
" for i, s in enumerate(text_summaries)\n",
"]\n",
"retriever.vectorstore.add_documents(summary_texts)\n",
"retriever.docstore.mset(list(zip(doc_ids, texts)))\n",
"\n",
"# Add tables\n",
"table_ids = [str(uuid.uuid4()) for _ in tables]\n",
"summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]\n",
"summary_tables = [\n",
" Document(page_content=s, metadata={id_key: table_ids[i]})\n",
" for i, s in enumerate(table_summaries)\n",
"]\n",
"retriever.vectorstore.add_documents(summary_tables)\n",
"retriever.docstore.mset(list(zip(table_ids, tables)))"
]
@@ -378,13 +384,13 @@
"prompt = ChatPromptTemplate.from_template(template)\n",
"\n",
"# LLM\n",
"model = ChatOpenAI(temperature=0,model=\"gpt-4\")\n",
"model = ChatOpenAI(temperature=0, model=\"gpt-4\")\n",
"\n",
"# RAG pipeline\n",
"chain = (\n",
" {\"context\": retriever, \"question\": RunnablePassthrough()} \n",
" | prompt \n",
" | model \n",
" {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
" | prompt\n",
" | model\n",
" | StrOutputParser()\n",
")"
]