Format Templates (#12396)

2025-09-16 15:04:13 +00:00 · 2023-10-26 19:44:30 -07:00
parent 25c98dbba9
commit 4b16601d33
59 changed files with 800 additions and 441 deletions
--- a/templates/rag-semi-structured/rag_semi_structured/chain.py
+++ b/templates/rag-semi-structured/rag_semi_structured/chain.py
@@ -1,33 +1,36 @@
 # Load
 import uuid
+
 from langchain.chat_models import ChatOpenAI
-from langchain.prompts import ChatPromptTemplate
-from langchain.schema.output_parser import StrOutputParser
-from langchain.vectorstores import Chroma
-from langchain.storage import InMemoryStore
-from unstructured.partition.pdf import partition_pdf
-from langchain.schema.document import Document
 from langchain.embeddings import OpenAIEmbeddings
+from langchain.prompts import ChatPromptTemplate
 from langchain.retrievers.multi_vector import MultiVectorRetriever
+from langchain.schema.document import Document
+from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable import RunnablePassthrough
+from langchain.storage import InMemoryStore
+from langchain.vectorstores import Chroma
+from unstructured.partition.pdf import partition_pdf

 # Path to docs
 path = "docs"
-raw_pdf_elements = partition_pdf(filename=path+"LLaMA2.pdf",
-                                 # Unstructured first finds embedded image blocks
-                                 extract_images_in_pdf=False,
-                                 # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
-                                 # Titles are any sub-section of the document 
-                                 infer_table_structure=True, 
-                                 # Post processing to aggregate text once we have the title 
-                                 chunking_strategy="by_title",
-                                 # Chunking params to aggregate text blocks
-                                 # Attempt to create a new chunk 3800 chars
-                                 # Attempt to keep chunks > 2000 chars 
-                                 max_characters=4000, 
-                                 new_after_n_chars=3800, 
-                                 combine_text_under_n_chars=2000,
-                                 image_output_dir_path=path)
+raw_pdf_elements = partition_pdf(
+    filename=path + "LLaMA2.pdf",
+    # Unstructured first finds embedded image blocks
+    extract_images_in_pdf=False,
+    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
+    # Titles are any sub-section of the document
+    infer_table_structure=True,
+    # Post processing to aggregate text once we have the title
+    chunking_strategy="by_title",
+    # Chunking params to aggregate text blocks
+    # Attempt to create a new chunk 3800 chars
+    # Attempt to keep chunks > 2000 chars
+    max_characters=4000,
+    new_after_n_chars=3800,
+    combine_text_under_n_chars=2000,
+    image_output_dir_path=path,
+)

 # Categorize by type
 tables = []
@@ -40,26 +43,23 @@ for element in raw_pdf_elements:

 # Summarize

-prompt_text="""You are an assistant tasked with summarizing tables and text. \ 
+prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
 Give a concise summary of the table or text. Table or text chunk: {element} """
-prompt = ChatPromptTemplate.from_template(prompt_text) 
-model = ChatOpenAI(temperature=0,model="gpt-4")
-summarize_chain = {"element": lambda x:x} | prompt | model | StrOutputParser()
+prompt = ChatPromptTemplate.from_template(prompt_text)
+model = ChatOpenAI(temperature=0, model="gpt-4")
+summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

 # Apply
 table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
 # To save time / cost, only do text summaries if chunk sizes are large
 # text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
-# We can just assign text_summaries to the raw texts 
+# We can just assign text_summaries to the raw texts
 text_summaries = texts

 # Use multi vector retriever

 # The vectorstore to use to index the child chunks
-vectorstore = Chroma(
-    collection_name="summaries",
-    embedding_function=OpenAIEmbeddings()
-)
+vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())

 # The storage layer for the parent documents
 store = InMemoryStore()
@@ -67,20 +67,26 @@ id_key = "doc_id"

 # The retriever (empty to start)
 retriever = MultiVectorRetriever(
-    vectorstore=vectorstore, 
-    docstore=store, 
+    vectorstore=vectorstore,
+    docstore=store,
    id_key=id_key,
 )

 # Add texts
 doc_ids = [str(uuid.uuid4()) for _ in texts]
-summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
+summary_texts = [
+    Document(page_content=s, metadata={id_key: doc_ids[i]})
+    for i, s in enumerate(text_summaries)
+]
 retriever.vectorstore.add_documents(summary_texts)
 retriever.docstore.mset(list(zip(doc_ids, texts)))

 # Add tables
 table_ids = [str(uuid.uuid4()) for _ in tables]
-summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
+summary_tables = [
+    Document(page_content=s, metadata={id_key: table_ids[i]})
+    for i, s in enumerate(table_summaries)
+]
 retriever.vectorstore.add_documents(summary_tables)
 retriever.docstore.mset(list(zip(table_ids, tables)))

@@ -90,16 +96,16 @@ retriever.docstore.mset(list(zip(table_ids, tables)))
 template = """Answer the question based only on the following context, which can include text and tables:
 {context}
 Question: {question}
-"""
+"""  # noqa: E501
 prompt = ChatPromptTemplate.from_template(template)

 # LLM
-model = ChatOpenAI(temperature=0,model="gpt-4")
+model = ChatOpenAI(temperature=0, model="gpt-4")

 # RAG pipeline
 chain = (
-    {"context": retriever, "question": RunnablePassthrough()} 
-    | prompt 
-    | model 
+    {"context": retriever, "question": RunnablePassthrough()}
+    | prompt
+    | model
    | StrOutputParser()
-)
+)