mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-16 15:04:13 +00:00
Format Templates (#12396)
This commit is contained in:
@@ -1,33 +1,36 @@
|
||||
# Load
|
||||
import uuid
|
||||
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.prompts import ChatPromptTemplate
|
||||
from langchain.schema.output_parser import StrOutputParser
|
||||
from langchain.vectorstores import Chroma
|
||||
from langchain.storage import InMemoryStore
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
from langchain.schema.document import Document
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.prompts import ChatPromptTemplate
|
||||
from langchain.retrievers.multi_vector import MultiVectorRetriever
|
||||
from langchain.schema.document import Document
|
||||
from langchain.schema.output_parser import StrOutputParser
|
||||
from langchain.schema.runnable import RunnablePassthrough
|
||||
from langchain.storage import InMemoryStore
|
||||
from langchain.vectorstores import Chroma
|
||||
from unstructured.partition.pdf import partition_pdf
|
||||
|
||||
# Path to docs
|
||||
path = "docs"
|
||||
raw_pdf_elements = partition_pdf(filename=path+"LLaMA2.pdf",
|
||||
# Unstructured first finds embedded image blocks
|
||||
extract_images_in_pdf=False,
|
||||
# Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
|
||||
# Titles are any sub-section of the document
|
||||
infer_table_structure=True,
|
||||
# Post processing to aggregate text once we have the title
|
||||
chunking_strategy="by_title",
|
||||
# Chunking params to aggregate text blocks
|
||||
# Attempt to create a new chunk 3800 chars
|
||||
# Attempt to keep chunks > 2000 chars
|
||||
max_characters=4000,
|
||||
new_after_n_chars=3800,
|
||||
combine_text_under_n_chars=2000,
|
||||
image_output_dir_path=path)
|
||||
raw_pdf_elements = partition_pdf(
|
||||
filename=path + "LLaMA2.pdf",
|
||||
# Unstructured first finds embedded image blocks
|
||||
extract_images_in_pdf=False,
|
||||
# Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
|
||||
# Titles are any sub-section of the document
|
||||
infer_table_structure=True,
|
||||
# Post processing to aggregate text once we have the title
|
||||
chunking_strategy="by_title",
|
||||
# Chunking params to aggregate text blocks
|
||||
# Attempt to create a new chunk 3800 chars
|
||||
# Attempt to keep chunks > 2000 chars
|
||||
max_characters=4000,
|
||||
new_after_n_chars=3800,
|
||||
combine_text_under_n_chars=2000,
|
||||
image_output_dir_path=path,
|
||||
)
|
||||
|
||||
# Categorize by type
|
||||
tables = []
|
||||
@@ -40,26 +43,23 @@ for element in raw_pdf_elements:
|
||||
|
||||
# Summarize
|
||||
|
||||
prompt_text="""You are an assistant tasked with summarizing tables and text. \
|
||||
prompt_text = """You are an assistant tasked with summarizing tables and text. \
|
||||
Give a concise summary of the table or text. Table or text chunk: {element} """
|
||||
prompt = ChatPromptTemplate.from_template(prompt_text)
|
||||
model = ChatOpenAI(temperature=0,model="gpt-4")
|
||||
summarize_chain = {"element": lambda x:x} | prompt | model | StrOutputParser()
|
||||
prompt = ChatPromptTemplate.from_template(prompt_text)
|
||||
model = ChatOpenAI(temperature=0, model="gpt-4")
|
||||
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
|
||||
|
||||
# Apply
|
||||
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})
|
||||
# To save time / cost, only do text summaries if chunk sizes are large
|
||||
# text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
|
||||
# We can just assign text_summaries to the raw texts
|
||||
# We can just assign text_summaries to the raw texts
|
||||
text_summaries = texts
|
||||
|
||||
# Use multi vector retriever
|
||||
|
||||
# The vectorstore to use to index the child chunks
|
||||
vectorstore = Chroma(
|
||||
collection_name="summaries",
|
||||
embedding_function=OpenAIEmbeddings()
|
||||
)
|
||||
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())
|
||||
|
||||
# The storage layer for the parent documents
|
||||
store = InMemoryStore()
|
||||
@@ -67,20 +67,26 @@ id_key = "doc_id"
|
||||
|
||||
# The retriever (empty to start)
|
||||
retriever = MultiVectorRetriever(
|
||||
vectorstore=vectorstore,
|
||||
docstore=store,
|
||||
vectorstore=vectorstore,
|
||||
docstore=store,
|
||||
id_key=id_key,
|
||||
)
|
||||
|
||||
# Add texts
|
||||
doc_ids = [str(uuid.uuid4()) for _ in texts]
|
||||
summary_texts = [Document(page_content=s,metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
|
||||
summary_texts = [
|
||||
Document(page_content=s, metadata={id_key: doc_ids[i]})
|
||||
for i, s in enumerate(text_summaries)
|
||||
]
|
||||
retriever.vectorstore.add_documents(summary_texts)
|
||||
retriever.docstore.mset(list(zip(doc_ids, texts)))
|
||||
|
||||
# Add tables
|
||||
table_ids = [str(uuid.uuid4()) for _ in tables]
|
||||
summary_tables = [Document(page_content=s,metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
|
||||
summary_tables = [
|
||||
Document(page_content=s, metadata={id_key: table_ids[i]})
|
||||
for i, s in enumerate(table_summaries)
|
||||
]
|
||||
retriever.vectorstore.add_documents(summary_tables)
|
||||
retriever.docstore.mset(list(zip(table_ids, tables)))
|
||||
|
||||
@@ -90,16 +96,16 @@ retriever.docstore.mset(list(zip(table_ids, tables)))
|
||||
template = """Answer the question based only on the following context, which can include text and tables:
|
||||
{context}
|
||||
Question: {question}
|
||||
"""
|
||||
""" # noqa: E501
|
||||
prompt = ChatPromptTemplate.from_template(template)
|
||||
|
||||
# LLM
|
||||
model = ChatOpenAI(temperature=0,model="gpt-4")
|
||||
model = ChatOpenAI(temperature=0, model="gpt-4")
|
||||
|
||||
# RAG pipeline
|
||||
chain = (
|
||||
{"context": retriever, "question": RunnablePassthrough()}
|
||||
| prompt
|
||||
| model
|
||||
{"context": retriever, "question": RunnablePassthrough()}
|
||||
| prompt
|
||||
| model
|
||||
| StrOutputParser()
|
||||
)
|
||||
)
|
||||
|
Reference in New Issue
Block a user