Compare commits

...

2 Commits

Author SHA1 Message Date
Lance Martin
2dac8d0f12 RAG from scrath video series 2024-01-30 11:54:43 -08:00
Lance Martin
3b9e1bfdc6 Ntbk for RAG video series 2024-01-29 16:50:52 -08:00

View File

@@ -0,0 +1,464 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "41ce62a8-251f-4f9e-b375-e93a5861c3fe",
"metadata": {},
"source": [
"## Enviornment\n",
"\n",
"`(1) Packages`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4bef162-17e2-4a27-9a97-1f63dcf9726a",
"metadata": {},
"outputs": [],
"source": [
"! pip[ install langchain_community tiktoken langchain-openai langchainhub chromadb"
]
},
{
"cell_type": "markdown",
"id": "75a8ab66-8477-429f-bbbe-ba439322d085",
"metadata": {},
"source": [
"`(2) LangSmith`\n",
"\n",
"https://docs.smith.langchain.com/"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b76f68a8-4745-4377-8057-6090b87377d1",
"metadata": {},
"outputs": [],
"source": [
"os.environ['LANGCHAIN_TRACING_V2'] = 'true'\n",
"os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'\n",
"os.environ['LANGCHAIN_API_KEY'] = <your-api-key>"
]
},
{
"cell_type": "markdown",
"id": "1eae0ab7-d43b-43e0-8b99-6122a636fe0c",
"metadata": {},
"source": [
"## Part 1: Overview\n",
" \n",
"[RAG quickstart](https://python.langchain.com/docs/use_cases/question_answering/quickstart)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98070313-0c2f-4ba6-ae3e-79e2418ce4df",
"metadata": {},
"outputs": [],
"source": [
"import bs4\n",
"from langchain import hub\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain_community.document_loaders import WebBaseLoader\n",
"from langchain_community.vectorstores import Chroma\n",
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.runnables import RunnablePassthrough\n",
"from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n",
"\n",
"#### INDEXING ####\n",
"\n",
"# Load Documents\n",
"loader = WebBaseLoader(\n",
" web_paths=(\"https://lilianweng.github.io/posts/2023-06-23-agent/\",),\n",
" bs_kwargs=dict(\n",
" parse_only=bs4.SoupStrainer(\n",
" class_=(\"post-content\", \"post-title\", \"post-header\")\n",
" )\n",
" ),\n",
")\n",
"docs = loader.load()\n",
"\n",
"# Split\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"splits = text_splitter.split_documents(docs)\n",
"\n",
"# Embed\n",
"vectorstore = Chroma.from_documents(documents=splits, \n",
" embedding=OpenAIEmbeddings())\n",
"\n",
"retriever = vectorstore.as_retriever()\n",
"\n",
"#### RETRIEVAL and GENERATION ####\n",
"\n",
"# Prompt\n",
"prompt = hub.pull(\"rlm/rag-prompt\")\n",
"\n",
"# LLM\n",
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n",
"\n",
"# Post-processing\n",
"def format_docs(docs):\n",
" return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
"\n",
"# Chain\n",
"rag_chain = (\n",
" {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n",
" | prompt\n",
" | llm\n",
" | StrOutputParser()\n",
")\n",
"\n",
"# Question\n",
"rag_chain.invoke(\"What is Task Decomposition?\")"
]
},
{
"cell_type": "markdown",
"id": "18e8e856-bafd-469e-b99a-11596b18aad4",
"metadata": {},
"source": [
"## Part 2: Indexing"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "edd7beeb-21fa-4f4b-b8fa-5a4f70489a16",
"metadata": {},
"outputs": [],
"source": [
"# Documents\n",
"question = \"What kinds of pets do I like?\"\n",
"document = \"My favorite pet is a cat.\""
]
},
{
"cell_type": "markdown",
"id": "e0552ea4-935d-4dfa-bd2b-56d148e96304",
"metadata": {},
"source": [
"[Count tokens](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) considering [~4 char / token](https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df119cca-1676-4caa-bad4-11805d69e616",
"metadata": {},
"outputs": [],
"source": [
"import tiktoken\n",
"\n",
"def num_tokens_from_string(string: str, encoding_name: str) -> int:\n",
" \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
" encoding = tiktoken.get_encoding(encoding_name)\n",
" num_tokens = len(encoding.encode(string))\n",
" return num_tokens\n",
"\n",
"num_tokens_from_string(question, \"cl100k_base\")"
]
},
{
"cell_type": "markdown",
"id": "4f04fd74-829f-472c-a1bc-ec6521a0529f",
"metadata": {},
"source": [
"[Text embedding models](https://python.langchain.com/docs/integrations/text_embedding/openai)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6bd98786-755d-4d49-ba97-30c5a623b74e",
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import OpenAIEmbeddings\n",
"embd = OpenAIEmbeddings()\n",
"query_result = embd.embed_query(question)\n",
"document_result = embd.embed_query(document)\n",
"len(query_result)"
]
},
{
"cell_type": "markdown",
"id": "f5e0e35f-6861-4c5e-9301-04fd5408f8f8",
"metadata": {},
"source": [
"[Cosine similarity](https://platform.openai.com/docs/guides/embeddings/frequently-asked-questions) is reccomended (1 indicates identical)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8001998-b08c-4560-b124-bfa1fced8958",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"def cosine_similarity(vec1, vec2):\n",
" dot_product = np.dot(vec1, vec2)\n",
" norm_vec1 = np.linalg.norm(vec1)\n",
" norm_vec2 = np.linalg.norm(vec2)\n",
" return dot_product / (norm_vec1 * norm_vec2)\n",
"\n",
"similarity = cosine_similarity(query_result, document_result)\n",
"print(\"Cosine Similarity:\", similarity)"
]
},
{
"cell_type": "markdown",
"id": "8aea73bc-98e3-4fdc-ba72-d190736bed20",
"metadata": {},
"source": [
"[Document Loaders](https://python.langchain.com/docs/integrations/document_loaders/)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5778c31a-6138-4130-8865-31a08e82b9fb",
"metadata": {},
"outputs": [],
"source": [
"#### INDEXING ####\n",
"\n",
"# Load blog\n",
"import bs4\n",
"from langchain_community.document_loaders import WebBaseLoader\n",
"loader = WebBaseLoader(\n",
" web_paths=(\"https://lilianweng.github.io/posts/2023-06-23-agent/\",),\n",
" bs_kwargs=dict(\n",
" parse_only=bs4.SoupStrainer(\n",
" class_=(\"post-content\", \"post-title\", \"post-header\")\n",
" )\n",
" ),\n",
")\n",
"blog_docs = loader.load()"
]
},
{
"cell_type": "markdown",
"id": "798e731e-c6ff-46e3-a8bc-386832362af2",
"metadata": {},
"source": [
"[Splitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter)\n",
"\n",
"> This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is [\"\\n\\n\", \"\\n\", \" \", \"\"]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e668d339-3951-4662-8387-c3d296646906",
"metadata": {},
"outputs": [],
"source": [
"# Split\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(\n",
" chunk_size=300, \n",
" chunk_overlap=50)\n",
"\n",
"# Make splits\n",
"splits = text_splitter.split_documents(blog_docs)"
]
},
{
"cell_type": "markdown",
"id": "427303a1-3ed4-430c-bfc7-cb3e48022f1d",
"metadata": {},
"source": [
"[Vectorstores](https://python.langchain.com/docs/integrations/vectorstores/)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "baa90aaf-cc1b-46a1-9fba-cf20804dcb41",
"metadata": {},
"outputs": [],
"source": [
"# Index\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain_community.vectorstores import Chroma\n",
"vectorstore = Chroma.from_documents(documents=splits, \n",
" embedding=OpenAIEmbeddings())\n",
"\n",
"retriever = vectorstore.as_retriever()"
]
},
{
"cell_type": "markdown",
"id": "ba890329-1411-4922-bd27-fe0490dd1208",
"metadata": {},
"source": [
"## Part 3: Retrieval"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57c2de7a-93e6-4072-bc5b-db6516f96dda",
"metadata": {},
"outputs": [],
"source": [
"docs = retriever.get_relevant_documents(\"What is Task Decomposition?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0582e6bd-8e97-4cf4-80b9-a04434ea7135",
"metadata": {},
"outputs": [],
"source": [
"len(docs)"
]
},
{
"cell_type": "markdown",
"id": "beda1b07-7bd2-4f5b-8d44-1fc52f5d2ce2",
"metadata": {},
"source": [
"## Part 4: Generation"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8beb6c14-5e18-43e7-9d04-59e3b8a81cc9",
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import ChatOpenAI\n",
"from langchain.prompts import ChatPromptTemplate\n",
"\n",
"# Prompt\n",
"template = \"\"\"Answer the question based only on the following context:\n",
"{context}\n",
"\n",
"Question: {question}\n",
"\"\"\"\n",
"\n",
"prompt = ChatPromptTemplate.from_template(template)\n",
"prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4461264-5cac-479a-917c-9bf589826da4",
"metadata": {},
"outputs": [],
"source": [
"# LLM\n",
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55d6629f-18ec-4372-a557-b254fbb1dd2d",
"metadata": {},
"outputs": [],
"source": [
"# Chain\n",
"chain = prompt | llm"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94470770-8df4-4359-9504-ef6c8b3137ff",
"metadata": {},
"outputs": [],
"source": [
"# Run\n",
"chain.invoke({\"context\":docs,\"question\":\"What is Task Decomposition?\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "65770e2d-3d5e-4371-abc9-0aeca9646885",
"metadata": {},
"outputs": [],
"source": [
"from langchain import hub\n",
"prompt_hub_rag = hub.pull(\"rlm/rag-prompt\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f53e5840-0a0f-4428-a4a4-6922800aff89",
"metadata": {},
"outputs": [],
"source": [
"prompt_hub_rag"
]
},
{
"cell_type": "markdown",
"id": "8ffe29a1-5527-419e-9f12-8a3061d12885",
"metadata": {},
"source": [
"[RAG chains](https://python.langchain.com/docs/expression_language/get_started#rag-search-example)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8208a8bc-c75f-4e8e-8601-680746cd6276",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.output_parsers import StrOutputParser\n",
"from langchain_core.runnables import RunnablePassthrough\n",
"\n",
"rag_chain = (\n",
" {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
" | prompt\n",
" | llm\n",
" | StrOutputParser()\n",
")\n",
"\n",
"rag_chain.invoke(\"What is Task Decomposition?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d21023fb-c570-4163-aa46-56dae4c67122",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}