mirror of
https://github.com/hwchase17/langchain.git
synced 2026-02-09 10:41:52 +00:00
Compare commits
1 Commits
erick/part
...
erick/impr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb136405ef |
@@ -115,7 +115,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Folder with pdf and extracted images \n",
|
||||
"# Folder with pdf and extracted images\n",
|
||||
"path = \"/Users/rlm/Desktop/photos/\""
|
||||
]
|
||||
},
|
||||
@@ -128,9 +128,10 @@
|
||||
"source": [
|
||||
"# Extract images, tables, and chunk text\n",
|
||||
"from unstructured.partition.pdf import partition_pdf\n",
|
||||
"\n",
|
||||
"raw_pdf_elements = partition_pdf(\n",
|
||||
" filename=path + \"photos.pdf\",\n",
|
||||
" extract_images_in_pdf=True, \n",
|
||||
" extract_images_in_pdf=True,\n",
|
||||
" infer_table_structure=True,\n",
|
||||
" chunking_strategy=\"by_title\",\n",
|
||||
" max_characters=4000,\n",
|
||||
@@ -191,14 +192,17 @@
|
||||
"\n",
|
||||
"# Create chroma\n",
|
||||
"vectorstore = Chroma(\n",
|
||||
" collection_name=\"mm_rag_clip_photos\",\n",
|
||||
" embedding_function=OpenCLIPEmbeddings()\n",
|
||||
" collection_name=\"mm_rag_clip_photos\", embedding_function=OpenCLIPEmbeddings()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Get image URIs with .jpg extension only\n",
|
||||
"image_uris = sorted([os.path.join(path, image_name) \n",
|
||||
" for image_name in os.listdir(path) \n",
|
||||
" if image_name.endswith('.jpg')])\n",
|
||||
"image_uris = sorted(\n",
|
||||
" [\n",
|
||||
" os.path.join(path, image_name)\n",
|
||||
" for image_name in os.listdir(path)\n",
|
||||
" if image_name.endswith(\".jpg\")\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Add images\n",
|
||||
"vectorstore.add_images(uris=image_uris)\n",
|
||||
@@ -206,7 +210,7 @@
|
||||
"# Add documents\n",
|
||||
"vectorstore.add_texts(texts=texts)\n",
|
||||
"\n",
|
||||
"# Make retriever \n",
|
||||
"# Make retriever\n",
|
||||
"retriever = vectorstore.as_retriever()"
|
||||
]
|
||||
},
|
||||
@@ -235,6 +239,7 @@
|
||||
"from io import BytesIO\n",
|
||||
"from PIL import Image\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def resize_base64_image(base64_string, size=(128, 128)):\n",
|
||||
" \"\"\"\n",
|
||||
" Resize an image encoded as a Base64 string.\n",
|
||||
@@ -258,30 +263,31 @@
|
||||
" resized_img.save(buffered, format=img.format)\n",
|
||||
"\n",
|
||||
" # Encode the resized image to Base64\n",
|
||||
" return base64.b64encode(buffered.getvalue()).decode('utf-8')\n",
|
||||
" return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def is_base64(s):\n",
|
||||
" ''' Check if a string is Base64 encoded '''\n",
|
||||
" \"\"\"Check if a string is Base64 encoded\"\"\"\n",
|
||||
" try:\n",
|
||||
" return base64.b64encode(base64.b64decode(s)) == s.encode()\n",
|
||||
" except Exception:\n",
|
||||
" return False\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def split_image_text_types(docs):\n",
|
||||
" ''' Split numpy array images and texts '''\n",
|
||||
" \"\"\"Split numpy array images and texts\"\"\"\n",
|
||||
" images = []\n",
|
||||
" text = []\n",
|
||||
" for doc in docs:\n",
|
||||
" doc = doc.page_content # Extract Document contents \n",
|
||||
" doc = doc.page_content # Extract Document contents\n",
|
||||
" if is_base64(doc):\n",
|
||||
" # Resize image to avoid OAI server error\n",
|
||||
" images.append(resize_base64_image(doc, size=(250, 250))) # base64 encoded str \n",
|
||||
" images.append(\n",
|
||||
" resize_base64_image(doc, size=(250, 250))\n",
|
||||
" ) # base64 encoded str\n",
|
||||
" else:\n",
|
||||
" text.append(doc) \n",
|
||||
" return {\n",
|
||||
" \"images\": images,\n",
|
||||
" \"texts\": text\n",
|
||||
" }"
|
||||
" text.append(doc)\n",
|
||||
" return {\"images\": images, \"texts\": text}"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -311,6 +317,7 @@
|
||||
"from langchain.schema.runnable import RunnablePassthrough, RunnableLambda\n",
|
||||
"from langchain.schema.messages import HumanMessage, SystemMessage\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def prompt_func(data_dict):\n",
|
||||
" # Joining the context texts into a single string\n",
|
||||
" formatted_texts = \"\\n\".join(data_dict[\"context\"][\"texts\"])\n",
|
||||
@@ -322,7 +329,7 @@
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {\n",
|
||||
" \"url\": f\"data:image/jpeg;base64,{data_dict['context']['images'][0]}\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" messages.append(image_message)\n",
|
||||
"\n",
|
||||
@@ -342,17 +349,21 @@
|
||||
" f\"User-provided keywords: {data_dict['question']}\\n\\n\"\n",
|
||||
" \"Text and / or tables:\\n\"\n",
|
||||
" f\"{formatted_texts}\"\n",
|
||||
" )\n",
|
||||
" ),\n",
|
||||
" }\n",
|
||||
" messages.append(text_message)\n",
|
||||
"\n",
|
||||
" return [HumanMessage(content=messages)]\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model = ChatOpenAI(temperature=0, model=\"gpt-4-vision-preview\", max_tokens=1024)\n",
|
||||
"\n",
|
||||
"# RAG pipeline\n",
|
||||
"chain = (\n",
|
||||
" {\"context\": retriever | RunnableLambda(split_image_text_types), \"question\": RunnablePassthrough()}\n",
|
||||
" {\n",
|
||||
" \"context\": retriever | RunnableLambda(split_image_text_types),\n",
|
||||
" \"question\": RunnablePassthrough(),\n",
|
||||
" }\n",
|
||||
" | RunnableLambda(prompt_func)\n",
|
||||
" | model\n",
|
||||
" | StrOutputParser()\n",
|
||||
@@ -410,7 +421,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"docs = retriever.get_relevant_documents(\"Woman with children\",k=10)\n",
|
||||
"docs = retriever.get_relevant_documents(\"Woman with children\", k=10)\n",
|
||||
"for doc in docs:\n",
|
||||
" if is_base64(doc.page_content):\n",
|
||||
" plt_img_base64(doc.page_content)\n",
|
||||
@@ -436,9 +447,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke(\n",
|
||||
" \"Woman with children\"\n",
|
||||
")"
|
||||
"chain.invoke(\"Woman with children\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -118,7 +118,9 @@
|
||||
"source": [
|
||||
"loader = DocusaurusLoader(\n",
|
||||
" \"https://python.langchain.com\",\n",
|
||||
" filter_urls=[\"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"],\n",
|
||||
" filter_urls=[\n",
|
||||
" \"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"\n",
|
||||
" ],\n",
|
||||
")\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
@@ -162,9 +164,11 @@
|
||||
"source": [
|
||||
"loader = DocusaurusLoader(\n",
|
||||
" \"https://python.langchain.com\",\n",
|
||||
" filter_urls=[\"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"],\n",
|
||||
" filter_urls=[\n",
|
||||
" \"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"\n",
|
||||
" ],\n",
|
||||
" # This will only include the content that matches these tags, otherwise they will be removed\n",
|
||||
" custom_html_tags=[\"#content\", \".main\"]\n",
|
||||
" custom_html_tags=[\"#content\", \".main\"],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -213,7 +217,9 @@
|
||||
"source": [
|
||||
"loader = DocusaurusLoader(\n",
|
||||
" \"https://python.langchain.com\",\n",
|
||||
" filter_urls=[\"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"],\n",
|
||||
" filter_urls=[\n",
|
||||
" \"https://python.langchain.com/docs/integrations/document_loaders/sitemap\"\n",
|
||||
" ],\n",
|
||||
" parsing_function=remove_nav_and_header_elements,\n",
|
||||
")"
|
||||
]
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
" url=\"bolt://localhost:7687\",\n",
|
||||
" username=\"neo4j\",\n",
|
||||
" password=\"password\",\n",
|
||||
" session_id=\"session_id_1\"\n",
|
||||
" session_id=\"session_id_1\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"history.add_user_message(\"hi!\")\n",
|
||||
|
||||
@@ -110,7 +110,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"document_embeddings = embeddings.embed_documents([\"This is a document\", \"This is some other document\"])"
|
||||
"document_embeddings = embeddings.embed_documents(\n",
|
||||
" [\"This is a document\", \"This is some other document\"]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import open_clip\n",
|
||||
"\n",
|
||||
"open_clip.list_pretrained()"
|
||||
]
|
||||
},
|
||||
@@ -147,8 +148,8 @@
|
||||
" \"rocket\": \"a rocket standing on a launchpad\",\n",
|
||||
" \"motorcycle_right\": \"a red motorcycle standing in a garage\",\n",
|
||||
" \"camera\": \"a person looking at a camera on a tripod\",\n",
|
||||
" \"horse\": \"a black-and-white silhouette of a horse\", \n",
|
||||
" \"coffee\": \"a cup of coffee on a saucer\"\n",
|
||||
" \"horse\": \"a black-and-white silhouette of a horse\",\n",
|
||||
" \"coffee\": \"a cup of coffee on a saucer\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"original_images = []\n",
|
||||
@@ -158,14 +159,18 @@
|
||||
"plt.figure(figsize=(16, 5))\n",
|
||||
"\n",
|
||||
"# Loop to display and prepare images and assemble URIs\n",
|
||||
"for filename in [filename for filename in os.listdir(skimage.data_dir) if filename.endswith(\".png\") or filename.endswith(\".jpg\")]:\n",
|
||||
"for filename in [\n",
|
||||
" filename\n",
|
||||
" for filename in os.listdir(skimage.data_dir)\n",
|
||||
" if filename.endswith(\".png\") or filename.endswith(\".jpg\")\n",
|
||||
"]:\n",
|
||||
" name = os.path.splitext(filename)[0]\n",
|
||||
" if name not in descriptions:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" image_path = os.path.join(skimage.data_dir, filename)\n",
|
||||
" image = Image.open(image_path).convert(\"RGB\")\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" plt.subplot(2, 4, len(images) + 1)\n",
|
||||
" plt.imshow(image)\n",
|
||||
" plt.title(f\"{filename}\\n{descriptions[name]}\")\n",
|
||||
@@ -173,7 +178,7 @@
|
||||
" plt.yticks([])\n",
|
||||
"\n",
|
||||
" original_images.append(image)\n",
|
||||
" images.append(image) # Origional code does preprocessing here\n",
|
||||
" images.append(image) # Origional code does preprocessing here\n",
|
||||
" texts.append(descriptions[name])\n",
|
||||
" image_uris.append(image_path) # Add the image URI to the list\n",
|
||||
"\n",
|
||||
@@ -216,7 +221,7 @@
|
||||
"# Instantiate your model\n",
|
||||
"clip_embd = OpenCLIPEmbeddings()\n",
|
||||
"\n",
|
||||
"# Embed images and text \n",
|
||||
"# Embed images and text\n",
|
||||
"img_features = clip_embd.embed_image(image_uris)\n",
|
||||
"text_features = clip_embd.embed_documents([\"This is \" + desc for desc in texts])\n",
|
||||
"\n",
|
||||
@@ -241,7 +246,7 @@
|
||||
" plt.text(x, y, f\"{similarity[y, x]:.2f}\", ha=\"center\", va=\"center\", size=12)\n",
|
||||
"\n",
|
||||
"for side in [\"left\", \"top\", \"right\", \"bottom\"]:\n",
|
||||
" plt.gca().spines[side].set_visible(False)\n",
|
||||
" plt.gca().spines[side].set_visible(False)\n",
|
||||
"\n",
|
||||
"plt.xlim([-0.5, count - 0.5])\n",
|
||||
"plt.ylim([count + 0.5, -2])\n",
|
||||
|
||||
@@ -149,12 +149,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"db = Weaviate.from_documents(\n",
|
||||
" docs, \n",
|
||||
" embeddings, \n",
|
||||
" weaviate_url=WEAVIATE_URL, \n",
|
||||
" by_text=False\n",
|
||||
")"
|
||||
"db = Weaviate.from_documents(docs, embeddings, weaviate_url=WEAVIATE_URL, by_text=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -227,8 +222,7 @@
|
||||
"import weaviate\n",
|
||||
"\n",
|
||||
"client = weaviate.Client(\n",
|
||||
" url=WEAVIATE_URL, \n",
|
||||
" auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY)\n",
|
||||
" url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(WEAVIATE_API_KEY)\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# client = weaviate.Client(\n",
|
||||
@@ -240,10 +234,7 @@
|
||||
"# )\n",
|
||||
"\n",
|
||||
"vectorstore = Weaviate.from_documents(\n",
|
||||
" documents, \n",
|
||||
" embeddings, \n",
|
||||
" client=client, \n",
|
||||
" by_text=False\n",
|
||||
" documents, embeddings, client=client, by_text=False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -378,6 +369,7 @@
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"\n",
|
||||
"llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n",
|
||||
"llm.predict(\"What did the president say about Justice Breyer\")"
|
||||
]
|
||||
@@ -575,10 +567,10 @@
|
||||
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||
"\n",
|
||||
"rag_chain = (\n",
|
||||
" {\"context\": retriever, \"question\": RunnablePassthrough()} \n",
|
||||
" | prompt \n",
|
||||
" {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
|
||||
" | prompt\n",
|
||||
" | llm\n",
|
||||
" | StrOutputParser() \n",
|
||||
" | StrOutputParser()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"rag_chain.invoke(\"What did the president say about Justice Breyer\")"
|
||||
|
||||
@@ -198,6 +198,7 @@
|
||||
"source": [
|
||||
"from langchain.agents import tool\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"@tool\n",
|
||||
"def get_word_length(word: str) -> int:\n",
|
||||
" \"\"\"Returns the length of a word.\"\"\"\n",
|
||||
@@ -606,10 +607,12 @@
|
||||
"source": [
|
||||
"input1 = \"how many letters in the word educa?\"\n",
|
||||
"result = agent_executor.invoke({\"input\": input1, \"chat_history\": chat_history})\n",
|
||||
"chat_history.extend([\n",
|
||||
" HumanMessage(content=input1),\n",
|
||||
" AIMessage(content=result[\"output\"]),\n",
|
||||
"])\n",
|
||||
"chat_history.extend(\n",
|
||||
" [\n",
|
||||
" HumanMessage(content=input1),\n",
|
||||
" AIMessage(content=result[\"output\"]),\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"agent_executor.invoke({\"input\": \"is that a real word?\", \"chat_history\": chat_history})"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
import glob
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
import shutil
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
TEMPLATES_DIR = Path(os.path.abspath(__file__)).parents[2] / "templates"
|
||||
DOCS_TEMPLATES_DIR = Path(os.path.abspath(__file__)).parents[1] / "docs" / "templates"
|
||||
|
||||
|
||||
readmes = list(glob.glob(str(TEMPLATES_DIR) + "/*/README.md"))
|
||||
destinations = [readme[len(str(TEMPLATES_DIR)) + 1:-10] + ".md" for readme in readmes]
|
||||
destinations = [readme[len(str(TEMPLATES_DIR)) + 1 : -10] + ".md" for readme in readmes]
|
||||
for source, destination in zip(readmes, destinations):
|
||||
full_destination = DOCS_TEMPLATES_DIR / destination
|
||||
shutil.copyfile(source, full_destination)
|
||||
@@ -33,4 +32,3 @@ with open(TEMPLATES_INDEX_DESTINATION, "r") as f:
|
||||
content = re.sub("\]\(\.\.\/", "](/docs/templates/", content)
|
||||
with open(TEMPLATES_INDEX_DESTINATION, "w") as f:
|
||||
f.write(sidebar_hidden + content)
|
||||
|
||||
|
||||
@@ -79,7 +79,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"answer = rag_app.invoke(\n",
|
||||
" {\n",
|
||||
" \"question\": \"What commits did the person with my name make?\",\n",
|
||||
@@ -125,7 +124,7 @@
|
||||
" \"end_date\": \"2016-01-01 00:00:00\",\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"answer\n"
|
||||
"answer"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user