mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-24 03:52:10 +00:00
Lint Python notebooks with ruff. (#12677)
The new ruff version fixed the blocking bugs, and I was able to fairly easily us to a passing state: ruff fixed some issues on its own, I fixed a handful by hand, and I added a list of narrowly-targeted exclusions for files that are currently failing ruff rules that we probably should look into eventually. I went pretty lenient on the docs / cookbooks rules, allowing dead code and such things. Perhaps in the future we may want to tighten the rules further, but this is already a good set of checks that found real issues and will prevent them going forward.
This commit is contained in:
@@ -115,7 +115,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Folder with pdf and extracted images \n",
|
||||
"# Folder with pdf and extracted images\n",
|
||||
"path = \"/Users/rlm/Desktop/photos/\""
|
||||
]
|
||||
},
|
||||
@@ -128,9 +128,10 @@
|
||||
"source": [
|
||||
"# Extract images, tables, and chunk text\n",
|
||||
"from unstructured.partition.pdf import partition_pdf\n",
|
||||
"\n",
|
||||
"raw_pdf_elements = partition_pdf(\n",
|
||||
" filename=path + \"photos.pdf\",\n",
|
||||
" extract_images_in_pdf=True, \n",
|
||||
" extract_images_in_pdf=True,\n",
|
||||
" infer_table_structure=True,\n",
|
||||
" chunking_strategy=\"by_title\",\n",
|
||||
" max_characters=4000,\n",
|
||||
@@ -191,14 +192,17 @@
|
||||
"\n",
|
||||
"# Create chroma\n",
|
||||
"vectorstore = Chroma(\n",
|
||||
" collection_name=\"mm_rag_clip_photos\",\n",
|
||||
" embedding_function=OpenCLIPEmbeddings()\n",
|
||||
" collection_name=\"mm_rag_clip_photos\", embedding_function=OpenCLIPEmbeddings()\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Get image URIs with .jpg extension only\n",
|
||||
"image_uris = sorted([os.path.join(path, image_name) \n",
|
||||
" for image_name in os.listdir(path) \n",
|
||||
" if image_name.endswith('.jpg')])\n",
|
||||
"image_uris = sorted(\n",
|
||||
" [\n",
|
||||
" os.path.join(path, image_name)\n",
|
||||
" for image_name in os.listdir(path)\n",
|
||||
" if image_name.endswith(\".jpg\")\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Add images\n",
|
||||
"vectorstore.add_images(uris=image_uris)\n",
|
||||
@@ -206,7 +210,7 @@
|
||||
"# Add documents\n",
|
||||
"vectorstore.add_texts(texts=texts)\n",
|
||||
"\n",
|
||||
"# Make retriever \n",
|
||||
"# Make retriever\n",
|
||||
"retriever = vectorstore.as_retriever()"
|
||||
]
|
||||
},
|
||||
@@ -235,6 +239,7 @@
|
||||
"from io import BytesIO\n",
|
||||
"from PIL import Image\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def resize_base64_image(base64_string, size=(128, 128)):\n",
|
||||
" \"\"\"\n",
|
||||
" Resize an image encoded as a Base64 string.\n",
|
||||
@@ -258,30 +263,31 @@
|
||||
" resized_img.save(buffered, format=img.format)\n",
|
||||
"\n",
|
||||
" # Encode the resized image to Base64\n",
|
||||
" return base64.b64encode(buffered.getvalue()).decode('utf-8')\n",
|
||||
" return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def is_base64(s):\n",
|
||||
" ''' Check if a string is Base64 encoded '''\n",
|
||||
" \"\"\"Check if a string is Base64 encoded\"\"\"\n",
|
||||
" try:\n",
|
||||
" return base64.b64encode(base64.b64decode(s)) == s.encode()\n",
|
||||
" except Exception:\n",
|
||||
" return False\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def split_image_text_types(docs):\n",
|
||||
" ''' Split numpy array images and texts '''\n",
|
||||
" \"\"\"Split numpy array images and texts\"\"\"\n",
|
||||
" images = []\n",
|
||||
" text = []\n",
|
||||
" for doc in docs:\n",
|
||||
" doc = doc.page_content # Extract Document contents \n",
|
||||
" doc = doc.page_content # Extract Document contents\n",
|
||||
" if is_base64(doc):\n",
|
||||
" # Resize image to avoid OAI server error\n",
|
||||
" images.append(resize_base64_image(doc, size=(250, 250))) # base64 encoded str \n",
|
||||
" images.append(\n",
|
||||
" resize_base64_image(doc, size=(250, 250))\n",
|
||||
" ) # base64 encoded str\n",
|
||||
" else:\n",
|
||||
" text.append(doc) \n",
|
||||
" return {\n",
|
||||
" \"images\": images,\n",
|
||||
" \"texts\": text\n",
|
||||
" }"
|
||||
" text.append(doc)\n",
|
||||
" return {\"images\": images, \"texts\": text}"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -311,6 +317,7 @@
|
||||
"from langchain.schema.runnable import RunnablePassthrough, RunnableLambda\n",
|
||||
"from langchain.schema.messages import HumanMessage, SystemMessage\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def prompt_func(data_dict):\n",
|
||||
" # Joining the context texts into a single string\n",
|
||||
" formatted_texts = \"\\n\".join(data_dict[\"context\"][\"texts\"])\n",
|
||||
@@ -322,7 +329,7 @@
|
||||
" \"type\": \"image_url\",\n",
|
||||
" \"image_url\": {\n",
|
||||
" \"url\": f\"data:image/jpeg;base64,{data_dict['context']['images'][0]}\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" }\n",
|
||||
" messages.append(image_message)\n",
|
||||
"\n",
|
||||
@@ -342,17 +349,21 @@
|
||||
" f\"User-provided keywords: {data_dict['question']}\\n\\n\"\n",
|
||||
" \"Text and / or tables:\\n\"\n",
|
||||
" f\"{formatted_texts}\"\n",
|
||||
" )\n",
|
||||
" ),\n",
|
||||
" }\n",
|
||||
" messages.append(text_message)\n",
|
||||
"\n",
|
||||
" return [HumanMessage(content=messages)]\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"model = ChatOpenAI(temperature=0, model=\"gpt-4-vision-preview\", max_tokens=1024)\n",
|
||||
"\n",
|
||||
"# RAG pipeline\n",
|
||||
"chain = (\n",
|
||||
" {\"context\": retriever | RunnableLambda(split_image_text_types), \"question\": RunnablePassthrough()}\n",
|
||||
" {\n",
|
||||
" \"context\": retriever | RunnableLambda(split_image_text_types),\n",
|
||||
" \"question\": RunnablePassthrough(),\n",
|
||||
" }\n",
|
||||
" | RunnableLambda(prompt_func)\n",
|
||||
" | model\n",
|
||||
" | StrOutputParser()\n",
|
||||
@@ -412,15 +423,16 @@
|
||||
"source": [
|
||||
"from IPython.display import display, HTML\n",
|
||||
"\n",
|
||||
"def plt_img_base64(img_base64):\n",
|
||||
"\n",
|
||||
"def plt_img_base64(img_base64):\n",
|
||||
" # Create an HTML img tag with the base64 string as the source\n",
|
||||
" image_html = f'<img src=\"data:image/jpeg;base64,{img_base64}\" />'\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" # Display the image by rendering the HTML\n",
|
||||
" display(HTML(image_html))\n",
|
||||
"\n",
|
||||
"docs = retriever.get_relevant_documents(\"Woman with children\",k=10)\n",
|
||||
"\n",
|
||||
"docs = retriever.get_relevant_documents(\"Woman with children\", k=10)\n",
|
||||
"for doc in docs:\n",
|
||||
" if is_base64(doc.page_content):\n",
|
||||
" plt_img_base64(doc.page_content)\n",
|
||||
@@ -446,9 +458,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"chain.invoke(\n",
|
||||
" \"Woman with children\"\n",
|
||||
")"
|
||||
"chain.invoke(\"Woman with children\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
Reference in New Issue
Block a user