Lint Python notebooks with ruff. (#12677)

The new ruff version fixed the blocking bugs, and I was able to fairly easily us to a passing state: ruff fixed some issues on its own, I fixed a handful by hand, and I added a list of narrowly-targeted exclusions for files that are currently failing ruff rules that we probably should look into eventually. I went pretty lenient on the docs / cookbooks rules, allowing dead code and such things. Perhaps in the future we may want to tighten the rules further, but this is already a good set of checks that found real issues and will prevent them going forward.
2025-09-24 12:01:54 +00:00 · 2023-11-14 15:58:22 -05:00
parent 344cab0739
commit 2ebd167dba
189 changed files with 2249 additions and 2362 deletions
--- a/cookbook/multi_modal_RAG_chroma.ipynb
+++ b/cookbook/multi_modal_RAG_chroma.ipynb
@@ -115,7 +115,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Folder with pdf and extracted images \n",
+    "# Folder with pdf and extracted images\n",
    "path = \"/Users/rlm/Desktop/photos/\""
   ]
  },
@@ -128,9 +128,10 @@
   "source": [
    "# Extract images, tables, and chunk text\n",
    "from unstructured.partition.pdf import partition_pdf\n",
+    "\n",
    "raw_pdf_elements = partition_pdf(\n",
    "    filename=path + \"photos.pdf\",\n",
-    "    extract_images_in_pdf=True, \n",
+    "    extract_images_in_pdf=True,\n",
    "    infer_table_structure=True,\n",
    "    chunking_strategy=\"by_title\",\n",
    "    max_characters=4000,\n",
@@ -191,14 +192,17 @@
    "\n",
    "# Create chroma\n",
    "vectorstore = Chroma(\n",
-    "    collection_name=\"mm_rag_clip_photos\",\n",
-    "    embedding_function=OpenCLIPEmbeddings()\n",
+    "    collection_name=\"mm_rag_clip_photos\", embedding_function=OpenCLIPEmbeddings()\n",
    ")\n",
    "\n",
    "# Get image URIs with .jpg extension only\n",
-    "image_uris = sorted([os.path.join(path, image_name) \n",
-    "                     for image_name in os.listdir(path) \n",
-    "                     if image_name.endswith('.jpg')])\n",
+    "image_uris = sorted(\n",
+    "    [\n",
+    "        os.path.join(path, image_name)\n",
+    "        for image_name in os.listdir(path)\n",
+    "        if image_name.endswith(\".jpg\")\n",
+    "    ]\n",
+    ")\n",
    "\n",
    "# Add images\n",
    "vectorstore.add_images(uris=image_uris)\n",
@@ -206,7 +210,7 @@
    "# Add documents\n",
    "vectorstore.add_texts(texts=texts)\n",
    "\n",
-    "# Make retriever \n",
+    "# Make retriever\n",
    "retriever = vectorstore.as_retriever()"
   ]
  },
@@ -235,6 +239,7 @@
    "from io import BytesIO\n",
    "from PIL import Image\n",
    "\n",
+    "\n",
    "def resize_base64_image(base64_string, size=(128, 128)):\n",
    "    \"\"\"\n",
    "    Resize an image encoded as a Base64 string.\n",
@@ -258,30 +263,31 @@
    "    resized_img.save(buffered, format=img.format)\n",
    "\n",
    "    # Encode the resized image to Base64\n",
-    "    return base64.b64encode(buffered.getvalue()).decode('utf-8')\n",
+    "    return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
+    "\n",
    "\n",
    "def is_base64(s):\n",
-    "    ''' Check if a string is Base64 encoded '''\n",
+    "    \"\"\"Check if a string is Base64 encoded\"\"\"\n",
    "    try:\n",
    "        return base64.b64encode(base64.b64decode(s)) == s.encode()\n",
    "    except Exception:\n",
    "        return False\n",
-    "        \n",
+    "\n",
+    "\n",
    "def split_image_text_types(docs):\n",
-    "    ''' Split numpy array images and texts '''\n",
+    "    \"\"\"Split numpy array images and texts\"\"\"\n",
    "    images = []\n",
    "    text = []\n",
    "    for doc in docs:\n",
-    "        doc = doc.page_content # Extract Document contents \n",
+    "        doc = doc.page_content  # Extract Document contents\n",
    "        if is_base64(doc):\n",
    "            # Resize image to avoid OAI server error\n",
-    "            images.append(resize_base64_image(doc, size=(250, 250)))  # base64 encoded str \n",
+    "            images.append(\n",
+    "                resize_base64_image(doc, size=(250, 250))\n",
+    "            )  # base64 encoded str\n",
    "        else:\n",
-    "            text.append(doc) \n",
-    "    return {\n",
-    "        \"images\": images,\n",
-    "        \"texts\": text\n",
-    "    }"
+    "            text.append(doc)\n",
+    "    return {\"images\": images, \"texts\": text}"
   ]
  },
  {
@@ -311,6 +317,7 @@
    "from langchain.schema.runnable import RunnablePassthrough, RunnableLambda\n",
    "from langchain.schema.messages import HumanMessage, SystemMessage\n",
    "\n",
+    "\n",
    "def prompt_func(data_dict):\n",
    "    # Joining the context texts into a single string\n",
    "    formatted_texts = \"\\n\".join(data_dict[\"context\"][\"texts\"])\n",
@@ -322,7 +329,7 @@
    "            \"type\": \"image_url\",\n",
    "            \"image_url\": {\n",
    "                \"url\": f\"data:image/jpeg;base64,{data_dict['context']['images'][0]}\"\n",
-    "            }\n",
+    "            },\n",
    "        }\n",
    "        messages.append(image_message)\n",
    "\n",
@@ -342,17 +349,21 @@
    "            f\"User-provided keywords: {data_dict['question']}\\n\\n\"\n",
    "            \"Text and / or tables:\\n\"\n",
    "            f\"{formatted_texts}\"\n",
-    "        )\n",
+    "        ),\n",
    "    }\n",
    "    messages.append(text_message)\n",
    "\n",
    "    return [HumanMessage(content=messages)]\n",
-    "    \n",
+    "\n",
+    "\n",
    "model = ChatOpenAI(temperature=0, model=\"gpt-4-vision-preview\", max_tokens=1024)\n",
    "\n",
    "# RAG pipeline\n",
    "chain = (\n",
-    "    {\"context\": retriever | RunnableLambda(split_image_text_types), \"question\": RunnablePassthrough()}\n",
+    "    {\n",
+    "        \"context\": retriever | RunnableLambda(split_image_text_types),\n",
+    "        \"question\": RunnablePassthrough(),\n",
+    "    }\n",
    "    | RunnableLambda(prompt_func)\n",
    "    | model\n",
    "    | StrOutputParser()\n",
@@ -412,15 +423,16 @@
   "source": [
    "from IPython.display import display, HTML\n",
    "\n",
-    "def plt_img_base64(img_base64):\n",
    "\n",
+    "def plt_img_base64(img_base64):\n",
    "    # Create an HTML img tag with the base64 string as the source\n",
    "    image_html = f'<img src=\"data:image/jpeg;base64,{img_base64}\" />'\n",
-    "    \n",
+    "\n",
    "    # Display the image by rendering the HTML\n",
    "    display(HTML(image_html))\n",
    "\n",
-    "docs = retriever.get_relevant_documents(\"Woman with children\",k=10)\n",
+    "\n",
+    "docs = retriever.get_relevant_documents(\"Woman with children\", k=10)\n",
    "for doc in docs:\n",
    "    if is_base64(doc.page_content):\n",
    "        plt_img_base64(doc.page_content)\n",
@@ -446,9 +458,7 @@
    }
   ],
   "source": [
-    "chain.invoke(\n",
-    "    \"Woman with children\"\n",
-    ")"
+    "chain.invoke(\"Woman with children\")"
   ]
  },
  {