From 14f3014ccefa1344e74429b184975b318808e024 Mon Sep 17 00:00:00 2001
From: Zach Nussbaum <zanussbaum@gmail.com>
Date: Wed, 5 Jun 2024 12:47:17 -0400
Subject: [PATCH] embeddings: nomic embed vision (#22482)

Thank you for contributing to LangChain!

**Description:** Adds Langchain support for Nomic Embed Vision
**Twitter handle:** nomic_ai,zach_nussbaum


- [x] **Add tests and docs**: If you're adding a new integration, please
include
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.


- [ ] **Lint and test**: Run `make format`, `make lint` and `make test`
from the root of the package(s) you've modified. See contribution
guidelines for more: https://python.langchain.com/docs/contributing/

Additional guidelines:
- Make sure optional dependencies are imported within a function.
- Please do not add dependencies to pyproject.toml files (even optional
ones) unless they are required for unit tests.
- Most PRs should not touch more than one package.
- Changes should be backwards compatible.
- If you are adding something to community, do not re-import it in
langchain.

If no one reviews your PR within a few days, please @-mention one of
baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17.

---------

Co-authored-by: Lance Martin <122662504+rlancemartin@users.noreply.github.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 cookbook/nomic_multimodal_rag.ipynb           | 497 ++++++++++++++++++
 docs/scripts/arxiv_references.py              |  12 +-
 .../nomic/langchain_nomic/__init__.py         |   4 +-
 .../nomic/langchain_nomic/embeddings.py       |  11 +
 libs/partners/nomic/poetry.lock               |  14 +-
 libs/partners/nomic/pyproject.toml            |   1 +
 templates/rag-multi-modal-local/README.md     |  21 +-
 templates/rag-multi-modal-local/ingest.py     |   6 +-
 .../rag_multi_modal_local/chain.py            |   6 +-
 9 files changed, 543 insertions(+), 29 deletions(-)
 create mode 100644 cookbook/nomic_multimodal_rag.ipynb

diff --git a/cookbook/nomic_multimodal_rag.ipynb b/cookbook/nomic_multimodal_rag.ipynb
new file mode 100644
index 00000000000..ba8a77ace29
--- /dev/null
+++ b/cookbook/nomic_multimodal_rag.ipynb
@@ -0,0 +1,497 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "9fc3897d-176f-4729-8fd1-cfb4add53abd",
+   "metadata": {},
+   "source": [
+    "## Nomic multi-modal RAG\n",
+    "\n",
+    "Many documents contain a mixture of content types, including text and images. \n",
+    "\n",
+    "Yet, information captured in images is lost in most RAG applications.\n",
+    "\n",
+    "With the emergence of multimodal LLMs, like [GPT-4V](https://openai.com/research/gpt-4v-system-card), it is worth considering how to utilize images in RAG:\n",
+    "\n",
+    "In this demo we\n",
+    "\n",
+    "* Use multimodal embeddings from Nomic Embed [Vision](https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5) and [Text](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) to embed images and text\n",
+    "* Retrieve both using similarity search\n",
+    "* Pass raw images and text chunks to a multimodal LLM for answer synthesis \n",
+    "\n",
+    "## Signup\n",
+    "\n",
+    "Get your API token, then run:\n",
+    "```\n",
+    "! nomic login\n",
+    "```\n",
+    "\n",
+    "Then run with your generated API token \n",
+    "```\n",
+    "! nomic login < token > \n",
+    "```\n",
+    "\n",
+    "## Packages\n",
+    "\n",
+    "For `unstructured`, you will also need `poppler` ([installation instructions](https://pdf2image.readthedocs.io/en/latest/installation.html)) and `tesseract` ([installation instructions](https://tesseract-ocr.github.io/tessdoc/Installation.html)) in your system."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54926b9b-75c2-4cd4-8f14-b3882a0d370b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! nomic login token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "febbc459-ebba-4c1a-a52b-fed7731593f8",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain # (newest versions required for multi-modal)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acbdc603-39e2-4a5f-836c-2bbaecd46b0b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# lock to 0.10.19 due to a persistent bug in more recent versions\n",
+    "! pip install \"unstructured[all-docs]==0.10.19\" pillow pydantic lxml pillow matplotlib tiktoken"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1e94b3fb-8e3e-4736-be0a-ad881626c7bd",
+   "metadata": {},
+   "source": [
+    "## Data Loading\n",
+    "\n",
+    "### Partition PDF text and images\n",
+    "  \n",
+    "Let's look at an example pdfs containing interesting images.\n",
+    "\n",
+    "1/ Art from the J Paul Getty museum:\n",
+    "\n",
+    " * Here is a [zip file](https://drive.google.com/file/d/18kRKbq2dqAhhJ3DfZRnYcTBEUfYxe1YR/view?usp=sharing) with the PDF and the already extracted images. \n",
+    "* https://www.getty.edu/publications/resources/virtuallibrary/0892360224.pdf\n",
+    "\n",
+    "2/ Famous photographs from library of congress:\n",
+    "\n",
+    "* https://www.loc.gov/lcm/pdf/LCM_2020_1112.pdf\n",
+    "* We'll use this as an example below\n",
+    "\n",
+    "We can use `partition_pdf` below from [Unstructured](https://unstructured-io.github.io/unstructured/introduction.html#key-concepts) to extract text and images.\n",
+    "\n",
+    "To supply this to extract the images:\n",
+    "```\n",
+    "extract_images_in_pdf=True\n",
+    "```\n",
+    "\n",
+    "\n",
+    "\n",
+    "If using this zip file, then you can simply process the text only with:\n",
+    "```\n",
+    "extract_images_in_pdf=False\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9646b524-71a7-4b2a-bdc8-0b81f77e968f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Folder with pdf and extracted images\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# replace with actual path to images\n",
+    "path = Path(\"../art\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77f096ab-a933-41d0-8f4e-1efc83998fc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path.resolve()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc4839c0-8773-4a07-ba59-5364501269b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Extract images, tables, and chunk text\n",
+    "from unstructured.partition.pdf import partition_pdf\n",
+    "\n",
+    "raw_pdf_elements = partition_pdf(\n",
+    "    filename=str(path.resolve()) + \"/getty.pdf\",\n",
+    "    extract_images_in_pdf=False,\n",
+    "    infer_table_structure=True,\n",
+    "    chunking_strategy=\"by_title\",\n",
+    "    max_characters=4000,\n",
+    "    new_after_n_chars=3800,\n",
+    "    combine_text_under_n_chars=2000,\n",
+    "    image_output_dir_path=path,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "969545ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Categorize text elements by type\n",
+    "tables = []\n",
+    "texts = []\n",
+    "for element in raw_pdf_elements:\n",
+    "    if \"unstructured.documents.elements.Table\" in str(type(element)):\n",
+    "        tables.append(str(element))\n",
+    "    elif \"unstructured.documents.elements.CompositeElement\" in str(type(element)):\n",
+    "        texts.append(str(element))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d8e6349-1547-4cbf-9c6f-491d8610ec10",
+   "metadata": {},
+   "source": [
+    "## Multi-modal embeddings with our document\n",
+    "\n",
+    "We will use [nomic-embed-vision-v1.5](https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5) embeddings. This model is aligned \n",
+    "to [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) allowing for multimodal semantic search and Multimodal RAG!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bc15842-cb95-4f84-9eb5-656b0282a800",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import uuid\n",
+    "\n",
+    "import chromadb\n",
+    "import numpy as np\n",
+    "from langchain_community.vectorstores import Chroma\n",
+    "from langchain_nomic import NomicEmbeddings\n",
+    "from PIL import Image as _PILImage\n",
+    "\n",
+    "# Create chroma\n",
+    "text_vectorstore = Chroma(\n",
+    "    collection_name=\"mm_rag_clip_photos_text\",\n",
+    "    embedding_function=NomicEmbeddings(\n",
+    "        vision_model=\"nomic-embed-vision-v1.5\", model=\"nomic-embed-text-v1.5\"\n",
+    "    ),\n",
+    ")\n",
+    "image_vectorstore = Chroma(\n",
+    "    collection_name=\"mm_rag_clip_photos_image\",\n",
+    "    embedding_function=NomicEmbeddings(\n",
+    "        vision_model=\"nomic-embed-vision-v1.5\", model=\"nomic-embed-text-v1.5\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "# Get image URIs with .jpg extension only\n",
+    "image_uris = sorted(\n",
+    "    [\n",
+    "        os.path.join(path, image_name)\n",
+    "        for image_name in os.listdir(path)\n",
+    "        if image_name.endswith(\".jpg\")\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "# Add images\n",
+    "image_vectorstore.add_images(uris=image_uris)\n",
+    "\n",
+    "# Add documents\n",
+    "text_vectorstore.add_texts(texts=texts)\n",
+    "\n",
+    "# Make retriever\n",
+    "image_retriever = image_vectorstore.as_retriever()\n",
+    "text_retriever = text_vectorstore.as_retriever()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "02a186d0-27e0-4820-8092-63b5349dd25d",
+   "metadata": {},
+   "source": [
+    "## RAG\n",
+    "\n",
+    "`vectorstore.add_images` will store / retrieve images as base64 encoded strings.\n",
+    "\n",
+    "These can be passed to [GPT-4V](https://platform.openai.com/docs/guides/vision)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "344f56a8-0dc3-433e-851c-3f7600c7a72b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import io\n",
+    "from io import BytesIO\n",
+    "\n",
+    "import numpy as np\n",
+    "from PIL import Image\n",
+    "\n",
+    "\n",
+    "def resize_base64_image(base64_string, size=(128, 128)):\n",
+    "    \"\"\"\n",
+    "    Resize an image encoded as a Base64 string.\n",
+    "\n",
+    "    Args:\n",
+    "    base64_string (str): Base64 string of the original image.\n",
+    "    size (tuple): Desired size of the image as (width, height).\n",
+    "\n",
+    "    Returns:\n",
+    "    str: Base64 string of the resized image.\n",
+    "    \"\"\"\n",
+    "    # Decode the Base64 string\n",
+    "    img_data = base64.b64decode(base64_string)\n",
+    "    img = Image.open(io.BytesIO(img_data))\n",
+    "\n",
+    "    # Resize the image\n",
+    "    resized_img = img.resize(size, Image.LANCZOS)\n",
+    "\n",
+    "    # Save the resized image to a bytes buffer\n",
+    "    buffered = io.BytesIO()\n",
+    "    resized_img.save(buffered, format=img.format)\n",
+    "\n",
+    "    # Encode the resized image to Base64\n",
+    "    return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n",
+    "\n",
+    "\n",
+    "def is_base64(s):\n",
+    "    \"\"\"Check if a string is Base64 encoded\"\"\"\n",
+    "    try:\n",
+    "        return base64.b64encode(base64.b64decode(s)) == s.encode()\n",
+    "    except Exception:\n",
+    "        return False\n",
+    "\n",
+    "\n",
+    "def split_image_text_types(docs):\n",
+    "    \"\"\"Split numpy array images and texts\"\"\"\n",
+    "    images = []\n",
+    "    text = []\n",
+    "    for doc in docs:\n",
+    "        doc = doc.page_content  # Extract Document contents\n",
+    "        if is_base64(doc):\n",
+    "            # Resize image to avoid OAI server error\n",
+    "            images.append(\n",
+    "                resize_base64_image(doc, size=(250, 250))\n",
+    "            )  # base64 encoded str\n",
+    "        else:\n",
+    "            text.append(doc)\n",
+    "    return {\"images\": images, \"texts\": text}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23a2c1d8-fea6-4152-b184-3172dd46c735",
+   "metadata": {},
+   "source": [
+    "Currently, we format the inputs using a `RunnableLambda` while we add image support to `ChatPromptTemplates`.\n",
+    "\n",
+    "Our runnable follows the classic RAG flow - \n",
+    "\n",
+    "* We first compute the context (both \"texts\" and \"images\" in this case) and the question (just a RunnablePassthrough here) \n",
+    "* Then we pass this into our prompt template, which is a custom function that formats the message for the gpt-4-vision-preview model. \n",
+    "* And finally we parse the output as a string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d8919dc-c238-4746-86ba-45d940a7d260",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c93fab3-74c4-4f1d-958a-0bc4cdd0797e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from operator import itemgetter\n",
+    "\n",
+    "from langchain_core.messages import HumanMessage, SystemMessage\n",
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n",
+    "from langchain_openai import ChatOpenAI\n",
+    "\n",
+    "\n",
+    "def prompt_func(data_dict):\n",
+    "    # Joining the context texts into a single string\n",
+    "    formatted_texts = \"\\n\".join(data_dict[\"text_context\"][\"texts\"])\n",
+    "    messages = []\n",
+    "\n",
+    "    # Adding image(s) to the messages if present\n",
+    "    if data_dict[\"image_context\"][\"images\"]:\n",
+    "        image_message = {\n",
+    "            \"type\": \"image_url\",\n",
+    "            \"image_url\": {\n",
+    "                \"url\": f\"data:image/jpeg;base64,{data_dict['image_context']['images'][0]}\"\n",
+    "            },\n",
+    "        }\n",
+    "        messages.append(image_message)\n",
+    "\n",
+    "    # Adding the text message for analysis\n",
+    "    text_message = {\n",
+    "        \"type\": \"text\",\n",
+    "        \"text\": (\n",
+    "            \"As an expert art critic and historian, your task is to analyze and interpret images, \"\n",
+    "            \"considering their historical and cultural significance. Alongside the images, you will be \"\n",
+    "            \"provided with related text to offer context. Both will be retrieved from a vectorstore based \"\n",
+    "            \"on user-input keywords. Please use your extensive knowledge and analytical skills to provide a \"\n",
+    "            \"comprehensive summary that includes:\\n\"\n",
+    "            \"- A detailed description of the visual elements in the image.\\n\"\n",
+    "            \"- The historical and cultural context of the image.\\n\"\n",
+    "            \"- An interpretation of the image's symbolism and meaning.\\n\"\n",
+    "            \"- Connections between the image and the related text.\\n\\n\"\n",
+    "            f\"User-provided keywords: {data_dict['question']}\\n\\n\"\n",
+    "            \"Text and / or tables:\\n\"\n",
+    "            f\"{formatted_texts}\"\n",
+    "        ),\n",
+    "    }\n",
+    "    messages.append(text_message)\n",
+    "\n",
+    "    return [HumanMessage(content=messages)]\n",
+    "\n",
+    "\n",
+    "model = ChatOpenAI(temperature=0, model=\"gpt-4-vision-preview\", max_tokens=1024)\n",
+    "\n",
+    "# RAG pipeline\n",
+    "chain = (\n",
+    "    {\n",
+    "        \"text_context\": text_retriever | RunnableLambda(split_image_text_types),\n",
+    "        \"image_context\": image_retriever | RunnableLambda(split_image_text_types),\n",
+    "        \"question\": RunnablePassthrough(),\n",
+    "    }\n",
+    "    | RunnableLambda(prompt_func)\n",
+    "    | model\n",
+    "    | StrOutputParser()\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1566096d-97c2-4ddc-ba4a-6ef88c525e4e",
+   "metadata": {},
+   "source": [
+    "## Test retrieval and run RAG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90121e56-674b-473b-871d-6e4753fd0c45",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import HTML, display\n",
+    "\n",
+    "\n",
+    "def plt_img_base64(img_base64):\n",
+    "    # Create an HTML img tag with the base64 string as the source\n",
+    "    image_html = f'<img src=\"data:image/jpeg;base64,{img_base64}\" />'\n",
+    "\n",
+    "    # Display the image by rendering the HTML\n",
+    "    display(HTML(image_html))\n",
+    "\n",
+    "\n",
+    "docs = text_retriever.invoke(\"Women with children\", k=5)\n",
+    "for doc in docs:\n",
+    "    if is_base64(doc.page_content):\n",
+    "        plt_img_base64(doc.page_content)\n",
+    "    else:\n",
+    "        print(doc.page_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44eaa532-f035-4c04-b578-02339d42554c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = image_retriever.invoke(\"Women with children\", k=5)\n",
+    "for doc in docs:\n",
+    "    if is_base64(doc.page_content):\n",
+    "        plt_img_base64(doc.page_content)\n",
+    "    else:\n",
+    "        print(doc.page_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69fb15fd-76fc-49b4-806d-c4db2990027d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chain.invoke(\"Women with children\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "227f08b8-e732-4089-b65c-6eb6f9e48f15",
+   "metadata": {},
+   "source": [
+    "We can see the images retrieved in the LangSmith trace:\n",
+    "\n",
+    "LangSmith [trace](https://smith.langchain.com/public/69c558a5-49dc-4c60-a49b-3adbb70f74c5/r/e872c2c8-528c-468f-aefd-8b5cd730a673)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/scripts/arxiv_references.py b/docs/scripts/arxiv_references.py
index 914c58e7d6e..1511c2ee3e1 100644
--- a/docs/scripts/arxiv_references.py
+++ b/docs/scripts/arxiv_references.py
@@ -515,7 +515,8 @@ def log_results(arxiv_id2type2key2urls):
 def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) -> None:
     with open(file_name, "w") as f:
         # Write the table headers
-        f.write("""# arXiv
+        f.write(
+            """# arXiv
             
 LangChain implements the latest research in the field of Natural Language Processing.
 This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference,
@@ -525,7 +526,8 @@ This page contains `arXiv` papers referenced in the LangChain Documentation, API
 
 | arXiv id / Title | Authors | Published date 🔻 | LangChain Documentation|
 |------------------|---------|-------------------|------------------------|
-""")
+"""
+        )
         for paper in papers:
             refs = []
             if paper.referencing_doc2url:
@@ -595,7 +597,8 @@ This page contains `arXiv` papers referenced in the LangChain Documentation, API
                     if el
                 ]
             )
-            f.write(f"""
+            f.write(
+                f"""
 ## {paper.title}
 
 - **arXiv id:** {paper.arxiv_id}
@@ -608,7 +611,8 @@ This page contains `arXiv` papers referenced in the LangChain Documentation, API
 {refs}
 
 **Abstract:** {paper.abstract}
-                """)
+                """
+            )
 
     logger.warning(f"Created the {file_name} file with {len(papers)} arXiv references.")
 
diff --git a/libs/partners/nomic/langchain_nomic/__init__.py b/libs/partners/nomic/langchain_nomic/__init__.py
index 75f3facd507..01326dd75cd 100644
--- a/libs/partners/nomic/langchain_nomic/__init__.py
+++ b/libs/partners/nomic/langchain_nomic/__init__.py
@@ -1,5 +1,3 @@
 from langchain_nomic.embeddings import NomicEmbeddings
 
-__all__ = [
-    "NomicEmbeddings",
-]
+__all__ = ["NomicEmbeddings"]
diff --git a/libs/partners/nomic/langchain_nomic/embeddings.py b/libs/partners/nomic/langchain_nomic/embeddings.py
index 5c690276252..1dd2fa60fc2 100644
--- a/libs/partners/nomic/langchain_nomic/embeddings.py
+++ b/libs/partners/nomic/langchain_nomic/embeddings.py
@@ -22,6 +22,7 @@ class NomicEmbeddings(Embeddings):
         self,
         *,
         model: str,
+        nomic_api_key: Optional[str] = ...,
         dimensionality: Optional[int] = ...,
         inference_mode: Literal["remote"] = ...,
     ):
@@ -32,6 +33,7 @@ class NomicEmbeddings(Embeddings):
         self,
         *,
         model: str,
+        nomic_api_key: Optional[str] = ...,
         dimensionality: Optional[int] = ...,
         inference_mode: Literal["local", "dynamic"],
         device: Optional[str] = ...,
@@ -43,6 +45,7 @@ class NomicEmbeddings(Embeddings):
         self,
         *,
         model: str,
+        nomic_api_key: Optional[str] = ...,
         dimensionality: Optional[int] = ...,
         inference_mode: str,
         device: Optional[str] = ...,
@@ -57,6 +60,7 @@ class NomicEmbeddings(Embeddings):
         dimensionality: Optional[int] = None,
         inference_mode: str = "remote",
         device: Optional[str] = None,
+        vision_model: Optional[str] = None,
     ):
         """Initialize NomicEmbeddings model.
 
@@ -80,6 +84,7 @@ class NomicEmbeddings(Embeddings):
         self.dimensionality = dimensionality
         self.inference_mode = inference_mode
         self.device = device
+        self.vision_model = vision_model
 
     def embed(self, texts: List[str], *, task_type: str) -> List[List[float]]:
         """Embed texts.
@@ -121,3 +126,9 @@ class NomicEmbeddings(Embeddings):
             texts=[text],
             task_type="search_query",
         )[0]
+
+    def embed_image(self, uris: List[str]) -> List[List[float]]:
+        return embed.image(
+            images=uris,
+            model=self.vision_model,
+        )["embeddings"]
diff --git a/libs/partners/nomic/poetry.lock b/libs/partners/nomic/poetry.lock
index 2e015a30a39..2bdd02d72a7 100644
--- a/libs/partners/nomic/poetry.lock
+++ b/libs/partners/nomic/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "annotated-types"
@@ -276,7 +276,7 @@ files = [
 
 [[package]]
 name = "langchain-core"
-version = "0.2.0rc1"
+version = "0.2.3"
 description = "Building applications with LLMs through composability"
 optional = false
 python-versions = ">=3.8.1,<4.0"
@@ -285,7 +285,7 @@ develop = true
 
 [package.dependencies]
 jsonpatch = "^1.33"
-langsmith = "^0.1.0"
+langsmith = "^0.1.65"
 packaging = "^23.2"
 pydantic = ">=1,<3"
 PyYAML = ">=5.3"
@@ -300,13 +300,13 @@ url = "../../core"
 
 [[package]]
 name = "langsmith"
-version = "0.1.58"
+version = "0.1.65"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 optional = false
 python-versions = "<4.0,>=3.8.1"
 files = [
-    {file = "langsmith-0.1.58-py3-none-any.whl", hash = "sha256:1148cc836ec99d1b2f37cd2fa3014fcac213bb6bad798a2b21bb9111c18c9768"},
-    {file = "langsmith-0.1.58.tar.gz", hash = "sha256:a5060933c1fb3006b498ec849677993329d7e6138bdc2ec044068ab806e09c39"},
+    {file = "langsmith-0.1.65-py3-none-any.whl", hash = "sha256:ab4487029240e69cca30da1065f1e9138e5a7ca2bbe8c697f0bd7d5839f71cf7"},
+    {file = "langsmith-0.1.65.tar.gz", hash = "sha256:d3c2eb2391478bd79989f02652cf66e29a7959d677614b6993a47cef43f7f43b"},
 ]
 
 [package.dependencies]
@@ -1309,4 +1309,4 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "369d2f7218d797a01a533380de9cce01037963f628dce10bc9927eac014edeeb"
+content-hash = "bf51336a3b4035385ddd68946aa5bbe699f4b805dd0503ba1dd1454a69248616"
diff --git a/libs/partners/nomic/pyproject.toml b/libs/partners/nomic/pyproject.toml
index c3c822c830a..759e27ddfdb 100644
--- a/libs/partners/nomic/pyproject.toml
+++ b/libs/partners/nomic/pyproject.toml
@@ -14,6 +14,7 @@ license = "MIT"
 python = ">=3.8.1,<4.0"
 langchain-core = ">=0.1.46,<0.3"
 nomic = "^3.0.29"
+pillow = "^10.3.0"
 
 [tool.poetry.group.test]
 optional = true
diff --git a/templates/rag-multi-modal-local/README.md b/templates/rag-multi-modal-local/README.md
index 4e34795b9dd..ed61d1a9f7d 100644
--- a/templates/rag-multi-modal-local/README.md
+++ b/templates/rag-multi-modal-local/README.md
@@ -7,11 +7,11 @@ With the release of open source, multi-modal LLMs it's possible to build this ki
 
 This template demonstrates how to perform private visual search and question-answering over a collection of your photos.
 
-It uses OpenCLIP embeddings to embed all of the photos and stores them in Chroma.
+It uses [`nomic-embed-vision-v1`](https://huggingface.co/nomic-ai/nomic-embed-vision-v1) multi-modal embeddings to embed the images and `Ollama` for question-answering.
  
 Given a question, relevant photos are retrieved and passed to an open source multi-modal LLM of your choice for answer synthesis.
  
-![Diagram illustrating the visual search process with OpenCLIP embeddings and multi-modal LLM for question-answering, featuring example food pictures and a matcha soft serve answer trace.](https://github.com/langchain-ai/langchain/assets/122662504/da543b21-052c-4c43-939e-d4f882a45d75 "Visual Search Process Diagram")
+![Diagram illustrating the visual search process with nomic-embed-vision-v1 embeddings and multi-modal LLM for question-answering, featuring example food pictures and a matcha soft serve answer trace.](https://github.com/langchain-ai/langchain/assets/122662504/da543b21-052c-4c43-939e-d4f882a45d75 "Visual Search Process Diagram")
 
 ## Input
 
@@ -34,22 +34,23 @@ python ingest.py
 
 ## Storage
 
-This template will use [OpenCLIP](https://github.com/mlfoundations/open_clip) multi-modal embeddings to embed the images.
-
-You can select different embedding model options (see results [here](https://github.com/mlfoundations/open_clip/blob/main/docs/openclip_results.csv)).
+This template will use [nomic-embed-vision-v1](https://huggingface.co/nomic-ai/nomic-embed-vision-v1) multi-modal embeddings to embed the images.
 
 The first time you run the app, it will automatically download the multimodal embedding model.
 
-By default, LangChain will use an embedding model with moderate performance but lower memory requirments, `ViT-H-14`.
 
-You can choose alternative `OpenCLIPEmbeddings` models in `rag_chroma_multi_modal/ingest.py`:
+You can choose alternative models in `rag_chroma_multi_modal/ingest.py`, such as `OpenCLIPEmbeddings`.
 ```
+langchain_experimental.open_clip import OpenCLIPEmbeddings
+
+embedding_function=OpenCLIPEmbeddings(
+        model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k"
+        )
+
 vectorstore_mmembd = Chroma(
     collection_name="multi-modal-rag",
     persist_directory=str(re_vectorstore_path),
-    embedding_function=OpenCLIPEmbeddings(
-        model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k"
-    ),
+    embedding_function=embedding_function
 )
 ```
 
diff --git a/templates/rag-multi-modal-local/ingest.py b/templates/rag-multi-modal-local/ingest.py
index 9aad0cf6568..1b2faa622f7 100644
--- a/templates/rag-multi-modal-local/ingest.py
+++ b/templates/rag-multi-modal-local/ingest.py
@@ -2,7 +2,7 @@ import os
 from pathlib import Path
 
 from langchain_community.vectorstores import Chroma
-from langchain_experimental.open_clip import OpenCLIPEmbeddings
+from langchain_nomic import NomicMultimodalEmbeddings
 
 # Load images
 img_dump_path = Path(__file__).parent / "docs/"
@@ -21,7 +21,9 @@ re_vectorstore_path = vectorstore.relative_to(Path.cwd())
 
 # Load embedding function
 print("Loading embedding function")
-embedding = OpenCLIPEmbeddings(model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k")
+embedding = NomicMultimodalEmbeddings(
+    vision_model="nomic-embed-vision-v1", text_model="nomic-embed-text-v1"
+)
 
 # Create chroma
 vectorstore_mmembd = Chroma(
diff --git a/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py b/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py
index ce09cee0749..c4df1253d93 100644
--- a/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py
+++ b/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py
@@ -9,7 +9,7 @@ from langchain_core.messages import HumanMessage
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.pydantic_v1 import BaseModel
 from langchain_core.runnables import RunnableLambda, RunnablePassthrough
-from langchain_experimental.open_clip import OpenCLIPEmbeddings
+from langchain_nomic import NomicMultimodalEmbeddings
 from PIL import Image
 
 
@@ -102,8 +102,8 @@ def multi_modal_rag_chain(retriever):
 vectorstore_mmembd = Chroma(
     collection_name="multi-modal-rag",
     persist_directory=str(Path(__file__).parent.parent / "chroma_db_multi_modal"),
-    embedding_function=OpenCLIPEmbeddings(
-        model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k"
+    embedding_function=NomicMultimodalEmbeddings(
+        vision_model="nomic-embed-vision-v1", text_model="nomic-embed-text-v1"
     ),
 )