From 14f3014ccefa1344e74429b184975b318808e024 Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Wed, 5 Jun 2024 12:47:17 -0400 Subject: [PATCH] embeddings: nomic embed vision (#22482) Thank you for contributing to LangChain! **Description:** Adds Langchain support for Nomic Embed Vision **Twitter handle:** nomic_ai,zach_nussbaum - [x] **Add tests and docs**: If you're adding a new integration, please include 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. - [ ] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/ Additional guidelines: - Make sure optional dependencies are imported within a function. - Please do not add dependencies to pyproject.toml files (even optional ones) unless they are required for unit tests. - Most PRs should not touch more than one package. - Changes should be backwards compatible. - If you are adding something to community, do not re-import it in langchain. If no one reviews your PR within a few days, please @-mention one of baskaryan, efriis, eyurtsev, ccurme, vbarda, hwchase17. --------- Co-authored-by: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Co-authored-by: Bagatur --- cookbook/nomic_multimodal_rag.ipynb | 497 ++++++++++++++++++ docs/scripts/arxiv_references.py | 12 +- .../nomic/langchain_nomic/__init__.py | 4 +- .../nomic/langchain_nomic/embeddings.py | 11 + libs/partners/nomic/poetry.lock | 14 +- libs/partners/nomic/pyproject.toml | 1 + templates/rag-multi-modal-local/README.md | 21 +- templates/rag-multi-modal-local/ingest.py | 6 +- .../rag_multi_modal_local/chain.py | 6 +- 9 files changed, 543 insertions(+), 29 deletions(-) create mode 100644 cookbook/nomic_multimodal_rag.ipynb diff --git a/cookbook/nomic_multimodal_rag.ipynb b/cookbook/nomic_multimodal_rag.ipynb new file mode 100644 index 00000000000..ba8a77ace29 --- /dev/null +++ b/cookbook/nomic_multimodal_rag.ipynb @@ -0,0 +1,497 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "9fc3897d-176f-4729-8fd1-cfb4add53abd", + "metadata": {}, + "source": [ + "## Nomic multi-modal RAG\n", + "\n", + "Many documents contain a mixture of content types, including text and images. \n", + "\n", + "Yet, information captured in images is lost in most RAG applications.\n", + "\n", + "With the emergence of multimodal LLMs, like [GPT-4V](https://openai.com/research/gpt-4v-system-card), it is worth considering how to utilize images in RAG:\n", + "\n", + "In this demo we\n", + "\n", + "* Use multimodal embeddings from Nomic Embed [Vision](https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5) and [Text](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) to embed images and text\n", + "* Retrieve both using similarity search\n", + "* Pass raw images and text chunks to a multimodal LLM for answer synthesis \n", + "\n", + "## Signup\n", + "\n", + "Get your API token, then run:\n", + "```\n", + "! nomic login\n", + "```\n", + "\n", + "Then run with your generated API token \n", + "```\n", + "! nomic login < token > \n", + "```\n", + "\n", + "## Packages\n", + "\n", + "For `unstructured`, you will also need `poppler` ([installation instructions](https://pdf2image.readthedocs.io/en/latest/installation.html)) and `tesseract` ([installation instructions](https://tesseract-ocr.github.io/tessdoc/Installation.html)) in your system." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54926b9b-75c2-4cd4-8f14-b3882a0d370b", + "metadata": {}, + "outputs": [], + "source": [ + "! nomic login token" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "febbc459-ebba-4c1a-a52b-fed7731593f8", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "! pip install -U langchain-nomic langchain_community tiktoken langchain-openai chromadb langchain # (newest versions required for multi-modal)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acbdc603-39e2-4a5f-836c-2bbaecd46b0b", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# lock to 0.10.19 due to a persistent bug in more recent versions\n", + "! pip install \"unstructured[all-docs]==0.10.19\" pillow pydantic lxml pillow matplotlib tiktoken" + ] + }, + { + "cell_type": "markdown", + "id": "1e94b3fb-8e3e-4736-be0a-ad881626c7bd", + "metadata": {}, + "source": [ + "## Data Loading\n", + "\n", + "### Partition PDF text and images\n", + " \n", + "Let's look at an example pdfs containing interesting images.\n", + "\n", + "1/ Art from the J Paul Getty museum:\n", + "\n", + " * Here is a [zip file](https://drive.google.com/file/d/18kRKbq2dqAhhJ3DfZRnYcTBEUfYxe1YR/view?usp=sharing) with the PDF and the already extracted images. \n", + "* https://www.getty.edu/publications/resources/virtuallibrary/0892360224.pdf\n", + "\n", + "2/ Famous photographs from library of congress:\n", + "\n", + "* https://www.loc.gov/lcm/pdf/LCM_2020_1112.pdf\n", + "* We'll use this as an example below\n", + "\n", + "We can use `partition_pdf` below from [Unstructured](https://unstructured-io.github.io/unstructured/introduction.html#key-concepts) to extract text and images.\n", + "\n", + "To supply this to extract the images:\n", + "```\n", + "extract_images_in_pdf=True\n", + "```\n", + "\n", + "\n", + "\n", + "If using this zip file, then you can simply process the text only with:\n", + "```\n", + "extract_images_in_pdf=False\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9646b524-71a7-4b2a-bdc8-0b81f77e968f", + "metadata": {}, + "outputs": [], + "source": [ + "# Folder with pdf and extracted images\n", + "from pathlib import Path\n", + "\n", + "# replace with actual path to images\n", + "path = Path(\"../art\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77f096ab-a933-41d0-8f4e-1efc83998fc3", + "metadata": {}, + "outputs": [], + "source": [ + "path.resolve()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc4839c0-8773-4a07-ba59-5364501269b2", + "metadata": {}, + "outputs": [], + "source": [ + "# Extract images, tables, and chunk text\n", + "from unstructured.partition.pdf import partition_pdf\n", + "\n", + "raw_pdf_elements = partition_pdf(\n", + " filename=str(path.resolve()) + \"/getty.pdf\",\n", + " extract_images_in_pdf=False,\n", + " infer_table_structure=True,\n", + " chunking_strategy=\"by_title\",\n", + " max_characters=4000,\n", + " new_after_n_chars=3800,\n", + " combine_text_under_n_chars=2000,\n", + " image_output_dir_path=path,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "969545ad", + "metadata": {}, + "outputs": [], + "source": [ + "# Categorize text elements by type\n", + "tables = []\n", + "texts = []\n", + "for element in raw_pdf_elements:\n", + " if \"unstructured.documents.elements.Table\" in str(type(element)):\n", + " tables.append(str(element))\n", + " elif \"unstructured.documents.elements.CompositeElement\" in str(type(element)):\n", + " texts.append(str(element))" + ] + }, + { + "cell_type": "markdown", + "id": "5d8e6349-1547-4cbf-9c6f-491d8610ec10", + "metadata": {}, + "source": [ + "## Multi-modal embeddings with our document\n", + "\n", + "We will use [nomic-embed-vision-v1.5](https://huggingface.co/nomic-ai/nomic-embed-vision-v1.5) embeddings. This model is aligned \n", + "to [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) allowing for multimodal semantic search and Multimodal RAG!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bc15842-cb95-4f84-9eb5-656b0282a800", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import uuid\n", + "\n", + "import chromadb\n", + "import numpy as np\n", + "from langchain_community.vectorstores import Chroma\n", + "from langchain_nomic import NomicEmbeddings\n", + "from PIL import Image as _PILImage\n", + "\n", + "# Create chroma\n", + "text_vectorstore = Chroma(\n", + " collection_name=\"mm_rag_clip_photos_text\",\n", + " embedding_function=NomicEmbeddings(\n", + " vision_model=\"nomic-embed-vision-v1.5\", model=\"nomic-embed-text-v1.5\"\n", + " ),\n", + ")\n", + "image_vectorstore = Chroma(\n", + " collection_name=\"mm_rag_clip_photos_image\",\n", + " embedding_function=NomicEmbeddings(\n", + " vision_model=\"nomic-embed-vision-v1.5\", model=\"nomic-embed-text-v1.5\"\n", + " ),\n", + ")\n", + "\n", + "# Get image URIs with .jpg extension only\n", + "image_uris = sorted(\n", + " [\n", + " os.path.join(path, image_name)\n", + " for image_name in os.listdir(path)\n", + " if image_name.endswith(\".jpg\")\n", + " ]\n", + ")\n", + "\n", + "# Add images\n", + "image_vectorstore.add_images(uris=image_uris)\n", + "\n", + "# Add documents\n", + "text_vectorstore.add_texts(texts=texts)\n", + "\n", + "# Make retriever\n", + "image_retriever = image_vectorstore.as_retriever()\n", + "text_retriever = text_vectorstore.as_retriever()" + ] + }, + { + "cell_type": "markdown", + "id": "02a186d0-27e0-4820-8092-63b5349dd25d", + "metadata": {}, + "source": [ + "## RAG\n", + "\n", + "`vectorstore.add_images` will store / retrieve images as base64 encoded strings.\n", + "\n", + "These can be passed to [GPT-4V](https://platform.openai.com/docs/guides/vision)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "344f56a8-0dc3-433e-851c-3f7600c7a72b", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "import io\n", + "from io import BytesIO\n", + "\n", + "import numpy as np\n", + "from PIL import Image\n", + "\n", + "\n", + "def resize_base64_image(base64_string, size=(128, 128)):\n", + " \"\"\"\n", + " Resize an image encoded as a Base64 string.\n", + "\n", + " Args:\n", + " base64_string (str): Base64 string of the original image.\n", + " size (tuple): Desired size of the image as (width, height).\n", + "\n", + " Returns:\n", + " str: Base64 string of the resized image.\n", + " \"\"\"\n", + " # Decode the Base64 string\n", + " img_data = base64.b64decode(base64_string)\n", + " img = Image.open(io.BytesIO(img_data))\n", + "\n", + " # Resize the image\n", + " resized_img = img.resize(size, Image.LANCZOS)\n", + "\n", + " # Save the resized image to a bytes buffer\n", + " buffered = io.BytesIO()\n", + " resized_img.save(buffered, format=img.format)\n", + "\n", + " # Encode the resized image to Base64\n", + " return base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", + "\n", + "\n", + "def is_base64(s):\n", + " \"\"\"Check if a string is Base64 encoded\"\"\"\n", + " try:\n", + " return base64.b64encode(base64.b64decode(s)) == s.encode()\n", + " except Exception:\n", + " return False\n", + "\n", + "\n", + "def split_image_text_types(docs):\n", + " \"\"\"Split numpy array images and texts\"\"\"\n", + " images = []\n", + " text = []\n", + " for doc in docs:\n", + " doc = doc.page_content # Extract Document contents\n", + " if is_base64(doc):\n", + " # Resize image to avoid OAI server error\n", + " images.append(\n", + " resize_base64_image(doc, size=(250, 250))\n", + " ) # base64 encoded str\n", + " else:\n", + " text.append(doc)\n", + " return {\"images\": images, \"texts\": text}" + ] + }, + { + "cell_type": "markdown", + "id": "23a2c1d8-fea6-4152-b184-3172dd46c735", + "metadata": {}, + "source": [ + "Currently, we format the inputs using a `RunnableLambda` while we add image support to `ChatPromptTemplates`.\n", + "\n", + "Our runnable follows the classic RAG flow - \n", + "\n", + "* We first compute the context (both \"texts\" and \"images\" in this case) and the question (just a RunnablePassthrough here) \n", + "* Then we pass this into our prompt template, which is a custom function that formats the message for the gpt-4-vision-preview model. \n", + "* And finally we parse the output as a string." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d8919dc-c238-4746-86ba-45d940a7d260", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c93fab3-74c4-4f1d-958a-0bc4cdd0797e", + "metadata": {}, + "outputs": [], + "source": [ + "from operator import itemgetter\n", + "\n", + "from langchain_core.messages import HumanMessage, SystemMessage\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.runnables import RunnableLambda, RunnablePassthrough\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "\n", + "def prompt_func(data_dict):\n", + " # Joining the context texts into a single string\n", + " formatted_texts = \"\\n\".join(data_dict[\"text_context\"][\"texts\"])\n", + " messages = []\n", + "\n", + " # Adding image(s) to the messages if present\n", + " if data_dict[\"image_context\"][\"images\"]:\n", + " image_message = {\n", + " \"type\": \"image_url\",\n", + " \"image_url\": {\n", + " \"url\": f\"data:image/jpeg;base64,{data_dict['image_context']['images'][0]}\"\n", + " },\n", + " }\n", + " messages.append(image_message)\n", + "\n", + " # Adding the text message for analysis\n", + " text_message = {\n", + " \"type\": \"text\",\n", + " \"text\": (\n", + " \"As an expert art critic and historian, your task is to analyze and interpret images, \"\n", + " \"considering their historical and cultural significance. Alongside the images, you will be \"\n", + " \"provided with related text to offer context. Both will be retrieved from a vectorstore based \"\n", + " \"on user-input keywords. Please use your extensive knowledge and analytical skills to provide a \"\n", + " \"comprehensive summary that includes:\\n\"\n", + " \"- A detailed description of the visual elements in the image.\\n\"\n", + " \"- The historical and cultural context of the image.\\n\"\n", + " \"- An interpretation of the image's symbolism and meaning.\\n\"\n", + " \"- Connections between the image and the related text.\\n\\n\"\n", + " f\"User-provided keywords: {data_dict['question']}\\n\\n\"\n", + " \"Text and / or tables:\\n\"\n", + " f\"{formatted_texts}\"\n", + " ),\n", + " }\n", + " messages.append(text_message)\n", + "\n", + " return [HumanMessage(content=messages)]\n", + "\n", + "\n", + "model = ChatOpenAI(temperature=0, model=\"gpt-4-vision-preview\", max_tokens=1024)\n", + "\n", + "# RAG pipeline\n", + "chain = (\n", + " {\n", + " \"text_context\": text_retriever | RunnableLambda(split_image_text_types),\n", + " \"image_context\": image_retriever | RunnableLambda(split_image_text_types),\n", + " \"question\": RunnablePassthrough(),\n", + " }\n", + " | RunnableLambda(prompt_func)\n", + " | model\n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1566096d-97c2-4ddc-ba4a-6ef88c525e4e", + "metadata": {}, + "source": [ + "## Test retrieval and run RAG" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90121e56-674b-473b-871d-6e4753fd0c45", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import HTML, display\n", + "\n", + "\n", + "def plt_img_base64(img_base64):\n", + " # Create an HTML img tag with the base64 string as the source\n", + " image_html = f''\n", + "\n", + " # Display the image by rendering the HTML\n", + " display(HTML(image_html))\n", + "\n", + "\n", + "docs = text_retriever.invoke(\"Women with children\", k=5)\n", + "for doc in docs:\n", + " if is_base64(doc.page_content):\n", + " plt_img_base64(doc.page_content)\n", + " else:\n", + " print(doc.page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44eaa532-f035-4c04-b578-02339d42554c", + "metadata": {}, + "outputs": [], + "source": [ + "docs = image_retriever.invoke(\"Women with children\", k=5)\n", + "for doc in docs:\n", + " if is_base64(doc.page_content):\n", + " plt_img_base64(doc.page_content)\n", + " else:\n", + " print(doc.page_content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69fb15fd-76fc-49b4-806d-c4db2990027d", + "metadata": {}, + "outputs": [], + "source": [ + "chain.invoke(\"Women with children\")" + ] + }, + { + "cell_type": "markdown", + "id": "227f08b8-e732-4089-b65c-6eb6f9e48f15", + "metadata": {}, + "source": [ + "We can see the images retrieved in the LangSmith trace:\n", + "\n", + "LangSmith [trace](https://smith.langchain.com/public/69c558a5-49dc-4c60-a49b-3adbb70f74c5/r/e872c2c8-528c-468f-aefd-8b5cd730a673)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/scripts/arxiv_references.py b/docs/scripts/arxiv_references.py index 914c58e7d6e..1511c2ee3e1 100644 --- a/docs/scripts/arxiv_references.py +++ b/docs/scripts/arxiv_references.py @@ -515,7 +515,8 @@ def log_results(arxiv_id2type2key2urls): def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) -> None: with open(file_name, "w") as f: # Write the table headers - f.write("""# arXiv + f.write( + """# arXiv LangChain implements the latest research in the field of Natural Language Processing. This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference, @@ -525,7 +526,8 @@ This page contains `arXiv` papers referenced in the LangChain Documentation, API | arXiv id / Title | Authors | Published date 🔻 | LangChain Documentation| |------------------|---------|-------------------|------------------------| -""") +""" + ) for paper in papers: refs = [] if paper.referencing_doc2url: @@ -595,7 +597,8 @@ This page contains `arXiv` papers referenced in the LangChain Documentation, API if el ] ) - f.write(f""" + f.write( + f""" ## {paper.title} - **arXiv id:** {paper.arxiv_id} @@ -608,7 +611,8 @@ This page contains `arXiv` papers referenced in the LangChain Documentation, API {refs} **Abstract:** {paper.abstract} - """) + """ + ) logger.warning(f"Created the {file_name} file with {len(papers)} arXiv references.") diff --git a/libs/partners/nomic/langchain_nomic/__init__.py b/libs/partners/nomic/langchain_nomic/__init__.py index 75f3facd507..01326dd75cd 100644 --- a/libs/partners/nomic/langchain_nomic/__init__.py +++ b/libs/partners/nomic/langchain_nomic/__init__.py @@ -1,5 +1,3 @@ from langchain_nomic.embeddings import NomicEmbeddings -__all__ = [ - "NomicEmbeddings", -] +__all__ = ["NomicEmbeddings"] diff --git a/libs/partners/nomic/langchain_nomic/embeddings.py b/libs/partners/nomic/langchain_nomic/embeddings.py index 5c690276252..1dd2fa60fc2 100644 --- a/libs/partners/nomic/langchain_nomic/embeddings.py +++ b/libs/partners/nomic/langchain_nomic/embeddings.py @@ -22,6 +22,7 @@ class NomicEmbeddings(Embeddings): self, *, model: str, + nomic_api_key: Optional[str] = ..., dimensionality: Optional[int] = ..., inference_mode: Literal["remote"] = ..., ): @@ -32,6 +33,7 @@ class NomicEmbeddings(Embeddings): self, *, model: str, + nomic_api_key: Optional[str] = ..., dimensionality: Optional[int] = ..., inference_mode: Literal["local", "dynamic"], device: Optional[str] = ..., @@ -43,6 +45,7 @@ class NomicEmbeddings(Embeddings): self, *, model: str, + nomic_api_key: Optional[str] = ..., dimensionality: Optional[int] = ..., inference_mode: str, device: Optional[str] = ..., @@ -57,6 +60,7 @@ class NomicEmbeddings(Embeddings): dimensionality: Optional[int] = None, inference_mode: str = "remote", device: Optional[str] = None, + vision_model: Optional[str] = None, ): """Initialize NomicEmbeddings model. @@ -80,6 +84,7 @@ class NomicEmbeddings(Embeddings): self.dimensionality = dimensionality self.inference_mode = inference_mode self.device = device + self.vision_model = vision_model def embed(self, texts: List[str], *, task_type: str) -> List[List[float]]: """Embed texts. @@ -121,3 +126,9 @@ class NomicEmbeddings(Embeddings): texts=[text], task_type="search_query", )[0] + + def embed_image(self, uris: List[str]) -> List[List[float]]: + return embed.image( + images=uris, + model=self.vision_model, + )["embeddings"] diff --git a/libs/partners/nomic/poetry.lock b/libs/partners/nomic/poetry.lock index 2e015a30a39..2bdd02d72a7 100644 --- a/libs/partners/nomic/poetry.lock +++ b/libs/partners/nomic/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "annotated-types" @@ -276,7 +276,7 @@ files = [ [[package]] name = "langchain-core" -version = "0.2.0rc1" +version = "0.2.3" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" @@ -285,7 +285,7 @@ develop = true [package.dependencies] jsonpatch = "^1.33" -langsmith = "^0.1.0" +langsmith = "^0.1.65" packaging = "^23.2" pydantic = ">=1,<3" PyYAML = ">=5.3" @@ -300,13 +300,13 @@ url = "../../core" [[package]] name = "langsmith" -version = "0.1.58" +version = "0.1.65" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langsmith-0.1.58-py3-none-any.whl", hash = "sha256:1148cc836ec99d1b2f37cd2fa3014fcac213bb6bad798a2b21bb9111c18c9768"}, - {file = "langsmith-0.1.58.tar.gz", hash = "sha256:a5060933c1fb3006b498ec849677993329d7e6138bdc2ec044068ab806e09c39"}, + {file = "langsmith-0.1.65-py3-none-any.whl", hash = "sha256:ab4487029240e69cca30da1065f1e9138e5a7ca2bbe8c697f0bd7d5839f71cf7"}, + {file = "langsmith-0.1.65.tar.gz", hash = "sha256:d3c2eb2391478bd79989f02652cf66e29a7959d677614b6993a47cef43f7f43b"}, ] [package.dependencies] @@ -1309,4 +1309,4 @@ dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "369d2f7218d797a01a533380de9cce01037963f628dce10bc9927eac014edeeb" +content-hash = "bf51336a3b4035385ddd68946aa5bbe699f4b805dd0503ba1dd1454a69248616" diff --git a/libs/partners/nomic/pyproject.toml b/libs/partners/nomic/pyproject.toml index c3c822c830a..759e27ddfdb 100644 --- a/libs/partners/nomic/pyproject.toml +++ b/libs/partners/nomic/pyproject.toml @@ -14,6 +14,7 @@ license = "MIT" python = ">=3.8.1,<4.0" langchain-core = ">=0.1.46,<0.3" nomic = "^3.0.29" +pillow = "^10.3.0" [tool.poetry.group.test] optional = true diff --git a/templates/rag-multi-modal-local/README.md b/templates/rag-multi-modal-local/README.md index 4e34795b9dd..ed61d1a9f7d 100644 --- a/templates/rag-multi-modal-local/README.md +++ b/templates/rag-multi-modal-local/README.md @@ -7,11 +7,11 @@ With the release of open source, multi-modal LLMs it's possible to build this ki This template demonstrates how to perform private visual search and question-answering over a collection of your photos. -It uses OpenCLIP embeddings to embed all of the photos and stores them in Chroma. +It uses [`nomic-embed-vision-v1`](https://huggingface.co/nomic-ai/nomic-embed-vision-v1) multi-modal embeddings to embed the images and `Ollama` for question-answering. Given a question, relevant photos are retrieved and passed to an open source multi-modal LLM of your choice for answer synthesis. -![Diagram illustrating the visual search process with OpenCLIP embeddings and multi-modal LLM for question-answering, featuring example food pictures and a matcha soft serve answer trace.](https://github.com/langchain-ai/langchain/assets/122662504/da543b21-052c-4c43-939e-d4f882a45d75 "Visual Search Process Diagram") +![Diagram illustrating the visual search process with nomic-embed-vision-v1 embeddings and multi-modal LLM for question-answering, featuring example food pictures and a matcha soft serve answer trace.](https://github.com/langchain-ai/langchain/assets/122662504/da543b21-052c-4c43-939e-d4f882a45d75 "Visual Search Process Diagram") ## Input @@ -34,22 +34,23 @@ python ingest.py ## Storage -This template will use [OpenCLIP](https://github.com/mlfoundations/open_clip) multi-modal embeddings to embed the images. - -You can select different embedding model options (see results [here](https://github.com/mlfoundations/open_clip/blob/main/docs/openclip_results.csv)). +This template will use [nomic-embed-vision-v1](https://huggingface.co/nomic-ai/nomic-embed-vision-v1) multi-modal embeddings to embed the images. The first time you run the app, it will automatically download the multimodal embedding model. -By default, LangChain will use an embedding model with moderate performance but lower memory requirments, `ViT-H-14`. -You can choose alternative `OpenCLIPEmbeddings` models in `rag_chroma_multi_modal/ingest.py`: +You can choose alternative models in `rag_chroma_multi_modal/ingest.py`, such as `OpenCLIPEmbeddings`. ``` +langchain_experimental.open_clip import OpenCLIPEmbeddings + +embedding_function=OpenCLIPEmbeddings( + model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k" + ) + vectorstore_mmembd = Chroma( collection_name="multi-modal-rag", persist_directory=str(re_vectorstore_path), - embedding_function=OpenCLIPEmbeddings( - model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k" - ), + embedding_function=embedding_function ) ``` diff --git a/templates/rag-multi-modal-local/ingest.py b/templates/rag-multi-modal-local/ingest.py index 9aad0cf6568..1b2faa622f7 100644 --- a/templates/rag-multi-modal-local/ingest.py +++ b/templates/rag-multi-modal-local/ingest.py @@ -2,7 +2,7 @@ import os from pathlib import Path from langchain_community.vectorstores import Chroma -from langchain_experimental.open_clip import OpenCLIPEmbeddings +from langchain_nomic import NomicMultimodalEmbeddings # Load images img_dump_path = Path(__file__).parent / "docs/" @@ -21,7 +21,9 @@ re_vectorstore_path = vectorstore.relative_to(Path.cwd()) # Load embedding function print("Loading embedding function") -embedding = OpenCLIPEmbeddings(model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k") +embedding = NomicMultimodalEmbeddings( + vision_model="nomic-embed-vision-v1", text_model="nomic-embed-text-v1" +) # Create chroma vectorstore_mmembd = Chroma( diff --git a/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py b/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py index ce09cee0749..c4df1253d93 100644 --- a/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py +++ b/templates/rag-multi-modal-local/rag_multi_modal_local/chain.py @@ -9,7 +9,7 @@ from langchain_core.messages import HumanMessage from langchain_core.output_parsers import StrOutputParser from langchain_core.pydantic_v1 import BaseModel from langchain_core.runnables import RunnableLambda, RunnablePassthrough -from langchain_experimental.open_clip import OpenCLIPEmbeddings +from langchain_nomic import NomicMultimodalEmbeddings from PIL import Image @@ -102,8 +102,8 @@ def multi_modal_rag_chain(retriever): vectorstore_mmembd = Chroma( collection_name="multi-modal-rag", persist_directory=str(Path(__file__).parent.parent / "chroma_db_multi_modal"), - embedding_function=OpenCLIPEmbeddings( - model_name="ViT-H-14", checkpoint="laion2b_s32b_b79k" + embedding_function=NomicMultimodalEmbeddings( + vision_model="nomic-embed-vision-v1", text_model="nomic-embed-text-v1" ), )