langchain/libs/partners/nomic/langchain_nomic/embeddings.py

from __future__ import annotations

import os
from typing import Literal, Optional, overload

import nomic  # type: ignore[import]
from langchain_core.embeddings import Embeddings
from nomic import embed


class NomicEmbeddings(Embeddings):
    """NomicEmbeddings embedding model.

    Example:

        .. code-block:: python

            from langchain_nomic import NomicEmbeddings

            model = NomicEmbeddings()

    """

    @overload
    def __init__(
        self,
        *,
        model: str,
        nomic_api_key: Optional[str] = ...,
        dimensionality: Optional[int] = ...,
        inference_mode: Literal["remote"] = ...,
    ): ...

    @overload
    def __init__(
        self,
        *,
        model: str,
        nomic_api_key: Optional[str] = ...,
        dimensionality: Optional[int] = ...,
        inference_mode: Literal["local", "dynamic"],
        device: Optional[str] = ...,
    ): ...

    @overload
    def __init__(
        self,
        *,
        model: str,
        nomic_api_key: Optional[str] = ...,
        dimensionality: Optional[int] = ...,
        inference_mode: str,
        device: Optional[str] = ...,
    ): ...

    def __init__(
        self,
        *,
        model: str,
        nomic_api_key: Optional[str] = None,
        dimensionality: Optional[int] = None,
        inference_mode: str = "remote",
        device: Optional[str] = None,
        vision_model: Optional[str] = None,
    ):
        """Initialize NomicEmbeddings model.

        Args:
            model: model name
            nomic_api_key: optionally, set the Nomic API key. Uses the ``NOMIC_API_KEY``
                environment variable by default.
            dimensionality: The embedding dimension, for use with Matryoshka-capable
                models. Defaults to full-size.
            inference_mode: How to generate embeddings. One of ``'remote'``, ``'local'``
                (Embed4All), or ``'dynamic'`` (automatic). Defaults to ``'remote'``.
            device: The device to use for local embeddings. Choices include
                ``'cpu'``, ``'gpu'``, ``'nvidia'``, ``'amd'``, or a specific device
                name. See the docstring for ``GPT4All.__init__`` for more info.
                Typically defaults to ``'cpu'``. Do not use on macOS.
            vision_model: The vision model to use for image embeddings.

        """
        _api_key = nomic_api_key or os.environ.get("NOMIC_API_KEY")
        if _api_key:
            nomic.login(_api_key)
        self.model = model
        self.dimensionality = dimensionality
        self.inference_mode = inference_mode
        self.device = device
        self.vision_model = vision_model

    def embed(self, texts: list[str], *, task_type: str) -> list[list[float]]:
        """Embed texts.

        Args:
            texts: list of texts to embed
            task_type: the task type to use when embedding. One of ``'search_query'``,
                ``'search_document'``, ``'classification'``, ``'clustering'``

        """
        output = embed.text(
            texts=texts,
            model=self.model,
            task_type=task_type,
            dimensionality=self.dimensionality,
            inference_mode=self.inference_mode,
            device=self.device,
        )
        return output["embeddings"]

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        """Embed search docs.

        Args:
            texts: list of texts to embed as documents

        """
        return self.embed(
            texts=texts,
            task_type="search_document",
        )

    def embed_query(self, text: str) -> list[float]:
        """Embed query text.

        Args:
            text: query text

        """
        return self.embed(
            texts=[text],
            task_type="search_query",
        )[0]

    def embed_image(self, uris: list[str]) -> list[list[float]]:
        return embed.image(
            images=uris,
            model=self.vision_model,
        )["embeddings"]