implement local Nomic Embed via llama.cpp (#2086)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
2025-09-06 19:10:15 +00:00 · 2024-03-13 18:09:24 -04:00
parent 171f4e488e
commit 406e88b59a
23 changed files with 799 additions and 1198 deletions
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@@ -10,7 +10,7 @@ import sys
 import threading
 from enum import Enum
 from queue import Queue
-from typing import Callable, Iterable, List
+from typing import Callable, Iterable, overload

 if sys.version_info >= (3, 9):
    import importlib.resources as importlib_resources
@@ -105,13 +105,18 @@ llmodel.llmodel_prompt.argtypes = [

 llmodel.llmodel_prompt.restype = None

-llmodel.llmodel_embedding.argtypes = [
+llmodel.llmodel_embed.argtypes = [
    ctypes.c_void_p,
-    ctypes.c_char_p,
+    ctypes.POINTER(ctypes.c_char_p),
    ctypes.POINTER(ctypes.c_size_t),
+    ctypes.c_char_p,
+    ctypes.c_int,
+    ctypes.c_bool,
+    ctypes.c_bool,
+    ctypes.POINTER(ctypes.c_char_p),
 ]

-llmodel.llmodel_embedding.restype = ctypes.POINTER(ctypes.c_float)
+llmodel.llmodel_embed.restype = ctypes.POINTER(ctypes.c_float)

 llmodel.llmodel_free_embedding.argtypes = [ctypes.POINTER(ctypes.c_float)]
 llmodel.llmodel_free_embedding.restype = None
@@ -287,16 +292,50 @@ class LLModel:
        self.context.repeat_last_n = repeat_last_n
        self.context.context_erase = context_erase

-    def generate_embedding(self, text: str) -> List[float]:
-        if not text:
-            raise ValueError("Text must not be None or empty")
+    @overload
+    def generate_embeddings(
+        self, text: str, prefix: str, dimensionality: int, do_mean: bool, atlas: bool,
+    ) -> list[float]: ...
+    @overload
+    def generate_embeddings(
+        self, text: list[str], prefix: str, dimensionality: int, do_mean: bool, atlas: bool,
+    ) -> list[list[float]]: ...

+    def generate_embeddings(self, text, prefix, dimensionality, do_mean, atlas):
+        if not text:
+            raise ValueError("text must not be None or empty")
+
+        single_text = isinstance(text, str)
+        if single_text:
+            text = [text]
+
+        # prepare input
        embedding_size = ctypes.c_size_t()
-        c_text = ctypes.c_char_p(text.encode())
-        embedding_ptr = llmodel.llmodel_embedding(self.model, c_text, ctypes.byref(embedding_size))
-        embedding_array = [embedding_ptr[i] for i in range(embedding_size.value)]
+        error = ctypes.c_char_p()
+        c_prefix = ctypes.c_char_p() if prefix is None else prefix.encode()
+        c_texts = (ctypes.c_char_p * (len(text) + 1))()
+        for i, t in enumerate(text):
+            c_texts[i] = t.encode()
+
+        # generate the embeddings
+        embedding_ptr = llmodel.llmodel_embed(
+            self.model, c_texts, ctypes.byref(embedding_size), c_prefix, dimensionality, do_mean, atlas,
+            ctypes.byref(error),
+        )
+
+        if embedding_ptr.value is None:
+            msg = "(unknown error)" if error.value is None else error.value.decode()
+            raise RuntimeError(f'Failed to generate embeddings: {msg}')
+
+        # extract output
+        n_embd = embedding_size.value // len(text)
+        embedding_array = [
+            embedding_ptr[i:i + n_embd]
+            for i in range(0, embedding_size.value, n_embd)
+        ]
        llmodel.llmodel_free_embedding(embedding_ptr)
-        return list(embedding_array)
+
+        return embedding_array[0] if single_text else embedding_array

    def prompt_model(
        self,
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@@ -10,7 +10,7 @@ import time
 import warnings
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional, Union
+from typing import Any, Dict, Iterable, List, Optional, Union, overload

 import requests
 from requests.exceptions import ChunkedEncodingError
@@ -36,6 +36,8 @@ class Embed4All:
    Python class that handles embeddings for GPT4All.
    """

+    MIN_DIMENSIONALITY = 64
+
    def __init__(self, model_name: Optional[str] = None, n_threads: Optional[int] = None, **kwargs):
        """
        Constructor
@@ -45,17 +47,48 @@ class Embed4All:
        """
        self.gpt4all = GPT4All(model_name or 'all-MiniLM-L6-v2-f16.gguf', n_threads=n_threads, **kwargs)

-    def embed(self, text: str) -> List[float]:
+    @overload
+    def embed(
+        self, text: str, prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
+        atlas: bool = ...,
+    ) -> list[float]: ...
+    @overload
+    def embed(
+        self, text: list[str], prefix: str | None = ..., dimensionality: int | None = ..., long_text_mode: str = ...,
+        atlas: bool = ...,
+    ) -> list[list[float]]: ...
+
+    def embed(self, text, prefix=None, dimensionality=None, long_text_mode="truncate", atlas=False):
        """
-        Generate an embedding.
+        Generate one or more embeddings.

        Args:
-            text: The text document to generate an embedding for.
+            text: A text or list of texts to generate embeddings for.
+            prefix: The model-specific prefix representing the embedding task, without the trailing colon. For Nomic
+            Embed this can be `search_query`, `search_document`, `classification`, or `clustering`.
+            dimensionality: The embedding dimension, for use with Matryoshka-capable models. Defaults to full-size.
+            long_text_mode: How to handle texts longer than the model can accept. One of `mean` or `truncate`.
+            atlas: Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens
+            with long_text_mode="mean" will raise an error. Disabled by default.

        Returns:
-            An embedding of your document of text.
+            An embedding or list of embeddings of your text(s).
        """
-        return self.gpt4all.model.generate_embedding(text)
+        if dimensionality is None:
+            dimensionality = -1
+        else:
+            if dimensionality <= 0:
+                raise ValueError(f'Dimensionality must be None or a positive integer, got {dimensionality}')
+            if dimensionality < self.MIN_DIMENSIONALITY:
+                warnings.warn(
+                    f'Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}.'
+                    ' Performance may be degraded.'
+                )
+        try:
+            do_mean = {"mean": True, "truncate": False}[long_text_mode]
+        except KeyError:
+            raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
+        return self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas)


 class GPT4All: