Harrison/llama (#2314)

Co-authored-by: RJ Adriaansen <adriaansen@eshcc.eur.nl>
2025-08-20 18:07:05 +00:00 · 2023-04-02 14:57:45 -07:00 · 2023-04-02 14:57:45 -07:00 · d85f57ef9c
commit d85f57ef9c
parent 595ebe1796
11 changed files with 582 additions and 0 deletions
--- a/docs/ecosystem/llamacpp.md
+++ b/docs/ecosystem/llamacpp.md
@ -0,0 +1,26 @@
 # Llama.cpp
 This page covers how to use [llama.cpp](https://github.com/ggerganov/llama.cpp) within LangChain.
 It is broken into two parts: installation and setup, and then references to specific Jina wrappers.
 ## Installation and Setup
 - Install the Python package with `pip install llama-cpp-python`
 - Download one of the [supported models](https://github.com/ggerganov/llama.cpp#description) and convert them to the llama.cpp format per the [instructions](https://github.com/ggerganov/llama.cpp)
 ## Wrappers
 ### LLM
 There exists a LlamaCpp LLM wrapper, which you can access with 
 ```python
 from langchain.llms import LlamaCpp
 ```
 For a more detailed walkthrough of this, see [this notebook](../modules/models/text_embedding/examples/llamacpp.ipynb)
 ### Embeddings
 There exists a LlamaCpp Embeddings wrapper, which you can access with 
 ```python
 from langchain.embeddings import LlamaCppEmbeddings
 ```
 For a more detailed walkthrough of this, see [this notebook](../modules/models/llms/integrations/examples/llamacpp.ipynb)
--- a/docs/modules/models/llms/integrations/llamacpp.ipynb
+++ b/docs/modules/models/llms/integrations/llamacpp.ipynb
@ -0,0 +1,98 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install llama-cpp-python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.llms import LlamaCpp\n",
    "from langchain import PromptTemplate, LLMChain"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "template = \"\"\"Question: {question}\n",
    "\n",
    "Answer: Let's think step by step.\"\"\"\n",
    "\n",
    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = LlamaCpp(model_path=\"./ggml-model-q4_0.bin\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "llm_chain = LLMChain(prompt=prompt, llm=llm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n\\nWe know that Justin Bieber is currently 25 years old and that he was born on March 1st, 1994 and that he is a singer and he has an album called Purpose, so we know that he was born when Super Bowl XXXVIII was played between Dallas and Seattle and that it took place February 1st, 2004 and that the Seattle Seahawks won 24-21, so Seattle is our answer!'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n",
    "\n",
    "llm_chain.run(question)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "workspace",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/docs/modules/models/text_embedding/examples/llamacpp.ipynb
+++ b/docs/modules/models/text_embedding/examples/llamacpp.ipynb
@ -0,0 +1,66 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install llama-cpp-python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.embeddings import LlamaCppEmbeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "llama = LlamaCppEmbeddings(model_path=\"/path/to/model/ggml-model-q4_0.bin\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"This is a test document.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query_result = embeddings.embed_query(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "doc_result = embeddings.embed_documents([text])"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/docs/reference/integrations.md
+++ b/docs/reference/integrations.md
@ -52,6 +52,9 @@ The following use cases require specific installs and api keys:
  - If you want to set up OpenSearch on your local, [here](https://opensearch.org/docs/latest/)
 - _DeepLake_:
  - Install requirements with `pip install deeplake`
 - _LlamaCpp_:
  - Install requirements with `pip install llama-cpp-python`
  - Download model and convert following [llama.cpp instructions](https://github.com/ggerganov/llama.cpp)
 If you are using the `NLTKTextSplitter` or the `SpacyTextSplitter`, you will also need to install the appropriate models. For example, if you want to use the `SpacyTextSplitter`, you will need to install the `en_core_web_sm` model with `python -m spacy download en_core_web_sm`. Similarly, if you want to use the `NLTKTextSplitter`, you will need to install the `punkt` model with `python -m nltk.downloader punkt`.
--- a/langchain/init.py
+++ b/langchain/init.py
@ -31,6 +31,7 @@ from langchain.llms import (
    ForefrontAI,
    GooseAI,
    HuggingFaceHub,
    LlamaCpp,
    Modal,
    OpenAI,
    Petals,
@ -110,4 +111,5 @@ __all__ = [
    "PALChain",
    "set_handler",
    "set_tracing_callback_manager",
    "LlamaCpp",
 ]
--- a/langchain/embeddings/init.py
+++ b/langchain/embeddings/init.py
@ -14,6 +14,7 @@ from langchain.embeddings.huggingface import (
 )
 from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain.embeddings.jina import JinaEmbeddings
 from langchain.embeddings.llamacpp import LlamaCppEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
 from langchain.embeddings.self_hosted import SelfHostedEmbeddings
@ -30,6 +31,7 @@ __all__ = [
    "HuggingFaceEmbeddings",
    "CohereEmbeddings",
    "JinaEmbeddings",
    "LlamaCppEmbeddings",
    "HuggingFaceHubEmbeddings",
    "TensorflowHubEmbeddings",
    "SagemakerEndpointEmbeddings",
--- a/langchain/embeddings/llamacpp.py
+++ b/langchain/embeddings/llamacpp.py
@ -0,0 +1,118 @@
 """Wrapper around llama.cpp embedding models."""
 from typing import Any, Dict, List, Optional
 from pydantic import BaseModel, Extra, Field, root_validator
 from langchain.embeddings.base import Embeddings
 class LlamaCppEmbeddings(BaseModel, Embeddings):
    """Wrapper around llama.cpp embedding models.
    To use, you should have the llama-cpp-python library installed, and provide the
    path to the Llama model as a named parameter to the constructor.
    Check out: https://github.com/abetlen/llama-cpp-python
    Example:
        .. code-block:: python
            from langchain.embeddings import LlamaCppEmbeddings
            llama = LlamaCppEmbeddings(model_path="/path/to/model.bin")
    """
    client: Any  #: :meta private:
    model_path: str
    n_ctx: int = Field(512, alias="n_ctx")
    """Token context window."""
    n_parts: int = Field(-1, alias="n_parts")
    """Number of parts to split the model into. 
    If -1, the number of parts is automatically determined."""
    seed: int = Field(-1, alias="seed")
    """Seed. If -1, a random seed is used."""
    f16_kv: bool = Field(False, alias="f16_kv")
    """Use half-precision for key/value cache."""
    logits_all: bool = Field(False, alias="logits_all")
    """Return logits for all tokens, not just the last token."""
    vocab_only: bool = Field(False, alias="vocab_only")
    """Only load the vocabulary, no weights."""
    use_mlock: bool = Field(False, alias="use_mlock")
    """Force system to keep model in RAM."""
    n_threads: Optional[int] = Field(None, alias="n_threads")
    """Number of threads to use. If None, the number 
    of threads is automatically determined."""
    class Config:
        """Configuration for this pydantic object."""
        extra = Extra.forbid
    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that llama-cpp-python library is installed."""
        model_path = values["model_path"]
        n_ctx = values["n_ctx"]
        n_parts = values["n_parts"]
        seed = values["seed"]
        f16_kv = values["f16_kv"]
        logits_all = values["logits_all"]
        vocab_only = values["vocab_only"]
        use_mlock = values["use_mlock"]
        n_threads = values["n_threads"]
        try:
            from llama_cpp import Llama
            values["client"] = Llama(
                model_path=model_path,
                n_ctx=n_ctx,
                n_parts=n_parts,
                seed=seed,
                f16_kv=f16_kv,
                logits_all=logits_all,
                vocab_only=vocab_only,
                use_mlock=use_mlock,
                n_threads=n_threads,
                embedding=True,
            )
        except ImportError:
            raise ModuleNotFoundError(
                "Could not import llama-cpp-python library. "
                "Please install the llama-cpp-python library to "
                "use this embedding model: pip install llama-cpp-python"
            )
        except Exception:
            raise NameError(f"Could not load Llama model from path: {model_path}")
        return values
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Embed a list of documents using the Llama model.
        Args:
            texts: The list of texts to embed.
        Returns:
            List of embeddings, one for each text.
        """
        embeddings = [self.client.embed(text) for text in texts]
        return [list(map(float, e)) for e in embeddings]
    def embed_query(self, text: str) -> List[float]:
        """Embed a query using the Llama model.
        Args:
            text: The text to embed.
        Returns:
            Embeddings for the text.
        """
        embedding = self.client.embed(text)
        return list(map(float, embedding))
--- a/langchain/llms/init.py
+++ b/langchain/llms/init.py
@ -14,6 +14,7 @@ from langchain.llms.gooseai import GooseAI
 from langchain.llms.huggingface_endpoint import HuggingFaceEndpoint
 from langchain.llms.huggingface_hub import HuggingFaceHub
 from langchain.llms.huggingface_pipeline import HuggingFacePipeline
 from langchain.llms.llamacpp import LlamaCpp
 from langchain.llms.modal import Modal
 from langchain.llms.nlpcloud import NLPCloud
 from langchain.llms.openai import AzureOpenAI, OpenAI, OpenAIChat
@ -35,6 +36,7 @@ __all__ = [
    "DeepInfra",
    "ForefrontAI",
    "GooseAI",
    "LlamaCpp",
    "Modal",
    "NLPCloud",
    "OpenAI",
@ -67,6 +69,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
    "gooseai": GooseAI,
    "huggingface_hub": HuggingFaceHub,
    "huggingface_endpoint": HuggingFaceEndpoint,
    "llamacpp": LlamaCpp,
    "modal": Modal,
    "sagemaker_endpoint": SagemakerEndpoint,
    "nlpcloud": NLPCloud,
--- a/langchain/llms/llamacpp.py
+++ b/langchain/llms/llamacpp.py
@ -0,0 +1,184 @@
 """Wrapper around llama.cpp."""
 import logging
 from typing import Any, Dict, List, Optional
 from pydantic import BaseModel, Field, root_validator
 from langchain.llms.base import LLM
 logger = logging.getLogger(__name__)
 class LlamaCpp(LLM, BaseModel):
    """Wrapper around the llama.cpp model.
    To use, you should have the llama-cpp-python library installed, and provide the
    path to the Llama model as a named parameter to the constructor.
    Check out: https://github.com/abetlen/llama-cpp-python
    Example:
        .. code-block:: python
            from langchain.llms import LlamaCppEmbeddings
            llm = LlamaCppEmbeddings(model_path="/path/to/llama/model")
    """
    client: Any  #: :meta private:
    model_path: str
    """The path to the Llama model file."""
    n_ctx: int = Field(512, alias="n_ctx")
    """Token context window."""
    n_parts: int = Field(-1, alias="n_parts")
    """Number of parts to split the model into. 
    If -1, the number of parts is automatically determined."""
    seed: int = Field(-1, alias="seed")
    """Seed. If -1, a random seed is used."""
    f16_kv: bool = Field(False, alias="f16_kv")
    """Use half-precision for key/value cache."""
    logits_all: bool = Field(False, alias="logits_all")
    """Return logits for all tokens, not just the last token."""
    vocab_only: bool = Field(False, alias="vocab_only")
    """Only load the vocabulary, no weights."""
    use_mlock: bool = Field(False, alias="use_mlock")
    """Force system to keep model in RAM."""
    n_threads: Optional[int] = Field(None, alias="n_threads")
    """Number of threads to use. 
    If None, the number of threads is automatically determined."""
    suffix: Optional[str] = Field(None)
    """A suffix to append to the generated text. If None, no suffix is appended."""
    max_tokens: Optional[int] = 256
    """The maximum number of tokens to generate."""
    temperature: Optional[float] = 0.8
    """The temperature to use for sampling."""
    top_p: Optional[float] = 0.95
    """The top-p value to use for sampling."""
    logprobs: Optional[int] = Field(None)
    """The number of logprobs to return. If None, no logprobs are returned."""
    echo: Optional[bool] = False
    """Whether to echo the prompt."""
    stop: Optional[List[str]] = []
    """A list of strings to stop generation when encountered."""
    repeat_penalty: Optional[float] = 1.1
    """The penalty to apply to repeated tokens."""
    top_k: Optional[int] = 40
    """The top-k value to use for sampling."""
    @root_validator()
    def validate_environment(cls, values: Dict) -> Dict:
        """Validate that llama-cpp-python library is installed."""
        model_path = values["model_path"]
        n_ctx = values["n_ctx"]
        n_parts = values["n_parts"]
        seed = values["seed"]
        f16_kv = values["f16_kv"]
        logits_all = values["logits_all"]
        vocab_only = values["vocab_only"]
        use_mlock = values["use_mlock"]
        n_threads = values["n_threads"]
        try:
            from llama_cpp import Llama
            values["client"] = Llama(
                model_path=model_path,
                n_ctx=n_ctx,
                n_parts=n_parts,
                seed=seed,
                f16_kv=f16_kv,
                logits_all=logits_all,
                vocab_only=vocab_only,
                use_mlock=use_mlock,
                n_threads=n_threads,
            )
        except ImportError:
            raise ModuleNotFoundError(
                "Could not import llama-cpp-python library. "
                "Please install the llama-cpp-python library to "
                "use this embedding model: pip install llama-cpp-python"
            )
        except Exception:
            raise NameError(f"Could not load Llama model from path: {model_path}")
        return values
    @property
    def _default_params(self) -> Dict[str, Any]:
        """Get the default parameters for calling llama_cpp."""
        return {
            "suffix": self.suffix,
            "max_tokens": self.max_tokens,
            "temperature": self.temperature,
            "top_p": self.top_p,
            "logprobs": self.logprobs,
            "echo": self.echo,
            "stop_sequences": self.stop,
            "repeat_penalty": self.repeat_penalty,
            "top_k": self.top_k,
        }
    @property
    def _identifying_params(self) -> Dict[str, Any]:
        """Get the identifying parameters."""
        return {**{"model_path": self.model_path}, **self._default_params}
    @property
    def _llm_type(self) -> str:
        """Return type of llm."""
        return "llama.cpp"
    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        """Call the Llama model and return the output.
        Args:
            prompt: The prompt to use for generation.
            stop: A list of strings to stop generation when encountered.
        Returns:
            The generated text.
        Example:
            .. code-block:: python
                from langchain.llms import LlamaCppEmbeddings
                llm = LlamaCppEmbeddings(model_path="/path/to/local/llama/model.bin")
                llm("This is a prompt.")
        """
        params = self._default_params
        if self.stop and stop is not None:
            raise ValueError("`stop` found in both the input and default params.")
        elif self.stop:
            params["stop_sequences"] = self.stop
        else:
            params["stop_sequences"] = []
        """Call the Llama model and return the output."""
        text = self.client(
            prompt=prompt,
            max_tokens=params["max_tokens"],
            temperature=params["temperature"],
            top_p=params["top_p"],
            logprobs=params["logprobs"],
            echo=params["echo"],
            stop=params["stop_sequences"],
            repeat_penalty=params["repeat_penalty"],
            top_k=params["top_k"],
        )
        return text["choices"][0]["text"]
--- a/tests/integration_tests/embeddings/test_llamacpp.py
+++ b/tests/integration_tests/embeddings/test_llamacpp.py
@ -0,0 +1,46 @@
 # flake8: noqa
 """Test llamacpp embeddings."""
 import os
 from urllib.request import urlretrieve
 from langchain.embeddings.llamacpp import LlamaCppEmbeddings
 def get_model() -> str:
    """Download model.
    From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
    convert to new ggml format and return model path.
    """
    model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
    tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
    conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
    local_filename = model_url.split("/")[-1]
    if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
        urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
    if not os.path.exists("tokenizer.model"):
        urlretrieve(tokenizer_url, "tokenizer.model")
    if not os.path.exists(local_filename):
        urlretrieve(model_url, local_filename)
        os.system("python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
    return local_filename
 def test_llamacpp_embedding_documents() -> None:
    """Test llamacpp embeddings."""
    documents = ["foo bar"]
    model_path = get_model()
    embedding = LlamaCppEmbeddings(model_path=model_path)
    output = embedding.embed_documents(documents)
    assert len(output) == 1
    assert len(output[0]) == 512
 def test_llamacpp_embedding_query() -> None:
    """Test llamacpp embeddings."""
    document = "foo bar"
    model_path = get_model()
    embedding = LlamaCppEmbeddings(model_path=model_path)
    output = embedding.embed_query(document)
    assert len(output) == 512
--- a/tests/integration_tests/llms/test_llamacpp.py
+++ b/tests/integration_tests/llms/test_llamacpp.py
@ -0,0 +1,34 @@
 # flake8: noqa
 """Test Llama.cpp wrapper."""
 import os
 from urllib.request import urlretrieve
 from langchain.llms import LlamaCpp
 def get_model() -> str:
    """Download model. f
    From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
    convert to new ggml format and return model path."""
    model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
    tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
    conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
    local_filename = model_url.split("/")[-1]
    if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
        urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
    if not os.path.exists("tokenizer.model"):
        urlretrieve(tokenizer_url, "tokenizer.model")
    if not os.path.exists(local_filename):
        urlretrieve(model_url, local_filename)
        os.system(f"python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
    return local_filename
 def test_llamacpp_inference() -> None:
    """Test valid llama.cpp inference."""
    model_path = get_model()
    llm = LlamaCpp(model_path=model_path)
    output = llm("Say foo:")
    assert isinstance(output, str)