Support for Nvidia TensorRT

2025-08-31 23:03:45 +00:00 · 2024-02-29 19:41:58 +01:00
parent c3fe36e070
commit a7b18058b5
7 changed files with 141 additions and 8 deletions
--- a/fern/docs/pages/installation/installation.mdx
+++ b/fern/docs/pages/installation/installation.mdx
@@ -47,6 +47,7 @@ Where `<extra>` can be any of the following:
 - llms-ollama: adds support for Ollama LLM, the easiest way to get a local LLM running
 - llms-llama-cpp: adds support for local LLM using LlamaCPP - expect a messy installation process on some platforms
 - llms-sagemaker: adds support for Amazon Sagemaker LLM, requires Sagemaker inference endpoints
+- llms-nvidia-tensorrt: add support for Nvidia TensorRT LLM
 - llms-openai: adds support for OpenAI LLM, requires OpenAI API key
 - llms-openai-like: adds support for 3rd party LLM providers that are compatible with OpenAI's API
 - embeddings-huggingface: adds support for local Embeddings using HuggingFace
@@ -67,7 +68,7 @@ The easiest way to run PrivateGPT fully locally is to depend on Ollama for the L

 Go to [ollama.ai](https://ollama.ai/) and follow the instructions to install Ollama on your machine.

-Once done, you can install PrivateGPT with the following command:
+Once done, you can install PrivateGPT dependencies with the following command:
 ```bash
 poetry install --extras "ui llms-ollama embeddings-huggingface vector-stores-qdrant"
 ```
@@ -96,7 +97,7 @@ You need to have access to sagemaker inference endpoints for the LLM and / or th

 Edit the `settings-sagemaker.yaml` file to include the correct Sagemaker endpoints.

-Then, install PrivateGPT with the following command:
+Then, install PrivateGPT dependencies with the following command:
 ```bash
 poetry install --extras "ui llms-sagemaker embeddings-sagemaker vector-stores-qdrant"
 ```
@@ -111,9 +112,49 @@ PrivateGPT will use the already existing `settings-sagemaker.yaml` settings file

 The UI will be available at http://localhost:8001

+### Local, TensorRT-powered setup
+
+To get the most out of NVIDIA GPUs, you can set up a fully local PrivateGPT using TensorRT as its LLM provider. For more information about Nvidia TensorRT, check the [official documentation](https://github.com/NVIDIA/TensorRT-LLM).
+
+Follow these steps to set up a local TensorRT-powered PrivateGPT:
+
+- Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM.
+
+- For this example we will use Llama2. The Llama2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/trt-llm-rag-windows/blob/release/1.0/README.md#building-trt-engine).
+The following files will be created from following the steps in the link:
+
+* `Llama_float16_tp1_rank0.engine`: The main output of the build script, containing the executable graph of operations with the model weights embedded.
+
+* `config.jsonp`: Includes detailed information about the model, like its general structure and precision, as well as information about which plug-ins were incorporated into the engine.
+
+* `model.cache`: Caches some of the timing and optimization information from model compilation, making successive builds quicker.
+
+- Create a folder inside `models` called `tensorrt`, and move all of the files mentioned above to that directory.
+
+Once done, you can install PrivateGPT dependencies with the following command:
+```bash
+poetry install --extras "ui llms-nvidia-tensorrt embeddings-huggingface vector-stores-qdrant"
+```
+
+We are installing "embeddings-huggingface" dependency to support local embeddings, because TensorRT only covers the LLM.
+In order for local embeddings to work, you need to download the embeddings model to the `models` folder. You can do so by running the `setup` script:
+```bash
+poetry run python scripts/setup
+```
+
+Once installed, you can run PrivateGPT.
+
+```bash
+PGPT_PROFILES=tensorrt make run
+```
+
+PrivateGPT will use the already existing `settings-tensorrt.yaml` settings file, which is already configured to use Nvidia TensorRT LLM, local Embeddings, and Qdrant. Review it and adapt it to your needs (different LLM model, etc.)
+
+The UI will be available at http://localhost:8001
+
 ### Local, Llama-CPP powered setup

-If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command:
+If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command to install its dependencies:

 ```bash
 poetry install --extras "ui llms-llama-cpp embeddings-huggingface vector-stores-qdrant"
@@ -142,7 +183,7 @@ You need an OPENAI API key to run this setup.

 Edit the `settings-openai.yaml` file to include the correct API KEY. Never commit it! It's a secret! As an alternative to editing `settings-openai.yaml`, you can just set the env var OPENAI_API_KEY.

-Then, install PrivateGPT with the following command:
+Then, install PrivateGPT dependencies with the following command:
 ```bash
 poetry install --extras "ui llms-openai embeddings-openai vector-stores-qdrant"
 ```
@@ -159,7 +200,7 @@ The UI will be available at http://localhost:8001

 ### Local, Llama-CPP powered setup

-If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command:
+If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command to install its dependencies:

 ```bash
 poetry install --extras "ui llms-llama-cpp embeddings-huggingface vector-stores-qdrant"
--- a/poetry.lock
+++ b/poetry.lock
@@ -2126,6 +2126,22 @@ files = [
 llama-cpp-python = ">=0.2.32,<0.3.0"
 llama-index-core = ">=0.10.1,<0.11.0"

+[[package]]
+name = "llama-index-llms-nvidia-tensorrt"
+version = "0.1.4"
+description = "llama-index llms nvidia tensorrt integration"
+optional = true
+python-versions = ">=3.8.1,<4.0"
+files = [
+    {file = "llama_index_llms_nvidia_tensorrt-0.1.4-py3-none-any.whl", hash = "sha256:146b249de86317985d57d1acb89e5af1ef1564462899e6711f1ec97b3ba9ce7c"},
+    {file = "llama_index_llms_nvidia_tensorrt-0.1.4.tar.gz", hash = "sha256:7edddbe1ad2bc8f9fc2812853b800c8ad2b610931b870d49ad7d5be920e6dbfc"},
+]
+
+[package.dependencies]
+llama-index-core = ">=0.10.1,<0.11.0"
+torch = ">=2.1.2,<3.0.0"
+transformers = ">=4.37.0,<5.0.0"
+
 [[package]]
 name = "llama-index-llms-ollama"
 version = "0.1.2"
@@ -4856,6 +4872,16 @@ files = [
 [package.extras]
 doc = ["reno", "sphinx", "tornado (>=4.5)"]

+[[package]]
+name = "tensorrt-llm"
+version = "0.8.0"
+description = "A fake package to warn the user they are not installing the correct package."
+optional = true
+python-versions = ">=3.7, <4"
+files = [
+    {file = "tensorrt-llm-0.8.0.tar.gz", hash = "sha256:8bd59bf59766bb16f81bd330ca38765a532a21a35d323fd33929c80a6ec53eaf"},
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.5.2"
@@ -5899,6 +5925,7 @@ embeddings-huggingface = ["llama-index-embeddings-huggingface"]
 embeddings-openai = ["llama-index-embeddings-openai"]
 embeddings-sagemaker = ["boto3"]
 llms-llama-cpp = ["llama-index-llms-llama-cpp"]
+llms-nvidia-tensorrt = ["llama-index-llms-nvidia-tensorrt", "tensorrt_llm"]
 llms-ollama = ["llama-index-llms-ollama"]
 llms-openai = ["llama-index-llms-openai"]
 llms-openai-like = ["llama-index-llms-openai-like"]
@@ -5911,4 +5938,4 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.12"
-content-hash = "0249c25c783180d0c483c533d9102e3885e4a4f5261dc331a41323bd79d446f3"
+content-hash = "68f6ce6bd92419ce73a68c4a260ccf3016b38528c563ccad56d1bb68e7c09aae"
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -111,5 +111,20 @@ class LLMComponent:
                self.llm = Ollama(
                    model=ollama_settings.model, base_url=ollama_settings.api_base
                )
+            case "tensorrt":
+                try:
+                    from llama_index.llms.nvidia_tensorrt import LocalTensorRTLLM  # type: ignore
+                except ImportError as e:
+                    raise ImportError(
+                        "Nvidia TensorRTLLM dependencies not found, install with `poetry install --extras llms-nvidia-tensorrt`"
+                    ) from e
+
+                prompt_style = get_prompt_style(settings.tensorrt.prompt_style)
+                self.llm = LocalTensorRTLLM(
+                    model_path=settings.tensorrt.model_path,
+                    engine_name=settings.tensorrt.engine_name,
+                    tokenizer_dir=settings.llm.tokenizer,
+                    completion_to_prompt=prompt_style.completion_to_prompt,
+                )
            case "mock":
                self.llm = MockLLM()
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@@ -81,7 +81,7 @@ class DataSettings(BaseModel):


 class LLMSettings(BaseModel):
-    mode: Literal["llamacpp", "openai", "openailike", "sagemaker", "mock", "ollama"]
+    mode: Literal["llamacpp", "openai", "openailike", "sagemaker", "mock", "ollama", "tensorrt"]
    max_new_tokens: int = Field(
        256,
        description="The maximum number of token that the LLM is authorized to generate in one completion.",
@@ -120,6 +120,22 @@ class LlamaCPPSettings(BaseModel):
    )


+class TensorRTSettings(BaseModel):
+    model_path: str
+    engine_name: str
+    prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field(
+        "llama2",
+        description=(
+            "The prompt style to use for the chat engine. "
+            "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
+            "If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
+            "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
+            "If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
+            "`llama2` is the historic behaviour. `default` might work better with your custom models."
+        ),
+    )
+
+
 class HuggingFaceSettings(BaseModel):
    embedding_hf_model_name: str = Field(
        description="Name of the HuggingFace model to use for embeddings"
@@ -296,6 +312,7 @@ class Settings(BaseModel):
    llm: LLMSettings
    embedding: EmbeddingSettings
    llamacpp: LlamaCPPSettings
+    tensorrt: TensorRTSettings
    huggingface: HuggingFaceSettings
    sagemaker: SagemakerSettings
    openai: OpenAISettings
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,8 +26,11 @@ llama-index-embeddings-openai = {version ="^0.1.6", optional = true}
 llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true}
 llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true}
 llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true}
+llama-index-llms-nvidia-tensorrt = {version ="^0.1.2", optional = true}
 # Optional Sagemaker dependency
 boto3 = {version ="^1.34.51", optional = true}
+# Optional Nvidia TensorRT dependency
+tensorrt_llm = {version ="^0.8.0", optional = true}
 # Optional UI
 gradio = {version ="^4.19.2", optional = true}

@@ -38,6 +41,7 @@ llms-openai = ["llama-index-llms-openai"]
 llms-openai-like = ["llama-index-llms-openai-like"]
 llms-ollama = ["llama-index-llms-ollama"]
 llms-sagemaker = ["boto3"]
+llms-nvidia-tensorrt = ["tensorrt_llm", "llama-index-llms-nvidia-tensorrt"]
 embeddings-huggingface = ["llama-index-embeddings-huggingface"]
 embeddings-openai = ["llama-index-embeddings-openai"]
 embeddings-sagemaker = ["boto3"]
@@ -45,7 +49,6 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
 vector-stores-chroma = ["llama-index-vector-stores-chroma"]
 vector-stores-postgres = ["llama-index-vector-stores-postgres"]

-
 [tool.poetry.group.dev.dependencies]
 black = "^22"
 mypy = "^1.2"
--- a/settings-tensorrt.yaml
+++ b/settings-tensorrt.yaml
@@ -0,0 +1,25 @@
+server:
+  env_name: ${APP_ENV:tensorrt}
+
+llm:
+  mode: tensorrt
+  max_new_tokens: 512
+  context_window: 3900
+
+tensorrt:
+  model_path: models/tensorrt
+  engine_name: llama_float16_tp1_rank0.engine
+  prompt_style: "llama2"
+
+embedding:
+  mode: huggingface
+
+huggingface:
+  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+
+vectorstore:
+  database: qdrant
+
+qdrant:
+  path: local_data/private_gpt/qdrant
+
--- a/settings.yaml
+++ b/settings.yaml
@@ -79,3 +79,8 @@ openai:

 ollama:
  model: llama2-uncensored
+
+tensorrt:
+  model_path: models/tensorrt
+  engine_name: llama_float16_tp1_rank0.engine
+  prompt_style: "llama2"