From a7b18058b54f831a729f1d9eab186b497e3e42b6 Mon Sep 17 00:00:00 2001
From: imartinez <ivanmartit@gmail.com>
Date: Thu, 29 Feb 2024 19:41:58 +0100
Subject: [PATCH] Support for Nvidia TensorRT

---
 fern/docs/pages/installation/installation.mdx | 51 +++++++++++++++++--
 poetry.lock                                   | 29 ++++++++++-
 private_gpt/components/llm/llm_component.py   | 15 ++++++
 private_gpt/settings/settings.py              | 19 ++++++-
 pyproject.toml                                |  5 +-
 settings-tensorrt.yaml                        | 25 +++++++++
 settings.yaml                                 |  5 ++
 7 files changed, 141 insertions(+), 8 deletions(-)
 create mode 100644 settings-tensorrt.yaml
diff --git a/fern/docs/pages/installation/installation.mdx b/fern/docs/pages/installation/installation.mdx
index 4ff8b2ee..94ad82d8 100644
--- a/fern/docs/pages/installation/installation.mdx
+++ b/fern/docs/pages/installation/installation.mdx
@@ -47,6 +47,7 @@ Where `<extra>` can be any of the following:
 - llms-ollama: adds support for Ollama LLM, the easiest way to get a local LLM running
 - llms-llama-cpp: adds support for local LLM using LlamaCPP - expect a messy installation process on some platforms
 - llms-sagemaker: adds support for Amazon Sagemaker LLM, requires Sagemaker inference endpoints
+- llms-nvidia-tensorrt: add support for Nvidia TensorRT LLM
 - llms-openai: adds support for OpenAI LLM, requires OpenAI API key
 - llms-openai-like: adds support for 3rd party LLM providers that are compatible with OpenAI's API
 - embeddings-huggingface: adds support for local Embeddings using HuggingFace
@@ -67,7 +68,7 @@ The easiest way to run PrivateGPT fully locally is to depend on Ollama for the L
 
 Go to [ollama.ai](https://ollama.ai/) and follow the instructions to install Ollama on your machine.
 
-Once done, you can install PrivateGPT with the following command:
+Once done, you can install PrivateGPT dependencies with the following command:
 ```bash
 poetry install --extras "ui llms-ollama embeddings-huggingface vector-stores-qdrant"
 ```
@@ -96,7 +97,7 @@ You need to have access to sagemaker inference endpoints for the LLM and / or th
 
 Edit the `settings-sagemaker.yaml` file to include the correct Sagemaker endpoints.
 
-Then, install PrivateGPT with the following command:
+Then, install PrivateGPT dependencies with the following command:
 ```bash
 poetry install --extras "ui llms-sagemaker embeddings-sagemaker vector-stores-qdrant"
 ```
@@ -111,9 +112,49 @@ PrivateGPT will use the already existing `settings-sagemaker.yaml` settings file
 
 The UI will be available at http://localhost:8001
 
+### Local, TensorRT-powered setup
+
+To get the most out of NVIDIA GPUs, you can set up a fully local PrivateGPT using TensorRT as its LLM provider. For more information about Nvidia TensorRT, check the [official documentation](https://github.com/NVIDIA/TensorRT-LLM).
+
+Follow these steps to set up a local TensorRT-powered PrivateGPT:
+
+- Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM.
+
+- For this example we will use Llama2. The Llama2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/trt-llm-rag-windows/blob/release/1.0/README.md#building-trt-engine).
+The following files will be created from following the steps in the link:
+
+* `Llama_float16_tp1_rank0.engine`: The main output of the build script, containing the executable graph of operations with the model weights embedded.
+
+* `config.jsonp`: Includes detailed information about the model, like its general structure and precision, as well as information about which plug-ins were incorporated into the engine.
+
+* `model.cache`: Caches some of the timing and optimization information from model compilation, making successive builds quicker.
+
+- Create a folder inside `models` called `tensorrt`, and move all of the files mentioned above to that directory.
+
+Once done, you can install PrivateGPT dependencies with the following command:
+```bash
+poetry install --extras "ui llms-nvidia-tensorrt embeddings-huggingface vector-stores-qdrant"
+```
+
+We are installing "embeddings-huggingface" dependency to support local embeddings, because TensorRT only covers the LLM.
+In order for local embeddings to work, you need to download the embeddings model to the `models` folder. You can do so by running the `setup` script:
+```bash
+poetry run python scripts/setup
+```
+
+Once installed, you can run PrivateGPT.
+
+```bash
+PGPT_PROFILES=tensorrt make run
+```
+
+PrivateGPT will use the already existing `settings-tensorrt.yaml` settings file, which is already configured to use Nvidia TensorRT LLM, local Embeddings, and Qdrant. Review it and adapt it to your needs (different LLM model, etc.)
+
+The UI will be available at http://localhost:8001
+
 ### Local, Llama-CPP powered setup
 
-If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command:
+If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command to install its dependencies:
 
 ```bash
 poetry install --extras "ui llms-llama-cpp embeddings-huggingface vector-stores-qdrant"
@@ -142,7 +183,7 @@ You need an OPENAI API key to run this setup.
 
 Edit the `settings-openai.yaml` file to include the correct API KEY. Never commit it! It's a secret! As an alternative to editing `settings-openai.yaml`, you can just set the env var OPENAI_API_KEY.
 
-Then, install PrivateGPT with the following command:
+Then, install PrivateGPT dependencies with the following command:
 ```bash
 poetry install --extras "ui llms-openai embeddings-openai vector-stores-qdrant"
 ```
@@ -159,7 +200,7 @@ The UI will be available at http://localhost:8001
 
 ### Local, Llama-CPP powered setup
 
-If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command:
+If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command to install its dependencies:
 
 ```bash
 poetry install --extras "ui llms-llama-cpp embeddings-huggingface vector-stores-qdrant"
diff --git a/poetry.lock b/poetry.lock
index 714399e2..98b86e3b 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2126,6 +2126,22 @@ files = [
 llama-cpp-python = ">=0.2.32,<0.3.0"
 llama-index-core = ">=0.10.1,<0.11.0"
 
+[[package]]
+name = "llama-index-llms-nvidia-tensorrt"
+version = "0.1.4"
+description = "llama-index llms nvidia tensorrt integration"
+optional = true
+python-versions = ">=3.8.1,<4.0"
+files = [
+    {file = "llama_index_llms_nvidia_tensorrt-0.1.4-py3-none-any.whl", hash = "sha256:146b249de86317985d57d1acb89e5af1ef1564462899e6711f1ec97b3ba9ce7c"},
+    {file = "llama_index_llms_nvidia_tensorrt-0.1.4.tar.gz", hash = "sha256:7edddbe1ad2bc8f9fc2812853b800c8ad2b610931b870d49ad7d5be920e6dbfc"},
+]
+
+[package.dependencies]
+llama-index-core = ">=0.10.1,<0.11.0"
+torch = ">=2.1.2,<3.0.0"
+transformers = ">=4.37.0,<5.0.0"
+
 [[package]]
 name = "llama-index-llms-ollama"
 version = "0.1.2"
@@ -4856,6 +4872,16 @@ files = [
 [package.extras]
 doc = ["reno", "sphinx", "tornado (>=4.5)"]
 
+[[package]]
+name = "tensorrt-llm"
+version = "0.8.0"
+description = "A fake package to warn the user they are not installing the correct package."
+optional = true
+python-versions = ">=3.7, <4"
+files = [
+    {file = "tensorrt-llm-0.8.0.tar.gz", hash = "sha256:8bd59bf59766bb16f81bd330ca38765a532a21a35d323fd33929c80a6ec53eaf"},
+]
+
 [[package]]
 name = "tiktoken"
 version = "0.5.2"
@@ -5899,6 +5925,7 @@ embeddings-huggingface = ["llama-index-embeddings-huggingface"]
 embeddings-openai = ["llama-index-embeddings-openai"]
 embeddings-sagemaker = ["boto3"]
 llms-llama-cpp = ["llama-index-llms-llama-cpp"]
+llms-nvidia-tensorrt = ["llama-index-llms-nvidia-tensorrt", "tensorrt_llm"]
 llms-ollama = ["llama-index-llms-ollama"]
 llms-openai = ["llama-index-llms-openai"]
 llms-openai-like = ["llama-index-llms-openai-like"]
@@ -5911,4 +5938,4 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.11,<3.12"
-content-hash = "0249c25c783180d0c483c533d9102e3885e4a4f5261dc331a41323bd79d446f3"
+content-hash = "68f6ce6bd92419ce73a68c4a260ccf3016b38528c563ccad56d1bb68e7c09aae"
diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
index 232d5b28..8cfee4b6 100644
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -111,5 +111,20 @@ class LLMComponent:
                 self.llm = Ollama(
                     model=ollama_settings.model, base_url=ollama_settings.api_base
                 )
+            case "tensorrt":
+                try:
+                    from llama_index.llms.nvidia_tensorrt import LocalTensorRTLLM  # type: ignore
+                except ImportError as e:
+                    raise ImportError(
+                        "Nvidia TensorRTLLM dependencies not found, install with `poetry install --extras llms-nvidia-tensorrt`"
+                    ) from e
+
+                prompt_style = get_prompt_style(settings.tensorrt.prompt_style)
+                self.llm = LocalTensorRTLLM(
+                    model_path=settings.tensorrt.model_path,
+                    engine_name=settings.tensorrt.engine_name,
+                    tokenizer_dir=settings.llm.tokenizer,
+                    completion_to_prompt=prompt_style.completion_to_prompt,
+                )
             case "mock":
                 self.llm = MockLLM()
diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
index 4493f90f..db65d227 100644
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@@ -81,7 +81,7 @@ class DataSettings(BaseModel):
 
 
 class LLMSettings(BaseModel):
-    mode: Literal["llamacpp", "openai", "openailike", "sagemaker", "mock", "ollama"]
+    mode: Literal["llamacpp", "openai", "openailike", "sagemaker", "mock", "ollama", "tensorrt"]
     max_new_tokens: int = Field(
         256,
         description="The maximum number of token that the LLM is authorized to generate in one completion.",
@@ -120,6 +120,22 @@ class LlamaCPPSettings(BaseModel):
     )
 
 
+class TensorRTSettings(BaseModel):
+    model_path: str
+    engine_name: str
+    prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field(
+        "llama2",
+        description=(
+            "The prompt style to use for the chat engine. "
+            "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n"
+            "If `llama2` - use the llama2 prompt style from the llama_index. Based on `<s>`, `[INST]` and `<<SYS>>`.\n"
+            "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n"
+            "If `mistral` - use the `mistral prompt style. It shoudl look like <s>[INST] {System Prompt} [/INST]</s>[INST] { UserInstructions } [/INST]"
+            "`llama2` is the historic behaviour. `default` might work better with your custom models."
+        ),
+    )
+
+
 class HuggingFaceSettings(BaseModel):
     embedding_hf_model_name: str = Field(
         description="Name of the HuggingFace model to use for embeddings"
@@ -296,6 +312,7 @@ class Settings(BaseModel):
     llm: LLMSettings
     embedding: EmbeddingSettings
     llamacpp: LlamaCPPSettings
+    tensorrt: TensorRTSettings
     huggingface: HuggingFaceSettings
     sagemaker: SagemakerSettings
     openai: OpenAISettings
diff --git a/pyproject.toml b/pyproject.toml
index 67961fe7..ef6941a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,8 +26,11 @@ llama-index-embeddings-openai = {version ="^0.1.6", optional = true}
 llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true}
 llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true}
 llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true}
+llama-index-llms-nvidia-tensorrt = {version ="^0.1.2", optional = true}
 # Optional Sagemaker dependency
 boto3 = {version ="^1.34.51", optional = true}
+# Optional Nvidia TensorRT dependency
+tensorrt_llm = {version ="^0.8.0", optional = true}
 # Optional UI
 gradio = {version ="^4.19.2", optional = true}
 
@@ -38,6 +41,7 @@ llms-openai = ["llama-index-llms-openai"]
 llms-openai-like = ["llama-index-llms-openai-like"]
 llms-ollama = ["llama-index-llms-ollama"]
 llms-sagemaker = ["boto3"]
+llms-nvidia-tensorrt = ["tensorrt_llm", "llama-index-llms-nvidia-tensorrt"]
 embeddings-huggingface = ["llama-index-embeddings-huggingface"]
 embeddings-openai = ["llama-index-embeddings-openai"]
 embeddings-sagemaker = ["boto3"]
@@ -45,7 +49,6 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
 vector-stores-chroma = ["llama-index-vector-stores-chroma"]
 vector-stores-postgres = ["llama-index-vector-stores-postgres"]
 
-
 [tool.poetry.group.dev.dependencies]
 black = "^22"
 mypy = "^1.2"
diff --git a/settings-tensorrt.yaml b/settings-tensorrt.yaml
new file mode 100644
index 00000000..f0adea9e
--- /dev/null
+++ b/settings-tensorrt.yaml
@@ -0,0 +1,25 @@
+server:
+  env_name: ${APP_ENV:tensorrt}
+
+llm:
+  mode: tensorrt
+  max_new_tokens: 512
+  context_window: 3900
+
+tensorrt:
+  model_path: models/tensorrt
+  engine_name: llama_float16_tp1_rank0.engine
+  prompt_style: "llama2"
+
+embedding:
+  mode: huggingface
+
+huggingface:
+  embedding_hf_model_name: BAAI/bge-small-en-v1.5
+
+vectorstore:
+  database: qdrant
+
+qdrant:
+  path: local_data/private_gpt/qdrant
+
diff --git a/settings.yaml b/settings.yaml
index 8c581a90..f4487a57 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -79,3 +79,8 @@ openai:
 
 ollama:
   model: llama2-uncensored
+
+tensorrt:
+  model_path: models/tensorrt
+  engine_name: llama_float16_tp1_rank0.engine
+  prompt_style: "llama2"
\ No newline at end of file