From a7b18058b54f831a729f1d9eab186b497e3e42b6 Mon Sep 17 00:00:00 2001 From: imartinez Date: Thu, 29 Feb 2024 19:41:58 +0100 Subject: [PATCH] Support for Nvidia TensorRT --- fern/docs/pages/installation/installation.mdx | 51 +++++++++++++++++-- poetry.lock | 29 ++++++++++- private_gpt/components/llm/llm_component.py | 15 ++++++ private_gpt/settings/settings.py | 19 ++++++- pyproject.toml | 5 +- settings-tensorrt.yaml | 25 +++++++++ settings.yaml | 5 ++ 7 files changed, 141 insertions(+), 8 deletions(-) create mode 100644 settings-tensorrt.yaml diff --git a/fern/docs/pages/installation/installation.mdx b/fern/docs/pages/installation/installation.mdx index 4ff8b2ee..94ad82d8 100644 --- a/fern/docs/pages/installation/installation.mdx +++ b/fern/docs/pages/installation/installation.mdx @@ -47,6 +47,7 @@ Where `` can be any of the following: - llms-ollama: adds support for Ollama LLM, the easiest way to get a local LLM running - llms-llama-cpp: adds support for local LLM using LlamaCPP - expect a messy installation process on some platforms - llms-sagemaker: adds support for Amazon Sagemaker LLM, requires Sagemaker inference endpoints +- llms-nvidia-tensorrt: add support for Nvidia TensorRT LLM - llms-openai: adds support for OpenAI LLM, requires OpenAI API key - llms-openai-like: adds support for 3rd party LLM providers that are compatible with OpenAI's API - embeddings-huggingface: adds support for local Embeddings using HuggingFace @@ -67,7 +68,7 @@ The easiest way to run PrivateGPT fully locally is to depend on Ollama for the L Go to [ollama.ai](https://ollama.ai/) and follow the instructions to install Ollama on your machine. -Once done, you can install PrivateGPT with the following command: +Once done, you can install PrivateGPT dependencies with the following command: ```bash poetry install --extras "ui llms-ollama embeddings-huggingface vector-stores-qdrant" ``` @@ -96,7 +97,7 @@ You need to have access to sagemaker inference endpoints for the LLM and / or th Edit the `settings-sagemaker.yaml` file to include the correct Sagemaker endpoints. -Then, install PrivateGPT with the following command: +Then, install PrivateGPT dependencies with the following command: ```bash poetry install --extras "ui llms-sagemaker embeddings-sagemaker vector-stores-qdrant" ``` @@ -111,9 +112,49 @@ PrivateGPT will use the already existing `settings-sagemaker.yaml` settings file The UI will be available at http://localhost:8001 +### Local, TensorRT-powered setup + +To get the most out of NVIDIA GPUs, you can set up a fully local PrivateGPT using TensorRT as its LLM provider. For more information about Nvidia TensorRT, check the [official documentation](https://github.com/NVIDIA/TensorRT-LLM). + +Follow these steps to set up a local TensorRT-powered PrivateGPT: + +- Nvidia Cuda 12.2 or higher is currently required to run TensorRT-LLM. + +- For this example we will use Llama2. The Llama2 model files need to be created via scripts following the instructions [here](https://github.com/NVIDIA/trt-llm-rag-windows/blob/release/1.0/README.md#building-trt-engine). +The following files will be created from following the steps in the link: + +* `Llama_float16_tp1_rank0.engine`: The main output of the build script, containing the executable graph of operations with the model weights embedded. + +* `config.jsonp`: Includes detailed information about the model, like its general structure and precision, as well as information about which plug-ins were incorporated into the engine. + +* `model.cache`: Caches some of the timing and optimization information from model compilation, making successive builds quicker. + +- Create a folder inside `models` called `tensorrt`, and move all of the files mentioned above to that directory. + +Once done, you can install PrivateGPT dependencies with the following command: +```bash +poetry install --extras "ui llms-nvidia-tensorrt embeddings-huggingface vector-stores-qdrant" +``` + +We are installing "embeddings-huggingface" dependency to support local embeddings, because TensorRT only covers the LLM. +In order for local embeddings to work, you need to download the embeddings model to the `models` folder. You can do so by running the `setup` script: +```bash +poetry run python scripts/setup +``` + +Once installed, you can run PrivateGPT. + +```bash +PGPT_PROFILES=tensorrt make run +``` + +PrivateGPT will use the already existing `settings-tensorrt.yaml` settings file, which is already configured to use Nvidia TensorRT LLM, local Embeddings, and Qdrant. Review it and adapt it to your needs (different LLM model, etc.) + +The UI will be available at http://localhost:8001 + ### Local, Llama-CPP powered setup -If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command: +If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command to install its dependencies: ```bash poetry install --extras "ui llms-llama-cpp embeddings-huggingface vector-stores-qdrant" @@ -142,7 +183,7 @@ You need an OPENAI API key to run this setup. Edit the `settings-openai.yaml` file to include the correct API KEY. Never commit it! It's a secret! As an alternative to editing `settings-openai.yaml`, you can just set the env var OPENAI_API_KEY. -Then, install PrivateGPT with the following command: +Then, install PrivateGPT dependencies with the following command: ```bash poetry install --extras "ui llms-openai embeddings-openai vector-stores-qdrant" ``` @@ -159,7 +200,7 @@ The UI will be available at http://localhost:8001 ### Local, Llama-CPP powered setup -If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command: +If you want to run PrivateGPT fully locally without relying on Ollama, you can run the following command to install its dependencies: ```bash poetry install --extras "ui llms-llama-cpp embeddings-huggingface vector-stores-qdrant" diff --git a/poetry.lock b/poetry.lock index 714399e2..98b86e3b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2126,6 +2126,22 @@ files = [ llama-cpp-python = ">=0.2.32,<0.3.0" llama-index-core = ">=0.10.1,<0.11.0" +[[package]] +name = "llama-index-llms-nvidia-tensorrt" +version = "0.1.4" +description = "llama-index llms nvidia tensorrt integration" +optional = true +python-versions = ">=3.8.1,<4.0" +files = [ + {file = "llama_index_llms_nvidia_tensorrt-0.1.4-py3-none-any.whl", hash = "sha256:146b249de86317985d57d1acb89e5af1ef1564462899e6711f1ec97b3ba9ce7c"}, + {file = "llama_index_llms_nvidia_tensorrt-0.1.4.tar.gz", hash = "sha256:7edddbe1ad2bc8f9fc2812853b800c8ad2b610931b870d49ad7d5be920e6dbfc"}, +] + +[package.dependencies] +llama-index-core = ">=0.10.1,<0.11.0" +torch = ">=2.1.2,<3.0.0" +transformers = ">=4.37.0,<5.0.0" + [[package]] name = "llama-index-llms-ollama" version = "0.1.2" @@ -4856,6 +4872,16 @@ files = [ [package.extras] doc = ["reno", "sphinx", "tornado (>=4.5)"] +[[package]] +name = "tensorrt-llm" +version = "0.8.0" +description = "A fake package to warn the user they are not installing the correct package." +optional = true +python-versions = ">=3.7, <4" +files = [ + {file = "tensorrt-llm-0.8.0.tar.gz", hash = "sha256:8bd59bf59766bb16f81bd330ca38765a532a21a35d323fd33929c80a6ec53eaf"}, +] + [[package]] name = "tiktoken" version = "0.5.2" @@ -5899,6 +5925,7 @@ embeddings-huggingface = ["llama-index-embeddings-huggingface"] embeddings-openai = ["llama-index-embeddings-openai"] embeddings-sagemaker = ["boto3"] llms-llama-cpp = ["llama-index-llms-llama-cpp"] +llms-nvidia-tensorrt = ["llama-index-llms-nvidia-tensorrt", "tensorrt_llm"] llms-ollama = ["llama-index-llms-ollama"] llms-openai = ["llama-index-llms-openai"] llms-openai-like = ["llama-index-llms-openai-like"] @@ -5911,4 +5938,4 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"] [metadata] lock-version = "2.0" python-versions = ">=3.11,<3.12" -content-hash = "0249c25c783180d0c483c533d9102e3885e4a4f5261dc331a41323bd79d446f3" +content-hash = "68f6ce6bd92419ce73a68c4a260ccf3016b38528c563ccad56d1bb68e7c09aae" diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py index 232d5b28..8cfee4b6 100644 --- a/private_gpt/components/llm/llm_component.py +++ b/private_gpt/components/llm/llm_component.py @@ -111,5 +111,20 @@ class LLMComponent: self.llm = Ollama( model=ollama_settings.model, base_url=ollama_settings.api_base ) + case "tensorrt": + try: + from llama_index.llms.nvidia_tensorrt import LocalTensorRTLLM # type: ignore + except ImportError as e: + raise ImportError( + "Nvidia TensorRTLLM dependencies not found, install with `poetry install --extras llms-nvidia-tensorrt`" + ) from e + + prompt_style = get_prompt_style(settings.tensorrt.prompt_style) + self.llm = LocalTensorRTLLM( + model_path=settings.tensorrt.model_path, + engine_name=settings.tensorrt.engine_name, + tokenizer_dir=settings.llm.tokenizer, + completion_to_prompt=prompt_style.completion_to_prompt, + ) case "mock": self.llm = MockLLM() diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 4493f90f..db65d227 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -81,7 +81,7 @@ class DataSettings(BaseModel): class LLMSettings(BaseModel): - mode: Literal["llamacpp", "openai", "openailike", "sagemaker", "mock", "ollama"] + mode: Literal["llamacpp", "openai", "openailike", "sagemaker", "mock", "ollama", "tensorrt"] max_new_tokens: int = Field( 256, description="The maximum number of token that the LLM is authorized to generate in one completion.", @@ -120,6 +120,22 @@ class LlamaCPPSettings(BaseModel): ) +class TensorRTSettings(BaseModel): + model_path: str + engine_name: str + prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field( + "llama2", + description=( + "The prompt style to use for the chat engine. " + "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n" + "If `llama2` - use the llama2 prompt style from the llama_index. Based on ``, `[INST]` and `<>`.\n" + "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n" + "If `mistral` - use the `mistral prompt style. It shoudl look like [INST] {System Prompt} [/INST][INST] { UserInstructions } [/INST]" + "`llama2` is the historic behaviour. `default` might work better with your custom models." + ), + ) + + class HuggingFaceSettings(BaseModel): embedding_hf_model_name: str = Field( description="Name of the HuggingFace model to use for embeddings" @@ -296,6 +312,7 @@ class Settings(BaseModel): llm: LLMSettings embedding: EmbeddingSettings llamacpp: LlamaCPPSettings + tensorrt: TensorRTSettings huggingface: HuggingFaceSettings sagemaker: SagemakerSettings openai: OpenAISettings diff --git a/pyproject.toml b/pyproject.toml index 67961fe7..ef6941a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,8 +26,11 @@ llama-index-embeddings-openai = {version ="^0.1.6", optional = true} llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true} llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true} llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true} +llama-index-llms-nvidia-tensorrt = {version ="^0.1.2", optional = true} # Optional Sagemaker dependency boto3 = {version ="^1.34.51", optional = true} +# Optional Nvidia TensorRT dependency +tensorrt_llm = {version ="^0.8.0", optional = true} # Optional UI gradio = {version ="^4.19.2", optional = true} @@ -38,6 +41,7 @@ llms-openai = ["llama-index-llms-openai"] llms-openai-like = ["llama-index-llms-openai-like"] llms-ollama = ["llama-index-llms-ollama"] llms-sagemaker = ["boto3"] +llms-nvidia-tensorrt = ["tensorrt_llm", "llama-index-llms-nvidia-tensorrt"] embeddings-huggingface = ["llama-index-embeddings-huggingface"] embeddings-openai = ["llama-index-embeddings-openai"] embeddings-sagemaker = ["boto3"] @@ -45,7 +49,6 @@ vector-stores-qdrant = ["llama-index-vector-stores-qdrant"] vector-stores-chroma = ["llama-index-vector-stores-chroma"] vector-stores-postgres = ["llama-index-vector-stores-postgres"] - [tool.poetry.group.dev.dependencies] black = "^22" mypy = "^1.2" diff --git a/settings-tensorrt.yaml b/settings-tensorrt.yaml new file mode 100644 index 00000000..f0adea9e --- /dev/null +++ b/settings-tensorrt.yaml @@ -0,0 +1,25 @@ +server: + env_name: ${APP_ENV:tensorrt} + +llm: + mode: tensorrt + max_new_tokens: 512 + context_window: 3900 + +tensorrt: + model_path: models/tensorrt + engine_name: llama_float16_tp1_rank0.engine + prompt_style: "llama2" + +embedding: + mode: huggingface + +huggingface: + embedding_hf_model_name: BAAI/bge-small-en-v1.5 + +vectorstore: + database: qdrant + +qdrant: + path: local_data/private_gpt/qdrant + diff --git a/settings.yaml b/settings.yaml index 8c581a90..f4487a57 100644 --- a/settings.yaml +++ b/settings.yaml @@ -79,3 +79,8 @@ openai: ollama: model: llama2-uncensored + +tensorrt: + model_path: models/tensorrt + engine_name: llama_float16_tp1_rank0.engine + prompt_style: "llama2" \ No newline at end of file