feat(llm): adds serveral settings for llamacpp and ollama (#1703)

2025-09-06 09:41:31 +00:00 · 2024-03-11 17:51:05 -04:00
parent 410bf7a71f
commit 02dc83e8e9
10 changed files with 91 additions and 8 deletions
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -39,16 +39,23 @@ class LLMComponent:
                    ) from e

                prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
-
+                settings_kwargs = {
+                    "tfs_z": settings.llamacpp.tfs_z,  # ollama and llama-cpp
+                    "top_k": settings.llamacpp.top_k,  # ollama and llama-cpp
+                    "top_p": settings.llamacpp.top_p,  # ollama and llama-cpp
+                    "repeat_penalty": settings.llamacpp.repeat_penalty,  # ollama llama-cpp
+                    "n_gpu_layers": -1,
+                    "offload_kqv": True,
+                }
                self.llm = LlamaCPP(
                    model_path=str(models_path / settings.llamacpp.llm_hf_model_file),
-                    temperature=0.1,
+                    temperature=settings.llm.temperature,
                    max_new_tokens=settings.llm.max_new_tokens,
                    context_window=settings.llm.context_window,
                    generate_kwargs={},
                    callback_manager=LlamaIndexSettings.callback_manager,
                    # All to GPU
-                    model_kwargs={"n_gpu_layers": -1, "offload_kqv": True},
+                    model_kwargs=settings_kwargs,
                    # transform inputs into Llama2 format
                    messages_to_prompt=prompt_style.messages_to_prompt,
                    completion_to_prompt=prompt_style.completion_to_prompt,
@@ -108,8 +115,22 @@ class LLMComponent:
                    ) from e

                ollama_settings = settings.ollama
+
+                settings_kwargs = {
+                    "tfs_z": ollama_settings.tfs_z,  # ollama and llama-cpp
+                    "num_predict": ollama_settings.num_predict,  # ollama only
+                    "top_k": ollama_settings.top_k,  # ollama and llama-cpp
+                    "top_p": ollama_settings.top_p,  # ollama and llama-cpp
+                    "repeat_last_n": ollama_settings.repeat_last_n,  # ollama
+                    "repeat_penalty": ollama_settings.repeat_penalty,  # ollama llama-cpp
+                }
+
                self.llm = Ollama(
-                    model=ollama_settings.llm_model, base_url=ollama_settings.api_base
+                    model=ollama_settings.llm_model,
+                    base_url=ollama_settings.api_base,
+                    temperature=settings.llm.temperature,
+                    context_window=settings.llm.context_window,
+                    additional_kwargs=settings_kwargs,
                )
            case "mock":
                self.llm = MockLLM()