diff --git a/fern/docs/pages/api-reference/sdks.mdx b/fern/docs/pages/api-reference/sdks.mdx index 0172d9d4..f7cf6f84 100644 --- a/fern/docs/pages/api-reference/sdks.mdx +++ b/fern/docs/pages/api-reference/sdks.mdx @@ -8,14 +8,14 @@ The clients are kept up to date automatically, so we encourage you to use the la
diff --git a/private_gpt/components/llm/custom/sagemaker.py b/private_gpt/components/llm/custom/sagemaker.py index e20f5394..bd2aec18 100644 --- a/private_gpt/components/llm/custom/sagemaker.py +++ b/private_gpt/components/llm/custom/sagemaker.py @@ -218,7 +218,7 @@ class SagemakerLLM(CustomLLM): response_body = resp["Body"] response_str = response_body.read().decode("utf-8") - response_dict = eval(response_str) + response_dict = json.loads(response_str) return CompletionResponse( text=response_dict[0]["generated_text"][len(prompt) :], raw=resp diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py index baffa4e4..c29638b1 100644 --- a/private_gpt/components/llm/llm_component.py +++ b/private_gpt/components/llm/llm_component.py @@ -51,7 +51,7 @@ class LLMComponent: "Local dependencies not found, install with `poetry install --extras llms-llama-cpp`" ) from e - prompt_style = get_prompt_style(settings.llamacpp.prompt_style) + prompt_style = get_prompt_style(settings.llm.prompt_style) settings_kwargs = { "tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp "top_k": settings.llamacpp.top_k, # ollama and llama-cpp @@ -109,15 +109,23 @@ class LLMComponent: raise ImportError( "OpenAILike dependencies not found, install with `poetry install --extras llms-openai-like`" ) from e - + prompt_style = get_prompt_style(settings.llm.prompt_style) openai_settings = settings.openai self.llm = OpenAILike( api_base=openai_settings.api_base, api_key=openai_settings.api_key, model=openai_settings.model, is_chat_model=True, - max_tokens=None, + max_tokens=settings.llm.max_new_tokens, api_version="", + temperature=settings.llm.temperature, + context_window=settings.llm.context_window, + max_new_tokens=settings.llm.max_new_tokens, + messages_to_prompt=prompt_style.messages_to_prompt, + completion_to_prompt=prompt_style.completion_to_prompt, + tokenizer=settings.llm.tokenizer, + timeout=openai_settings.request_timeout, + reuse_client=False, ) case "ollama": try: diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index 051cfcab..bd83fb8b 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -104,6 +104,17 @@ class LLMSettings(BaseModel): 0.1, description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.", ) + prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field( + "llama2", + description=( + "The prompt style to use for the chat engine. " + "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n" + "If `llama2` - use the llama2 prompt style from the llama_index. Based on ``, `[INST]` and `<>`.\n" + "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n" + "If `mistral` - use the `mistral prompt style. It shoudl look like [INST] {System Prompt} [/INST][INST] { UserInstructions } [/INST]" + "`llama2` is the historic behaviour. `default` might work better with your custom models." + ), + ) class VectorstoreSettings(BaseModel): @@ -117,18 +128,6 @@ class NodeStoreSettings(BaseModel): class LlamaCPPSettings(BaseModel): llm_hf_repo_id: str llm_hf_model_file: str - prompt_style: Literal["default", "llama2", "tag", "mistral", "chatml"] = Field( - "llama2", - description=( - "The prompt style to use for the chat engine. " - "If `default` - use the default prompt style from the llama_index. It should look like `role: message`.\n" - "If `llama2` - use the llama2 prompt style from the llama_index. Based on ``, `[INST]` and `<>`.\n" - "If `tag` - use the `tag` prompt style. It should look like `<|role|>: message`. \n" - "If `mistral` - use the `mistral prompt style. It shoudl look like [INST] {System Prompt} [/INST][INST] { UserInstructions } [/INST]" - "`llama2` is the historic behaviour. `default` might work better with your custom models." - ), - ) - tfs_z: float = Field( 1.0, description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.", @@ -206,6 +205,10 @@ class OpenAISettings(BaseModel): "gpt-3.5-turbo", description="OpenAI Model to use. Example: 'gpt-4'.", ) + request_timeout: float = Field( + 120.0, + description="Time elapsed until openailike server times out the request. Default is 120s. Format is float. ", + ) class OllamaSettings(BaseModel): diff --git a/settings-local.yaml b/settings-local.yaml index c9d02742..48eeb0ea 100644 --- a/settings-local.yaml +++ b/settings-local.yaml @@ -8,9 +8,9 @@ llm: max_new_tokens: 512 context_window: 3900 tokenizer: mistralai/Mistral-7B-Instruct-v0.2 + prompt_style: "mistral" llamacpp: - prompt_style: "mistral" llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf @@ -24,4 +24,4 @@ vectorstore: database: qdrant qdrant: - path: local_data/private_gpt/qdrant \ No newline at end of file + path: local_data/private_gpt/qdrant diff --git a/settings-vllm.yaml b/settings-vllm.yaml index 5a0a68c6..1bfab6b2 100644 --- a/settings-vllm.yaml +++ b/settings-vllm.yaml @@ -3,6 +3,9 @@ server: llm: mode: openailike + max_new_tokens: 512 + tokenizer: mistralai/Mistral-7B-Instruct-v0.2 + temperature: 0.1 embedding: mode: huggingface @@ -15,3 +18,4 @@ openai: api_base: http://localhost:8000/v1 api_key: EMPTY model: facebook/opt-125m + request_timeout: 600.0 \ No newline at end of file diff --git a/settings.yaml b/settings.yaml index e881a555..06fcd63d 100644 --- a/settings.yaml +++ b/settings.yaml @@ -5,7 +5,7 @@ server: env_name: ${APP_ENV:prod} port: ${PORT:8001} cors: - enabled: false + enabled: true allow_origins: ["*"] allow_methods: ["*"] allow_headers: ["*"] @@ -36,6 +36,7 @@ ui: llm: mode: llamacpp + prompt_style: "mistral" # Should be matching the selected model max_new_tokens: 512 context_window: 3900 @@ -53,7 +54,6 @@ rag: top_n: 1 llamacpp: - prompt_style: "mistral" llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting