llama-cpp: add gpu layers parameter (#4739)

Adds gpu layers parameter to llama.cpp wrapper

Co-authored-by: andrew.khvalenski <andrew.khvalenski@behavox.com>
Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
hilarious-viking 2023-05-16 00:01:48 +01:00 committed by GitHub
parent 36c9fd1af7
commit 7d15669b41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 55 deletions

View File

@ -53,6 +53,9 @@ class LlamaCppEmbeddings(BaseModel, Embeddings):
"""Number of tokens to process in parallel. """Number of tokens to process in parallel.
Should be a number between 1 and n_ctx.""" Should be a number between 1 and n_ctx."""
n_gpu_layers: Optional[int] = Field(None, alias="n_gpu_layers")
"""Number of layers to be loaded into gpu memory. Default None."""
class Config: class Config:
"""Configuration for this pydantic object.""" """Configuration for this pydantic object."""
@ -62,40 +65,37 @@ class LlamaCppEmbeddings(BaseModel, Embeddings):
def validate_environment(cls, values: Dict) -> Dict: def validate_environment(cls, values: Dict) -> Dict:
"""Validate that llama-cpp-python library is installed.""" """Validate that llama-cpp-python library is installed."""
model_path = values["model_path"] model_path = values["model_path"]
n_ctx = values["n_ctx"] model_param_names = [
n_parts = values["n_parts"] "n_ctx",
seed = values["seed"] "n_parts",
f16_kv = values["f16_kv"] "seed",
logits_all = values["logits_all"] "f16_kv",
vocab_only = values["vocab_only"] "logits_all",
use_mlock = values["use_mlock"] "vocab_only",
n_threads = values["n_threads"] "use_mlock",
n_batch = values["n_batch"] "n_threads",
"n_batch",
]
model_params = {k: values[k] for k in model_param_names}
# For backwards compatibility, only include if non-null.
if values["n_gpu_layers"] is not None:
model_params["n_gpu_layers"] = values["n_gpu_layers"]
try: try:
from llama_cpp import Llama from llama_cpp import Llama
values["client"] = Llama( values["client"] = Llama(model_path, embedding=True, **model_params)
model_path=model_path,
n_ctx=n_ctx,
n_parts=n_parts,
seed=seed,
f16_kv=f16_kv,
logits_all=logits_all,
vocab_only=vocab_only,
use_mlock=use_mlock,
n_threads=n_threads,
n_batch=n_batch,
embedding=True,
)
except ImportError: except ImportError:
raise ModuleNotFoundError( raise ModuleNotFoundError(
"Could not import llama-cpp-python library. " "Could not import llama-cpp-python library. "
"Please install the llama-cpp-python library to " "Please install the llama-cpp-python library to "
"use this embedding model: pip install llama-cpp-python" "use this embedding model: pip install llama-cpp-python"
) )
except Exception: except Exception as e:
raise NameError(f"Could not load Llama model from path: {model_path}") raise ValueError(
f"Could not load Llama model from path: {model_path}. "
f"Received error {e}"
)
return values return values

View File

@ -64,6 +64,9 @@ class LlamaCpp(LLM):
"""Number of tokens to process in parallel. """Number of tokens to process in parallel.
Should be a number between 1 and n_ctx.""" Should be a number between 1 and n_ctx."""
n_gpu_layers: Optional[int] = Field(None, alias="n_gpu_layers")
"""Number of layers to be loaded into gpu memory. Default None."""
suffix: Optional[str] = Field(None) suffix: Optional[str] = Field(None)
"""A suffix to append to the generated text. If None, no suffix is appended.""" """A suffix to append to the generated text. If None, no suffix is appended."""
@ -104,47 +107,41 @@ class LlamaCpp(LLM):
def validate_environment(cls, values: Dict) -> Dict: def validate_environment(cls, values: Dict) -> Dict:
"""Validate that llama-cpp-python library is installed.""" """Validate that llama-cpp-python library is installed."""
model_path = values["model_path"] model_path = values["model_path"]
lora_path = values["lora_path"] model_param_names = [
lora_base = values["lora_base"] "lora_path",
n_ctx = values["n_ctx"] "lora_base",
n_parts = values["n_parts"] "n_ctx",
seed = values["seed"] "n_parts",
f16_kv = values["f16_kv"] "seed",
logits_all = values["logits_all"] "f16_kv",
vocab_only = values["vocab_only"] "logits_all",
use_mlock = values["use_mlock"] "vocab_only",
n_threads = values["n_threads"] "use_mlock",
n_batch = values["n_batch"] "n_threads",
use_mmap = values["use_mmap"] "n_batch",
last_n_tokens_size = values["last_n_tokens_size"] "use_mmap",
"last_n_tokens_size",
]
model_params = {k: values[k] for k in model_param_names}
# For backwards compatibility, only include if non-null.
if values["n_gpu_layers"] is not None:
model_params["n_gpu_layers"] = values["n_gpu_layers"]
try: try:
from llama_cpp import Llama from llama_cpp import Llama
values["client"] = Llama( values["client"] = Llama(model_path, **model_params)
model_path=model_path,
lora_base=lora_base,
lora_path=lora_path,
n_ctx=n_ctx,
n_parts=n_parts,
seed=seed,
f16_kv=f16_kv,
logits_all=logits_all,
vocab_only=vocab_only,
use_mlock=use_mlock,
n_threads=n_threads,
n_batch=n_batch,
use_mmap=use_mmap,
last_n_tokens_size=last_n_tokens_size,
)
except ImportError: except ImportError:
raise ModuleNotFoundError( raise ModuleNotFoundError(
"Could not import llama-cpp-python library. " "Could not import llama-cpp-python library. "
"Please install the llama-cpp-python library to " "Please install the llama-cpp-python library to "
"use this embedding model: pip install llama-cpp-python" "use this embedding model: pip install llama-cpp-python"
) )
except Exception: except Exception as e:
raise NameError(f"Could not load Llama model from path: {model_path}") raise ValueError(
f"Could not load Llama model from path: {model_path}. "
f"Received error {e}"
)
return values return values