expose n_gpu_layers parameter of llama.cpp (#1890)

Also dynamically limit the GPU layers and context length fields to the maximum supported by the model.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-01-31 14:17:44 -05:00
committed by GitHub
parent f549d5a70a
commit 061d1969f8
31 changed files with 381 additions and 157 deletions

View File

@@ -42,6 +42,8 @@ public:
static const Implementation *implementation(const char *fname, const std::string& buildVariant);
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
static std::vector<GPUDevice> availableGPUDevices();
static int32_t maxContextLength(const std::string &modelPath);
static int32_t layerCount(const std::string &modelPath);
static void setImplementationsSearchPath(const std::string& path);
static const std::string& implementationsSearchPath();
@@ -77,9 +79,9 @@ public:
virtual bool supportsEmbedding() const = 0;
virtual bool supportsCompletion() const = 0;
virtual bool loadModel(const std::string &modelPath, int n_ctx) = 0;
virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
virtual bool isModelLoaded() const = 0;
virtual size_t requiredMem(const std::string &modelPath, int n_ctx) = 0;
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
virtual size_t stateSize() const { return 0; }
virtual size_t saveState(uint8_t */*dest*/) const { return 0; }
virtual size_t restoreState(const uint8_t */*src*/) { return 0; }
@@ -101,18 +103,18 @@ public:
return *m_implementation;
}
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) {
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
(void)memoryRequired;
return {};
}
virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) {
virtual bool initializeGPUDevice(size_t memoryRequired, const std::string& name) const {
(void)memoryRequired;
(void)name;
return false;
}
virtual bool initializeGPUDevice(const GPUDevice & device, std::string *unavail_reason = nullptr) {
virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
(void)device;
if (unavail_reason) {
*unavail_reason = "model has no GPU support";
@@ -120,7 +122,6 @@ public:
return false;
}
virtual bool initializeGPUDevice(int /*device*/) { return false; }
virtual bool hasGPUDevice() { return false; }
virtual bool usingGPUDevice() { return false; }
@@ -134,6 +135,18 @@ protected:
virtual int32_t contextLength() const = 0;
virtual const std::vector<Token>& endTokens() const = 0;
virtual int32_t maxContextLength(std::string const &modelPath) const
{
(void)modelPath;
return -1;
}
virtual int32_t layerCount(std::string const &modelPath) const
{
(void)modelPath;
return -1;
}
// This is a helper function called from the default implementation of 'prompt' but it can be
// shared by all base classes so it isn't virtual
void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);