Use the token cache to infer greater n_past and reuse results (#3073)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-10-31 11:19:12 -04:00
committed by GitHub
parent 62cab695eb
commit f07e2e63df
15 changed files with 320 additions and 169 deletions

View File

@@ -51,7 +51,6 @@ bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
void ChatAPI::setThreadCount(int32_t n_threads)
{
Q_UNUSED(n_threads);
qt_noop();
}
int32_t ChatAPI::threadCount() const
@@ -68,24 +67,6 @@ bool ChatAPI::isModelLoaded() const
return true;
}
// All three of the state virtual functions are handled custom inside of chatllm save/restore
size_t ChatAPI::stateSize() const
{
throw std::logic_error("not implemented");
}
size_t ChatAPI::saveState(std::span<uint8_t> dest) const
{
Q_UNUSED(dest);
throw std::logic_error("not implemented");
}
size_t ChatAPI::restoreState(std::span<const uint8_t> src)
{
Q_UNUSED(src);
throw std::logic_error("not implemented");
}
void ChatAPI::prompt(const std::string &prompt,
const std::string &promptTemplate,
std::function<bool(int32_t)> promptCallback,