backend: fix extra spaces in tokenization and a CUDA crash (#2778)

Also potentially improves accuracy of BOS insertion, token cache, and logit indexing.

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
Jared Van Bortel
2024-08-01 10:46:36 -04:00
committed by GitHub
parent da59c9f5ea
commit 51bd01ae05
10 changed files with 46 additions and 36 deletions

View File

@@ -117,9 +117,6 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
return response_callback(token_id, response.c_str());
};
if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
wrapper->promptContext.tokens.resize(ctx->n_past);
// Copy the C prompt context
wrapper->promptContext.n_past = ctx->n_past;
wrapper->promptContext.n_ctx = ctx->n_ctx;