backend: dedupe tokenizing code in gptj/mpt

This commit is contained in:
Aaron Miller
2023-05-15 17:30:45 -07:00
committed by AT
parent 4cd8bdf9a1
commit 6182026c70
3 changed files with 57 additions and 121 deletions

View File

@@ -44,6 +44,11 @@ struct gpt_vocab {
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
std::vector<std::string> special_tokens;
void add_special_token(const std::string &token) {
special_tokens.push_back(token);
}
};
void replace(std::string & str, const std::string & needle, const std::string & replacement);