From d04e7d34cb86041264656da3a970fb67037f5aab Mon Sep 17 00:00:00 2001 From: Zach Nussbaum Date: Thu, 29 Jun 2023 03:18:59 +0000 Subject: [PATCH] fix: current status --- gpt4all-api/triton/README.md | 10 ++- gpt4all-api/triton/client.py | 22 ++---- gpt4all-api/triton/convert_to_triton.py | 91 +++++++++++-------------- gpt4all-api/triton/requirements.txt | 1 - gpt4all-api/triton/test_data.json | 34 +++++++++ gpt4all-api/triton/triton_config.pbtxt | 13 +--- 6 files changed, 92 insertions(+), 79 deletions(-) create mode 100644 gpt4all-api/triton/test_data.json diff --git a/gpt4all-api/triton/README.md b/gpt4all-api/triton/README.md index 21ba3c94..9358d147 100644 --- a/gpt4all-api/triton/README.md +++ b/gpt4all-api/triton/README.md @@ -2,4 +2,12 @@ docker run --gpus=1 --rm --net=host -v ${PWD}/model_store:/model_store nvcr.io/nvidia/tritonserver:23.01-py3 tritonserver --model-repository=/model_store -python client.py --model= \ No newline at end of file +python client.py --model= + + +## Dynamic Batching +Need to figure out how to do batching such that we can have dynamic batching +We're getting 1.3 infer/sec which seems slow.... + +To test, +perf_analyzer -m nomic-ai--gpt4all-j --input-data test_data.json --measurement-interval 25000 --request-rate-range=10 -b 8 \ No newline at end of file diff --git a/gpt4all-api/triton/client.py b/gpt4all-api/triton/client.py index 21e24b6b..7086392c 100644 --- a/gpt4all-api/triton/client.py +++ b/gpt4all-api/triton/client.py @@ -3,7 +3,7 @@ import tritonclient.grpc.aio as grpcclient def prepare_inference_inputs( - inputs_ids: torch.IntTensor, new_tokens: int = 1, temperature: float = 1.0, top_k: int = 0, top_p: float = 1.0 + inputs_ids: torch.IntTensor, new_tokens: int = 1, temperature: float = 1.0 ): batch_size = inputs_ids.shape[0] @@ -22,17 +22,7 @@ def prepare_inference_inputs( torch.full([batch_size, 1], temperature, dtype=torch.float32).cpu().numpy() ) - top_k_input = grpcclient.InferInput("top_k", [batch_size, 1], "INT32") - top_k_input.set_data_from_numpy( - torch.full([batch_size, 1], top_k, dtype=torch.int32).cpu().numpy() - ) - - top_p_input = grpcclient.InferInput("top_p", [batch_size, 1], "FP32") - top_p_input.set_data_from_numpy( - torch.full([batch_size, 1], top_p, dtype=torch.float32).cpu().numpy() - ) - - inputs = [input_ids_input, new_tokens_input, temperature_input, top_k_input, top_p_input] + inputs = [input_ids_input, new_tokens_input, temperature_input] outputs = [ grpcclient.InferRequestedOutput("logits"), grpcclient.InferRequestedOutput("output_ids"), @@ -41,9 +31,9 @@ def prepare_inference_inputs( async def infer( - triton_client, model_name, input_ids, new_tokens: int = 1, temperature: float = 1.0, top_k: int = 0, top_p: float = 1.0 + triton_client, model_name, input_ids, new_tokens: int = 1, temperature: float = 1.0 ): - inputs, outputs = prepare_inference_inputs(input_ids, new_tokens, temperature, top_k, top_p) + inputs, outputs = prepare_inference_inputs(input_ids, new_tokens, temperature) triton_model_name = model_name.replace("/", "--") @@ -69,7 +59,7 @@ if __name__ == "__main__": args = parser.parse_args() - tokenizer = AutoTokenizer.from_pretrained(args.model) + tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False) async def main(): async with Client(args.url) as triton_client: @@ -77,7 +67,7 @@ if __name__ == "__main__": prompt = input("Prompt: ") input_ids = tokenizer.encode(prompt, return_tensors="pt") last_logits, output_ids = await infer( - triton_client, args.model, input_ids, new_tokens=128, temperature=1.0, top_k=0, top_p=0.9, + triton_client, args.model, input_ids, new_tokens=256, temperature=1.0, ) print(tokenizer.decode(output_ids[0])) diff --git a/gpt4all-api/triton/convert_to_triton.py b/gpt4all-api/triton/convert_to_triton.py index c1e6560c..dd996843 100644 --- a/gpt4all-api/triton/convert_to_triton.py +++ b/gpt4all-api/triton/convert_to_triton.py @@ -5,7 +5,6 @@ from string import Template import torch from torch import nn from transformers import AutoModelForCausalLM, AutoTokenizer -from gpt4all.falcon.modelling_RW import RWForCausalLM parser = argparse.ArgumentParser() @@ -14,7 +13,7 @@ parser.add_argument( ) parser.add_argument( - "--max-batch-size", type=int, default=4, help="Maximum batch size for inference" + "--max-batch-size", type=int, default=64, help="Maximum batch size for inference" ) parser.add_argument( @@ -51,91 +50,83 @@ class InferModel(nn.Module): input_ids: torch.Tensor, tensor_of_seq_len: torch.Tensor, temperature: torch.Tensor, - top_k: torch.Tensor, - top_p: torch.Tensor, ): + # this has mostly been adapted from huggingface generate + unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device) + eos_token_id_tensor = torch.tensor([self.eos_token_id]).to(input_ids.device) + with torch.no_grad(): for _ in range(tensor_of_seq_len.shape[1] - 1): logits = self.traced_model(input_ids).float() next_token_logits = logits[:, -1, :] next_token_logits = next_token_logits / temperature - - next_token_logits = self.top_k(next_token_logits, top_k) - next_token_logits = self.top_p(next_token_logits, top_p) - next_token = torch.multinomial( - torch.softmax(next_token_logits, dim=-1), 1 - ).squeeze(1) - # early break - if next_token.item() == self.eos_token_id: + next_tokens = torch.multinomial( + torch.softmax(next_token_logits, dim=-1), input_ids.shape[0] + ) + + next_tokens = next_tokens * unfinished_sequences + self.eos_token_id * (1 - unfinished_sequences) + + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) + + # stop when each sentence is finished + if unfinished_sequences.max() == 0: return input_ids.int(), logits - input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1) + input_ids = torch.cat([input_ids, next_tokens], dim=-1) + + unfinished_sequences = unfinished_sequences.mul( + next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0) + ) # in TorchScript, the above logits var lifetime doesn't escape the loop's scope logits = self.traced_model(input_ids).float() next_token_logits = logits[:, -1, :] next_token_logits = next_token_logits / temperature - next_token_logits = self.top_k(next_token_logits, top_k) - next_token_logits = self.top_p(next_token_logits, top_p) + next_tokens = torch.multinomial( + torch.softmax(next_token_logits, dim=-1), input_ids.shape[0] + ) - next_token = torch.multinomial( - torch.softmax(next_token_logits, dim=-1), 1 - ).squeeze(1) + next_tokens = next_tokens * unfinished_sequences + self.eos_token_id * (1 - unfinished_sequences) - input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1) + input_ids = torch.cat([input_ids, next_tokens], dim=-1) return input_ids.int(), logits - def top_p(self, scores: torch.Tensor, top_p: torch.Tensor): - if top_p.squeeze().item() >= 1.0: - return scores - sorted_logits, sorted_indices = torch.sort(scores, descending=False) - cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) - - # Remove tokens with cumulative top_p above the threshold (token with 0 are kept) - sorted_indices_to_remove = cumulative_probs <= (1 - top_p) - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) - scores[indices_to_remove] = float("-inf") - return scores - - - def top_k(self, scores: torch.Tensor, top_k: torch.Tensor): - if top_k.squeeze().item() <= 0: - return scores - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = scores < torch.topk(scores, top_k.squeeze().item())[0][..., -1, None] - scores[indices_to_remove] = float("-inf") - return scores - print(f"Converting {args.model} to TorchScript...") -tokenizer = AutoTokenizer.from_pretrained(args.model) -model = ModelLogits(AutoModelForCausalLM.from_pretrained(args.model, trust_remote_code=True, revision=args.revision)) +tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False) +model = ModelLogits(AutoModelForCausalLM.from_pretrained(args.model, + trust_remote_code=True, + revision=args.revision, + torch_dtype=torch.float16, + use_cache=False)) + model.eval() model.requires_grad_(False) -model = model.half().to(device) +model = model.to(device) + input = tokenizer("annotator model's hash is 0x", return_tensors="pt").to(device) print(f"{model(input.input_ids)=}") traced_script_module = torch.jit.trace(model, input.input_ids) - +print("Tracing...") print(f"{traced_script_module(input.input_ids)=}") print("Scripting generation wrapper...") - # need to script this as we have data conditional flow scripted_generator_model = torch.jit.script(InferModel(traced_script_module, tokenizer.eos_token_id)) print(scripted_generator_model.code) print(f"{input.input_ids=}") -x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0).cuda(), torch.full([1, 1], len(tokenizer) // 2).cuda(), torch.full([1, 1], 0.9).cuda() -# x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0), torch.full([1, 1], len(tokenizer) // 2), torch.full([1, 1], 0.9) -# print(f"{(scripted_generator_model(*x))=}") +# x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0).cuda(), torch.full([1, 1], len(tokenizer) // 2).cuda(), torch.full([1, 1], 0.9).cuda() +x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 0.9).cuda() +print(x[0].shape) + print(f"{tokenizer.decode(scripted_generator_model(*x)[0][0])=}") sanitized_name = args.model.replace("/", "--") diff --git a/gpt4all-api/triton/requirements.txt b/gpt4all-api/triton/requirements.txt index 450e71f3..5f8dd9ef 100644 --- a/gpt4all-api/triton/requirements.txt +++ b/gpt4all-api/triton/requirements.txt @@ -1,5 +1,4 @@ transformers triton -triton-client einops pandas \ No newline at end of file diff --git a/gpt4all-api/triton/test_data.json b/gpt4all-api/triton/test_data.json new file mode 100644 index 00000000..82c01731 --- /dev/null +++ b/gpt4all-api/triton/test_data.json @@ -0,0 +1,34 @@ +{ + "data": + [ + { + "input_ids": { + "content": [17250, 11, 703, 389, 345, 30], + "shape": [6] + }, + "tensor_of_seq_len": { + "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "shape": [17] + }, + "temperature": { + "content": [1.0], + "shape": [1] + } + }, + { + "input_ids": { + "content": [17250, 11, 703, 389, 345, 30], + "shape": [6] + }, + "tensor_of_seq_len": { + "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + "shape": [17] + }, + "temperature": { + "content": [1.0], + "shape": [1] + } + } + + ] + } \ No newline at end of file diff --git a/gpt4all-api/triton/triton_config.pbtxt b/gpt4all-api/triton/triton_config.pbtxt index 35417c61..8666fe08 100644 --- a/gpt4all-api/triton/triton_config.pbtxt +++ b/gpt4all-api/triton/triton_config.pbtxt @@ -3,7 +3,8 @@ backend: "pytorch" default_model_filename: "traced-model.pt" max_batch_size: ${max_batch_size} -dynamic_batching { } +dynamic_batching { +} parameters { key: "model_name" @@ -35,16 +36,6 @@ input [ name: "temperature" data_type: TYPE_FP32 dims: [-1] - }, - { - name: "top_k" - data_type: TYPE_INT32 - dims: [-1] - }, - { - name: "top_p" - data_type: TYPE_FP32 - dims: [-1] } ]