From d04e7d34cb86041264656da3a970fb67037f5aab Mon Sep 17 00:00:00 2001
From: Zach Nussbaum <zanussbaum@gmail.com>
Date: Thu, 29 Jun 2023 03:18:59 +0000
Subject: [PATCH] fix: current status

---
 gpt4all-api/triton/README.md            | 10 ++-
 gpt4all-api/triton/client.py            | 22 ++----
 gpt4all-api/triton/convert_to_triton.py | 91 +++++++++++--------------
 gpt4all-api/triton/requirements.txt     |  1 -
 gpt4all-api/triton/test_data.json       | 34 +++++++++
 gpt4all-api/triton/triton_config.pbtxt  | 13 +---
 6 files changed, 92 insertions(+), 79 deletions(-)
 create mode 100644 gpt4all-api/triton/test_data.json

diff --git a/gpt4all-api/triton/README.md b/gpt4all-api/triton/README.md
index 21ba3c94..9358d147 100644
--- a/gpt4all-api/triton/README.md
+++ b/gpt4all-api/triton/README.md
@@ -2,4 +2,12 @@
 
 docker run --gpus=1 --rm --net=host -v ${PWD}/model_store:/model_store nvcr.io/nvidia/tritonserver:23.01-py3 tritonserver --model-repository=/model_store
 
-python client.py --model=<model_name>
\ No newline at end of file
+python client.py --model=<model_name>
+
+
+## Dynamic Batching
+Need to figure out how to do batching such that we can have dynamic batching
+We're getting 1.3 infer/sec which seems slow....
+
+To test,
+perf_analyzer -m nomic-ai--gpt4all-j --input-data test_data.json --measurement-interval 25000 --request-rate-range=10 -b 8
\ No newline at end of file
diff --git a/gpt4all-api/triton/client.py b/gpt4all-api/triton/client.py
index 21e24b6b..7086392c 100644
--- a/gpt4all-api/triton/client.py
+++ b/gpt4all-api/triton/client.py
@@ -3,7 +3,7 @@ import tritonclient.grpc.aio as grpcclient
 
 
 def prepare_inference_inputs(
-    inputs_ids: torch.IntTensor, new_tokens: int = 1, temperature: float = 1.0, top_k: int = 0, top_p: float = 1.0
+    inputs_ids: torch.IntTensor, new_tokens: int = 1, temperature: float = 1.0
 ):
     batch_size = inputs_ids.shape[0]
 
@@ -22,17 +22,7 @@ def prepare_inference_inputs(
         torch.full([batch_size, 1], temperature, dtype=torch.float32).cpu().numpy()
     )
 
-    top_k_input = grpcclient.InferInput("top_k", [batch_size, 1], "INT32")
-    top_k_input.set_data_from_numpy(
-        torch.full([batch_size, 1], top_k, dtype=torch.int32).cpu().numpy()
-    )
-
-    top_p_input = grpcclient.InferInput("top_p", [batch_size, 1], "FP32")
-    top_p_input.set_data_from_numpy(
-        torch.full([batch_size, 1], top_p, dtype=torch.float32).cpu().numpy()
-    )
-
-    inputs = [input_ids_input, new_tokens_input, temperature_input, top_k_input, top_p_input]
+    inputs = [input_ids_input, new_tokens_input, temperature_input]
     outputs = [
         grpcclient.InferRequestedOutput("logits"),
         grpcclient.InferRequestedOutput("output_ids"),
@@ -41,9 +31,9 @@ def prepare_inference_inputs(
 
 
 async def infer(
-    triton_client, model_name, input_ids, new_tokens: int = 1, temperature: float = 1.0, top_k: int = 0, top_p: float = 1.0
+    triton_client, model_name, input_ids, new_tokens: int = 1, temperature: float = 1.0
 ):
-    inputs, outputs = prepare_inference_inputs(input_ids, new_tokens, temperature, top_k, top_p)
+    inputs, outputs = prepare_inference_inputs(input_ids, new_tokens, temperature)
 
     triton_model_name = model_name.replace("/", "--")
 
@@ -69,7 +59,7 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
 
     async def main():
         async with Client(args.url) as triton_client:
@@ -77,7 +67,7 @@ if __name__ == "__main__":
                 prompt = input("Prompt: ")
                 input_ids = tokenizer.encode(prompt, return_tensors="pt")
                 last_logits, output_ids = await infer(
-                    triton_client, args.model, input_ids, new_tokens=128, temperature=1.0, top_k=0, top_p=0.9,
+                    triton_client, args.model, input_ids, new_tokens=256, temperature=1.0,
                 )
                 print(tokenizer.decode(output_ids[0]))
 
diff --git a/gpt4all-api/triton/convert_to_triton.py b/gpt4all-api/triton/convert_to_triton.py
index c1e6560c..dd996843 100644
--- a/gpt4all-api/triton/convert_to_triton.py
+++ b/gpt4all-api/triton/convert_to_triton.py
@@ -5,7 +5,6 @@ from string import Template
 import torch
 from torch import nn
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from gpt4all.falcon.modelling_RW import RWForCausalLM
 
 parser = argparse.ArgumentParser()
 
@@ -14,7 +13,7 @@ parser.add_argument(
 )
 
 parser.add_argument(
-    "--max-batch-size", type=int, default=4, help="Maximum batch size for inference"
+    "--max-batch-size", type=int, default=64, help="Maximum batch size for inference"
 )
 
 parser.add_argument(
@@ -51,91 +50,83 @@ class InferModel(nn.Module):
         input_ids: torch.Tensor,
         tensor_of_seq_len: torch.Tensor,
         temperature: torch.Tensor,
-        top_k: torch.Tensor,
-        top_p: torch.Tensor,
     ):
+        # this has mostly been adapted from huggingface generate
+        unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
+        eos_token_id_tensor = torch.tensor([self.eos_token_id]).to(input_ids.device)
+
         with torch.no_grad():
             for _ in range(tensor_of_seq_len.shape[1] - 1):
                 logits = self.traced_model(input_ids).float()
                 next_token_logits = logits[:, -1, :]
                 next_token_logits = next_token_logits / temperature
-
-                next_token_logits = self.top_k(next_token_logits, top_k)
-                next_token_logits = self.top_p(next_token_logits, top_p)
                 
-                next_token = torch.multinomial(
-                    torch.softmax(next_token_logits, dim=-1), 1
-                ).squeeze(1)
-                # early break
-                if next_token.item() == self.eos_token_id:
+                next_tokens = torch.multinomial(
+                    torch.softmax(next_token_logits, dim=-1), input_ids.shape[0] 
+                )
+
+                next_tokens = next_tokens * unfinished_sequences + self.eos_token_id * (1 - unfinished_sequences)
+
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
+
+                # stop when each sentence is finished
+                if unfinished_sequences.max() == 0:
                     return input_ids.int(), logits
 
-                input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
+                input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+
+                unfinished_sequences = unfinished_sequences.mul(
+                    next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
+                )
 
             # in TorchScript, the above logits var lifetime doesn't escape the loop's scope
             logits = self.traced_model(input_ids).float()
             next_token_logits = logits[:, -1, :]
             next_token_logits = next_token_logits / temperature
 
-            next_token_logits = self.top_k(next_token_logits, top_k)
-            next_token_logits = self.top_p(next_token_logits, top_p)
+            next_tokens = torch.multinomial(
+                    torch.softmax(next_token_logits, dim=-1), input_ids.shape[0] 
+                )
 
-            next_token = torch.multinomial(
-                torch.softmax(next_token_logits, dim=-1), 1
-            ).squeeze(1)
+            next_tokens = next_tokens * unfinished_sequences + self.eos_token_id * (1 - unfinished_sequences)
 
-            input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
+            input_ids = torch.cat([input_ids, next_tokens], dim=-1)
 
             return input_ids.int(), logits
             
-    def top_p(self, scores: torch.Tensor, top_p: torch.Tensor):
-        if top_p.squeeze().item() >= 1.0:
-            return scores
-        sorted_logits, sorted_indices = torch.sort(scores, descending=False)
-        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
-
-        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
-
-        # scatter sorted tensors to original indexing
-        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
-        scores[indices_to_remove] = float("-inf")
-        return scores
-
-        
-    def top_k(self, scores: torch.Tensor, top_k: torch.Tensor):
-        if top_k.squeeze().item() <= 0:
-            return scores
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = scores < torch.topk(scores, top_k.squeeze().item())[0][..., -1, None]
-        scores[indices_to_remove] = float("-inf")
-        return scores
-
 
 print(f"Converting {args.model} to TorchScript...")
-tokenizer = AutoTokenizer.from_pretrained(args.model)
-model = ModelLogits(AutoModelForCausalLM.from_pretrained(args.model, trust_remote_code=True, revision=args.revision))
+tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
+model = ModelLogits(AutoModelForCausalLM.from_pretrained(args.model,
+                                                         trust_remote_code=True,
+                                                         revision=args.revision,
+                                                         torch_dtype=torch.float16,
+                                                         use_cache=False))
+
 model.eval()
 model.requires_grad_(False)
-model = model.half().to(device)
+model = model.to(device)
+
 
 input = tokenizer("annotator model's hash is 0x", return_tensors="pt").to(device)
 print(f"{model(input.input_ids)=}")
 
 traced_script_module = torch.jit.trace(model, input.input_ids)
-
+print("Tracing...")
 print(f"{traced_script_module(input.input_ids)=}")
 
 print("Scripting generation wrapper...")
-
 # need to script this as we have data conditional flow
 scripted_generator_model = torch.jit.script(InferModel(traced_script_module, tokenizer.eos_token_id))
 print(scripted_generator_model.code)
 
 print(f"{input.input_ids=}")
-x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0).cuda(), torch.full([1, 1], len(tokenizer) // 2).cuda(), torch.full([1, 1], 0.9).cuda()
-# x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0), torch.full([1, 1], len(tokenizer) // 2), torch.full([1, 1], 0.9)
-# print(f"{(scripted_generator_model(*x))=}")
+# x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0).cuda(), torch.full([1, 1], len(tokenizer) // 2).cuda(), torch.full([1, 1], 0.9).cuda()
+x = input.input_ids, torch.empty(1, 5),  torch.full([1, 1], 0.9).cuda()
+print(x[0].shape)
+
 print(f"{tokenizer.decode(scripted_generator_model(*x)[0][0])=}")
 
 sanitized_name = args.model.replace("/", "--")
diff --git a/gpt4all-api/triton/requirements.txt b/gpt4all-api/triton/requirements.txt
index 450e71f3..5f8dd9ef 100644
--- a/gpt4all-api/triton/requirements.txt
+++ b/gpt4all-api/triton/requirements.txt
@@ -1,5 +1,4 @@
 transformers
 triton
-triton-client
 einops
 pandas
\ No newline at end of file
diff --git a/gpt4all-api/triton/test_data.json b/gpt4all-api/triton/test_data.json
new file mode 100644
index 00000000..82c01731
--- /dev/null
+++ b/gpt4all-api/triton/test_data.json
@@ -0,0 +1,34 @@
+{
+    "data":
+      [
+        {
+          "input_ids": {
+            "content": [17250, 11, 703, 389, 345, 30],
+            "shape": [6]
+          },
+          "tensor_of_seq_len": {
+            "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "shape": [17]
+          },
+          "temperature": {
+            "content": [1.0],
+            "shape": [1]
+          }
+        },
+        {
+            "input_ids": {
+              "content": [17250, 11, 703, 389, 345, 30],
+              "shape": [6]
+            },
+            "tensor_of_seq_len": {
+              "content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+              "shape": [17]
+            },
+            "temperature": {
+              "content": [1.0],
+              "shape": [1]
+            }
+          }
+
+      ]
+  }
\ No newline at end of file
diff --git a/gpt4all-api/triton/triton_config.pbtxt b/gpt4all-api/triton/triton_config.pbtxt
index 35417c61..8666fe08 100644
--- a/gpt4all-api/triton/triton_config.pbtxt
+++ b/gpt4all-api/triton/triton_config.pbtxt
@@ -3,7 +3,8 @@ backend: "pytorch"
 default_model_filename: "traced-model.pt"
 max_batch_size: ${max_batch_size}
 
-dynamic_batching { }
+dynamic_batching {
+}
 
 parameters {
   key: "model_name"
@@ -35,16 +36,6 @@ input [
     name: "temperature"
     data_type: TYPE_FP32
     dims: [-1]
-  },
-  {
-    name: "top_k"
-    data_type: TYPE_INT32
-    dims: [-1]
-  },
-  {
-    name: "top_p"
-    data_type: TYPE_FP32
-    dims: [-1]
   }
 ]