mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-08-09 11:57:23 +00:00
fix: current status
This commit is contained in:
parent
dedc494a7f
commit
d04e7d34cb
@ -3,3 +3,11 @@
|
|||||||
docker run --gpus=1 --rm --net=host -v ${PWD}/model_store:/model_store nvcr.io/nvidia/tritonserver:23.01-py3 tritonserver --model-repository=/model_store
|
docker run --gpus=1 --rm --net=host -v ${PWD}/model_store:/model_store nvcr.io/nvidia/tritonserver:23.01-py3 tritonserver --model-repository=/model_store
|
||||||
|
|
||||||
python client.py --model=<model_name>
|
python client.py --model=<model_name>
|
||||||
|
|
||||||
|
|
||||||
|
## Dynamic Batching
|
||||||
|
Need to figure out how to do batching such that we can have dynamic batching
|
||||||
|
We're getting 1.3 infer/sec which seems slow....
|
||||||
|
|
||||||
|
To test,
|
||||||
|
perf_analyzer -m nomic-ai--gpt4all-j --input-data test_data.json --measurement-interval 25000 --request-rate-range=10 -b 8
|
@ -3,7 +3,7 @@ import tritonclient.grpc.aio as grpcclient
|
|||||||
|
|
||||||
|
|
||||||
def prepare_inference_inputs(
|
def prepare_inference_inputs(
|
||||||
inputs_ids: torch.IntTensor, new_tokens: int = 1, temperature: float = 1.0, top_k: int = 0, top_p: float = 1.0
|
inputs_ids: torch.IntTensor, new_tokens: int = 1, temperature: float = 1.0
|
||||||
):
|
):
|
||||||
batch_size = inputs_ids.shape[0]
|
batch_size = inputs_ids.shape[0]
|
||||||
|
|
||||||
@ -22,17 +22,7 @@ def prepare_inference_inputs(
|
|||||||
torch.full([batch_size, 1], temperature, dtype=torch.float32).cpu().numpy()
|
torch.full([batch_size, 1], temperature, dtype=torch.float32).cpu().numpy()
|
||||||
)
|
)
|
||||||
|
|
||||||
top_k_input = grpcclient.InferInput("top_k", [batch_size, 1], "INT32")
|
inputs = [input_ids_input, new_tokens_input, temperature_input]
|
||||||
top_k_input.set_data_from_numpy(
|
|
||||||
torch.full([batch_size, 1], top_k, dtype=torch.int32).cpu().numpy()
|
|
||||||
)
|
|
||||||
|
|
||||||
top_p_input = grpcclient.InferInput("top_p", [batch_size, 1], "FP32")
|
|
||||||
top_p_input.set_data_from_numpy(
|
|
||||||
torch.full([batch_size, 1], top_p, dtype=torch.float32).cpu().numpy()
|
|
||||||
)
|
|
||||||
|
|
||||||
inputs = [input_ids_input, new_tokens_input, temperature_input, top_k_input, top_p_input]
|
|
||||||
outputs = [
|
outputs = [
|
||||||
grpcclient.InferRequestedOutput("logits"),
|
grpcclient.InferRequestedOutput("logits"),
|
||||||
grpcclient.InferRequestedOutput("output_ids"),
|
grpcclient.InferRequestedOutput("output_ids"),
|
||||||
@ -41,9 +31,9 @@ def prepare_inference_inputs(
|
|||||||
|
|
||||||
|
|
||||||
async def infer(
|
async def infer(
|
||||||
triton_client, model_name, input_ids, new_tokens: int = 1, temperature: float = 1.0, top_k: int = 0, top_p: float = 1.0
|
triton_client, model_name, input_ids, new_tokens: int = 1, temperature: float = 1.0
|
||||||
):
|
):
|
||||||
inputs, outputs = prepare_inference_inputs(input_ids, new_tokens, temperature, top_k, top_p)
|
inputs, outputs = prepare_inference_inputs(input_ids, new_tokens, temperature)
|
||||||
|
|
||||||
triton_model_name = model_name.replace("/", "--")
|
triton_model_name = model_name.replace("/", "--")
|
||||||
|
|
||||||
@ -69,7 +59,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with Client(args.url) as triton_client:
|
async with Client(args.url) as triton_client:
|
||||||
@ -77,7 +67,7 @@ if __name__ == "__main__":
|
|||||||
prompt = input("Prompt: ")
|
prompt = input("Prompt: ")
|
||||||
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
||||||
last_logits, output_ids = await infer(
|
last_logits, output_ids = await infer(
|
||||||
triton_client, args.model, input_ids, new_tokens=128, temperature=1.0, top_k=0, top_p=0.9,
|
triton_client, args.model, input_ids, new_tokens=256, temperature=1.0,
|
||||||
)
|
)
|
||||||
print(tokenizer.decode(output_ids[0]))
|
print(tokenizer.decode(output_ids[0]))
|
||||||
|
|
||||||
|
@ -5,7 +5,6 @@ from string import Template
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
from gpt4all.falcon.modelling_RW import RWForCausalLM
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
@ -14,7 +13,7 @@ parser.add_argument(
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-batch-size", type=int, default=4, help="Maximum batch size for inference"
|
"--max-batch-size", type=int, default=64, help="Maximum batch size for inference"
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -51,91 +50,83 @@ class InferModel(nn.Module):
|
|||||||
input_ids: torch.Tensor,
|
input_ids: torch.Tensor,
|
||||||
tensor_of_seq_len: torch.Tensor,
|
tensor_of_seq_len: torch.Tensor,
|
||||||
temperature: torch.Tensor,
|
temperature: torch.Tensor,
|
||||||
top_k: torch.Tensor,
|
|
||||||
top_p: torch.Tensor,
|
|
||||||
):
|
):
|
||||||
|
# this has mostly been adapted from huggingface generate
|
||||||
|
unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
|
||||||
|
eos_token_id_tensor = torch.tensor([self.eos_token_id]).to(input_ids.device)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for _ in range(tensor_of_seq_len.shape[1] - 1):
|
for _ in range(tensor_of_seq_len.shape[1] - 1):
|
||||||
logits = self.traced_model(input_ids).float()
|
logits = self.traced_model(input_ids).float()
|
||||||
next_token_logits = logits[:, -1, :]
|
next_token_logits = logits[:, -1, :]
|
||||||
next_token_logits = next_token_logits / temperature
|
next_token_logits = next_token_logits / temperature
|
||||||
|
|
||||||
next_token_logits = self.top_k(next_token_logits, top_k)
|
next_tokens = torch.multinomial(
|
||||||
next_token_logits = self.top_p(next_token_logits, top_p)
|
torch.softmax(next_token_logits, dim=-1), input_ids.shape[0]
|
||||||
|
)
|
||||||
|
|
||||||
next_token = torch.multinomial(
|
next_tokens = next_tokens * unfinished_sequences + self.eos_token_id * (1 - unfinished_sequences)
|
||||||
torch.softmax(next_token_logits, dim=-1), 1
|
|
||||||
).squeeze(1)
|
unfinished_sequences = unfinished_sequences.mul(
|
||||||
# early break
|
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
|
||||||
if next_token.item() == self.eos_token_id:
|
)
|
||||||
|
|
||||||
|
# stop when each sentence is finished
|
||||||
|
if unfinished_sequences.max() == 0:
|
||||||
return input_ids.int(), logits
|
return input_ids.int(), logits
|
||||||
|
|
||||||
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
|
input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
||||||
|
|
||||||
|
unfinished_sequences = unfinished_sequences.mul(
|
||||||
|
next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
|
||||||
|
)
|
||||||
|
|
||||||
# in TorchScript, the above logits var lifetime doesn't escape the loop's scope
|
# in TorchScript, the above logits var lifetime doesn't escape the loop's scope
|
||||||
logits = self.traced_model(input_ids).float()
|
logits = self.traced_model(input_ids).float()
|
||||||
next_token_logits = logits[:, -1, :]
|
next_token_logits = logits[:, -1, :]
|
||||||
next_token_logits = next_token_logits / temperature
|
next_token_logits = next_token_logits / temperature
|
||||||
|
|
||||||
next_token_logits = self.top_k(next_token_logits, top_k)
|
next_tokens = torch.multinomial(
|
||||||
next_token_logits = self.top_p(next_token_logits, top_p)
|
torch.softmax(next_token_logits, dim=-1), input_ids.shape[0]
|
||||||
|
)
|
||||||
|
|
||||||
next_token = torch.multinomial(
|
next_tokens = next_tokens * unfinished_sequences + self.eos_token_id * (1 - unfinished_sequences)
|
||||||
torch.softmax(next_token_logits, dim=-1), 1
|
|
||||||
).squeeze(1)
|
|
||||||
|
|
||||||
input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=1)
|
input_ids = torch.cat([input_ids, next_tokens], dim=-1)
|
||||||
|
|
||||||
return input_ids.int(), logits
|
return input_ids.int(), logits
|
||||||
|
|
||||||
def top_p(self, scores: torch.Tensor, top_p: torch.Tensor):
|
|
||||||
if top_p.squeeze().item() >= 1.0:
|
|
||||||
return scores
|
|
||||||
sorted_logits, sorted_indices = torch.sort(scores, descending=False)
|
|
||||||
cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
|
|
||||||
|
|
||||||
# Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
|
|
||||||
sorted_indices_to_remove = cumulative_probs <= (1 - top_p)
|
|
||||||
|
|
||||||
# scatter sorted tensors to original indexing
|
|
||||||
indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
|
|
||||||
scores[indices_to_remove] = float("-inf")
|
|
||||||
return scores
|
|
||||||
|
|
||||||
|
|
||||||
def top_k(self, scores: torch.Tensor, top_k: torch.Tensor):
|
|
||||||
if top_k.squeeze().item() <= 0:
|
|
||||||
return scores
|
|
||||||
# Remove all tokens with a probability less than the last token of the top-k
|
|
||||||
indices_to_remove = scores < torch.topk(scores, top_k.squeeze().item())[0][..., -1, None]
|
|
||||||
scores[indices_to_remove] = float("-inf")
|
|
||||||
return scores
|
|
||||||
|
|
||||||
|
|
||||||
print(f"Converting {args.model} to TorchScript...")
|
print(f"Converting {args.model} to TorchScript...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=False)
|
||||||
model = ModelLogits(AutoModelForCausalLM.from_pretrained(args.model, trust_remote_code=True, revision=args.revision))
|
model = ModelLogits(AutoModelForCausalLM.from_pretrained(args.model,
|
||||||
|
trust_remote_code=True,
|
||||||
|
revision=args.revision,
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
use_cache=False))
|
||||||
|
|
||||||
model.eval()
|
model.eval()
|
||||||
model.requires_grad_(False)
|
model.requires_grad_(False)
|
||||||
model = model.half().to(device)
|
model = model.to(device)
|
||||||
|
|
||||||
|
|
||||||
input = tokenizer("annotator model's hash is 0x", return_tensors="pt").to(device)
|
input = tokenizer("annotator model's hash is 0x", return_tensors="pt").to(device)
|
||||||
print(f"{model(input.input_ids)=}")
|
print(f"{model(input.input_ids)=}")
|
||||||
|
|
||||||
traced_script_module = torch.jit.trace(model, input.input_ids)
|
traced_script_module = torch.jit.trace(model, input.input_ids)
|
||||||
|
print("Tracing...")
|
||||||
print(f"{traced_script_module(input.input_ids)=}")
|
print(f"{traced_script_module(input.input_ids)=}")
|
||||||
|
|
||||||
print("Scripting generation wrapper...")
|
print("Scripting generation wrapper...")
|
||||||
|
|
||||||
# need to script this as we have data conditional flow
|
# need to script this as we have data conditional flow
|
||||||
scripted_generator_model = torch.jit.script(InferModel(traced_script_module, tokenizer.eos_token_id))
|
scripted_generator_model = torch.jit.script(InferModel(traced_script_module, tokenizer.eos_token_id))
|
||||||
print(scripted_generator_model.code)
|
print(scripted_generator_model.code)
|
||||||
|
|
||||||
print(f"{input.input_ids=}")
|
print(f"{input.input_ids=}")
|
||||||
x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0).cuda(), torch.full([1, 1], len(tokenizer) // 2).cuda(), torch.full([1, 1], 0.9).cuda()
|
# x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0).cuda(), torch.full([1, 1], len(tokenizer) // 2).cuda(), torch.full([1, 1], 0.9).cuda()
|
||||||
# x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 1.0), torch.full([1, 1], len(tokenizer) // 2), torch.full([1, 1], 0.9)
|
x = input.input_ids, torch.empty(1, 5), torch.full([1, 1], 0.9).cuda()
|
||||||
# print(f"{(scripted_generator_model(*x))=}")
|
print(x[0].shape)
|
||||||
|
|
||||||
print(f"{tokenizer.decode(scripted_generator_model(*x)[0][0])=}")
|
print(f"{tokenizer.decode(scripted_generator_model(*x)[0][0])=}")
|
||||||
|
|
||||||
sanitized_name = args.model.replace("/", "--")
|
sanitized_name = args.model.replace("/", "--")
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
transformers
|
transformers
|
||||||
triton
|
triton
|
||||||
triton-client
|
|
||||||
einops
|
einops
|
||||||
pandas
|
pandas
|
34
gpt4all-api/triton/test_data.json
Normal file
34
gpt4all-api/triton/test_data.json
Normal file
@ -0,0 +1,34 @@
|
|||||||
|
{
|
||||||
|
"data":
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"input_ids": {
|
||||||
|
"content": [17250, 11, 703, 389, 345, 30],
|
||||||
|
"shape": [6]
|
||||||
|
},
|
||||||
|
"tensor_of_seq_len": {
|
||||||
|
"content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||||
|
"shape": [17]
|
||||||
|
},
|
||||||
|
"temperature": {
|
||||||
|
"content": [1.0],
|
||||||
|
"shape": [1]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"input_ids": {
|
||||||
|
"content": [17250, 11, 703, 389, 345, 30],
|
||||||
|
"shape": [6]
|
||||||
|
},
|
||||||
|
"tensor_of_seq_len": {
|
||||||
|
"content": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
|
||||||
|
"shape": [17]
|
||||||
|
},
|
||||||
|
"temperature": {
|
||||||
|
"content": [1.0],
|
||||||
|
"shape": [1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
]
|
||||||
|
}
|
@ -3,7 +3,8 @@ backend: "pytorch"
|
|||||||
default_model_filename: "traced-model.pt"
|
default_model_filename: "traced-model.pt"
|
||||||
max_batch_size: ${max_batch_size}
|
max_batch_size: ${max_batch_size}
|
||||||
|
|
||||||
dynamic_batching { }
|
dynamic_batching {
|
||||||
|
}
|
||||||
|
|
||||||
parameters {
|
parameters {
|
||||||
key: "model_name"
|
key: "model_name"
|
||||||
@ -35,16 +36,6 @@ input [
|
|||||||
name: "temperature"
|
name: "temperature"
|
||||||
data_type: TYPE_FP32
|
data_type: TYPE_FP32
|
||||||
dims: [-1]
|
dims: [-1]
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "top_k"
|
|
||||||
data_type: TYPE_INT32
|
|
||||||
dims: [-1]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "top_p"
|
|
||||||
data_type: TYPE_FP32
|
|
||||||
dims: [-1]
|
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user