[pipeline] rewrite t5 tests & support multi-tensor transmitting in pipeline (#4388)

* fix remaining t5 bugs/rewrite t5 tests

* fix multi-tensor communication in pipeline

* rearrange test_config

* fix keyerror in sync_shared_params

* fix get_held_layers & Randomnizer, complete t5 tests

* erase printing

* fix get_held_layers through modifying _release_unheld_layers

* fix _get_recursive_held_layers bug
This commit is contained in:
Baizhou Zhang
2023-08-08 17:46:44 +08:00
committed by Hongxin Liu
parent 906426cb44
commit ed4c448488
11 changed files with 196 additions and 246 deletions

View File

@@ -3,6 +3,7 @@
import io
import pickle
import re
from typing import Any, List, Optional, Union
import torch
@@ -31,7 +32,10 @@ def _cuda_safe_tensor_to_object(tensor: torch.Tensor, tensor_size: torch.Size) -
if b'cuda' in buf:
buf_array = bytearray(buf)
device_index = torch.cuda.current_device()
buf_array[buf_array.find(b'cuda') + 5] = 48 + device_index
# There might be more than one output tensors during forward
for cuda_str in re.finditer(b'cuda', buf_array):
pos = cuda_str.start()
buf_array[pos + 5] = 48 + device_index
buf = bytes(buf_array)
io_bytes = io.BytesIO(buf)