mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-06-02 20:35:29 +00:00
[bug] workaround for idx fix
This commit is contained in:
parent
e0dde8fda5
commit
936dd96dbb
@ -57,6 +57,7 @@ class GeminiManager:
|
|||||||
self._comp_cuda_demand_time = 0
|
self._comp_cuda_demand_time = 0
|
||||||
|
|
||||||
def reset_attributes(self):
|
def reset_attributes(self):
|
||||||
|
assert self._compute_idx + 1 == len(self._compute_list)
|
||||||
self._compute_idx = -1
|
self._compute_idx = -1
|
||||||
self._h2d_volume = 0
|
self._h2d_volume = 0
|
||||||
self._d2h_volume = 0
|
self._d2h_volume = 0
|
||||||
|
@ -145,6 +145,8 @@ class AutoPlacementPolicy(PlacementPolicy):
|
|||||||
self._warmup_non_model_data_ratio = warmup_non_model_data_ratio
|
self._warmup_non_model_data_ratio = warmup_non_model_data_ratio
|
||||||
self._steady_cuda_cap_ratio = steady_cuda_cap_ratio
|
self._steady_cuda_cap_ratio = steady_cuda_cap_ratio
|
||||||
|
|
||||||
|
self.__avail_cuda_model_data_for_prefetch = None
|
||||||
|
|
||||||
def evict_tensors(
|
def evict_tensors(
|
||||||
self,
|
self,
|
||||||
can_evict_chunks: List[Chunk],
|
can_evict_chunks: List[Chunk],
|
||||||
@ -204,6 +206,7 @@ class AutoPlacementPolicy(PlacementPolicy):
|
|||||||
f"Adjust layout failed! No enough CUDA memory! "
|
f"Adjust layout failed! No enough CUDA memory! "
|
||||||
f"Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}"
|
f"Need {to_free_cuda_model_data}, freed {freed_cuda_model_data}"
|
||||||
)
|
)
|
||||||
|
self.__avail_cuda_model_data_for_prefetch = avail_cuda_model_data - freed_cuda_model_data
|
||||||
return freed_cuda_model_data, time() - start
|
return freed_cuda_model_data, time() - start
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -234,14 +237,9 @@ class AutoPlacementPolicy(PlacementPolicy):
|
|||||||
) -> List[Chunk]:
|
) -> List[Chunk]:
|
||||||
if is_warmup: # no prefetch during warmup since we need compute_list
|
if is_warmup: # no prefetch during warmup since we need compute_list
|
||||||
return []
|
return []
|
||||||
# modified from self.evict_tensors
|
|
||||||
cuda_capacity = self._steady_cuda_cap_ratio * colo_device_memory_capacity(
|
avail_cuda_model_data = self.__avail_cuda_model_data_for_prefetch
|
||||||
get_accelerator().get_current_device()
|
self.__avail_cuda_model_data_for_prefetch = None # incase of double use
|
||||||
)
|
|
||||||
max_cuda_non_model_data_per_period = self.mem_stats_collector.next_period_non_model_data_usage("cuda")
|
|
||||||
used_cuda_model_data = self.chunk_manager.total_mem["cuda"]
|
|
||||||
total_cuda_model_data = cuda_capacity - max_cuda_non_model_data_per_period
|
|
||||||
avail_cuda_model_data = total_cuda_model_data - used_cuda_model_data
|
|
||||||
|
|
||||||
prefetch_chunk_memory = 0
|
prefetch_chunk_memory = 0
|
||||||
can_prefetch = self.max_prefetch - len(async_works)
|
can_prefetch = self.max_prefetch - len(async_works)
|
||||||
|
Loading…
Reference in New Issue
Block a user