[hotfix] fix memory leak in zero (#781)

2025-09-03 10:06:44 +00:00 · 2022-04-18 13:57:03 +08:00
parent 4b01da24cd
commit 4c4388c46e
6 changed files with 32 additions and 36 deletions
--- a/tests/test_zero/test_stateful_tensor_mgr.py
+++ b/tests/test_zero/test_stateful_tensor_mgr.py
@@ -72,23 +72,13 @@ def run_stm():

    # warmup done
    # only 2 params can be on CUDA
-    limit_cuda_memory(0.26)
+    limit_cuda_memory(0.26 / tensor_placement_policy._steady_cuda_cap_ratio)
    # use OPT-like eviction strategy
    apply_adjust(model, model.p0, [model.p0, model.p1], stateful_tensor_mgr)
-    mem_collector.sample_model_data()
-    mem_collector.sample_overall_data()
    apply_adjust(model, model.p1, [model.p0, model.p1], stateful_tensor_mgr)
-    mem_collector.sample_model_data()
-    mem_collector.sample_overall_data()
    apply_adjust(model, model.p2, [model.p0, model.p2], stateful_tensor_mgr)
-    mem_collector.sample_model_data()
-    mem_collector.sample_overall_data()
    apply_adjust(model, model.p0, [model.p0, model.p2], stateful_tensor_mgr)
-    mem_collector.sample_model_data()
-    mem_collector.sample_overall_data()
    apply_adjust(model, model.p1, [model.p1, model.p2], stateful_tensor_mgr)
-    mem_collector.sample_model_data()
-    mem_collector.finish_collection()


 def apply_adjust(model: torch.nn.Module, compute_param: Parameter, cuda_param_after_adjust: List[Parameter],