From 7b855929e401a18ddbd8d1133e333ae052dc0761 Mon Sep 17 00:00:00 2001 From: MekkCyber Date: Mon, 18 Nov 2024 12:39:33 +0000 Subject: [PATCH 1/3] first fix --- src/accelerate/utils/modeling.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index f4230c55994..6e75c6e7256 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -1066,7 +1066,12 @@ def get_balanced_memory( ) # The last device is left with max_memory just in case the buffer is not enough. for idx in gpus_idx_list[:-1]: - max_memory[idx] = min(max_memory[0] if low_zero and idx == 0 else per_gpu, max_memory[idx]) + if idx == 0 and not low_zero and module_sizes['model.embed_tokens'] > per_gpu * 0.9: + max_memory[idx] = min(module_sizes['model.embed_tokens'] * 1.3, max_memory[idx]) + elif idx == 1 and low_zero and module_sizes['model.embed_tokens'] > per_gpu * 0.9 : + max_memory[idx] = min(module_sizes['model.embed_tokens'] * 1.3, max_memory[idx]) + else : + max_memory[idx] = min(max_memory[0] if low_zero and idx == 0 else per_gpu, max_memory[idx]) if low_zero: min_zero = max(0, module_sizes[""] - sum([max_memory[i] for i in range(1, num_devices)])) From c5eb33c209e6c618d01626967af41826804369da Mon Sep 17 00:00:00 2001 From: MekkCyber Date: Mon, 18 Nov 2024 13:03:13 +0000 Subject: [PATCH 2/3] fix style --- src/accelerate/utils/modeling.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index 6e75c6e7256..62b209157b7 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -1066,11 +1066,11 @@ def get_balanced_memory( ) # The last device is left with max_memory just in case the buffer is not enough. for idx in gpus_idx_list[:-1]: - if idx == 0 and not low_zero and module_sizes['model.embed_tokens'] > per_gpu * 0.9: - max_memory[idx] = min(module_sizes['model.embed_tokens'] * 1.3, max_memory[idx]) - elif idx == 1 and low_zero and module_sizes['model.embed_tokens'] > per_gpu * 0.9 : - max_memory[idx] = min(module_sizes['model.embed_tokens'] * 1.3, max_memory[idx]) - else : + if idx == 0 and not low_zero and module_sizes["model.embed_tokens"] > per_gpu * 0.9: + max_memory[idx] = min(module_sizes["model.embed_tokens"] * 1.3, max_memory[idx]) + elif idx == 1 and low_zero and module_sizes["model.embed_tokens"] > per_gpu * 0.9: + max_memory[idx] = min(module_sizes["model.embed_tokens"] * 1.3, max_memory[idx]) + else: max_memory[idx] = min(max_memory[0] if low_zero and idx == 0 else per_gpu, max_memory[idx]) if low_zero: From a7ea07c0bec1a59060ad63abcd62f3be7b50378c Mon Sep 17 00:00:00 2001 From: MekkCyber Date: Tue, 19 Nov 2024 15:09:38 +0000 Subject: [PATCH 3/3] max leave size instead of embed layer --- src/accelerate/utils/modeling.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index 62b209157b7..2b1be34eb7f 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -1065,11 +1065,12 @@ def get_balanced_memory( ) ) # The last device is left with max_memory just in case the buffer is not enough. + max_leave_size = max([module_sizes[leave] for leave in leaves]) for idx in gpus_idx_list[:-1]: - if idx == 0 and not low_zero and module_sizes["model.embed_tokens"] > per_gpu * 0.9: - max_memory[idx] = min(module_sizes["model.embed_tokens"] * 1.3, max_memory[idx]) - elif idx == 1 and low_zero and module_sizes["model.embed_tokens"] > per_gpu * 0.9: - max_memory[idx] = min(module_sizes["model.embed_tokens"] * 1.3, max_memory[idx]) + if idx == 0 and not low_zero and max_leave_size > per_gpu * 0.9: + max_memory[idx] = min(max_leave_size * 1.3, max_memory[idx]) + elif idx == 1 and low_zero and max_leave_size > per_gpu * 0.9: + max_memory[idx] = min(max_leave_size * 1.3, max_memory[idx]) else: max_memory[idx] = min(max_memory[0] if low_zero and idx == 0 else per_gpu, max_memory[idx])