From 02bb6442b374e0d2557066b62da744463d9e5d59 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 8 Sep 2023 01:58:15 -0400 Subject: [PATCH] wip tp --- src/axolotl/utils/bench.py | 10 ++++++++- src/axolotl/utils/models.py | 44 +++++++++++++++++++++++-------------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/axolotl/utils/bench.py b/src/axolotl/utils/bench.py index d19e81ecdc..8f192c2cd5 100644 --- a/src/axolotl/utils/bench.py +++ b/src/axolotl/utils/bench.py @@ -1,8 +1,12 @@ """Benchmarking and measurement utilities""" +import logging + import pynvml import torch +LOG = logging.getLogger("axolotl.utils.bench") + def gpu_memory_usage(device=0): return torch.cuda.memory_allocated(device) / 1024.0**3 @@ -31,7 +35,11 @@ def log_gpu_memory_usage(log, msg, device): if not torch.cuda.is_available(): return (0, 0, 0) - usage, cache, misc = gpu_memory_usage_all(device) + try: + usage, cache, misc = gpu_memory_usage_all(device) + except ValueError as exc: + LOG.exception(exc) + return (0, 0, 0) extras = [] if cache > 0: extras.append(f"+{cache:.03f}GB cache") diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 428985e12e..b9ab121497 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -6,7 +6,6 @@ import os from typing import Optional, Tuple # noqa: F401 -import accelerate import bitsandbytes as bnb import tensor_parallel as tp import torch @@ -252,14 +251,21 @@ def load_model( base_model, trust_remote_code=cfg.trust_remote_code or False, ) - with accelerate.init_empty_weights(): - model = AutoModelForCausalLM.from_config( - config=config, - trust_remote_code=cfg.trust_remote_code or False, - ).half() - model = tp.TensorParallelPreTrainedModel( - model, - ) + # with accelerate.init_empty_weights(): + # model = AutoModelForCausalLM.from_config( + # config=config, + # trust_remote_code=cfg.trust_remote_code or False, + # ).half() + # model = tp.TensorParallelPreTrainedModel( + # model, + # sharded=False, + # ) + model = AutoModelForCausalLM.from_pretrained( + base_model, + config=config, + trust_remote_code=cfg.trust_remote_code or False, + ).half() + model = tp.tensor_parallel(model, sharded=False) else: config = AutoConfig.from_pretrained( base_model, @@ -306,12 +312,15 @@ def load_model( **model_kwargs, ) - embeddings_len = ( - math.ceil(len(tokenizer) / 32) * 32 - if cfg.resize_token_embeddings_to_32x - else len(tokenizer) - ) - model.resize_token_embeddings(embeddings_len) + try: + embeddings_len = ( + math.ceil(len(tokenizer) / 32) * 32 + if cfg.resize_token_embeddings_to_32x + else len(tokenizer) + ) + model.resize_token_embeddings(embeddings_len) + except NotImplementedError: + LOG.warning("`resize_token_embeddings` not implemented on model") if ( hasattr(model.config, "max_position_embeddings") @@ -397,7 +406,10 @@ def load_adapter(model, cfg, adapter, inference=False): if adapter is None: return model, None if hasattr(model, "enable_input_require_grads"): - model.enable_input_require_grads() + try: + model.enable_input_require_grads() + except NotImplementedError: + LOG.warning("enable_input_require_grads not implemented on model") if adapter == "qlora" and cfg.tensor_parallel: return load_tp_qlora(model) if adapter in ["lora", "qlora"]: