From d35fe802452135b7ca9745041dc3743bc90a9a06 Mon Sep 17 00:00:00 2001 From: Isi Date: Tue, 1 Oct 2024 20:40:08 +0100 Subject: [PATCH] Modified inference_unianimate_entrance.py to reduce VRAM requirement --- .../inferences/inference_unianimate_entrance.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/inferences/inference_unianimate_entrance.py b/tools/inferences/inference_unianimate_entrance.py index 4afd6e5..f3be0ef 100644 --- a/tools/inferences/inference_unianimate_entrance.py +++ b/tools/inferences/inference_unianimate_entrance.py @@ -238,13 +238,13 @@ def worker(gpu, seed, steps, useFirstFrame, reference_image, ref_pose, pose_sequ # [Data] Data Transform train_trans = data.Compose([ - data.Resize(cfg.resolution), + data.Resize(resolution), data.ToTensor(), data.Normalize(mean=cfg.mean, std=cfg.std) ]) train_trans_pose = data.Compose([ - data.Resize(cfg.resolution), + data.Resize(resolution), data.ToTensor(), ] ) @@ -403,7 +403,7 @@ def worker(gpu, seed, steps, useFirstFrame, reference_image, ref_pose, pose_sequ # logging.info(f"Current seed {cur_seed} ...") print(f"Number of frames to denoise: {frames_num}") - noise = torch.randn([1, 4, frames_num, int(cfg.resolution[1]/cfg.scale), int(cfg.resolution[0]/cfg.scale)]) + noise = torch.randn([1, 4, frames_num, int(resolution[1]/cfg.scale), int(resolution[0]/cfg.scale)]) noise = noise.to(gpu) # print(f"noise: {noise.shape}") @@ -477,11 +477,14 @@ def worker(gpu, seed, steps, useFirstFrame, reference_image, ref_pose, pose_sequ if hasattr(cfg, "CPU_CLIP_VAE") and cfg.CPU_CLIP_VAE: clip_encoder.cpu() # add this line + del clip_encoder # Delete this object to free memory autoencoder.cpu() # add this line torch.cuda.empty_cache() # add this line + import gc + gc.collect() # print(f' noise_one is ({noise_one})') - + print(f"noise: {noise.shape}") video_data = diffusion.ddim_sample_loop( noise=noise_one, @@ -495,8 +498,12 @@ def worker(gpu, seed, steps, useFirstFrame, reference_image, ref_pose, pose_sequ if hasattr(cfg, "CPU_CLIP_VAE") and cfg.CPU_CLIP_VAE: # if run forward of autoencoder or clip_encoder second times, load them again - clip_encoder.cuda() + # clip_encoder.cuda() + del diffusion + torch.cuda.empty_cache() + gc.collect() autoencoder.cuda() + video_data = 1. / cfg.scale_factor * video_data video_data = rearrange(video_data, 'b c f h w -> (b f) c h w') chunk_size = min(cfg.decoder_bs, video_data.shape[0])