Skip to content

Commit

Permalink
Merge pull request #928 from StanfordVL/fix-profiler
Browse files Browse the repository at this point in the history
Fix pynvml for profiling VRAM
  • Loading branch information
cgokmen authored Oct 2, 2024
2 parents d3930c4 + 64be4b1 commit 12b4173
Show file tree
Hide file tree
Showing 3 changed files with 4,600 additions and 9 deletions.
45 changes: 37 additions & 8 deletions omnigibson/utils/profiling_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,37 @@
import os
from time import time

import gym
import gymnasium as gym
import psutil
from pynvml.smi import nvidia_smi

import omnigibson as og
import omnigibson.utils.pynvml_utils as pynvml


# Method copied from: https://github.com/wandb/wandb/blob/main/wandb/sdk/internal/system/assets/gpu.py
def gpu_in_use_by_this_process(gpu_handle: "GPUHandle", pid: int) -> bool:
if psutil is None:
return False

try:
base_process = psutil.Process(pid=pid)
except psutil.NoSuchProcess:
# do not report any gpu metrics if the base process cant be found
return False

our_processes = base_process.children(recursive=True)
our_processes.append(base_process)

our_pids = {process.pid for process in our_processes}

compute_pids = {process.pid for process in pynvml.nvmlDeviceGetComputeRunningProcesses(gpu_handle)} # type: ignore
graphics_pids = {
process.pid for process in pynvml.nvmlDeviceGetGraphicsRunningProcesses(gpu_handle) # type: ignore
}

pids_using_device = compute_pids | graphics_pids

return len(pids_using_device & our_pids) > 0


class ProfilingEnv(og.Environment):
Expand Down Expand Up @@ -64,15 +90,18 @@ def step(self, action):
# memory usage in GB
memory_usage = psutil.Process(os.getpid()).memory_info().rss / 1024**3
# VRAM usage in GB
for gpu in nvidia_smi.getInstance().DeviceQuery()["gpu"]:
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
for i in range(device_count):
found = False
for process in gpu["processes"]:
if process["pid"] == os.getpid():
vram_usage = process["used_memory"] / 1024
found = True
break
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
if gpu_in_use_by_this_process(handle, os.getpid()):
vram_usage = pynvml.nvmlDeviceGetMemoryInfo(handle).used / 1024**3
found = True
break
if found:
break
pynvml.nvmlShutdown()

ret = [total_frame_time, omni_time, og_time, memory_usage, vram_usage]
if self._current_step % 100 == 0:
Expand Down
Loading

0 comments on commit 12b4173

Please sign in to comment.