Skip to content

Commit

Permalink
feat(exporter): remove metrics if process is gone (#107)
Browse files Browse the repository at this point in the history
  • Loading branch information
XuehaiPan authored Nov 23, 2023
1 parent 83f90f3 commit 8c8bc18
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ repos:
- id: debug-statements
- id: double-quote-string-fixer
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.5
rev: v0.1.6
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
Expand Down
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

-
- Remove metrics if process is gone in `nvitop-exporter` by [@XuehaiPan](https://github.com/XuehaiPan) in [#107](https://github.com/XuehaiPan/nvitop/pull/107).

### Changed

Expand Down
37 changes: 35 additions & 2 deletions nvitop-exporter/nvitop_exporter/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,13 @@ def __init__( # pylint: disable=too-many-statements
self.hostname = hostname or get_ip_address()
self.registry = registry
self.interval = interval
self.alive_pids: dict[Device, set[tuple[int, str]]] = {
device: set() for device in self.devices
}

self.info = Info(
'nvitop',
documentation='NVITOP.',
documentation='NVITOP Prometheus Exporter.',
labelnames=['hostname'],
registry=self.registry,
)
Expand Down Expand Up @@ -503,6 +506,7 @@ def update_host(self) -> None:
(self.host_disk_io_write_data, disk_io_counter.write_bytes / MiB),
):
gauge.labels(hostname=self.hostname, partition=partition).set(value)

for partition in host.disk_partitions(): # type: ignore[attr-defined]
try:
partition_usage = host.disk_usage(partition.mountpoint) # type: ignore[attr-defined]
Expand All @@ -516,7 +520,7 @@ def update_host(self) -> None:
):
gauge.labels(hostname=self.hostname, mountpoint=partition.mountpoint).set(value)

def update_device(self, device: Device) -> None:
def update_device(self, device: Device) -> None: # pylint: disable=too-many-locals
"""Update metrics for a single device."""
index = (
str(device.index) if isinstance(device.index, int) else ':'.join(map(str, device.index))
Expand Down Expand Up @@ -567,11 +571,16 @@ def update_device(self, device: Device) -> None:
link=link,
).set(throughput / 1024.0)

alive_pids = self.alive_pids[device]
previous_alive_pids = alive_pids.copy()
alive_pids.clear()

with GpuProcess.failsafe():
for pid, process in device.processes().items():
with process.oneshot():
username = process.username()
running_time = process.running_time()
alive_pids.add((pid, username))
for gauge, value in (
(
self.process_running_time,
Expand Down Expand Up @@ -606,3 +615,27 @@ def update_device(self, device: Device) -> None:
pid=pid,
username=username,
).set(value)

for pid, username in previous_alive_pids.difference(alive_pids):
for gauge in (
self.process_running_time,
self.process_cpu_percent,
self.process_rss_memory,
self.process_memory_percent,
self.process_gpu_memory,
self.process_gpu_sm_utilization,
self.process_gpu_memory_utilization,
self.process_gpu_encoder_utilization,
self.process_gpu_decoder_utilization,
):
try:
gauge.remove(
self.hostname,
index,
name,
uuid,
pid,
username,
)
except KeyError:
pass
4 changes: 2 additions & 2 deletions nvitop/api/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -2991,7 +2991,7 @@ def _parse_cuda_visible_devices( # pylint: disable=too-many-branches,too-many-s
try:
physical_device_attrs = _get_all_physical_device_attrs()
except libnvml.NVMLError:
return [] # type: ignore[return-value]
return []
gpu_uuids = set(physical_device_attrs)

try:
Expand Down Expand Up @@ -3072,7 +3072,7 @@ def strip_identifier(identifier: str) -> str:

for identifier in map(strip_identifier, cuda_visible_devices.split(',')):
if identifier in presented:
return [] # type: ignore[return-value] # duplicate identifiers found
return [] # duplicate identifiers found

try:
device = from_index_or_uuid(identifier)
Expand Down
4 changes: 2 additions & 2 deletions nvitop/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,9 @@ def main() -> int:
return 1

if args.gpu_util_thresh is not None:
Device.GPU_UTILIZATION_THRESHOLDS = tuple(sorted(args.gpu_util_thresh)) # type: ignore[assignment]
Device.GPU_UTILIZATION_THRESHOLDS = tuple(sorted(args.gpu_util_thresh))
if args.mem_util_thresh is not None:
Device.MEMORY_UTILIZATION_THRESHOLDS = tuple(sorted(args.mem_util_thresh)) # type: ignore[assignment]
Device.MEMORY_UTILIZATION_THRESHOLDS = tuple(sorted(args.mem_util_thresh))

if args.only is not None:
indices = set(args.only)
Expand Down
2 changes: 1 addition & 1 deletion nvitop/select.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def select_devices( # pylint: disable=too-many-branches,too-many-statements,too

if max_count is not None:
if max_count == 0:
return [] # type: ignore[return-value]
return []
assert max_count >= min_count >= 0

free_accounts = set(free_accounts or [])
Expand Down

0 comments on commit 8c8bc18

Please sign in to comment.