Skip to content

Commit

Permalink
Add approx_instant_energy to ZeusMonitor
Browse files Browse the repository at this point in the history
  • Loading branch information
jaywonchung committed Jul 31, 2023
1 parent b2469c9 commit 17c33a2
Showing 1 changed file with 39 additions and 0 deletions.
39 changes: 39 additions & 0 deletions zeus/monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ class ZeusMonitor:
def __init__(
self,
gpu_indices: list[int] | None = None,
approx_instant_energy: bool = False,
monitor_exec: str = "zeus_monitor",
log_file: str | Path | None = None,
) -> None:
Expand All @@ -118,9 +119,19 @@ def __init__(
`CUDA_VISIBLE_DEVICES=2,3` will be interpreted as CUDA device `3`.
`CUDA_VISIBLE_DEVICES`s formatted with comma-separated indices are supported.
(Default: `None`)
approx_instant_energy: When the execution time of a measurement window is
shorter than the NVML energy counter's update period, energy consumption will
be shows as zero. In this case, if `approx_instant_energy` is True, the
window's energy consumption will be approximated by multiplying the current
instantaneous power consumption with the window's execution time. This should
be a better estimate than zero, but it's still an approximation.
(Default: `False`)
monitor_exec: Zeus monitor executable. (Default: `"zeus_monitor"`)
log_file: Path to the log CSV file. If `None`, logging will be disabled.
"""
# Save arguments.
self.approx_instant_energy = approx_instant_energy

# Initialize NVML.
pynvml.nvmlInit()

Expand Down Expand Up @@ -254,6 +265,16 @@ def _is_new_arch(self, gpu: int) -> bool:
>= pynvml.NVML_DEVICE_ARCH_VOLTA
)

def _get_instant_power(self) -> tuple[dict[int, float], float]:
"""Measure the power consumption of all GPUs at the current time."""
power_measurement_start_time: float = time.monotonic()
power = {
i: pynvml.nvmlDeviceGetPowerUsage(h) / 1000.0
for i, h in self.gpu_handles.items()
}
power_measurement_time = time.monotonic() - power_measurement_start_time
return power, power_measurement_time

def begin_window(self, key: str, sync_cuda: bool = True) -> None:
"""Begin a new measurement window.
Expand Down Expand Up @@ -307,6 +328,16 @@ def end_window(
except KeyError:
raise ValueError(f"Measurement window '{key}' does not exist") from None

# Take instant power consumption measurements.
# This, in theory, is introducing extra NVMLs call in the critical path
# even if computation time is not so short. However, it is reasonable to
# expect that computation time would be short if the user explicitly
# turned on the `approx_instant_energy` option. Calling this function
# as early as possible will lead to more accurate energy approximation.
power, power_measurement_time = (
self._get_instant_power() if self.approx_instant_energy else ({}, 0.0)
)

# Call cudaSynchronize to make sure we freeze at the right time.
if sync_cuda:
for gpu_index in self.gpu_handles:
Expand Down Expand Up @@ -335,6 +366,14 @@ def end_window(
end_time - self.monitor_start_time,
)

# Approximate zero energy consumption.
if self.approx_instant_energy:
for gpu_index in self.gpu_indices:
if energy_consumption[gpu_index] == 0.0:
energy_consumption[gpu_index] = power[gpu_index] * (
time_consumption - power_measurement_time
)

self._log(f"Measurement window '{key}' ended.")

# Add to log file.
Expand Down

0 comments on commit 17c33a2

Please sign in to comment.