Skip to content

Commit

Permalink
added average power usage
Browse files Browse the repository at this point in the history
  • Loading branch information
parthraut committed Oct 30, 2024
1 parent 6e632d2 commit 8f835b1
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 6 deletions.
35 changes: 29 additions & 6 deletions zeus/device/gpu/amd.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,9 @@ def __init__(self, gpu_index: int) -> None:
super().__init__(gpu_index)
self._get_handle()

# This value is updated in AMDGPUs constructor
self._supportsGetTotalEnergyConsumption = False
# These values are updated in AMDGPUs constructor
self._supportsGetTotalEnergyConsumption = True
self._supportsInstantPowerUsage = True

_exception_map = {
1: gpu_common.ZeusGPUInvalidArgError, # amdsmi.amdsmi_wrapper.AMDSMI_STATUS_INVAL
Expand Down Expand Up @@ -239,12 +240,25 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None:
clk_type=amdsmi.AmdSmiClkType.GFX,
) # expects MHz

@_handle_amdsmi_errors
def getAveragePowerUsage(self) -> int:
"""Return the average power draw of the GPU. Units: mW."""
# returns in W, convert to mW
return int(
int(amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"])
* 1000
)

@_handle_amdsmi_errors
def getInstantPowerUsage(self) -> int:
"""Return the current power draw of the GPU. Units: mW."""
if self._supportsInstantPowerUsage is False:
raise gpu_common.ZeusGPUNotSupportedError(
"Instant power usage is not supported on this AMD GPU."
)
# returns in W, convert to mW
return int(
int(amdsmi.amdsmi_get_power_info(self.handle)["average_socket_power"])
int(amdsmi.amdsmi_get_power_info(self.handle)["current_socket_power"])
* 1000
)

Expand Down Expand Up @@ -327,13 +341,22 @@ def _init_gpus(self) -> None:
else:
visible_indices = list(range(len(amdsmi.amdsmi_get_processor_handles())))

# create a threadpool with the number of visible GPUs
# create the number of visible GPUs
self._gpus = [AMDGPU(gpu_num) for gpu_num in visible_indices]

# set _supportsInstantPowerUsage for all GPUs
for gpu in self._gpus:
if gpu.getInstantPowerUsage() == "N/A":
gpu._supportsInstantPowerUsage = False

# set _supportsGetTotalEnergyConsumption for all GPUs
wait_time = 0.5 # seconds

powers = [gpu.getInstantPowerUsage() for gpu in self._gpus]
powers = [
gpu.getInstantPowerUsage()
if gpu._supportsInstantPowerUsage
else gpu.getAveragePowerUsage()
for gpu in self._gpus
]
initial_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus]
time.sleep(wait_time)
final_energies = [gpu.getTotalEnergyConsumption() for gpu in self._gpus]
Expand Down
5 changes: 5 additions & 0 deletions zeus/device/gpu/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None:
"""Reset the locked GPU clocks to the default."""
pass

@abc.abstractmethod
def getAveragePowerUsage(self) -> int:
"""Return the average power usage of the GPU. Units: mW."""
pass

@abc.abstractmethod
def getInstantPowerUsage(self) -> int:
"""Return the current power draw of the GPU. Units: mW."""
Expand Down
10 changes: 10 additions & 0 deletions zeus/device/gpu/nvidia.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,16 @@ def resetGpuLockedClocks(self, _block: bool = True) -> None:
"""Reset the locked GPU clocks to the default."""
pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

@_handle_nvml_errors
def getAveragePowerUsage(self) -> int:
"""Return the current power draw of the GPU. Units: mW."""
metric = pynvml.nvmlDeviceGetFieldValues(
self.handle, [pynvml.NVML_FI_DEV_POWER_AVERAGE]
)[0]
if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
raise pynvml.NVMLError(ret)
return metric.value.uiVal

@_handle_nvml_errors
def getInstantPowerUsage(self) -> int:
"""Return the current power draw of the GPU. Units: mW."""
Expand Down

0 comments on commit 8f835b1

Please sign in to comment.