Skip to content

nvidia

zeus.device.gpu.nvidia

NVIDIA GPUs.

NVIDIAGPU

Bases: GPU

Implementation of GPU for NVIDIA GPUs.

Source code in zeus/device/gpu/nvidia.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
class NVIDIAGPU(gpu_common.GPU):
    """Implementation of `GPU` for NVIDIA GPUs."""

    def __init__(self, gpu_index: int) -> None:
        """Initialize the GPU object."""
        super().__init__(gpu_index)
        self._get_handle()
        self._supportsGetTotalEnergyConsumption = None

        # Check if it's a Grace Hopper chip
        try:
            c2c_mode_info = pynvml.nvmlDeviceGetC2cModeInfoV(self.handle)
            self._is_grace_hopper = c2c_mode_info.isC2cEnabled
        except pynvml.NVMLError as e:
            e_value = e.value  # ty: ignore[unresolved-attribute]
            if e_value != pynvml.NVML_ERROR_NOT_SUPPORTED:
                logger.warning(
                    "Attempted to check whether the current chip is a Grace Hopper chip "
                    "by calling `nvmlDeviceGetC2cModeInfoV`, which we expected to either "
                    "return a valid response or raise `NVML_ERROR_NOT_SUPPORTED`. "
                    "Instead, it raised an unexpected error: '%s'. Treating this as "
                    "not a Grace Hopper chip.",
                    e,
                )
            self._is_grace_hopper = False

    _exception_map = {
        pynvml.NVML_ERROR_UNINITIALIZED: gpu_common.ZeusGPUInitError,
        pynvml.NVML_ERROR_INVALID_ARGUMENT: gpu_common.ZeusGPUInvalidArgError,
        pynvml.NVML_ERROR_NOT_SUPPORTED: gpu_common.ZeusGPUNotSupportedError,
        pynvml.NVML_ERROR_NO_PERMISSION: gpu_common.ZeusGPUNoPermissionError,
        pynvml.NVML_ERROR_ALREADY_INITIALIZED: gpu_common.ZeusGPUAlreadyInitializedError,
        pynvml.NVML_ERROR_NOT_FOUND: gpu_common.ZeusGPUNotFoundError,
        pynvml.NVML_ERROR_INSUFFICIENT_SIZE: gpu_common.ZeusGPUInsufficientSizeError,
        pynvml.NVML_ERROR_INSUFFICIENT_POWER: gpu_common.ZeusGPUInsufficientPowerError,
        pynvml.NVML_ERROR_DRIVER_NOT_LOADED: gpu_common.ZeusGPUDriverNotLoadedError,
        pynvml.NVML_ERROR_TIMEOUT: gpu_common.ZeusGPUTimeoutError,
        pynvml.NVML_ERROR_IRQ_ISSUE: gpu_common.ZeusGPUIRQError,
        pynvml.NVML_ERROR_LIBRARY_NOT_FOUND: gpu_common.ZeusGPULibraryNotFoundError,
        pynvml.NVML_ERROR_FUNCTION_NOT_FOUND: gpu_common.ZeusGPUFunctionNotFoundError,
        pynvml.NVML_ERROR_CORRUPTED_INFOROM: gpu_common.ZeusGPUCorruptedInfoROMError,
        pynvml.NVML_ERROR_GPU_IS_LOST: gpu_common.ZeusGPULostError,
        pynvml.NVML_ERROR_RESET_REQUIRED: gpu_common.ZeusGPUResetRequiredError,
        pynvml.NVML_ERROR_OPERATING_SYSTEM: gpu_common.ZeusGPUOperatingSystemError,
        pynvml.NVML_ERROR_LIB_RM_VERSION_MISMATCH: gpu_common.ZeusGPULibRMVersionMismatchError,
        pynvml.NVML_ERROR_MEMORY: gpu_common.ZeusGPUMemoryError,
        pynvml.NVML_ERROR_UNKNOWN: gpu_common.ZeusGPUUnknownError,
    }

    @_handle_nvml_errors
    def _get_handle(self):
        self.handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)

    @_handle_nvml_errors
    def get_name(self) -> str:
        """Return the name of the GPU model."""
        return pynvml.nvmlDeviceGetName(self.handle)

    @property
    def supports_nonblocking_setters(self) -> bool:
        """Return True if the GPU object supports non-blocking configuration setters."""
        return False

    @_handle_nvml_errors
    def get_power_management_limit_constraints(self) -> tuple[int, int]:
        """Return the minimum and maximum power management limits. Units: mW."""
        min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
        return (min_, max_)

    @_handle_nvml_errors
    def get_power_management_limit(self) -> int:
        """Return the current power management limit. Units: mW."""
        return pynvml.nvmlDeviceGetPowerManagementLimit(self.handle)

    @_handle_nvml_errors
    def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
        """Set the GPU's power management limit. Unit: mW."""
        current_limit = self.get_power_management_limit()
        if current_limit != power_limit_mw:
            self._warn_sys_admin()
            pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, power_limit_mw)

    @_handle_nvml_errors
    def reset_power_management_limit(self, block: bool = True) -> None:
        """Reset the GPU's power management limit to the default value."""
        default_limit = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle)
        current_limit = self.get_power_management_limit()
        if current_limit != default_limit:
            self._warn_sys_admin()
            pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, default_limit)

    @_handle_nvml_errors
    def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
        """Set persistence mode."""
        current_mode = pynvml.nvmlDeviceGetPersistenceMode(self.handle)
        desired_mode = pynvml.NVML_FEATURE_ENABLED if enabled else pynvml.NVML_FEATURE_DISABLED
        if current_mode != desired_mode:
            self._warn_sys_admin()
            pynvml.nvmlDeviceSetPersistenceMode(self.handle, desired_mode)

    @_handle_nvml_errors
    def get_persistence_mode(self) -> bool:
        """Return whether persistence mode is currently enabled."""
        return pynvml.nvmlDeviceGetPersistenceMode(self.handle) == pynvml.NVML_FEATURE_ENABLED

    @_handle_nvml_errors
    def get_supported_memory_clocks(self) -> list[int]:
        """Return a list of supported memory clock frequencies. Units: MHz."""
        return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

    @_handle_nvml_errors
    def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the memory clock to a specified range. Units: MHz."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetMemoryLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

    @_handle_nvml_errors
    def reset_memory_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked memory clocks to the default."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

    @_handle_nvml_errors
    def get_supported_graphics_clocks(self, memory_clock_mhz: int | None = None) -> list[int]:
        """Return a list of supported graphics clock frequencies. Units: MHz.

        Args:
            memory_clock_mhz: Memory clock frequency to use. Some GPUs have
                different supported graphics clocks depending on the memory clock.
        """
        pass
        return pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.handle, memory_clock_mhz)

    @_handle_nvml_errors
    def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the GPU clock to a specified range. Units: MHz."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

    @_handle_nvml_errors
    def reset_gpu_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked GPU clocks to the default."""
        self._warn_sys_admin()
        pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

    @_handle_nvml_errors
    def get_average_power_usage(self) -> int:
        """Return the average power draw of the GPU. Units: mW."""
        if self._is_grace_hopper:
            fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MODULE)]
        else:
            fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU)]

        metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        return metric.value.uiVal

    @_handle_nvml_errors
    def get_instant_power_usage(self) -> int:
        """Return the current power draw of the GPU. Units: mW."""
        if self._is_grace_hopper:
            fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_MODULE)]
        else:
            fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_GPU)]

        metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        return metric.value.uiVal

    @_handle_nvml_errors
    def get_average_memory_power_usage(self) -> int:
        """Return the average power draw of the GPU's memory. Units: mW.

        !!! Warning
            This isn't exactly documented in NVML at the time of writing, but `nvidia-smi`
            makes use of this API.

            Confirmed working on H100 80GB HBM3. Confirmed not working on A40.
        """
        metric = pynvml.nvmlDeviceGetFieldValues(
            self.handle,
            [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)],
        )[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        power = metric.value.uiVal
        if power == 0:
            warnings.warn(
                "Average memory power returned 0. The current GPU may not be supported.",
                stacklevel=1,
            )
        return power

    @_handle_nvml_errors
    def supports_get_total_energy_consumption(self) -> bool:
        """Check if the GPU supports retrieving total energy consumption."""
        # Supported on Volta or newer microarchitectures
        if self._supportsGetTotalEnergyConsumption is None:
            self._supportsGetTotalEnergyConsumption = (
                pynvml.nvmlDeviceGetArchitecture(self.handle) >= pynvml.NVML_DEVICE_ARCH_VOLTA
            )

        return self._supportsGetTotalEnergyConsumption

    @_handle_nvml_errors
    def get_total_energy_consumption(self) -> int:
        """Return the total energy consumption of the specified GPU. Units: mJ."""
        return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

    @_handle_nvml_errors
    def get_gpu_temperature(self) -> int:
        """Return the current GPU temperature. Units: Celsius."""
        temperature = pynvml.nvmlDeviceGetTemperatureV(self.handle, pynvml.NVML_TEMPERATURE_GPU)
        return temperature

supports_nonblocking_setters property

supports_nonblocking_setters

Return True if the GPU object supports non-blocking configuration setters.

__init__

__init__(gpu_index)
Source code in zeus/device/gpu/nvidia.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def __init__(self, gpu_index: int) -> None:
    """Initialize the GPU object."""
    super().__init__(gpu_index)
    self._get_handle()
    self._supportsGetTotalEnergyConsumption = None

    # Check if it's a Grace Hopper chip
    try:
        c2c_mode_info = pynvml.nvmlDeviceGetC2cModeInfoV(self.handle)
        self._is_grace_hopper = c2c_mode_info.isC2cEnabled
    except pynvml.NVMLError as e:
        e_value = e.value  # ty: ignore[unresolved-attribute]
        if e_value != pynvml.NVML_ERROR_NOT_SUPPORTED:
            logger.warning(
                "Attempted to check whether the current chip is a Grace Hopper chip "
                "by calling `nvmlDeviceGetC2cModeInfoV`, which we expected to either "
                "return a valid response or raise `NVML_ERROR_NOT_SUPPORTED`. "
                "Instead, it raised an unexpected error: '%s'. Treating this as "
                "not a Grace Hopper chip.",
                e,
            )
        self._is_grace_hopper = False

get_name

get_name()

Return the name of the GPU model.

Source code in zeus/device/gpu/nvidia.py
119
120
121
122
@_handle_nvml_errors
def get_name(self) -> str:
    """Return the name of the GPU model."""
    return pynvml.nvmlDeviceGetName(self.handle)

get_power_management_limit_constraints

get_power_management_limit_constraints()

Return the minimum and maximum power management limits. Units: mW.

Source code in zeus/device/gpu/nvidia.py
129
130
131
132
133
@_handle_nvml_errors
def get_power_management_limit_constraints(self) -> tuple[int, int]:
    """Return the minimum and maximum power management limits. Units: mW."""
    min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
    return (min_, max_)

get_power_management_limit

get_power_management_limit()

Return the current power management limit. Units: mW.

Source code in zeus/device/gpu/nvidia.py
135
136
137
138
@_handle_nvml_errors
def get_power_management_limit(self) -> int:
    """Return the current power management limit. Units: mW."""
    return pynvml.nvmlDeviceGetPowerManagementLimit(self.handle)

set_power_management_limit

set_power_management_limit(power_limit_mw, block=True)

Set the GPU's power management limit. Unit: mW.

Source code in zeus/device/gpu/nvidia.py
140
141
142
143
144
145
146
@_handle_nvml_errors
def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
    """Set the GPU's power management limit. Unit: mW."""
    current_limit = self.get_power_management_limit()
    if current_limit != power_limit_mw:
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, power_limit_mw)

reset_power_management_limit

reset_power_management_limit(block=True)

Reset the GPU's power management limit to the default value.

Source code in zeus/device/gpu/nvidia.py
148
149
150
151
152
153
154
155
@_handle_nvml_errors
def reset_power_management_limit(self, block: bool = True) -> None:
    """Reset the GPU's power management limit to the default value."""
    default_limit = pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle)
    current_limit = self.get_power_management_limit()
    if current_limit != default_limit:
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, default_limit)

set_persistence_mode

set_persistence_mode(enabled, block=True)

Set persistence mode.

Source code in zeus/device/gpu/nvidia.py
157
158
159
160
161
162
163
164
@_handle_nvml_errors
def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
    """Set persistence mode."""
    current_mode = pynvml.nvmlDeviceGetPersistenceMode(self.handle)
    desired_mode = pynvml.NVML_FEATURE_ENABLED if enabled else pynvml.NVML_FEATURE_DISABLED
    if current_mode != desired_mode:
        self._warn_sys_admin()
        pynvml.nvmlDeviceSetPersistenceMode(self.handle, desired_mode)

get_persistence_mode

get_persistence_mode()

Return whether persistence mode is currently enabled.

Source code in zeus/device/gpu/nvidia.py
166
167
168
169
@_handle_nvml_errors
def get_persistence_mode(self) -> bool:
    """Return whether persistence mode is currently enabled."""
    return pynvml.nvmlDeviceGetPersistenceMode(self.handle) == pynvml.NVML_FEATURE_ENABLED

get_supported_memory_clocks

get_supported_memory_clocks()

Return a list of supported memory clock frequencies. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
171
172
173
174
@_handle_nvml_errors
def get_supported_memory_clocks(self) -> list[int]:
    """Return a list of supported memory clock frequencies. Units: MHz."""
    return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

set_memory_locked_clocks

set_memory_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the memory clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
176
177
178
179
180
@_handle_nvml_errors
def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the memory clock to a specified range. Units: MHz."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceSetMemoryLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

reset_memory_locked_clocks

reset_memory_locked_clocks(block=True)

Reset the locked memory clocks to the default.

Source code in zeus/device/gpu/nvidia.py
182
183
184
185
186
@_handle_nvml_errors
def reset_memory_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked memory clocks to the default."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

get_supported_graphics_clocks

get_supported_graphics_clocks(memory_clock_mhz=None)

Return a list of supported graphics clock frequencies. Units: MHz.

Parameters:

Name Type Description Default
memory_clock_mhz int | None

Memory clock frequency to use. Some GPUs have different supported graphics clocks depending on the memory clock.

None
Source code in zeus/device/gpu/nvidia.py
188
189
190
191
192
193
194
195
196
197
@_handle_nvml_errors
def get_supported_graphics_clocks(self, memory_clock_mhz: int | None = None) -> list[int]:
    """Return a list of supported graphics clock frequencies. Units: MHz.

    Args:
        memory_clock_mhz: Memory clock frequency to use. Some GPUs have
            different supported graphics clocks depending on the memory clock.
    """
    pass
    return pynvml.nvmlDeviceGetSupportedGraphicsClocks(self.handle, memory_clock_mhz)

set_gpu_locked_clocks

set_gpu_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the GPU clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
199
200
201
202
203
@_handle_nvml_errors
def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the GPU clock to a specified range. Units: MHz."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

reset_gpu_locked_clocks

reset_gpu_locked_clocks(block=True)

Reset the locked GPU clocks to the default.

Source code in zeus/device/gpu/nvidia.py
205
206
207
208
209
@_handle_nvml_errors
def reset_gpu_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked GPU clocks to the default."""
    self._warn_sys_admin()
    pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

get_average_power_usage

get_average_power_usage()

Return the average power draw of the GPU. Units: mW.

Source code in zeus/device/gpu/nvidia.py
211
212
213
214
215
216
217
218
219
220
221
222
@_handle_nvml_errors
def get_average_power_usage(self) -> int:
    """Return the average power draw of the GPU. Units: mW."""
    if self._is_grace_hopper:
        fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MODULE)]
    else:
        fields = [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_GPU)]

    metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    return metric.value.uiVal

get_instant_power_usage

get_instant_power_usage()

Return the current power draw of the GPU. Units: mW.

Source code in zeus/device/gpu/nvidia.py
224
225
226
227
228
229
230
231
232
233
234
235
@_handle_nvml_errors
def get_instant_power_usage(self) -> int:
    """Return the current power draw of the GPU. Units: mW."""
    if self._is_grace_hopper:
        fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_MODULE)]
    else:
        fields = [(pynvml.NVML_FI_DEV_POWER_INSTANT, pynvml.NVML_POWER_SCOPE_GPU)]

    metric = pynvml.nvmlDeviceGetFieldValues(self.handle, fields)[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    return metric.value.uiVal

get_average_memory_power_usage

get_average_memory_power_usage()

Return the average power draw of the GPU's memory. Units: mW.

Warning

This isn't exactly documented in NVML at the time of writing, but nvidia-smi makes use of this API.

Confirmed working on H100 80GB HBM3. Confirmed not working on A40.

Source code in zeus/device/gpu/nvidia.py
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
@_handle_nvml_errors
def get_average_memory_power_usage(self) -> int:
    """Return the average power draw of the GPU's memory. Units: mW.

    !!! Warning
        This isn't exactly documented in NVML at the time of writing, but `nvidia-smi`
        makes use of this API.

        Confirmed working on H100 80GB HBM3. Confirmed not working on A40.
    """
    metric = pynvml.nvmlDeviceGetFieldValues(
        self.handle,
        [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)],
    )[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    power = metric.value.uiVal
    if power == 0:
        warnings.warn(
            "Average memory power returned 0. The current GPU may not be supported.",
            stacklevel=1,
        )
    return power

supports_get_total_energy_consumption

supports_get_total_energy_consumption()

Check if the GPU supports retrieving total energy consumption.

Source code in zeus/device/gpu/nvidia.py
261
262
263
264
265
266
267
268
269
270
@_handle_nvml_errors
def supports_get_total_energy_consumption(self) -> bool:
    """Check if the GPU supports retrieving total energy consumption."""
    # Supported on Volta or newer microarchitectures
    if self._supportsGetTotalEnergyConsumption is None:
        self._supportsGetTotalEnergyConsumption = (
            pynvml.nvmlDeviceGetArchitecture(self.handle) >= pynvml.NVML_DEVICE_ARCH_VOLTA
        )

    return self._supportsGetTotalEnergyConsumption

get_total_energy_consumption

get_total_energy_consumption()

Return the total energy consumption of the specified GPU. Units: mJ.

Source code in zeus/device/gpu/nvidia.py
272
273
274
275
@_handle_nvml_errors
def get_total_energy_consumption(self) -> int:
    """Return the total energy consumption of the specified GPU. Units: mJ."""
    return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

get_gpu_temperature

get_gpu_temperature()

Return the current GPU temperature. Units: Celsius.

Source code in zeus/device/gpu/nvidia.py
277
278
279
280
281
@_handle_nvml_errors
def get_gpu_temperature(self) -> int:
    """Return the current GPU temperature. Units: Celsius."""
    temperature = pynvml.nvmlDeviceGetTemperatureV(self.handle, pynvml.NVML_TEMPERATURE_GPU)
    return temperature

ZeusdNVIDIAGPU

Bases: NVIDIAGPU

An NVIDIAGPU that sets GPU knobs that require SYS_ADMIN via zeusd.

Some NVML APIs (e.g., setting persistence mode, power limit, frequency) requires the Linux security capability SYS_ADMIN, which is virtually sudo. This class overrides those methods so that they send a request to the Zeus daemon.

See here for details on system privileges required.

Source code in zeus/device/gpu/nvidia.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
class ZeusdNVIDIAGPU(NVIDIAGPU):
    """An NVIDIAGPU that sets GPU knobs that require `SYS_ADMIN` via zeusd.

    Some NVML APIs (e.g., setting persistence mode, power limit, frequency)
    requires the Linux security capability `SYS_ADMIN`, which is virtually `sudo`.
    This class overrides those methods so that they send a request to the
    Zeus daemon.

    See [here](https://ml.energy/zeus/getting_started/#system-privileges)
    for details on system privileges required.
    """

    def __init__(self, gpu_index: int, client: ZeusdClient) -> None:
        """Initialize the GPU object backed by a Zeusd daemon.

        Args:
            gpu_index: Index of the GPU.
            client: ZeusdClient connected to the daemon.
        """
        super().__init__(gpu_index)
        self._client = client
        self._gpu_index = gpu_index
        require_capabilities(client, read_gpu=True, control_gpu=True)

    @property
    def supports_nonblocking_setters(self) -> bool:
        """Return True if the GPU object supports non-blocking configuration setters."""
        return True

    def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
        """Set the GPU's power management limit. Unit: mW."""
        current_limit = self.get_power_management_limit()
        if current_limit == power_limit_mw:
            return
        self._client.set_power_limit([self._gpu_index], power_limit_mw, block)

    @_handle_nvml_errors
    def reset_power_management_limit(self, block: bool = True) -> None:
        """Reset the GPU's power management limit to the default value."""
        self.set_power_management_limit(
            pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
            block,
        )

    def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
        """Set persistence mode."""
        self._client.set_persistence_mode([self._gpu_index], enabled, block)

    def get_persistence_mode(self) -> bool:
        """Return whether persistence mode is currently enabled."""
        return self._client.get_persistence_mode([self._gpu_index])[self._gpu_index]

    def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the memory clock to a specified range. Units: MHz."""
        self._client.set_mem_locked_clocks([self._gpu_index], min_clock_mhz, max_clock_mhz, block)

    def reset_memory_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked memory clocks to the default."""
        self._client.reset_mem_locked_clocks([self._gpu_index], block)

    def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
        """Lock the GPU clock to a specified range. Units: MHz."""
        self._client.set_gpu_locked_clocks([self._gpu_index], min_clock_mhz, max_clock_mhz, block)

    def reset_gpu_locked_clocks(self, block: bool = True) -> None:
        """Reset the locked GPU clocks to the default."""
        self._client.reset_gpu_locked_clocks([self._gpu_index], block)

supports_nonblocking_setters property

supports_nonblocking_setters

Return True if the GPU object supports non-blocking configuration setters.

__init__

__init__(gpu_index, client)

Parameters:

Name Type Description Default
gpu_index int

Index of the GPU.

required
client ZeusdClient

ZeusdClient connected to the daemon.

required
Source code in zeus/device/gpu/nvidia.py
296
297
298
299
300
301
302
303
304
305
306
def __init__(self, gpu_index: int, client: ZeusdClient) -> None:
    """Initialize the GPU object backed by a Zeusd daemon.

    Args:
        gpu_index: Index of the GPU.
        client: ZeusdClient connected to the daemon.
    """
    super().__init__(gpu_index)
    self._client = client
    self._gpu_index = gpu_index
    require_capabilities(client, read_gpu=True, control_gpu=True)

set_power_management_limit

set_power_management_limit(power_limit_mw, block=True)

Set the GPU's power management limit. Unit: mW.

Source code in zeus/device/gpu/nvidia.py
313
314
315
316
317
318
def set_power_management_limit(self, power_limit_mw: int, block: bool = True) -> None:
    """Set the GPU's power management limit. Unit: mW."""
    current_limit = self.get_power_management_limit()
    if current_limit == power_limit_mw:
        return
    self._client.set_power_limit([self._gpu_index], power_limit_mw, block)

reset_power_management_limit

reset_power_management_limit(block=True)

Reset the GPU's power management limit to the default value.

Source code in zeus/device/gpu/nvidia.py
320
321
322
323
324
325
326
@_handle_nvml_errors
def reset_power_management_limit(self, block: bool = True) -> None:
    """Reset the GPU's power management limit to the default value."""
    self.set_power_management_limit(
        pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
        block,
    )

set_persistence_mode

set_persistence_mode(enabled, block=True)

Set persistence mode.

Source code in zeus/device/gpu/nvidia.py
328
329
330
def set_persistence_mode(self, enabled: bool, block: bool = True) -> None:
    """Set persistence mode."""
    self._client.set_persistence_mode([self._gpu_index], enabled, block)

get_persistence_mode

get_persistence_mode()

Return whether persistence mode is currently enabled.

Source code in zeus/device/gpu/nvidia.py
332
333
334
def get_persistence_mode(self) -> bool:
    """Return whether persistence mode is currently enabled."""
    return self._client.get_persistence_mode([self._gpu_index])[self._gpu_index]

set_memory_locked_clocks

set_memory_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the memory clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
336
337
338
def set_memory_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the memory clock to a specified range. Units: MHz."""
    self._client.set_mem_locked_clocks([self._gpu_index], min_clock_mhz, max_clock_mhz, block)

reset_memory_locked_clocks

reset_memory_locked_clocks(block=True)

Reset the locked memory clocks to the default.

Source code in zeus/device/gpu/nvidia.py
340
341
342
def reset_memory_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked memory clocks to the default."""
    self._client.reset_mem_locked_clocks([self._gpu_index], block)

set_gpu_locked_clocks

set_gpu_locked_clocks(min_clock_mhz, max_clock_mhz, block=True)

Lock the GPU clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
344
345
346
def set_gpu_locked_clocks(self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True) -> None:
    """Lock the GPU clock to a specified range. Units: MHz."""
    self._client.set_gpu_locked_clocks([self._gpu_index], min_clock_mhz, max_clock_mhz, block)

reset_gpu_locked_clocks

reset_gpu_locked_clocks(block=True)

Reset the locked GPU clocks to the default.

Source code in zeus/device/gpu/nvidia.py
348
349
350
def reset_gpu_locked_clocks(self, block: bool = True) -> None:
    """Reset the locked GPU clocks to the default."""
    self._client.reset_gpu_locked_clocks([self._gpu_index], block)

NVIDIAGPUs

Bases: GPUs

Implementation of GPUs for NVIDIA GPUs.

CUDA_VISIBLE_DEVICES environment variable is respected if set. For example, if there are 4 GPUs on the node and CUDA_VISIBLE_DEVICES=0,2, only GPUs 0 and 2 are instantiated. In this case, to access GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

If you have the Zeus daemon deployed, make sure you have set the ZEUSD_SOCK_PATH environment variable to the path of the Zeus daemon socket. This class will automatically use ZeusdNVIDIAGPU if ZEUSD_SOCK_PATH is set.

Note

For Grace Hopper, the power and energy values are for the entire superchip/module.

Source code in zeus/device/gpu/nvidia.py
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
class NVIDIAGPUs(gpu_common.GPUs):
    """Implementation of `GPUs` for NVIDIA GPUs.

    `CUDA_VISIBLE_DEVICES` environment variable is respected if set.
    For example, if there are 4 GPUs on the node and `CUDA_VISIBLE_DEVICES=0,2`,
    only GPUs 0 and 2 are instantiated. In this case, to access
    GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

    If you have the Zeus daemon deployed, make sure you have set the `ZEUSD_SOCK_PATH`
    environment variable to the path of the Zeus daemon socket. This class will
    automatically use [`ZeusdNVIDIAGPU`][zeus.device.gpu.nvidia.ZeusdNVIDIAGPU]
    if `ZEUSD_SOCK_PATH` is set.

    !!! Note
        For Grace Hopper, the power and energy values are for the entire superchip/module.
    """

    def __init__(self, ensure_homogeneous: bool = False) -> None:
        """Initialize NVML and sets up the GPUs.

        Args:
            ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name.
        """
        try:
            pynvml.nvmlInit()
            self._init_gpus()
            if ensure_homogeneous:
                self._ensure_homogeneous()
        except pynvml.NVMLError as e:
            exception_class = NVIDIAGPU._exception_map.get(
                e.value,  # type: ignore
                gpu_common.ZeusBaseGPUError,
            )
            raise exception_class(str(e)) from e

    @property
    def gpus(self) -> Sequence[NVIDIAGPU]:
        """Return a list of NVIDIAGPU objects being tracked."""
        return self._gpus

    def _init_gpus(self) -> None:
        # Must respect `CUDA_VISIBLE_DEVICES` if set
        if (visible_device := os.environ.get("CUDA_VISIBLE_DEVICES")) is not None:
            if not visible_device:
                raise gpu_common.ZeusGPUInitError(
                    "CUDA_VISIBLE_DEVICES is set to an empty string. "
                    "It should either be unset or a comma-separated list of GPU indices."
                )
            if visible_device.startswith("MIG"):
                raise gpu_common.ZeusGPUInitError(
                    "CUDA_VISIBLE_DEVICES contains MIG devices. NVML (the library used by Zeus) "
                    "currently does not support measuring the power or energy consumption of MIG "
                    "slices. You can still measure the whole GPU by temporarily setting "
                    "CUDA_VISIBLE_DEVICES to integer GPU indices and restoring it afterwards."
                )
            visible_indices = [int(idx) for idx in visible_device.split(",")]
        else:
            visible_indices = list(range(pynvml.nvmlDeviceGetCount()))

        # If Zeusd env vars are set, use ZeusdNVIDIAGPU backed by a shared client.
        config = ZeusdConfig.from_env()
        if config is not None:
            try:
                client = ZeusdClient(config)
                self._gpus = [ZeusdNVIDIAGPU(gpu_num, client) for gpu_num in visible_indices]
            except ZeusBaseError as e:
                raise gpu_common.ZeusGPUInitError(str(e)) from e
            for gpu in self._gpus:
                gpu._disable_sys_admin_warning = True
        else:
            self._gpus = [NVIDIAGPU(gpu_num) for gpu_num in visible_indices]

    def __del__(self) -> None:
        """Shut down NVML."""
        with contextlib.suppress(pynvml.NVMLError):
            pynvml.nvmlShutdown()

gpus property

gpus

Return a list of NVIDIAGPU objects being tracked.

__init__

__init__(ensure_homogeneous=False)

Parameters:

Name Type Description Default
ensure_homogeneous bool

If True, ensures that all tracked GPUs have the same name.

False
Source code in zeus/device/gpu/nvidia.py
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def __init__(self, ensure_homogeneous: bool = False) -> None:
    """Initialize NVML and sets up the GPUs.

    Args:
        ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name.
    """
    try:
        pynvml.nvmlInit()
        self._init_gpus()
        if ensure_homogeneous:
            self._ensure_homogeneous()
    except pynvml.NVMLError as e:
        exception_class = NVIDIAGPU._exception_map.get(
            e.value,  # type: ignore
            gpu_common.ZeusBaseGPUError,
        )
        raise exception_class(str(e)) from e

__del__

__del__()

Shut down NVML.

Source code in zeus/device/gpu/nvidia.py
425
426
427
428
def __del__(self) -> None:
    """Shut down NVML."""
    with contextlib.suppress(pynvml.NVMLError):
        pynvml.nvmlShutdown()

nvml_is_available cached

nvml_is_available()

Check if NVML is available.

Source code in zeus/device/gpu/nvidia.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
@lru_cache(maxsize=1)
def nvml_is_available() -> bool:
    """Check if NVML is available."""
    try:
        import pynvml
    except ImportError:
        logger.info("Failed to import `pynvml`. Make sure you have `nvidia-ml-py` installed.")
        return False

    # Detect unofficial pynvml packages.
    # If detected, this should be a critical error.
    if not hasattr(pynvml, "_nvmlGetFunctionPointer"):
        logger.error("Unoffical pynvml package detected!")
        raise ImportError(
            "Unofficial pynvml package detected! "
            "This causes conflicts with the official NVIDIA bindings. "
            "Please remove with `pip uninstall pynvml` and instead use the official "
            "bindings from NVIDIA: `nvidia-ml-py`. "
        )

    try:
        pynvml.nvmlInit()
        logger.info("pynvml is available and initialized.")
        return True
    except pynvml.NVMLError as e:
        logger.info("pynvml is available but could not initialize NVML: %s.", e)
        return False