Skip to content

nvidia

zeus.device.gpu.nvidia

NVIDIA GPUs.

NVIDIAGPU

Bases: GPU

Implementation of GPU for NVIDIA GPUs.

Source code in zeus/device/gpu/nvidia.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
class NVIDIAGPU(gpu_common.GPU):
    """Implementation of `GPU` for NVIDIA GPUs."""

    def __init__(self, gpu_index: int) -> None:
        """Initialize the GPU object."""
        super().__init__(gpu_index)
        self._get_handle()
        self._supportsGetTotalEnergyConsumption = None

    _exception_map = {
        pynvml.NVML_ERROR_UNINITIALIZED: gpu_common.ZeusGPUInitError,
        pynvml.NVML_ERROR_INVALID_ARGUMENT: gpu_common.ZeusGPUInvalidArgError,
        pynvml.NVML_ERROR_NOT_SUPPORTED: gpu_common.ZeusGPUNotSupportedError,
        pynvml.NVML_ERROR_NO_PERMISSION: gpu_common.ZeusGPUNoPermissionError,
        pynvml.NVML_ERROR_ALREADY_INITIALIZED: gpu_common.ZeusGPUAlreadyInitializedError,
        pynvml.NVML_ERROR_NOT_FOUND: gpu_common.ZeusGPUNotFoundError,
        pynvml.NVML_ERROR_INSUFFICIENT_SIZE: gpu_common.ZeusGPUInsufficientSizeError,
        pynvml.NVML_ERROR_INSUFFICIENT_POWER: gpu_common.ZeusGPUInsufficientPowerError,
        pynvml.NVML_ERROR_DRIVER_NOT_LOADED: gpu_common.ZeusGPUDriverNotLoadedError,
        pynvml.NVML_ERROR_TIMEOUT: gpu_common.ZeusGPUTimeoutError,
        pynvml.NVML_ERROR_IRQ_ISSUE: gpu_common.ZeusGPUIRQError,
        pynvml.NVML_ERROR_LIBRARY_NOT_FOUND: gpu_common.ZeusGPULibraryNotFoundError,
        pynvml.NVML_ERROR_FUNCTION_NOT_FOUND: gpu_common.ZeusGPUFunctionNotFoundError,
        pynvml.NVML_ERROR_CORRUPTED_INFOROM: gpu_common.ZeusGPUCorruptedInfoROMError,
        pynvml.NVML_ERROR_GPU_IS_LOST: gpu_common.ZeusGPULostError,
        pynvml.NVML_ERROR_RESET_REQUIRED: gpu_common.ZeusGPUResetRequiredError,
        pynvml.NVML_ERROR_OPERATING_SYSTEM: gpu_common.ZeusGPUOperatingSystemError,
        pynvml.NVML_ERROR_LIB_RM_VERSION_MISMATCH: gpu_common.ZeusGPULibRMVersionMismatchError,
        pynvml.NVML_ERROR_MEMORY: gpu_common.ZeusGPUMemoryError,
        pynvml.NVML_ERROR_UNKNOWN: gpu_common.ZeusGPUUnknownError,
    }

    @_handle_nvml_errors
    def _get_handle(self):
        self.handle = pynvml.nvmlDeviceGetHandleByIndex(self.gpu_index)

    @_handle_nvml_errors
    def getName(self) -> str:
        """Return the name of the GPU model."""
        return pynvml.nvmlDeviceGetName(self.handle)

    @property
    def supports_nonblocking_setters(self) -> bool:
        """Return True if the GPU object supports non-blocking configuration setters."""
        return False

    @_handle_nvml_errors
    def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
        """Return the minimum and maximum power management limits. Units: mW."""
        min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
        return (min_, max_)

    @_handle_nvml_errors
    def setPowerManagementLimit(self, power_limit_mw: int, _block: bool = True) -> None:
        """Set the GPU's power management limit. Unit: mW."""
        pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, power_limit_mw)

    @_handle_nvml_errors
    def resetPowerManagementLimit(self, _block: bool = True) -> None:
        """Reset the GPU's power management limit to the default value."""
        pynvml.nvmlDeviceSetPowerManagementLimit(
            self.handle,
            pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
        )

    @_handle_nvml_errors
    def setPersistenceMode(self, enabled: bool, _block: bool = True) -> None:
        """Set persistence mode."""
        if enabled:
            pynvml.nvmlDeviceSetPersistenceMode(
                self.handle, pynvml.NVML_FEATURE_ENABLED
            )
        else:
            pynvml.nvmlDeviceSetPersistenceMode(
                self.handle, pynvml.NVML_FEATURE_DISABLED
            )

    @_handle_nvml_errors
    def getSupportedMemoryClocks(self) -> list[int]:
        """Return a list of supported memory clock frequencies. Units: MHz."""
        return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

    @_handle_nvml_errors
    def setMemoryLockedClocks(
        self, min_clock_mhz: int, max_clock_mhz: int, _block: bool = True
    ) -> None:
        """Lock the memory clock to a specified range. Units: MHz."""
        pynvml.nvmlDeviceSetMemoryLockedClocks(
            self.handle, min_clock_mhz, max_clock_mhz
        )

    @_handle_nvml_errors
    def resetMemoryLockedClocks(self, _block: bool = True) -> None:
        """Reset the locked memory clocks to the default."""
        pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

    @_handle_nvml_errors
    def getSupportedGraphicsClocks(
        self, memory_clock_mhz: int | None = None
    ) -> list[int]:
        """Return a list of supported graphics clock frequencies. Units: MHz.

        Args:
            memory_clock_mhz: Memory clock frequency to use. Some GPUs have
                different supported graphics clocks depending on the memory clock.
        """
        pass
        return pynvml.nvmlDeviceGetSupportedGraphicsClocks(
            self.handle, memory_clock_mhz
        )

    @_handle_nvml_errors
    def setGpuLockedClocks(
        self, min_clock_mhz: int, max_clock_mhz: int, _block: bool = True
    ) -> None:
        """Lock the GPU clock to a specified range. Units: MHz."""
        pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

    @_handle_nvml_errors
    def resetGpuLockedClocks(self, _block: bool = True) -> None:
        """Reset the locked GPU clocks to the default."""
        pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

    @_handle_nvml_errors
    def getAveragePowerUsage(self) -> int:
        """Return the average power draw of the GPU. Units: mW."""
        metric = pynvml.nvmlDeviceGetFieldValues(
            self.handle, [pynvml.NVML_FI_DEV_POWER_AVERAGE]
        )[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        return metric.value.uiVal

    @_handle_nvml_errors
    def getInstantPowerUsage(self) -> int:
        """Return the current power draw of the GPU. Units: mW."""
        metric = pynvml.nvmlDeviceGetFieldValues(
            self.handle, [pynvml.NVML_FI_DEV_POWER_INSTANT]
        )[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        return metric.value.uiVal

    @_handle_nvml_errors
    def getAverageMemoryPowerUsage(self) -> int:
        """Return the average power draw of the GPU's memory. Units: mW.

        !!! Warning
            This isn't exactly documented in NVML at the time of writing, but `nvidia-smi`
            makes use of this API.

            Confirmed working on H100 80GB HBM3. Confirmed not working on A40.
        """
        metric = pynvml.nvmlDeviceGetFieldValues(
            self.handle,
            [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)],
        )[0]
        if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
            raise pynvml.NVMLError(ret)
        power = metric.value.uiVal
        if power == 0:
            warnings.warn(
                "Average memory power returned 0. The current GPU may not be supported.",
                stacklevel=1,
            )
        return power

    @_handle_nvml_errors
    def supportsGetTotalEnergyConsumption(self) -> bool:
        """Check if the GPU supports retrieving total energy consumption."""
        # Supported on Volta or newer microarchitectures
        if self._supportsGetTotalEnergyConsumption is None:
            self._supportsGetTotalEnergyConsumption = (
                pynvml.nvmlDeviceGetArchitecture(self.handle)
                >= pynvml.NVML_DEVICE_ARCH_VOLTA
            )

        return self._supportsGetTotalEnergyConsumption

    @_handle_nvml_errors
    def getTotalEnergyConsumption(self) -> int:
        """Return the total energy consumption of the specified GPU. Units: mJ."""
        return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

supports_nonblocking_setters property

supports_nonblocking_setters

Return True if the GPU object supports non-blocking configuration setters.

__init__

__init__(gpu_index)
Source code in zeus/device/gpu/nvidia.py
72
73
74
75
76
def __init__(self, gpu_index: int) -> None:
    """Initialize the GPU object."""
    super().__init__(gpu_index)
    self._get_handle()
    self._supportsGetTotalEnergyConsumption = None

getName

getName()

Return the name of the GPU model.

Source code in zeus/device/gpu/nvidia.py
105
106
107
108
@_handle_nvml_errors
def getName(self) -> str:
    """Return the name of the GPU model."""
    return pynvml.nvmlDeviceGetName(self.handle)

getPowerManagementLimitConstraints

getPowerManagementLimitConstraints()

Return the minimum and maximum power management limits. Units: mW.

Source code in zeus/device/gpu/nvidia.py
115
116
117
118
119
@_handle_nvml_errors
def getPowerManagementLimitConstraints(self) -> tuple[int, int]:
    """Return the minimum and maximum power management limits. Units: mW."""
    min_, max_ = pynvml.nvmlDeviceGetPowerManagementLimitConstraints(self.handle)
    return (min_, max_)

setPowerManagementLimit

setPowerManagementLimit(power_limit_mw, _block=True)

Set the GPU's power management limit. Unit: mW.

Source code in zeus/device/gpu/nvidia.py
121
122
123
124
@_handle_nvml_errors
def setPowerManagementLimit(self, power_limit_mw: int, _block: bool = True) -> None:
    """Set the GPU's power management limit. Unit: mW."""
    pynvml.nvmlDeviceSetPowerManagementLimit(self.handle, power_limit_mw)

resetPowerManagementLimit

resetPowerManagementLimit(_block=True)

Reset the GPU's power management limit to the default value.

Source code in zeus/device/gpu/nvidia.py
126
127
128
129
130
131
132
@_handle_nvml_errors
def resetPowerManagementLimit(self, _block: bool = True) -> None:
    """Reset the GPU's power management limit to the default value."""
    pynvml.nvmlDeviceSetPowerManagementLimit(
        self.handle,
        pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
    )

setPersistenceMode

setPersistenceMode(enabled, _block=True)

Set persistence mode.

Source code in zeus/device/gpu/nvidia.py
134
135
136
137
138
139
140
141
142
143
144
@_handle_nvml_errors
def setPersistenceMode(self, enabled: bool, _block: bool = True) -> None:
    """Set persistence mode."""
    if enabled:
        pynvml.nvmlDeviceSetPersistenceMode(
            self.handle, pynvml.NVML_FEATURE_ENABLED
        )
    else:
        pynvml.nvmlDeviceSetPersistenceMode(
            self.handle, pynvml.NVML_FEATURE_DISABLED
        )

getSupportedMemoryClocks

getSupportedMemoryClocks()

Return a list of supported memory clock frequencies. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
146
147
148
149
@_handle_nvml_errors
def getSupportedMemoryClocks(self) -> list[int]:
    """Return a list of supported memory clock frequencies. Units: MHz."""
    return pynvml.nvmlDeviceGetSupportedMemoryClocks(self.handle)

setMemoryLockedClocks

setMemoryLockedClocks(
    min_clock_mhz, max_clock_mhz, _block=True
)

Lock the memory clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
151
152
153
154
155
156
157
158
@_handle_nvml_errors
def setMemoryLockedClocks(
    self, min_clock_mhz: int, max_clock_mhz: int, _block: bool = True
) -> None:
    """Lock the memory clock to a specified range. Units: MHz."""
    pynvml.nvmlDeviceSetMemoryLockedClocks(
        self.handle, min_clock_mhz, max_clock_mhz
    )

resetMemoryLockedClocks

resetMemoryLockedClocks(_block=True)

Reset the locked memory clocks to the default.

Source code in zeus/device/gpu/nvidia.py
160
161
162
163
@_handle_nvml_errors
def resetMemoryLockedClocks(self, _block: bool = True) -> None:
    """Reset the locked memory clocks to the default."""
    pynvml.nvmlDeviceResetMemoryLockedClocks(self.handle)

getSupportedGraphicsClocks

getSupportedGraphicsClocks(memory_clock_mhz=None)

Return a list of supported graphics clock frequencies. Units: MHz.

Parameters:

Name Type Description Default
memory_clock_mhz int | None

Memory clock frequency to use. Some GPUs have different supported graphics clocks depending on the memory clock.

None
Source code in zeus/device/gpu/nvidia.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
@_handle_nvml_errors
def getSupportedGraphicsClocks(
    self, memory_clock_mhz: int | None = None
) -> list[int]:
    """Return a list of supported graphics clock frequencies. Units: MHz.

    Args:
        memory_clock_mhz: Memory clock frequency to use. Some GPUs have
            different supported graphics clocks depending on the memory clock.
    """
    pass
    return pynvml.nvmlDeviceGetSupportedGraphicsClocks(
        self.handle, memory_clock_mhz
    )

setGpuLockedClocks

setGpuLockedClocks(
    min_clock_mhz, max_clock_mhz, _block=True
)

Lock the GPU clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
180
181
182
183
184
185
@_handle_nvml_errors
def setGpuLockedClocks(
    self, min_clock_mhz: int, max_clock_mhz: int, _block: bool = True
) -> None:
    """Lock the GPU clock to a specified range. Units: MHz."""
    pynvml.nvmlDeviceSetGpuLockedClocks(self.handle, min_clock_mhz, max_clock_mhz)

resetGpuLockedClocks

resetGpuLockedClocks(_block=True)

Reset the locked GPU clocks to the default.

Source code in zeus/device/gpu/nvidia.py
187
188
189
190
@_handle_nvml_errors
def resetGpuLockedClocks(self, _block: bool = True) -> None:
    """Reset the locked GPU clocks to the default."""
    pynvml.nvmlDeviceResetGpuLockedClocks(self.handle)

getAveragePowerUsage

getAveragePowerUsage()

Return the average power draw of the GPU. Units: mW.

Source code in zeus/device/gpu/nvidia.py
192
193
194
195
196
197
198
199
200
@_handle_nvml_errors
def getAveragePowerUsage(self) -> int:
    """Return the average power draw of the GPU. Units: mW."""
    metric = pynvml.nvmlDeviceGetFieldValues(
        self.handle, [pynvml.NVML_FI_DEV_POWER_AVERAGE]
    )[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    return metric.value.uiVal

getInstantPowerUsage

getInstantPowerUsage()

Return the current power draw of the GPU. Units: mW.

Source code in zeus/device/gpu/nvidia.py
202
203
204
205
206
207
208
209
210
@_handle_nvml_errors
def getInstantPowerUsage(self) -> int:
    """Return the current power draw of the GPU. Units: mW."""
    metric = pynvml.nvmlDeviceGetFieldValues(
        self.handle, [pynvml.NVML_FI_DEV_POWER_INSTANT]
    )[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    return metric.value.uiVal

getAverageMemoryPowerUsage

getAverageMemoryPowerUsage()

Return the average power draw of the GPU's memory. Units: mW.

Warning

This isn't exactly documented in NVML at the time of writing, but nvidia-smi makes use of this API.

Confirmed working on H100 80GB HBM3. Confirmed not working on A40.

Source code in zeus/device/gpu/nvidia.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
@_handle_nvml_errors
def getAverageMemoryPowerUsage(self) -> int:
    """Return the average power draw of the GPU's memory. Units: mW.

    !!! Warning
        This isn't exactly documented in NVML at the time of writing, but `nvidia-smi`
        makes use of this API.

        Confirmed working on H100 80GB HBM3. Confirmed not working on A40.
    """
    metric = pynvml.nvmlDeviceGetFieldValues(
        self.handle,
        [(pynvml.NVML_FI_DEV_POWER_AVERAGE, pynvml.NVML_POWER_SCOPE_MEMORY)],
    )[0]
    if (ret := metric.nvmlReturn) != pynvml.NVML_SUCCESS:
        raise pynvml.NVMLError(ret)
    power = metric.value.uiVal
    if power == 0:
        warnings.warn(
            "Average memory power returned 0. The current GPU may not be supported.",
            stacklevel=1,
        )
    return power

supportsGetTotalEnergyConsumption

supportsGetTotalEnergyConsumption()

Check if the GPU supports retrieving total energy consumption.

Source code in zeus/device/gpu/nvidia.py
236
237
238
239
240
241
242
243
244
245
246
@_handle_nvml_errors
def supportsGetTotalEnergyConsumption(self) -> bool:
    """Check if the GPU supports retrieving total energy consumption."""
    # Supported on Volta or newer microarchitectures
    if self._supportsGetTotalEnergyConsumption is None:
        self._supportsGetTotalEnergyConsumption = (
            pynvml.nvmlDeviceGetArchitecture(self.handle)
            >= pynvml.NVML_DEVICE_ARCH_VOLTA
        )

    return self._supportsGetTotalEnergyConsumption

getTotalEnergyConsumption

getTotalEnergyConsumption()

Return the total energy consumption of the specified GPU. Units: mJ.

Source code in zeus/device/gpu/nvidia.py
248
249
250
251
@_handle_nvml_errors
def getTotalEnergyConsumption(self) -> int:
    """Return the total energy consumption of the specified GPU. Units: mJ."""
    return pynvml.nvmlDeviceGetTotalEnergyConsumption(self.handle)

ZeusdNVIDIAGPU

Bases: NVIDIAGPU

An NVIDIAGPU that sets GPU knobs that require SYS_ADMIN via zeusd.

Some NVML APIs (e.g., setting persistence mode, power limit, frequency) requires the Linux security capability SYS_ADMIN, which is virtually sudo. This class overrides those methods so that they send a request to the Zeus daemon.

See here for details on system privileges required.

Source code in zeus/device/gpu/nvidia.py
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
class ZeusdNVIDIAGPU(NVIDIAGPU):
    """An NVIDIAGPU that sets GPU knobs that require `SYS_ADMIN` via zeusd.

    Some NVML APIs (e.g., setting persistence mode, power limit, frequency)
    requires the Linux security capability `SYS_ADMIN`, which is virtually `sudo`.
    This class overrides those methods so that they send a request to the
    Zeus daemon.

    See [here](https://ml.energy/zeus/getting_started/#system-privileges)
    for details on system privileges required.
    """

    def __init__(
        self,
        gpu_index: int,
        zeusd_sock_path: str = "/var/run/zeusd.sock",
    ) -> None:
        """Initialize NVML and sets up the GPUs.

        Args:
            gpu_index (int): Index of the GPU.
            zeusd_sock_path (str): Path to the Zeus daemon socket.
        """
        super().__init__(gpu_index)
        self.zeusd_sock_path = zeusd_sock_path

        self._client = httpx.Client(transport=httpx.HTTPTransport(uds=zeusd_sock_path))
        self._url_prefix = f"http://zeusd/gpu/{gpu_index}"

    @property
    def supports_nonblocking_setters(self) -> bool:
        """Return True if the GPU object supports non-blocking configuration setters."""
        return True

    def setPowerManagementLimit(self, power_limit_mw: int, block: bool = True) -> None:
        """Set the GPU's power management limit. Unit: mW."""
        resp = self._client.post(
            self._url_prefix + "/set_power_limit",
            json=dict(power_limit_mw=power_limit_mw, block=block),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set power management limit: {resp.text}")
        logger.debug("Took %s ms to set power limit", resp.elapsed.microseconds / 1000)

    @_handle_nvml_errors
    def resetPowerManagementLimit(self, block: bool = True) -> None:
        """Reset the GPU's power management limit to the default value."""
        self.setPowerManagementLimit(
            pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
            block,
        )

    def setPersistenceMode(self, enabled: bool, block: bool = False) -> None:
        """Set persistence mode."""
        resp = self._client.post(
            self._url_prefix + "/set_persistence_mode",
            json=dict(enabled=enabled, block=block),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set persistence mode: {resp.text}")
        logger.debug(
            "Took %s ms to set persistence mode", resp.elapsed.microseconds / 1000
        )

    def setMemoryLockedClocks(
        self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True
    ) -> None:
        """Lock the memory clock to a specified range. Units: MHz."""
        resp = self._client.post(
            self._url_prefix + "/set_mem_locked_clocks",
            json=dict(
                min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block
            ),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}")
        logger.debug(
            "Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000
        )

    def resetMemoryLockedClocks(self, block: bool = True) -> None:
        """Reset the locked memory clocks to the default."""
        resp = self._client.post(
            self._url_prefix + "/reset_mem_locked_clocks", json=dict(block=block)
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to reset memory locked clocks: {resp.text}")

    def setGpuLockedClocks(
        self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True
    ) -> None:
        """Lock the GPU clock to a specified range. Units: MHz."""
        resp = self._client.post(
            self._url_prefix + "/set_gpu_locked_clocks",
            json=dict(
                min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block
            ),
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to set GPU locked clocks: {resp.text}")

    def resetGpuLockedClocks(self, block: bool = True) -> None:
        """Reset the locked GPU clocks to the default."""
        resp = self._client.post(
            self._url_prefix + "/reset_gpu_locked_clocks", json=dict(block=block)
        )
        if resp.status_code != 200:
            raise ZeusdError(f"Failed to reset GPU locked clocks: {resp.text}")

supports_nonblocking_setters property

supports_nonblocking_setters

Return True if the GPU object supports non-blocking configuration setters.

__init__

__init__(gpu_index, zeusd_sock_path='/var/run/zeusd.sock')

Parameters:

Name Type Description Default
gpu_index int

Index of the GPU.

required
zeusd_sock_path str

Path to the Zeus daemon socket.

'/var/run/zeusd.sock'
Source code in zeus/device/gpu/nvidia.py
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def __init__(
    self,
    gpu_index: int,
    zeusd_sock_path: str = "/var/run/zeusd.sock",
) -> None:
    """Initialize NVML and sets up the GPUs.

    Args:
        gpu_index (int): Index of the GPU.
        zeusd_sock_path (str): Path to the Zeus daemon socket.
    """
    super().__init__(gpu_index)
    self.zeusd_sock_path = zeusd_sock_path

    self._client = httpx.Client(transport=httpx.HTTPTransport(uds=zeusd_sock_path))
    self._url_prefix = f"http://zeusd/gpu/{gpu_index}"

setPowerManagementLimit

setPowerManagementLimit(power_limit_mw, block=True)

Set the GPU's power management limit. Unit: mW.

Source code in zeus/device/gpu/nvidia.py
288
289
290
291
292
293
294
295
296
def setPowerManagementLimit(self, power_limit_mw: int, block: bool = True) -> None:
    """Set the GPU's power management limit. Unit: mW."""
    resp = self._client.post(
        self._url_prefix + "/set_power_limit",
        json=dict(power_limit_mw=power_limit_mw, block=block),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set power management limit: {resp.text}")
    logger.debug("Took %s ms to set power limit", resp.elapsed.microseconds / 1000)

resetPowerManagementLimit

resetPowerManagementLimit(block=True)

Reset the GPU's power management limit to the default value.

Source code in zeus/device/gpu/nvidia.py
298
299
300
301
302
303
304
@_handle_nvml_errors
def resetPowerManagementLimit(self, block: bool = True) -> None:
    """Reset the GPU's power management limit to the default value."""
    self.setPowerManagementLimit(
        pynvml.nvmlDeviceGetPowerManagementDefaultLimit(self.handle),
        block,
    )

setPersistenceMode

setPersistenceMode(enabled, block=False)

Set persistence mode.

Source code in zeus/device/gpu/nvidia.py
306
307
308
309
310
311
312
313
314
315
316
def setPersistenceMode(self, enabled: bool, block: bool = False) -> None:
    """Set persistence mode."""
    resp = self._client.post(
        self._url_prefix + "/set_persistence_mode",
        json=dict(enabled=enabled, block=block),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set persistence mode: {resp.text}")
    logger.debug(
        "Took %s ms to set persistence mode", resp.elapsed.microseconds / 1000
    )

setMemoryLockedClocks

setMemoryLockedClocks(
    min_clock_mhz, max_clock_mhz, block=True
)

Lock the memory clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
def setMemoryLockedClocks(
    self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True
) -> None:
    """Lock the memory clock to a specified range. Units: MHz."""
    resp = self._client.post(
        self._url_prefix + "/set_mem_locked_clocks",
        json=dict(
            min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block
        ),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set memory locked clocks: {resp.text}")
    logger.debug(
        "Took %s ms to set memory locked clocks", resp.elapsed.microseconds / 1000
    )

resetMemoryLockedClocks

resetMemoryLockedClocks(block=True)

Reset the locked memory clocks to the default.

Source code in zeus/device/gpu/nvidia.py
334
335
336
337
338
339
340
def resetMemoryLockedClocks(self, block: bool = True) -> None:
    """Reset the locked memory clocks to the default."""
    resp = self._client.post(
        self._url_prefix + "/reset_mem_locked_clocks", json=dict(block=block)
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to reset memory locked clocks: {resp.text}")

setGpuLockedClocks

setGpuLockedClocks(
    min_clock_mhz, max_clock_mhz, block=True
)

Lock the GPU clock to a specified range. Units: MHz.

Source code in zeus/device/gpu/nvidia.py
342
343
344
345
346
347
348
349
350
351
352
353
def setGpuLockedClocks(
    self, min_clock_mhz: int, max_clock_mhz: int, block: bool = True
) -> None:
    """Lock the GPU clock to a specified range. Units: MHz."""
    resp = self._client.post(
        self._url_prefix + "/set_gpu_locked_clocks",
        json=dict(
            min_clock_mhz=min_clock_mhz, max_clock_mhz=max_clock_mhz, block=block
        ),
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to set GPU locked clocks: {resp.text}")

resetGpuLockedClocks

resetGpuLockedClocks(block=True)

Reset the locked GPU clocks to the default.

Source code in zeus/device/gpu/nvidia.py
355
356
357
358
359
360
361
def resetGpuLockedClocks(self, block: bool = True) -> None:
    """Reset the locked GPU clocks to the default."""
    resp = self._client.post(
        self._url_prefix + "/reset_gpu_locked_clocks", json=dict(block=block)
    )
    if resp.status_code != 200:
        raise ZeusdError(f"Failed to reset GPU locked clocks: {resp.text}")

NVIDIAGPUs

Bases: GPUs

Implementation of GPUs for NVIDIA GPUs.

CUDA_VISIBLE_DEVICES environment variable is respected if set. For example, if there are 4 GPUs on the node and CUDA_VISIBLE_DEVICES=0,2, only GPUs 0 and 2 are instantiated. In this case, to access GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

If you have the Zeus daemon deployed, make sure you have set the ZEUSD_SOCK_PATH environment variable to the path of the Zeus daemon socket. This class will automatically use ZeusdNVIDIAGPU if ZEUSD_SOCK_PATH is set.

Source code in zeus/device/gpu/nvidia.py
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
class NVIDIAGPUs(gpu_common.GPUs):
    """Implementation of `GPUs` for NVIDIA GPUs.

    `CUDA_VISIBLE_DEVICES` environment variable is respected if set.
    For example, if there are 4 GPUs on the node and `CUDA_VISIBLE_DEVICES=0,2`,
    only GPUs 0 and 2 are instantiated. In this case, to access
    GPU of CUDA index 0, use the index 0, and for CUDA index 2, use the index 1.

    If you have the Zeus daemon deployed, make sure you have set the `ZEUSD_SOCK_PATH`
    environment variable to the path of the Zeus daemon socket. This class will
    automatically use [`ZeusdNVIDIAGPU`][zeus.device.gpu.nvidia.ZeusdNVIDIAGPU]
    if `ZEUSD_SOCK_PATH` is set.
    """

    def __init__(self, ensure_homogeneous: bool = False) -> None:
        """Initialize NVML and sets up the GPUs.

        Args:
            ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name.
        """
        try:
            pynvml.nvmlInit()
            self._init_gpus()
            if ensure_homogeneous:
                self._ensure_homogeneous()
        except pynvml.NVMLError as e:
            exception_class = NVIDIAGPU._exception_map.get(
                e.value,  # pyright: ignore[reportAttributeAccessIssue]
                gpu_common.ZeusBaseGPUError,
            )
            raise exception_class(
                e.msg  # pyright: ignore[reportAttributeAccessIssue]
            ) from e

    @property
    def gpus(self) -> Sequence[NVIDIAGPU]:
        """Return a list of NVIDIAGPU objects being tracked."""
        return self._gpus

    def _init_gpus(self) -> None:
        # Must respect `CUDA_VISIBLE_DEVICES` if set
        if (visible_device := os.environ.get("CUDA_VISIBLE_DEVICES")) is not None:
            if not visible_device:
                raise gpu_common.ZeusGPUInitError(
                    "CUDA_VISIBLE_DEVICES is set to an empty string. "
                    "It should either be unset or a comma-separated list of GPU indices."
                )
            if visible_device.startswith("MIG"):
                raise gpu_common.ZeusGPUInitError(
                    "CUDA_VISIBLE_DEVICES contains MIG devices. NVML (the library used by Zeus) "
                    "currently does not support measuring the power or energy consumption of MIG "
                    "slices. You can still measure the whole GPU by temporarily setting "
                    "CUDA_VISIBLE_DEVICES to integer GPU indices and restoring it afterwards."
                )
            visible_indices = [int(idx) for idx in visible_device.split(",")]
        else:
            visible_indices = list(range(pynvml.nvmlDeviceGetCount()))

        # If `ZEUSD_SOCK_PATH` is set, always use ZeusdNVIDIAGPU
        if (sock_path := os.environ.get("ZEUSD_SOCK_PATH")) is not None:
            if not Path(sock_path).exists():
                raise ZeusdError(
                    f"ZEUSD_SOCK_PATH points to non-existent file: {sock_path}"
                )
            if not Path(sock_path).is_socket():
                raise ZeusdError(f"ZEUSD_SOCK_PATH is not a socket: {sock_path}")
            if not os.access(sock_path, os.W_OK):
                raise ZeusdError(f"ZEUSD_SOCK_PATH is not writable: {sock_path}")
            self._gpus = [
                ZeusdNVIDIAGPU(gpu_num, sock_path) for gpu_num in visible_indices
            ]
            # Disable the warning about SYS_ADMIN capabilities
            self._disable_sys_admin_warning = True

        # Otherwise just use NVIDIAGPU
        else:
            self._gpus = [NVIDIAGPU(gpu_num) for gpu_num in visible_indices]

    def __del__(self) -> None:
        """Shut down NVML."""
        with contextlib.suppress(pynvml.NVMLError):
            pynvml.nvmlShutdown()

gpus property

gpus

Return a list of NVIDIAGPU objects being tracked.

__init__

__init__(ensure_homogeneous=False)

Parameters:

Name Type Description Default
ensure_homogeneous bool

If True, ensures that all tracked GPUs have the same name.

False
Source code in zeus/device/gpu/nvidia.py
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
def __init__(self, ensure_homogeneous: bool = False) -> None:
    """Initialize NVML and sets up the GPUs.

    Args:
        ensure_homogeneous (bool): If True, ensures that all tracked GPUs have the same name.
    """
    try:
        pynvml.nvmlInit()
        self._init_gpus()
        if ensure_homogeneous:
            self._ensure_homogeneous()
    except pynvml.NVMLError as e:
        exception_class = NVIDIAGPU._exception_map.get(
            e.value,  # pyright: ignore[reportAttributeAccessIssue]
            gpu_common.ZeusBaseGPUError,
        )
        raise exception_class(
            e.msg  # pyright: ignore[reportAttributeAccessIssue]
        ) from e

__del__

__del__()

Shut down NVML.

Source code in zeus/device/gpu/nvidia.py
442
443
444
445
def __del__(self) -> None:
    """Shut down NVML."""
    with contextlib.suppress(pynvml.NVMLError):
        pynvml.nvmlShutdown()

nvml_is_available cached

nvml_is_available()

Check if NVML is available.

Source code in zeus/device/gpu/nvidia.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@lru_cache(maxsize=1)
def nvml_is_available() -> bool:
    """Check if NVML is available."""
    try:
        import pynvml
    except ImportError:
        logger.info(
            "Failed to import `pynvml`. Make sure you have `nvidia-ml-py` installed."
        )
        return False

    # Detect unofficial pynvml packages.
    # If detected, this should be a critical error.
    if not hasattr(pynvml, "_nvmlGetFunctionPointer"):
        logger.error("Unoffical pynvml package detected!")
        raise ImportError(
            "Unofficial pynvml package detected! "
            "This causes conflicts with the official NVIDIA bindings. "
            "Please remove with `pip uninstall pynvml` and instead use the official "
            "bindings from NVIDIA: `nvidia-ml-py`. "
        )

    try:
        pynvml.nvmlInit()
        logger.info("pynvml is available and initialized.")
        return True
    except pynvml.NVMLError as e:
        logger.info("pynvml is available but could not initialize NVML: %s.", e)
        return False