Source code for zero.hardware

"""Tools related to devices, memory, etc."""

import gc
from typing import Any, Dict

import pynvml
import torch
from pynvml import NVMLError_LibraryNotFound


[docs]def free_memory() -> None: """Free GPU-memory occupied by `torch` and run the garbage collector. Warning: There is a small chunk of GPU-memory (occupied by drivers) that is impossible to free. It is a `torch` "limitation", so the function inherits this property. Inspired by: https://github.com/xtinkt/editable/blob/1c80efb80c196cdb925fc994fc9ed576a246c7a1/lib/utils/basic.py#L124 """ gc.collect() if torch.cuda.is_available(): # torch has wrong .pyi torch.cuda.synchronize() # type: ignore gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache()
[docs]def get_gpus_info() -> Dict[str, Any]: """Get information about GPU devices: driver version, memory, utilization etc. The example below shows what kind of information is returned as the result. All figures about memory are given in bytes. Returns: Information about GPU devices. Raises: RuntimeError: if necessary cuda-related libraries are not found. Usually, it means that the function is run on a machine without GPU. Examples: .. code-block:: print(get_gpu_info()) Output example (formatted for convenience): .. code-block:: none { 'driver': '440.33.01', 'devices': [ { 'name': 'GeForce RTX 2080 Ti', 'memory_total': 11554717696, 'memory_free': 11554652160, 'memory_used': 65536, 'utilization': 0, }, { 'name': 'GeForce RTX 2080 Ti', 'memory_total': 11552096256, 'memory_free': 11552030720, 'memory_used': 65536, 'utilization': 0, }, ], } Warning: The 'devices' value contains information about *all* gpus regardless of the value of :code:`CUDA_VISIBLE_DEVICES`. """ try: pynvml.nvmlInit() except NVMLError_LibraryNotFound as err: raise RuntimeError( 'Failed to get information about GPU memory. ' 'Make sure that you actually have GPU and all relevant software installed.' ) from err n_devices = pynvml.nvmlDeviceGetCount() devices = [] for device_id in range(n_devices): handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) devices.append( { 'name': str(pynvml.nvmlDeviceGetName(handle), 'utf-8'), 'memory_total': memory_info.total, 'memory_free': memory_info.free, 'memory_used': memory_info.used, 'utilization': pynvml.nvmlDeviceGetUtilizationRates(handle).gpu, } ) return { 'driver': str(pynvml.nvmlSystemGetDriverVersion(), 'utf-8'), 'devices': devices, }