Source code for zero.hardware

"""Tools related to devices, memory, etc."""

__all__ = ['free_memory', 'get_gpu_info']

import gc
import math
from typing import Any, Dict, List

import torch
from pynvml import NVMLError_LibraryNotFound
from pynvml.smi import nvidia_smi

_GPU_INFO_QUERY = 'memory.total, memory.used, memory.free, utilization.gpu'


[docs]def free_memory() -> None: """Free GPU-memory occupied by `torch` and run the garbage collector. Warning: There is a small chunk of GPU-memory (occupied by drivers) that is impossible to free. It is a `torch` "limitation", so the function inherits this property. """ gc.collect() if torch.cuda.is_available(): # torch has wrong .pyi torch.cuda.synchronize() # type: ignore gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache()
[docs]def get_gpu_info(precise: bool = False) -> List[Dict[str, Any]]: """Get statistics about GPU devices. Includes information about memory (total, free and used) and utilization. Some figures are represented in two ways: with raw units and with percentage. Args: precise: if False, all data is rounded (to Mb for memory, to % for percentages) Returns: Information about GPU devices. Raises: RuntimeError: if necessary cuda-related libraries are not found. Usually, it means that the function is run on a machine without GPU. Examples: .. code-block:: print(get_gpu_info()) Output example (formatted for convenience): .. code-block:: none [ { 'util%': 0, 'total': 11019, 'used': 0, 'free': 11019, 'used%': 0, 'free%': 100, }, { 'util%': 0, 'total': 11016, 'used': 0, 'free': 11016, 'used%': 0, 'free%': 100, }, ] Warning: The function directly collects information using the :code:`pynvml` library, hence, settings like :code:`CUDA_VISIBLE_DEVICES` don't affect the result. """ try: smi = nvidia_smi.getInstance() except NVMLError_LibraryNotFound as err: raise RuntimeError( 'Failed to get information about GPU memory. ' 'Make sure that you actually have GPU and all relevant software installed.' ) from err raw_info = smi.DeviceQuery(_GPU_INFO_QUERY) process_float = (lambda x: float(x)) if precise else math.floor # noqa def unpack_raw_gpu_info(raw_gpu_info): gpu_info = {'util%': raw_gpu_info['utilization']['gpu_util']} gpu_info.update( (x, process_float(raw_gpu_info['fb_memory_usage'][x])) for x in ['total', 'used', 'free'] ) for x in 'used', 'free': gpu_info[x + '%'] = process_float(gpu_info[x] / gpu_info['total'] * 100) return gpu_info return list(map(unpack_raw_gpu_info, raw_info['gpu']))