Source code for zero.hardware
"""Tools related to devices, memory, etc."""
__all__ = ['free_memory', 'get_gpu_info']
import gc
import math
from typing import Any, Dict, List
import torch
from pynvml import NVMLError_LibraryNotFound
from pynvml.smi import nvidia_smi
_GPU_INFO_QUERY = 'memory.total, memory.used, memory.free, utilization.gpu'
[docs]def free_memory() -> None:
"""Free GPU-memory occupied by `torch` and run the garbage collector.
Warning:
There is a small chunk of GPU-memory (occupied by drivers) that is impossible to
free. It is a `torch` "limitation", so the function inherits this property.
"""
gc.collect()
if torch.cuda.is_available():
# torch has wrong .pyi
torch.cuda.synchronize() # type: ignore
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
[docs]def get_gpu_info(precise: bool = False) -> List[Dict[str, Any]]:
"""Get statistics about GPU devices.
Includes information about memory (total, free and used) and utilization. Some
figures are represented in two ways: with raw units and with percentage.
Args:
precise: if False, all data is rounded (to Mb for memory, to % for percentages)
Returns:
Information about GPU devices.
Raises:
RuntimeError: if necessary cuda-related libraries are not found. Usually, it
means that the function is run on a machine without GPU.
Examples:
.. code-block::
print(get_gpu_info())
Output example (formatted for convenience):
.. code-block:: none
[
{
'util%': 0,
'total': 11019,
'used': 0,
'free': 11019,
'used%': 0,
'free%': 100,
},
{
'util%': 0,
'total': 11016,
'used': 0,
'free': 11016,
'used%': 0,
'free%': 100,
},
]
Warning:
The function directly collects information using the :code:`pynvml` library,
hence, settings like :code:`CUDA_VISIBLE_DEVICES` don't affect the result.
"""
try:
smi = nvidia_smi.getInstance()
except NVMLError_LibraryNotFound as err:
raise RuntimeError(
'Failed to get information about GPU memory. '
'Make sure that you actually have GPU and all relevant software installed.'
) from err
raw_info = smi.DeviceQuery(_GPU_INFO_QUERY)
process_float = (lambda x: float(x)) if precise else math.floor # noqa
def unpack_raw_gpu_info(raw_gpu_info):
gpu_info = {'util%': raw_gpu_info['utilization']['gpu_util']}
gpu_info.update(
(x, process_float(raw_gpu_info['fb_memory_usage'][x]))
for x in ['total', 'used', 'free']
)
for x in 'used', 'free':
gpu_info[x + '%'] = process_float(gpu_info[x] / gpu_info['total'] * 100)
return gpu_info
return list(map(unpack_raw_gpu_info, raw_info['gpu']))