Model weights memory corruption on cuda

Hi,
After initializing a model and sending it to cuda device I see random changes of the weights of a simple model. It doesn’t happen when using CPU device.

Running on ubuntu 2020 LTS and 2022 LTS 64bit.
NVIDIA GeForce RTX 2070
Python 3.8 & 3.10.6
torch version ‘2.0.0+cu117’ (from pip)

Any ideas how to solve this issue?

Code to reproduce:

from typing import Tuple

import torch
from torch import nn


class UNetTiny(nn.Module):

    def __init__(self, in_channels: int, out_channels: int, num_block_filters: Tuple[int]):

        super().__init__()

        padding_mode = 'replicate'

        # Blocks of down portion
        down_modules = [
            nn.Conv2d(in_channels=in_channels, out_channels=num_block_filters[0], kernel_size=3, stride=2, padding=1, padding_mode=padding_mode)
        ]

        for i in range(1, len(num_block_filters)):
            down_modules.append(
                nn.Conv2d(in_channels=num_block_filters[i-1], out_channels=num_block_filters[i], kernel_size=3, stride=2, padding=1, padding_mode=padding_mode)
            )

        self._down_modules = nn.ModuleList(down_modules)

        up_deconv_modules = list()

        for i in range(len(num_block_filters) - 1, 0, -1):
            up_deconv_modules.append(
                torch.nn.ConvTranspose2d(in_channels=num_block_filters[i], out_channels=num_block_filters[i - 1], output_padding=1, padding=1, kernel_size=3, stride=2),
            )

        self._up_deconv_modules = nn.ModuleList(up_deconv_modules)

        self._out_deconv = torch.nn.ConvTranspose2d(in_channels=num_block_filters[0], out_channels=out_channels, output_padding=1, padding=1, kernel_size=3, stride=2)

    def forward(self, x) -> torch.Tensor:

        for l in self._down_modules:
            x = l(x)
            x = torch.nn.functional.relu(x)

        for l_deconv in self._up_deconv_modules:
            x = l_deconv(x)
            x = torch.nn.functional.relu(x)

        res = self._out_deconv(x)
        return res


def _analyze_weights(unet_model):

    for name, tmp_module in unet_model.named_modules():
        if type(tmp_module) not in (torch.nn.modules.conv.Conv2d, torch.nn.modules.conv.ConvTranspose2d):
            continue

        if tmp_module.weight.detach().abs().max().item() > 1.0:
            print(f'{name} weight::\t{tmp_module.weight.detach().abs().max().item()}')

        if tmp_module.bias.detach().abs().max().item() > 1.0:
            print(f'{name} bias::\t{tmp_module.bias.detach().abs().max().item()}')


def _check_memory_corruption():

    dev = "cuda:0" if torch.cuda.is_available() else "cpu"
    # dev = 'cpu'
    device = torch.device(dev)

    print('Using device: ', device)

    N = 100
    num_block_filters = (64, 128, 256)

    unet = UNetTiny(
        in_channels=1,
        out_channels=3,
        num_block_filters=num_block_filters
    )

    unet.to(device=device)
    unet.train()

    for i in range(N):
        print(f'weights on iteration {i}')
        _analyze_weights(unet)


if __name__ == '__main__':

    _check_memory_corruption()

The output I’m getting:

Using device: cuda:0
weights on iteration 0
weights on iteration 1
weights on iteration 2
weights on iteration 3
weights on iteration 4
weights on iteration 5
weights on iteration 6
weights on iteration 7
weights on iteration 8
weights on iteration 9
weights on iteration 10
weights on iteration 11
weights on iteration 12
weights on iteration 13
weights on iteration 14
weights on iteration 15
weights on iteration 16
weights on iteration 17
_up_deconv_modules.0 weight:: 254.71319580078125
weights on iteration 18
weights on iteration 19
weights on iteration 20
weights on iteration 21
weights on iteration 22
weights on iteration 23
_down_modules.2 weight:: 1815.224853515625
weights on iteration 24
weights on iteration 25
weights on iteration 26
weights on iteration 27
_down_modules.2 weight:: 1815.224853515625
weights on iteration 28
_down_modules.2 weight:: 1166.347900390625
weights on iteration 29
weights on iteration 30
weights on iteration 31
weights on iteration 32
weights on iteration 33
weights on iteration 34
weights on iteration 35
weights on iteration 36
weights on iteration 37
weights on iteration 38
weights on iteration 39
weights on iteration 40
weights on iteration 41
weights on iteration 42
weights on iteration 43
_up_deconv_modules.0 weight:: 249.4268798828125
weights on iteration 44
weights on iteration 45
weights on iteration 46
weights on iteration 47
weights on iteration 48
weights on iteration 49
weights on iteration 50
weights on iteration 51
_up_deconv_modules.0 weight:: 457.2268981933594
weights on iteration 52
weights on iteration 53
weights on iteration 54
weights on iteration 55
_down_modules.2 weight:: 427.8851318359375
weights on iteration 56
weights on iteration 57
weights on iteration 58
weights on iteration 59
weights on iteration 60
weights on iteration 61
weights on iteration 62
_down_modules.2 weight:: 1423.19970703125
weights on iteration 63
weights on iteration 64
weights on iteration 65
weights on iteration 66
weights on iteration 67
weights on iteration 68
weights on iteration 69
_up_deconv_modules.0 weight:: 254.71319580078125
weights on iteration 70
weights on iteration 71
weights on iteration 72
weights on iteration 73
weights on iteration 74
weights on iteration 75
weights on iteration 76
weights on iteration 77
weights on iteration 78
weights on iteration 79
weights on iteration 80
weights on iteration 81
weights on iteration 82
weights on iteration 83
weights on iteration 84
weights on iteration 85
weights on iteration 86
weights on iteration 87
weights on iteration 88
weights on iteration 89
weights on iteration 90
weights on iteration 91
_down_modules.2 weight:: 1815.224853515625
weights on iteration 92
weights on iteration 93
weights on iteration 94
weights on iteration 95
weights on iteration 96
weights on iteration 97
weights on iteration 98
weights on iteration 99

Process finished with exit code 0

Did you compare the parameters between both devices or are you assuming the max. values are guaranteed to never exceed your specified threshold? In any case, you might want to check if disabling IOMMU might help.

I did. The attached code is a simple program to show the issue. I also trained identical models on CPU and GPU simultaneously (loading the CPU model state to the GPU one) and the maximal difference among all the parameters was in the order of 10-7. After a couple of iterations suddenly it was ~1k
even though the gradients of all the layers were almost identical.
I will try the IOMMU option.
One more question - is there a tool I can use inorder to catch events of GPU memory changes within the hardware level so I can inspect when and why it is changing?