I’m receiving an error training on CUDA that doesn’t occur when I use a CPU.
First things first, I’m pretty sure it is due to memory. I am running tensors of length 5020 into a linear layer that outputs tensors of length 5000 (I am not sure whether this is too intensive or not).
I’m running on NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0
PyTorch version: 1.6.0
I get the following error:
Traceback (most recent call last):
File "main.py", line 457, in <module>
in()
File "main.py", line 373, in main
ain_worker(args.gpu, ngpus_per_node, args, timers=Timers)
Fi "/oak/stanford/groups/satpathy/users/hkitano/AtacWorks/scripts/worker.py", line 222, in train_worker
ansform=args.transform)
Fi "/oak/stanford/groups/satpathy/users/hkitano/AtacWorks/claragenomics/dl4atac/train.py", line 96, in train
ed = model(x, l)
Fi "/home/users/hkitano/miniconda3/envs/atacworks/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
sult = self.forward(*input, **kwargs)
Fi "/oak/stanford/groups/satpathy/users/hkitano/AtacWorks/claragenomics/dl4atac/models/models.py", line 378, in forward
= linear_layer(x) #[64, 6000]
Fi "/home/users/hkitano/miniconda3/envs/atacworks/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
sult = self.forward(*input, **kwargs)
Fi "/oak/stanford/groups/satpathy/users/hkitano/AtacWorks/claragenomics/dl4atac/layers.py", line 154, in forward
= self.layer(x)
Fi "/home/users/hkitano/miniconda3/envs/atacworks/lib/python3.6/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
sult = self.forward(*input, **kwargs)
Fi "/home/users/hkitano/miniconda3/envs/atacworks/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 91, in forward
turn F.linear(input, self.weight, self.bias)
Fi "/home/users/hkitano/miniconda3/envs/atacworks/lib/python3.6/site-packages/torch/nn/functional.py", line 1674, in linear
ret = torch.addmm(bias, input, weight.t())
RuntimeError: CUDA error: CUBLAS_STATUS_INTERNAL_ERROR when calling `cublasSgemm( handle, opa, opb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)`
If i run it on a separate screen and monitor GPU usage with nvidia-smi -l 1
, I get the following:
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:3F:00.0 Off | 0 |
| N/A 29C P0 25W / 250W | 4MiB / 32510MiB | 0% E. Process |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Sun Sep 13 17:32:30 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:3F:00.0 Off | 0 |
| N/A 30C P0 39W / 250W | 549MiB / 32510MiB | 4% E. Process |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 379509 C python 545MiB |
+-----------------------------------------------------------------------------+
Sun Sep 13 17:32:31 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:3F:00.0 Off | 0 |
| N/A 30C P0 39W / 250W | 839MiB / 32510MiB | 4% E. Process |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 379509 C python 837MiB |
+-----------------------------------------------------------------------------+
Sun Sep 13 17:32:32 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:3F:00.0 Off | 0 |
| N/A 30C P0 39W / 250W | 1351MiB / 32510MiB | 1% E. Process |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| 0 N/A N/A 379509 C python 1347MiB |
+-----------------------------------------------------------------------------+
**GPU 00000000:3F:00.0: Detected Critical Xid Error**
Sun Sep 13 17:32:33 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:3F:00.0 Off | 0 |
| N/A 30C P0 39W / 250W | 0MiB / 32510MiB | 0% E. Process |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
Sun Sep 13 17:32:34 2020
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05 Driver Version: 450.51.05 CUDA Version: 11.0 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 Tesla V100-PCIE... On | 00000000:3F:00.0 Off | 0 |
| N/A 30C P0 39W / 250W | 0MiB / 32510MiB | 0% E. Process |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
It seems to me like the amount of memory used is not near the limit.
If it helps, here is my Model:
class DenoisingResNetWithLocation(nn.Module):
"""Resnet model."""
def __init__(self, interval_size, in_channels=1, out_channels=15,
num_blocks=5,
kernel_size=50, dilation=8, bn=False, afunc='relu',
num_blocks_class=2,
kernel_size_class=50, dilation_class=8,
out_channels_class=15,
num_layers_location=3, num_layers_joint=3,
in_channels_location=20, out_channels_location=20):
"""Initialize the class.
Args:
interval_size : length (in bp) of the genomic intervals supplied
to the model.
in_channels : Number of input channels.
out_channels: Number of output channels for all residual blocks
in the regression part of the model.
num_blocks: Number of residual blocks in the regression part
of the model.
kernel_size: Size of the kernel filter for all
convolutional layers in the regression part of the model.
dilation: Dilation parameter for all convolutional layers
in the regression part of the model.
bn: Batch normalization.
afunc: activation function.
num_blocks_class: Number of residual blocks in the
classification part of the model.
kernel_size_class: Size of the kernel filter for all
convolutional layers in the classification part of the model.
dilation_class: Dilation parameter for all convolutional layers
in the classification part of the model.
out_channels_class: Number of output channels for all residual
blocks in the classification part of the model
"""
self.interval_size = interval_size
super(DenoisingResNetWithLocation, self).__init__()
self.res_blocks = nn.ModuleList()
self.location_blocks = nn.ModuleList()
self.joint_blocks = nn.ModuleList()
self.res_blocks_class = nn.ModuleList()
# Residual blocks for regression
self.res_blocks.append(
ResBlock(interval_size, in_channels, out_channels, kernel_size,
dilation=dilation, bn=bn, afunc=afunc, conv_input=True))
for _ in range(num_blocks - 1):
self.res_blocks.append(
ResBlock(interval_size, out_channels, out_channels,
kernel_size,
dilation=dilation, bn=bn, afunc=afunc,
conv_input=False))
self.location_blocks.append(
Linear(in_channels_location, out_channels_location, afunc))
for _ in range(num_layers_location - 1):
self.location_blocks.append(
Linear(out_channels_location, out_channels_location, afunc))
self.regressor = ConvAct1d(interval_size, in_channels=out_channels,
out_channels=1, kernel_size=1, dilation=1,
bn=bn, afunc=afunc)
joint_in = interval_size + out_channels_location
self.joint_blocks.append(
Linear(joint_in, interval_size, afunc))
for _ in range(num_layers_joint - 1):
self.joint_blocks.append(
Linear(interval_size, interval_size, afunc))
# Residual blocks for classification
self.res_blocks_class.append(ResBlock(interval_size, in_channels=1,
out_channels=out_channels_class,
kernel_size=kernel_size_class,
dilation=dilation_class, bn=bn,
afunc=afunc, conv_input=True,
bias=True))
for _ in range(num_blocks_class - 1):
self.res_blocks_class.append(
ResBlock(interval_size, out_channels_class, out_channels_class,
kernel_size_class, dilation=dilation_class, bn=bn,
afunc=afunc, conv_input=False, bias=True))
self.classifier = ConvAct1d(interval_size, in_channels=out_channels,
out_channels=1, kernel_size=1, dilation=1,
bn=bn, afunc=None, bias=True)
def forward(self, x, l):
"""Get regression and classification forward propagated output.
Args:
x : Input.
Return:
out_reg: Regression output.
out_cla: Classification output.
"""
for res_block in self.res_blocks:
x = res_block(x) #[64, 15, 6000]
x = self.regressor(x) #[64, 1, 6000]
for linear_layer in self.location_blocks:
l = linear_layer(l) #[64, 20]
x = x.view(x.size(0), -1) #[64, 6000]
x = torch.cat((l, x), dim=1) #[64, 6020]
for linear_layer in self.joint_blocks:
x = linear_layer(x) #[64, 6000] ## fails here!
out_reg = x #[64, 6000]
x = x.unsqueeze(1) #[64, 1, 6000]
for res_block in self.res_blocks_class:
x = res_block(x) ##[64, 15, 60000]
out_cla = torch.sigmoid(self.classifier(x).squeeze(1))# [64, 6000]
return out_reg, out_cla
Any thoughts! Thank you!