I’m seeing this too. Training works with batch sizes of 192, 128 and 32, but fails with anything around 45-64.

I’m training a convolutional autoencoder on a GeForce RTX 2080ti, cuda version 10.0, nvidia version 415.27. The GPU has 11 gigs of RAM, and when it does work, only ~3 gigs are needed.

In case it’s relevant I’m using an Adam optimizer and MSE loss, with the following architecture.

```
class ConvAE(nn.Module):
def __init__(self):
super(ConvAE, self).__init__()
self.c1 = nn.Conv2d(1, 32, kernel_size=(3,5), padding=(1, 2), stride=(2,2))
self.c2 = nn.Conv2d(32, 32, kernel_size=(3,5), padding=(1, 2), stride=(2,2))
self.c3 = nn.Conv2d(32, 64, kernel_size=(4,2), padding=(2, 1), stride=(2,1))
self.c4 = nn.Conv2d(64, 128, kernel_size=(4,16), padding=(0, 0), stride=(4,16))
self.c4_t = nn.ConvTranspose2d(128, 64, kernel_size=(16,4), padding=(3, 2), stride=(16, 4))
self.c3_t = nn.ConvTranspose2d(64, 32, kernel_size=(2,4), padding=(1, 2), stride=(1, 2))
self.c2_t = nn.ConvTranspose2d(32, 32, kernel_size=(5,3), padding=(2, 2), stride=(2, 2))
self.c1_t = nn.ConvTranspose2d(32, 1, kernel_size=(5,3), padding=(2, 2), stride=(2, 2))
def _encode_tensor(self, x):
x = F.leaky_relu(self.c1(x))
x = F.leaky_relu(F.max_pool2d(self.c2(x), (2, 2), stride=(2, 2)))
x = F.leaky_relu(F.max_pool2d(self.c3(x), (2, 2), stride=(2, 2)))
x = F.leaky_relu(self.c4(x))
return x
def _decode_tensor(self, x):
# It's necessary to interpolate a bit first, since the 1x8 isn't enough for the first kernel
# I'm also interpolating after the ConvTranspose, to make dimensions match up
x = F.leaky_relu(F.interpolate(self.c4_t(F.interpolate(x, (5, 9))), (4, 16)))
x = F.leaky_relu(F.interpolate(self.c3_t(F.interpolate(x, (17, 17))), (16, 32)))
x = F.leaky_relu(F.interpolate(self.c2_t(F.interpolate(x, (32, 59))), (64, 128)))
x = F.interpolate(self.c1_t(F.interpolate(x, (63, 130))), (128, 256))
return x
# x is a tensor object of size 1x128x646
def forward(self, x):
return self._decode_tensor(self._encode_tensor(x))
```