Yes, in this case, the difference is quite large. Deleting the dictionary does not change the memory footprint unfortunately.
The VAE code is a bit more complex. It consists of a Conv2D encoder stack, with a bunch of FCs, and the Deconv decoder stack. Maybe at some point, it would be easier to look at the NLP problem, as it is more accessible.
class VAE2(nn.Module):
'''
Variational Autoencoder
'''
def __init__(self, img_channels, img_dim, latent_dim, filters,
kernel_sizes, strides, activation=nn.LeakyReLU,
out_activation=nn.Tanh, batch_norm=True, no_samples=10, sp_activation=None, public_stream=None, private_stream=None):
'''
in_dim (int): number of pixels on each row / column of the images
(assumes the images are square).
in_channels (int): number of channels of the images (e.g.: 1 for
grayscale, 3 for color images).
latent_dim (int): dimension of the latent space.
filters (list of length n_conv): number of filters for each conv.
layer.
kernel_sizes (list of length n_conv): kernel size for each conv.
strides (list of length n_conv): strides for each conv. layer.
activation (nn.Module): activation used in all layers (default:
LeakyReLU).
out_activation (subclass of nn.Module): activation used in the output
layer (default: Tanh).
batch_norm (boolean): if True, batch normalization is applied in every
layer before the activation (default: True).
'''
super(VAE2, self).__init__()
self.img_dim = img_dim
self.img_channels = img_channels
self.latent_dim = latent_dim
self.filters = filters
self.kernel_sizes = kernel_sizes
self.strides = strides
self.activation = activation
self.out_activation = out_activation
self.batch_norm = batch_norm
self.no_samples = no_samples
n_conv = len(self.filters)
# compute the paddings and the flattened dimension at the output of the
# last conv.
paddings = []
dims = [self.img_dim]
for i in range(n_conv):
if (dims[i] - self.kernel_sizes[i]) % strides[i] == 0:
paddings.append((self.kernel_sizes[i] - 1)//2)
else:
paddings.append((self.kernel_sizes[i] - strides[i] + 1)//2)
dims.append((dims[i] + 2*paddings[i] - self.kernel_sizes[i])
// self.strides[i] + 1)
flat_dim = self.filters[-1] * (dims[-1]**2)
self.encoder = Encoder(self.img_channels, self.img_dim,
self.latent_dim, self.filters,
self.kernel_sizes, self.strides,
paddings, flat_dim,
activation=self.activation,
batch_norm=self.batch_norm)
# the decoder architecture will be the transposed of the encoder's
filters_dec = (list(reversed(self.filters[0:n_conv-1]))
+ [img_channels])
kernel_sizes_dec = list(reversed(self.kernel_sizes))
strides_dec = list(reversed(self.strides))
paddings = list(reversed(paddings))
dims = list(reversed(dims))
# compute the output paddings
out_paddings = []
for i in range(n_conv):
out_dim = ((dims[i] - 1)*strides_dec[i] - 2*paddings[i] +
kernel_sizes_dec[i])
out_paddings.append(dims[i+1] - out_dim)
self.decoder = Decoder(self.latent_dim, self.filters[-1], dims[0],
filters_dec, kernel_sizes_dec, strides_dec,
paddings=paddings, out_paddings=out_paddings,
activation=self.activation,
out_activation=self.out_activation,
batch_norm=self.batch_norm)
self.bottleneck_fc = SpLinear(in_features=flat_dim, out_features=int(np.ceil(flat_dim/4)), bias=True, activation=sp_activation)
.............
self.fc_model_public = nn.Sequential(stream2dict(public_stream))
self.fc_model_private = nn.Sequential(stream2dict(private_stream))