What does the backward() function do?

hi @ptrblck , thank you for your time. The code now works without “retain_graph = True” flag after I moved a variable declaration inside the training loop, which I was using in training batches. But it did not resolve the high training time issue and the difference in number of trainable params. I have created two sample codes with a random data. Can you please help me with - 1) Why the number of trainable params is just half of the TF code. 2) The training takes more than double the time of the original TF code.

PYTORCH:

import torch
import torch.nn as nn
from torchvision.utils import save_image
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import argparse
import time
import matplotlib.pyplot as plt
import math
from scipy import stats
import scipy
import os
import datetime
from math import sqrt
from math import log
from torch import optim
from torch.autograd import Variable
from math import sqrt
from math import log



# from tensorflow import keras as K

# dim_red = 1  # perform PCA on the codes and plot the first two components
# plot_on = 1  # plot the results, otherwise only textual output is returned
# interp_on = 0  # interpolate data (needed if the input time series have different length)
# tied_weights = 0  # train an AE where the decoder weights are the econder weights transposed
# lin_dec = 1  # train an AE with linear activations in the decoder

# parse input data
parser = argparse.ArgumentParser()
parser.add_argument("--code_size", default=20, help="size of the code", type=int)
parser.add_argument("--w_reg", default=0.001, help="weight of the regularization in the loss function", type=float)
parser.add_argument("--a_reg", default=0.2, help="weight of the kernel alignment", type=float)
parser.add_argument("--num_epochs", default=5000, help="number of epochs in training", type=int)
parser.add_argument("--batch_size", default=25, help="number of samples in each batch", type=int)
parser.add_argument("--max_gradient_norm", default=1.0, help="max gradient norm for gradient clipping", type=float)
parser.add_argument("--learning_rate", default=0.001, help="Adam initial learning rate", type=float)
parser.add_argument("--hidden_size", default=30, help="size of the code", type=int)
args = parser.parse_args()
print(args)

# ================= DATASET =================
# (train_data, train_labels, train_len, _, K_tr,
#  valid_data, _, valid_len, _, K_vs,
#  test_data_orig, test_labels, test_len, _, K_ts) = getBlood(kernel='TCK',
#                                                             inp='zero')  # data shape is [T, N, V] = [time_steps, num_elements, num_var]

train_data = np.random.rand(9000,6)
train_labels = np.ones([9000,1])
train_len = 9000

valid_data = np.random.rand(9000,6)
valid_len = 9000

test_data = np.random.rand(1500,6)
test_labels = np.ones([1500,1])

K_tr = np.random.rand(9000,9000)
K_ts = np.random.rand(1500,1500)
K_vs =  np.random.rand(9000,9000)

#test_data = test_data_orig


print(
    '\n**** Processing Blood data: Tr{}, Vs{}, Ts{} ****\n'.format(train_data.shape, valid_data.shape, test_data.shape))

input_length = train_data.shape[1]  # same for all inputs

# ================= GRAPH =================

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

encoder_inputs = train_data
prior_k = K_tr

# ============= TENSORBOARD =============
writer = SummaryWriter()

# # ----- ENCODER -----

input_length = encoder_inputs.shape[1]
print ("INPUT ")

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        self.We1 = torch.nn.Parameter(torch.Tensor(input_length, args.hidden_size).uniform_(-1.0 / math.sqrt(input_length), 1.0 / math.sqrt(input_length)))
        self.We2 = torch.nn.Parameter(torch.Tensor(args.hidden_size, args.code_size).uniform_(-1.0 / math.sqrt(args.hidden_size), 1.0 / math.sqrt(args.hidden_size)))

        self.be1 = torch.nn.Parameter(torch.zeros([args.hidden_size]))
        self.be2 = torch.nn.Parameter(torch.zeros([args.code_size]))


    def encoder(self, encoder_inputs):
        hidden_1 = torch.tanh(torch.matmul(encoder_inputs.float(), self.We1) + self.be1)
        code = torch.tanh(torch.matmul(hidden_1, self.We2) + self.be2)
        #print ("CODE ENCODER SHAPE:", code.size())
        return code

    def decoder(self,encoder_inputs):
        code = self.encoder(encoder_inputs)

        # ----- DECODER -----
        # if tied_weights:
        #
        #     Wd1 = torch.transpose(We2)
        #     Wd2 = torch.transpose(We1)
        #
        # else:

        Wd1 = torch.nn.Parameter(
            torch.Tensor(args.code_size, args.hidden_size).uniform_(-1.0 / math.sqrt(args.code_size),
                                                                       1.0 / math.sqrt(args.code_size)))
        Wd2 = torch.nn.Parameter(
            torch.Tensor(args.hidden_size, input_length).uniform_(-1.0 / math.sqrt(args.hidden_size),
                                                                         1.0 / math.sqrt(args.hidden_size)))

        bd1 = torch.nn.Parameter(torch.zeros([args.hidden_size]))
        bd2 = torch.nn.Parameter(torch.zeros([input_length]))


        #if lin_dec:
        hidden_2 = torch.matmul(code, Wd1) + bd1
        #else:
        #hidden_2 = torch.tanh(torch.matmul(code, Wd1) + bd1)

        #print("hidden SHAPE:", hidden_2.size())
        dec_out = torch.matmul(hidden_2, Wd2) + bd2

        return  dec_out

    def kernel_loss(self,code, prior_K):
        # kernel on codes
        code_K = torch.mm(code, torch.t(code))

        # ----- LOSS -----
        # kernel alignment loss with normalized Frobenius norm
        code_K_norm = code_K / torch.linalg.matrix_norm(code_K, ord='fro', dim=(- 2, - 1))
        prior_K_norm = prior_K / torch.linalg.matrix_norm(prior_K, ord='fro', dim=(- 2, - 1))
        k_loss = torch.linalg.matrix_norm(torch.sub(code_K_norm,prior_K_norm), ord='fro', dim=(- 2, - 1))
        return k_loss


# Initialize model
model = Model()

# trainable parameters count
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Total parameters: {}'.format(total_params))

#Optimizer
optimizer = torch.optim.Adam(model.parameters(),args.learning_rate)

# ================= TRAINING =================

# initialize training variables
time_tr_start = time.time()
batch_size = args.batch_size
max_batches = train_data.shape[0] // batch_size
loss_track = []
kloss_track = []
min_vs_loss = np.infty
model_dir = "logs/dkae_models/m_0.ckpt"

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

###############################################################################
# Training code
###############################################################################

try:
    for ep in range(args.num_epochs):

        # shuffle training data
        idx = np.random.permutation(train_data.shape[0])
        train_data_s = train_data[idx, :]
        K_tr_s = K_tr[idx, :][:, idx]


        for batch in range(max_batches):
            fdtr = {}
            fdtr["encoder_inputs"] = train_data_s[(batch) * batch_size:(batch + 1) * batch_size, :]
            fdtr["prior_K"] =  K_tr_s[(batch) * batch_size:(batch + 1) * batch_size,
                             (batch) * batch_size:(batch + 1) * batch_size]

            encoder_inputs = (fdtr["encoder_inputs"].astype(float))
            encoder_inputs = torch.from_numpy(encoder_inputs)
            #print("TYPE ENCODER_INP IN TRAIN:", type(encoder_inputs))

            prior_K = (fdtr["prior_K"].astype(float))
            prior_K = torch.from_numpy(prior_K)

            dec_out = model.decoder(encoder_inputs)

            #print("DEC OUT TRAIN:", dec_out)


            reconstruct_loss = torch.mean((dec_out - encoder_inputs) ** 2)
            reconstruct_loss = reconstruct_loss.float()
            #print("RECONS LOSS TRAIN:", reconstruct_loss)

            enc_out = model.encoder(encoder_inputs)
            k_loss = model.kernel_loss(enc_out,prior_K)
            k_loss = k_loss.float()
            #print ("K_LOSS TRAIN:", k_loss)


            #print ("ENTRPY LOSS:", entrpy_loss)

            # Regularization L2 loss
            reg_loss = 0

            parameters = torch.nn.utils.parameters_to_vector(model.parameters())
            # print ("PARAMS:", (parameters))
            for tf_var in parameters:
                reg_loss += torch.mean(torch.linalg.norm(tf_var))

            tot_loss = reconstruct_loss + args.w_reg * reg_loss + args.a_reg * k_loss
            tot_loss = tot_loss.float()

            # Backpropagation
            optimizer.zero_grad()
            #tot_loss.backward(retain_graph=True)
            tot_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_gradient_norm)
            optimizer.step()

            #tot_loss = tot_loss.detach()

            loss_track.append(reconstruct_loss)
            kloss_track.append(k_loss)

        #check training progress on the validations set (in blood data valid=train)
        if ep % 100 == 0:
            print('Ep: {}'.format(ep))

            # fdvs = {"encoder_inputs": valid_data,
            #         "prior_K": K_vs}

            fdvs = {}
            fdvs["encoder_inputs"] = valid_data
            fdvs["prior_K"] = K_vs


            #dec_out_val, lossvs, klossvs, vs_code_K, summary = sess.run(
             #   [dec_out, reconstruct_loss, k_loss, code_K, merged_summary], fdvs)

            encoder_inp = (fdvs["encoder_inputs"].astype(float))
            encoder_inp = torch.from_numpy(encoder_inp)

            prior_K_vs = (fdvs["prior_K"].astype(float))
            prior_K_vs = torch.from_numpy(prior_K_vs)

            enc_out_vs = model.encoder(encoder_inp)


            dec_out_val = model.decoder(encoder_inp)
            #print ("DEC OUT VAL:", dec_out_val)


            reconstruct_loss_val = torch.mean((dec_out_val - encoder_inp) ** 2)
            #print("RECONS LOSS VAL:", reconstruct_loss)

            k_loss_val = model.kernel_loss(enc_out_vs,prior_K_vs)
            #print("K_LOSS VAL:", k_loss_val)


            writer.add_scalar("reconstruct_loss", reconstruct_loss_val, ep)
            writer.add_scalar("k_loss", k_loss_val, ep)
            #writer.add_scalar("tot_loss", tot_loss, ep)


            print('VS r_loss=%.3f, k_loss=%.3f -- TR r_loss=%.3f, k_loss=%.3f' % (
            reconstruct_loss_val, k_loss_val, torch.mean(torch.stack(loss_track[-100:])), torch.mean(torch.stack(kloss_track[-100:]))))
            #reconstruct_loss_val, k_loss_val, np.mean(loss_track[-100:].detach().numpy()), np.mean(kloss_track[-100:].detach().numpy())))


            # Save model yielding best results on validation
            if reconstruct_loss_val < min_vs_loss:
                min_vs_loss = reconstruct_loss_val
                torch.save(model, model_dir)
                torch.save(model.state_dict(), 'logs/dkae_models/best-model-parameters.pt')

                #save_path = saver.save(sess, model_name)

except KeyboardInterrupt:
    print('training interrupted')

time_tr_end = time.time()
print('Tot training time: {}'.format((time_tr_end - time_tr_start) // 60))
writer.close()

TF code:

import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.compat.v1.enable_eager_execution()
tf.disable_v2_behavior()
import argparse
import time
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats
import scipy
import os
import datetime
from math import sqrt
from math import log
import numpy as np
import tensorflow_probability as tfp

dim_red = 1  # perform PCA on the codes and plot the first two components
plot_on = 1  # plot the results, otherwise only textual output is returned
interp_on = 0  # interpolate data (needed if the input time series have different length)
tied_weights = 0  # train an AE where the decoder weights are the econder weights transposed
lin_dec = 1  # train an AE with linear activations in the decoder

# parse input data
parser = argparse.ArgumentParser()
parser.add_argument("--code_size", default=20, help="size of the code", type=int)
parser.add_argument("--w_reg", default=0.001, help="weight of the regularization in the loss function", type=float)
parser.add_argument("--a_reg", default=0.2, help="weight of the kernel alignment", type=float)
parser.add_argument("--num_epochs", default=5000, help="number of epochs in training", type=int)
parser.add_argument("--batch_size", default=25, help="number of samples in each batch", type=int)
parser.add_argument("--max_gradient_norm", default=1.0, help="max gradient norm for gradient clipping", type=float)
parser.add_argument("--learning_rate", default=0.001, help="Adam initial learning rate", type=float)
parser.add_argument("--hidden_size", default=30, help="size of the code", type=int)
args = parser.parse_args()
print(args)

# ================= DATASET =================
# (train_data, train_labels, train_len, _, K_tr,
#  valid_data, _, valid_len, _, K_vs,
#  test_data_orig, test_labels, test_len, _, K_ts) = getBlood(kernel='TCK',
#                                                             inp='zero')  # data shape is [T,T N, V] = [time_steps, num_elements, num_var]

train_data = np.random.rand(9000,6)
train_labels = np.ones([9000,1])
train_len = 9000

valid_data = np.random.rand(9000,6)
valid_len = 9000

test_data = np.random.rand(1500,6)
test_labels = np.ones([1500,1])

K_tr = np.random.rand(9000,9000)
K_ts = np.random.rand(1500,1500)
K_vs =  np.random.rand(9000,9000)

print(
    '\n**** Processing Blood data: Tr{}, Vs{}, Ts{} ****\n'.format(train_data.shape, valid_data.shape, test_data.shape))

input_length = train_data.shape[1]  # same for all inputs

# ================= GRAPH =================

# init session
# tf.reset_default_graph() # needed when working with iPython
sess = tf.Session()


# placeholders
encoder_inputs = tf.placeholder(shape=(None, input_length), dtype=tf.float32, name='encoder_inputs')
prior_K = tf.placeholder(shape=(None, None), dtype=tf.float32, name='prior_K')

# ----- ENCODER -----
We1 = tf.Variable(
    tf.random_uniform((input_length, args.hidden_size), -1.0 / math.sqrt(input_length), 1.0 / math.sqrt(input_length)))
We2 = tf.Variable(tf.random_uniform((args.hidden_size, args.code_size), -1.0 / math.sqrt(args.hidden_size),
                                    1.0 / math.sqrt(args.hidden_size)))

be1 = tf.Variable(tf.zeros([args.hidden_size]))
be2 = tf.Variable(tf.zeros([args.code_size]))


hidden_1 = tf.nn.tanh(tf.matmul(encoder_inputs, We1) + be1)
code = tf.nn.tanh(tf.matmul(hidden_1, We2) + be2)

# kernel on codes
code_K = tf.tensordot(code, tf.transpose(code), axes=1)

print("CODE K:", code_K)

print("Shape prior_K:", (tf.shape(prior_K)))
print("Code_k shape:", tf.shape(code_K))


# ----- DECODER -----
if tied_weights:
    Wd1 = tf.transpose(We2)
    Wd2 = tf.transpose(We1)
else:
    Wd1 = tf.Variable(tf.random_uniform((args.code_size, args.hidden_size), -1.0 / math.sqrt(args.code_size),
                                        1.0 / math.sqrt(args.code_size)))
    Wd2 = tf.Variable(tf.random_uniform((args.hidden_size, input_length), -1.0 / math.sqrt(args.hidden_size),
                                        1.0 / math.sqrt(args.hidden_size)))

bd1 = tf.Variable(tf.zeros([args.hidden_size]))
bd2 = tf.Variable(tf.zeros([input_length]))



if lin_dec:
    hidden_2 = tf.matmul(code, Wd1) + bd1
else:
    hidden_2 = tf.nn.tanh(tf.matmul(code, Wd1) + bd1)

dec_out = tf.matmul(hidden_2, Wd2) + bd2

# ----- LOSS -----
# kernel alignment loss with normalized Frobenius norm
code_K_norm = code_K / tf.norm(code_K, ord='fro', axis=[-2, -1])
prior_K_norm = prior_K / tf.norm(prior_K, ord='fro', axis=[-2, -1])
k_loss = tf.norm(code_K_norm - prior_K_norm, ord='fro', axis=[-2,-1])

# reconstruction loss
parameters = tf.trainable_variables()
print ("PARAMS:", (parameters))
optimizer = tf.train.AdamOptimizer(args.learning_rate)
reconstruct_loss = tf.losses.mean_squared_error(labels=dec_out, predictions=encoder_inputs)


# L2 loss
reg_loss = 0
for tf_var in tf.trainable_variables():
    reg_loss += tf.reduce_mean(tf.nn.l2_loss(tf_var))

print ("REG_LOSS:", reg_loss)
tot_loss = reconstruct_loss + args.w_reg * reg_loss + args.a_reg * k_loss

# Calculate and clip gradients
print ("TOT LOSS:", tot_loss)

gradients = tf.gradients(tot_loss,parameters)
print ("GRADS:", gradients)
clipped_gradients, _ = tf.clip_by_global_norm(gradients, args.max_gradient_norm)
update_step = optimizer.apply_gradients(zip(clipped_gradients, parameters))

sess.run(tf.global_variables_initializer())


# trainable parameters count
total_parameters = 0
for variable in tf.trainable_variables():
    shape = variable.get_shape()
    variable_parametes = 1
    for dim in shape:
        variable_parametes *= dim.value
    total_parameters += variable_parametes
print('Total parameters: {}'.format(total_parameters))

# ============= TENSORBOARD =============
mean_grads = tf.reduce_mean([tf.reduce_mean(grad) for grad in gradients])
tf.summary.scalar('mean_grads', mean_grads)
tf.summary.scalar('reconstruct_loss', reconstruct_loss)
tf.summary.scalar('k_loss', k_loss)
tvars = tf.trainable_variables()
for tvar in tvars:
    tf.summary.histogram(tvar.name.replace(':', '_'), tvar)
merged_summary = tf.summary.merge_all()

# ================= TRAINING =================

# initialize training variables
time_tr_start = time.time()
batch_size = args.batch_size
max_batches = train_data.shape[0] // batch_size
loss_track = []
kloss_track = []
min_vs_loss = np.infty
model_name = "logs/dkae_models/m_0.ckpt"
#train_writer = tf.summary.FileWriter('/logs/tensorboard', graph=sess.graph)
saver = tf.train.Saver()


logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
train_writer = tf.compat.v1.summary.FileWriter(
    logdir,
    graph=None,
    max_queue=10,
    flush_secs=120,
    graph_def=None,
    filename_suffix=None,
    session=None
)

try:
    for ep in range(args.num_epochs):

        # shuffle training data
        idx = np.random.permutation(train_data.shape[0])
        train_data_s = train_data[idx, :]
        K_tr_s = K_tr[idx, :][:, idx]

        for batch in range(max_batches):
            fdtr = {encoder_inputs: train_data_s[(batch) * batch_size:(batch + 1) * batch_size, :],
                    prior_K: K_tr_s[(batch) * batch_size:(batch + 1) * batch_size,
                             (batch) * batch_size:(batch + 1) * batch_size]
                    }


            _, train_loss, train_kloss = sess.run([update_step, reconstruct_loss, k_loss], fdtr)

            loss_track.append(train_loss)
            kloss_track.append(train_kloss)

        # check training progress on the validations set (in blood data valid=train)
        if ep % 100 == 0:
            print('Ep: {}'.format(ep))

            fdvs = {encoder_inputs: valid_data,
                    prior_K: K_vs}
            outvs, lossvs, klossvs, vs_code_K, summary = sess.run(
                [dec_out, reconstruct_loss, k_loss, code_K, merged_summary], fdvs)
            train_writer.add_summary(summary, ep)
            print('VS r_loss=%.3f, k_loss=%.3f -- TR r_loss=%.3f, k_loss=%.3f' % (
            lossvs, klossvs, np.mean(loss_track[-100:]), np.mean(kloss_track[-100:])))

            # Save model yielding best results on validation
            if lossvs < min_vs_loss:
                min_vs_loss = lossvs
                tf.add_to_collection("encoder_inputs", encoder_inputs)
                tf.add_to_collection("dec_out", dec_out)
                tf.add_to_collection("reconstruct_loss", reconstruct_loss)
                save_path = saver.save(sess, model_name)

except KeyboardInterrupt:
    print('training interrupted')

time_tr_end = time.time()
print('Tot training time: {}'.format((time_tr_end - time_tr_start) // 60))

sess.close()

The code can be run as

!python3 filename.py --code_size 5 --w_reg 0.001 --a_reg 0.1 --num_epochs 200 --max_gradient_norm 0.5 --learning_rate 0.001 --hidden_size 30 

Appreciate your help here.