Error: One of the variables needed for gradient computation has been modified by an inplace operation

I have got the following error while running the below code:
Environment used for the code
Python 3.6
Pytorch 0.4 Cuda 9.0

Traceback (most recent call last):
File “main.py”, line 20, in
t.train()
File “E:\Deep_learning\TSAN\TSAN_patternnet_test\Train\trainer.py”, line 72, in train
loss.backward()
File “C:\Users\SUMIT\anaconda3\envs\TSAN\lib\site-packages\torch\tensor.py”, line 93, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “C:\Users\SUMIT\anaconda3\envs\TSAN\lib\site-packages\torch\autograd_init_.py”, line 89, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

The code is given below:
main.py:

import torch

import utility
import data
import model
import loss
from option import args
from trainer import Trainer

torch.manual_seed(args.seed)
checkpoint = utility.checkpoint(args)

if __name__ == '__main__':
    if checkpoint.ok:
        loader = data.Data(args)
        model = model.Model(args, checkpoint)
        loss = loss.Loss(args, checkpoint) if not args.test_only else None
        t = Trainer(args, loader, model, loss, checkpoint)
        while not t.terminate():
            t.train()
            t.test()

        checkpoint.done()

trainer.py

import os
import math
from decimal import Decimal

import utility
from imresize import *
import torch
from torch.autograd import Variable
from tqdm import tqdm
from skimage.io import imsave, imread

class Trainer():
    def __init__(self, args, loader, my_model, my_loss, ckp):
        self.args = args
        self.scale = args.scale

        self.ckp = ckp
        self.loader_train = loader.loader_train
        self.loader_test = loader.loader_test
        self.model = my_model
        self.loss = my_loss
        self.optimizer = utility.make_optimizer(args, self.model)
        self.scheduler = utility.make_scheduler(args, self.optimizer)

        if self.args.load != '.':
            self.optimizer.load_state_dict(
                torch.load(os.path.join(ckp.dir, 'optimizer.pt'))
            )
            for _ in range(len(ckp.log)): self.scheduler.step()

        self.error_last = 1e8

    def train(self):
        self.scheduler.step()
        self.loss.step()
        epoch = self.scheduler.last_epoch + 1
        lr = self.scheduler.get_lr()[0]

        self.ckp.write_log(
            '[Epoch {}]\tLearning rate: {:.2e}'.format(epoch, Decimal(lr))
        )
        self.loss.start_log()
        self.model.train()

        timer_data, timer_model = utility.timer(), utility.timer()
        for batch, (lr, hr, _, idx_scale) in enumerate(self.loader_train):
            lr, hr = self.prepare([lr, hr])
            timer_data.hold()
            timer_model.tic()

            self.optimizer.zero_grad()
            sr1,sr2 = self.model(lr, idx_scale)
            loss1 = self.loss(sr1, hr)
            loss2 = self.loss(sr2, hr)

            # bic loss
            # hrbic = torch.zeros_like(hr)
            # srbic = torch.zeros_like(hr)
            # for i in range(len(hr.shape[0])):
            #     hr_bic = hr[i,:,:,:].squeeze(0).transpose(0,2).transpose(0,1)
            #     sr_bic = sr2[i,:,:,:].squeeze(0).transpose(0,2).transpose(0,1)
            #     hr_bic = hr_bic.cpu().numpy()
            #     sr_bic = sr_bic.cpu().numpy()  
            #     hr_bic = imresize(hr_bic, output_shape=(hr.shape[2]//self.scale,hr.shape[3]//self.scale))
            #     sr_bic = imresize(sr_bic, output_shape=(hr.shape[2]//self.scale,hr.shape[3]//self.scale))
            #     hrbic[i,:,:,:] = torch.from_numpy(hr_bic).unsqueeze(0).transpose(1,3).transpose(2,3)
            #     srbic[i,:,:,:] = torch.from_numpy(sr_bic).unsqueeze(0).transpose(1,3).transpose(2,3)
            # loss3 = self.loss(hrbic.cuda(), srbic.cuda())
            loss = loss1 + loss2
            # loss = loss + loss3
            if loss.item() < self.args.skip_threshold * self.error_last:
                loss.backward()
                self.optimizer.step()
            else:
                print('Skip this batch {}! (Loss: {})'.format(
                    batch + 1, loss.item()
                ))

            timer_model.hold()

            if (batch + 1) % self.args.print_every == 0:
                self.ckp.write_log('[{}/{}]\t{}\t{:.1f}+{:.1f}s'.format(
                    (batch + 1) * self.args.batch_size,
                    len(self.loader_train.dataset),
                    self.loss.display_loss(batch),
                    timer_model.release(),
                    timer_data.release()))

            timer_data.tic()

        self.loss.end_log(len(self.loader_train))
        self.error_last = self.loss.log[-1, -1]

    def test(self):
        epoch = self.scheduler.last_epoch + 1
        self.ckp.write_log('\nEvaluation:')
        self.ckp.add_log(torch.zeros(1, len(self.scale)))
        self.model.eval()

        timer_test = utility.timer()
        with torch.no_grad():
            for idx_scale, scale in enumerate(self.scale):
                eval_acc = 0
                self.loader_test.dataset.set_scale(idx_scale)
                tqdm_test = tqdm(self.loader_test, ncols=80)
                for idx_img, (lr, hr, filename, _) in enumerate(tqdm_test):
                    filename = filename[0]
                    no_eval = (hr.nelement() == 1)
                    if not no_eval:
                        lr, hr = self.prepare([lr, hr])
                    else:
                        lr = self.prepare([lr])[0]

                    sr = self.model(lr, idx_scale)
                    sr = utility.quantize(sr, self.args.rgb_range)

                    save_list = [sr]
                    if not no_eval:
                        eval_acc += utility.calc_psnr(
                            sr, hr, scale, self.args.rgb_range,
                            benchmark=self.loader_test.dataset.benchmark
                        )
                        save_list.extend([lr, hr])

                    if self.args.save_results:
                        #self.ckp.save_results(filename, save_list, scale)
                        self.ckp.save_results_nopostfix(filename, save_list, scale)

                self.ckp.log[-1, idx_scale] = eval_acc / len(self.loader_test)
                best = self.ckp.log.max(0)
                self.ckp.write_log(
                    '[{} x{}]\tPSNR: {:.3f} (Best: {:.3f} @epoch {})'.format(
                        self.args.data_test,
                        scale,
                        self.ckp.log[-1, idx_scale],
                        best[0][idx_scale],
                        best[1][idx_scale] + 1
                    )
                )

        self.ckp.write_log(
            'Total time: {:.2f}s, ave time: {:.2f}s\n'.format(timer_test.toc(), timer_test.toc()/len(self.loader_test)), refresh=True
        )
        if not self.args.test_only:
            self.ckp.save(self, epoch, is_best=(best[1][0] + 1 == epoch))

    def prepare(self, l, volatile=False):
        device = torch.device('cpu' if self.args.cpu else 'cuda')
        def _prepare(tensor):
            if self.args.precision == 'half': tensor = tensor.half()
            return tensor.to(device)
           
        return [_prepare(_l) for _l in l]

    def terminate(self):
        if self.args.test_only:
            self.test()
            return True
        else:
            epoch = self.scheduler.last_epoch + 1
            return epoch >= self.args.epochs