Implementing "A neural algorithm of artistic style" but the result is not good

Hi, I’m trying to learn pytorch by doing stuff directly. I have read a paper called “A neural algorithm of artistic style” and I have decided to implement the algorithm of this paper, however the result is still random and not at all as good as expected. I wonder if you guys can check my code to see if there is any problem. Thank you very much.

Here is my code:

import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import torch.nn as nn
​
from pdb import set_trace

# Hyperparameter settings
n_epoch = 5000
learning_rate = 0.001
wl_list = [1/5, 1/5, 1/5, 1/5, 1/5]
alpha = 0.0001
beta = 1
content_layer_list = ["conv4_2"]
style_layer_list = ["conv1_1", "conv2_1", "conv3_1", "conv4_1", "conv5_1"]

# Load pre-trained VGG19 model
pretrained_model = torchvision.models.vgg19(pretrained=True)
pretrained_model.classifier = nn.Identity()
pretrained_model.features.requires_grad_(False)

activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output
    return hook
pretrained_model.features[0].register_forward_hook(get_activation('conv1_1'))
pretrained_model.features[5].register_forward_hook(get_activation('conv2_1'))
pretrained_model.features[10].register_forward_hook(get_activation('conv3_1'))
pretrained_model.features[19].register_forward_hook(get_activation('conv4_1'))
pretrained_model.features[28].register_forward_hook(get_activation('conv5_1'))

pretrained_model.features[21].register_forward_hook(get_activation('conv4_2'))

# Load the style and content images
dataset_path = "/kaggle/input/datasetstyle/dataset"
content_img = cv2.imread(pjoin(dataset_path, "content_image", "Tuebingen_Neckarfront.jpg"))
style_img = cv2.imread(pjoin(dataset_path, "style_image", "van_gogh_the_scream.jpg"))

content_img = content_img.astype(np.float32)
style_img = style_img.astype(np.float32)
content_img /= 255.0
style_img /= 255.0
content_img = cv2.resize(content_img, (1024, 1024), interpolation=cv2.INTER_CUBIC)
style_img = cv2.resize(style_img, (1024, 1024), interpolation=cv2.INTER_CUBIC)
content_img = content_img[np.newaxis, :, :, :]
style_img = style_img[np.newaxis, :, :, :]

content_img_torch = torch.from_numpy(content_img)
style_img_torch = torch.from_numpy(style_img)

content_img_torch = torch.swapaxes(content_img_torch, 3, 1)
style_img_torch = torch.swapaxes(style_img_torch, 3, 1)

content_img_torch = content_img_torch.cuda()
style_img_torch = style_img_torch.cuda()

# set_trace()

# Set up trainable input parameter here
trainable_input = nn.Parameter(torch.rand((1, 3, 1024, 1024), requires_grad=True).cuda())

# Set up optimizer
opt = optim.Adam([trainable_input.requires_grad_()], lr=learning_rate)

# Dictionaries to store intermedate outputs
style_dict = {}
content_dict = {}
input_dict = {}

pretrained_model.cuda()

for epoch_idx in range(n_epoch):
    print("Epoch %d" % (epoch_idx))    
    # correct the values of updated input image
    # with torch.no_grad():
    #     trainable_input.clamp_(0, 1)

    # Let the trainable input pass through 
    out = pretrained_model(trainable_input)
    input_dict = activation.copy()

    # Next, let the content image run through, then we calculate the content loss
    out = pretrained_model(content_img_torch)
    content_dict = activation.copy()

    content_loss = 0
    for content_layer in content_layer_list:
        Pl = content_dict[content_layer]
        Fl = input_dict[content_layer]
        rms_loss = nn.MSELoss()
        loss_val = rms_loss(Pl, Fl)
        # loss_val = loss_val.cpu().detach().numpy()
        # loss_val = loss_val.item()
        content_loss += loss_val  
        # set_trace()
    # set_trace()
    # Now, let the style image run through, then we calculate the style loss
    out = pretrained_model(style_img_torch)
    style_dict = activation.copy()

    style_loss = 0
    for style_layer, wl in zip(style_layer_list, wl_list):
        Pl = style_dict[content_layer]
        Fl = input_dict[content_layer]

        Pl = Pl.reshape(Pl.shape[0], Pl.shape[1], -1)
        Fl = Fl.reshape(Fl.shape[0], Fl.shape[1], -1)

        n_filter = Pl.shape[1]
        feature_map_size = Pl.shape[2]
        # Pl = torch.squeeze(Pl)
        # Fl = torch.squeeze(Fl)

        Pl_gram = torch.bmm(Pl, torch.swapaxes(Pl, 2, 1))
        Fl_gram = torch.bmm(Fl, torch.swapaxes(Pl, 2, 1))
        # Pl_gram /= (n_filter*feature_map_size)
        # Fl_gram /= (n_filter*feature_map_size)
        # set_trace()
        rms_loss = nn.MSELoss()
        loss_val = rms_loss(Pl_gram, Fl_gram)
        # loss_val = loss_val.cpu().detach().numpy()
        # loss_val = loss_val.item()
        loss_val /= 2.0
        loss_val /= (1.0* (n_filter*feature_map_size))
        loss_val /= (1.0* (n_filter*feature_map_size))
        loss_val *= wl

        style_loss += loss_val
        # set_trace()
        pass


    # Calculate total loss and update the trainable parameters accordingly
    total_loss = alpha*content_loss + beta*style_loss
    # print("Loss:")
    # print(total_loss)
    total_loss.backward()  
    opt.step()
    print("Loss:")
    print(total_loss)
    # set_trace()
    pass