Issue implementing Laplacian-steered style transfer

Haziq_Muhammad · February 2, 2022, 7:03pm

I’m trying to implement a style transfer paper from 2017 in pytorch and I’m running into some issues. I would really appreciate it if someone could help me identify the bugs in my code. The paper is “Laplacian-Steered Neural Style Transfer” by Li et al. I actually coded the Gatys paper as well to see the difference but the issue is that both implementations give exactly the same result even though I added a laplacian loss term in the second program. This is my implementation of Gatys et al. :

    # https://arxiv.org/pdf/1508.06576.pdf
    
    from traceback import print_tb
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from PIL import Image
    import torchvision.transforms as transforms
    import torchvision.models as models
    from torchvision.utils import save_image
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    
    class VGG(nn.Module):
    def __init__(self):
        super(VGG, self).__init__()

        self.chosen_features = ['0', '5', '10', '19', '28']

        self.model = models.vgg19(pretrained=True).features[:29]

    def forward(self, x):
        features = []

        for layer_num, layer in enumerate(self.model):
            x = layer(x)

            if str(layer_num) in self.chosen_features:
                features.append(x)

            
        return features

def load_image(image_name):
    image = Image.open(image_name).convert('RGB')
    image = loader(image).unsqueeze(0)
    return image.to(device)

device = torch.device('cuda')
image_size = (512, 376)

loader = transforms.Compose(
    [
        transforms.Resize(image_size),
        transforms.ToTensor()
    ]
)

original_image = load_image("person.png")

# generated = torch.randn(original_image.shape, device=device, requires_grad=True)
generated = original_image.clone().requires_grad_(True)

total_steps = 6000
lr = 0.001
alpha = 1
beta = 0.0001 

img_list = ["cartoon.jpg"]

optimizer = optim.Adam([generated], lr=lr)

model = VGG().to(device).eval()
for img in img_list:
    style_image = load_image(img)
    for step in range(total_steps):
        generated_features = model(generated)
        original_image_features = model(original_image)
        style_features = model(style_image)

        style_loss = original_loss = 0

        for gen_feature, orig_feature, style_feature in zip(
            generated_features, original_image_features, style_features
        ):
            batch_size, channel, height, width = gen_feature.shape

            original_loss += torch.mean((gen_feature - orig_feature) ** 2)

            G = gen_feature.view(channel, height*width).mm(
                gen_feature.view(channel, height*width).t()
            )

            A = style_feature.view(channel, height*width).mm(
                style_feature.view(channel, height*width).t()
            )

            style_loss += torch.mean((G - A) ** 2)

        total_loss = alpha * original_loss + beta * style_loss
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        if step % 200 == 0:
            print(total_loss)
            save_image(generated, f"generated{img}.png")

Using the same content and style image that Li et al. use in the first section of the paper, the above code outputs this image:

This is my implementation of Li et al. :

# https://arxiv.org/pdf/1707.01253.pdf

import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.utils import save_image
import os
import torch.nn.functional as F
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

class VGG(nn.Module):
    def __init__(self):
        super(VGG, self).__init__()

        self.chosen_features = ['0', '5', '10', '19', '28']

        self.model = models.vgg19(pretrained=True).features[:29]

    def forward(self, x):
        features = []

        for layer_num, layer in enumerate(self.model):
            x = layer(x)

            if str(layer_num) in self.chosen_features:
                features.append(x)

            
        return features

def load_image(image_name):
    image = Image.open(image_name).convert('RGB')

    image = loader(image).unsqueeze(0)
    return image.to(device)

device = torch.device('cuda')
image_size = (512, 376)

loader = transforms.Compose(
    [
        transforms.Resize(image_size),
        transforms.ToTensor()
    ]
)

original_image = load_image("person.png")

# generated = torch.randn(original_image.shape, device=device, requires_grad=True)
generated = original_image.clone().requires_grad_(True)

total_steps = 6000
lr = 0.001
alpha = 5 # content loss coefficient
beta = 100 # style loss coefficient
gamma = 100 # laplacian loss coefficient

img_list = ["cartoon.jpg"]

optimizer = optim.Adam([generated], lr=lr)

laplacian_filter = torch.tensor([[
    0., -1., 0.,
    -1., 4., -1., 
    0., -1., 0.
]]).view(1, 1, 3, 3).repeat(1, 3, 1, 1).to(device)

lap_pool_sizes = [4, 16]

model = VGG().to(device).eval()
for img in img_list:
    style_image = load_image(img)
    for step in range(total_steps):
        generated_features = model(generated)
        original_image_features = model(original_image)
        style_features = model(style_image)

        style_loss = original_loss = laplacian_loss = 0

        for gen_feature, orig_feature, style_feature in zip(
            generated_features, original_image_features, style_features
        ):
            batch_size, channel, height, width = gen_feature.shape

            original_loss += torch.mean((gen_feature - orig_feature) ** 2)

            G = gen_feature.view(channel, height*width).mm(
                gen_feature.view(channel, height*width).t()
            )

            A = style_feature.view(channel, height*width).mm(
                style_feature.view(channel, height*width).t()
            )

            style_loss += torch.mean((G - A) ** 2)

        for pool_size in lap_pool_sizes:
            laplacian_loss += torch.mean(
                (
                   F.conv2d(F.avg_pool2d(original_image, pool_size), laplacian_filter)
                    - F.conv2d(F.avg_pool2d(generated, pool_size), laplacian_filter)
                ) ** 2
            )

        total_loss = alpha * original_loss + beta * style_loss + gamma * laplacian_loss
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

        if step % 200 == 0:
            print(total_loss)
            save_image(generated, f"laplacian_generated{img}.png")

This code outputs exactly the same image. I was hoping for something like this:

Thanks in advance

Haziq_Muhammad · February 3, 2022, 1:54pm

Any updates on this issue?

Haziq_Muhammad · February 3, 2022, 6:44pm

Should I post this somewhere else?