I’m trying to implement a style transfer paper from 2017 in pytorch and I’m running into some issues. I would really appreciate it if someone could help me identify the bugs in my code. The paper is “Laplacian-Steered Neural Style Transfer” by Li et al. I actually coded the Gatys paper as well to see the difference but the issue is that both implementations give exactly the same result even though I added a laplacian loss term in the second program. This is my implementation of Gatys et al. :
# https://arxiv.org/pdf/1508.06576.pdf
from traceback import print_tb
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.utils import save_image
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
class VGG(nn.Module):
def __init__(self):
super(VGG, self).__init__()
self.chosen_features = ['0', '5', '10', '19', '28']
self.model = models.vgg19(pretrained=True).features[:29]
def forward(self, x):
features = []
for layer_num, layer in enumerate(self.model):
x = layer(x)
if str(layer_num) in self.chosen_features:
features.append(x)
return features
def load_image(image_name):
image = Image.open(image_name).convert('RGB')
image = loader(image).unsqueeze(0)
return image.to(device)
device = torch.device('cuda')
image_size = (512, 376)
loader = transforms.Compose(
[
transforms.Resize(image_size),
transforms.ToTensor()
]
)
original_image = load_image("person.png")
# generated = torch.randn(original_image.shape, device=device, requires_grad=True)
generated = original_image.clone().requires_grad_(True)
total_steps = 6000
lr = 0.001
alpha = 1
beta = 0.0001
img_list = ["cartoon.jpg"]
optimizer = optim.Adam([generated], lr=lr)
model = VGG().to(device).eval()
for img in img_list:
style_image = load_image(img)
for step in range(total_steps):
generated_features = model(generated)
original_image_features = model(original_image)
style_features = model(style_image)
style_loss = original_loss = 0
for gen_feature, orig_feature, style_feature in zip(
generated_features, original_image_features, style_features
):
batch_size, channel, height, width = gen_feature.shape
original_loss += torch.mean((gen_feature - orig_feature) ** 2)
G = gen_feature.view(channel, height*width).mm(
gen_feature.view(channel, height*width).t()
)
A = style_feature.view(channel, height*width).mm(
style_feature.view(channel, height*width).t()
)
style_loss += torch.mean((G - A) ** 2)
total_loss = alpha * original_loss + beta * style_loss
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
if step % 200 == 0:
print(total_loss)
save_image(generated, f"generated{img}.png")
Using the same content and style image that Li et al. use in the first section of the paper, the above code outputs this image:
This is my implementation of Li et al. :
# https://arxiv.org/pdf/1707.01253.pdf
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.utils import save_image
import os
import torch.nn.functional as F
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
class VGG(nn.Module):
def __init__(self):
super(VGG, self).__init__()
self.chosen_features = ['0', '5', '10', '19', '28']
self.model = models.vgg19(pretrained=True).features[:29]
def forward(self, x):
features = []
for layer_num, layer in enumerate(self.model):
x = layer(x)
if str(layer_num) in self.chosen_features:
features.append(x)
return features
def load_image(image_name):
image = Image.open(image_name).convert('RGB')
image = loader(image).unsqueeze(0)
return image.to(device)
device = torch.device('cuda')
image_size = (512, 376)
loader = transforms.Compose(
[
transforms.Resize(image_size),
transforms.ToTensor()
]
)
original_image = load_image("person.png")
# generated = torch.randn(original_image.shape, device=device, requires_grad=True)
generated = original_image.clone().requires_grad_(True)
total_steps = 6000
lr = 0.001
alpha = 5 # content loss coefficient
beta = 100 # style loss coefficient
gamma = 100 # laplacian loss coefficient
img_list = ["cartoon.jpg"]
optimizer = optim.Adam([generated], lr=lr)
laplacian_filter = torch.tensor([[
0., -1., 0.,
-1., 4., -1.,
0., -1., 0.
]]).view(1, 1, 3, 3).repeat(1, 3, 1, 1).to(device)
lap_pool_sizes = [4, 16]
model = VGG().to(device).eval()
for img in img_list:
style_image = load_image(img)
for step in range(total_steps):
generated_features = model(generated)
original_image_features = model(original_image)
style_features = model(style_image)
style_loss = original_loss = laplacian_loss = 0
for gen_feature, orig_feature, style_feature in zip(
generated_features, original_image_features, style_features
):
batch_size, channel, height, width = gen_feature.shape
original_loss += torch.mean((gen_feature - orig_feature) ** 2)
G = gen_feature.view(channel, height*width).mm(
gen_feature.view(channel, height*width).t()
)
A = style_feature.view(channel, height*width).mm(
style_feature.view(channel, height*width).t()
)
style_loss += torch.mean((G - A) ** 2)
for pool_size in lap_pool_sizes:
laplacian_loss += torch.mean(
(
F.conv2d(F.avg_pool2d(original_image, pool_size), laplacian_filter)
- F.conv2d(F.avg_pool2d(generated, pool_size), laplacian_filter)
) ** 2
)
total_loss = alpha * original_loss + beta * style_loss + gamma * laplacian_loss
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
if step % 200 == 0:
print(total_loss)
save_image(generated, f"laplacian_generated{img}.png")
This code outputs exactly the same image. I was hoping for something like this:
Thanks in advance