Hi,
Related question: http://discuss.pytorch.org/t/resolved-implementing-maml-in-pytorch/4053
I’m trying to implement meta-learning algorithms such as MAML(TF implementation) and Meta-SGD in PyTorch. I have difficulties in understanding the gradient flow.
train_samples, test_samples = task['train'], task['test']
images, labels = train_samples['image'], train_samples['label']
if args.cuda:
images, labels = images.cuda(), labels.cuda()
images, labels = Variable(images), Variable(labels)
images, labels = test_samples['image'], test_samples['label']
if args.cuda:
images, labels = images.cuda(), labels.cuda()
images, labels = Variable(images), Variable(labels)
# Inner gradient update
alpha = [Variable(p.data.clone().fill_(0.01), requires_grad=True) for p in base_learner.parameters()]
meta_optimizer = optim.Adam(base_learner.parameters(), lr=args.update_lr)
meta_optimizer.zero_grad()
output = base_learner(images)
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(output, labels)
loss.backward(retain_graph=True)
for param in base_learner.parameters():
param.grad.requires_grad = True
base_learner_ = BaseLearner(args.num_classes, args.num_filters)
if args.cuda:
base_learner_.cuda()
base_learner_.train()
print(list(base_learner_.parameters())[0].data.sum())
for param_, lr, param in zip(base_learner_.parameters(), alpha, base_learner.parameters()):
param_ = param - lr*param.grad
print(list(base_learner_.parameters())[0].data.sum())
# Note: parameters of base_learner_ doesn't seem to get updated here.
# Meta update
base_learner_.zero_grad()
output = base_learner_(images)
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(output, labels)
loss.backward()
meta_optimizer.step()
- When I set the parameters of
base_learner_
frombase_learner
, that doesn’t seem to be executed. How to do this properly? - The gradient while backpropagating from
base_learner_
doesn’t flow to the parameters ofbase_learner
. I know that this is because the parameters ofbase_learner_
are not set properly based onbase_learner
. I want to set it such that gradients ofbase_learner_
flows to parameters ofbase_learner
.
The algorithm I’m trying to implement can be found at the beginning of page 5 here - Meta-SGD: Learning to Learn Quickly for Few Shot Learning. Is this the right approach? What is the pragmatic way of doing it?
More code:
# Model
class ConvModule(nn.Module):
""" Conv Module """
def __init__(self, in_channels, out_channels):
super(ConvModule, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1)
self.bn = nn.BatchNorm2d(out_channels)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = F.relu(x)
return F.max_pool2d(x, 2)
class BaseLearner(nn.Module):
""" Simple Conv Net """
def __init__(self, num_classes, num_filters=64):
super(BaseLearner, self).__init__()
self.conv1 = ConvModule(1, num_filters)
self.conv2 = ConvModule(num_filters, num_filters)
self.conv3 = ConvModule(num_filters, num_filters)
self.conv4 = ConvModule(num_filters, num_filters)
self.fc = nn.Linear(num_filters, num_classes)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = x.view(-1, 64)
return self.fc(x)