Hello @ptrblck,
Your post here helped me to create the ensemble TwoStream Inception I3D architecture.
This architecture is made up of two Inception I3D architectures that are fused after the logits step.
I’m going to paste to you the architectures script :
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import os
import sys
from collections import OrderedDict
##################################################################
# Architectures classes
##################################################################
class MaxPool3dSamePadding(nn.MaxPool3d):
def compute_pad(self, dim, s):
if s % self.stride[dim] == 0:
return max(self.kernel_size[dim] - self.stride[dim], 0)
else:
return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
def forward(self, x):
# compute 'same' padding
(batch, channel, t, h, w) = x.size()
#print t,h,w
out_t = np.ceil(float(t) / float(self.stride[0]))
out_h = np.ceil(float(h) / float(self.stride[1]))
out_w = np.ceil(float(w) / float(self.stride[2]))
#print out_t, out_h, out_w
pad_t = self.compute_pad(0, t)
pad_h = self.compute_pad(1, h)
pad_w = self.compute_pad(2, w)
#print pad_t, pad_h, pad_w
pad_t_f = pad_t // 2
pad_t_b = pad_t - pad_t_f
pad_h_f = pad_h // 2
pad_h_b = pad_h - pad_h_f
pad_w_f = pad_w // 2
pad_w_b = pad_w - pad_w_f
pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
#print x.size()
#print pad
x = F.pad(x, pad)
return super(MaxPool3dSamePadding, self).forward(x)
class Unit3D(nn.Module):
def __init__(self, in_channels,
output_channels,
kernel_shape=(1, 1, 1),
stride=(1, 1, 1),
padding=0,
activation_fn=F.relu,
use_batch_norm=True,
use_bias=False,
name='unit_3d'):
"""Initializes Unit3D module."""
super(Unit3D, self).__init__()
self._output_channels = output_channels
self._kernel_shape = kernel_shape
self._stride = stride
self._use_batch_norm = use_batch_norm
self._activation_fn = activation_fn
self._use_bias = use_bias
self.name = name
self.padding = padding
self.conv3d = nn.Conv3d(in_channels=in_channels,
out_channels=self._output_channels,
kernel_size=self._kernel_shape,
stride=self._stride,
padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
bias=self._use_bias)
if self._use_batch_norm:
self.bn = nn.BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
def compute_pad(self, dim, s):
if s % self._stride[dim] == 0:
return max(self._kernel_shape[dim] - self._stride[dim], 0)
else:
return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
def forward(self, x):
# compute 'same' padding
(batch, channel, t, h, w) = x.size()
#print t,h,w
out_t = np.ceil(float(t) / float(self._stride[0]))
out_h = np.ceil(float(h) / float(self._stride[1]))
out_w = np.ceil(float(w) / float(self._stride[2]))
#print out_t, out_h, out_w
pad_t = self.compute_pad(0, t)
pad_h = self.compute_pad(1, h)
pad_w = self.compute_pad(2, w)
#print pad_t, pad_h, pad_w
pad_t_f = pad_t // 2
pad_t_b = pad_t - pad_t_f
pad_h_f = pad_h // 2
pad_h_b = pad_h - pad_h_f
pad_w_f = pad_w // 2
pad_w_b = pad_w - pad_w_f
pad = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
#print x.size()
#print pad
x = F.pad(x, pad)
#print x.size()
x = self.conv3d(x)
if self._use_batch_norm:
x = self.bn(x)
if self._activation_fn is not None:
x = self._activation_fn(x)
return x
class InceptionModule(nn.Module):
def __init__(self, in_channels, out_channels, name):
super(InceptionModule, self).__init__()
self.b0 = Unit3D(in_channels=in_channels, output_channels=out_channels[0], kernel_shape=[1, 1, 1], padding=0,
name=name+'/Branch_0/Conv3d_0a_1x1')
self.b1a = Unit3D(in_channels=in_channels, output_channels=out_channels[1], kernel_shape=[1, 1, 1], padding=0,
name=name+'/Branch_1/Conv3d_0a_1x1')
self.b1b = Unit3D(in_channels=out_channels[1], output_channels=out_channels[2], kernel_shape=[3, 3, 3],
name=name+'/Branch_1/Conv3d_0b_3x3')
self.b2a = Unit3D(in_channels=in_channels, output_channels=out_channels[3], kernel_shape=[1, 1, 1], padding=0,
name=name+'/Branch_2/Conv3d_0a_1x1')
self.b2b = Unit3D(in_channels=out_channels[3], output_channels=out_channels[4], kernel_shape=[3, 3, 3],
name=name+'/Branch_2/Conv3d_0b_3x3')
self.b3a = MaxPool3dSamePadding(kernel_size=[3, 3, 3],
stride=(1, 1, 1), padding=0)
self.b3b = Unit3D(in_channels=in_channels, output_channels=out_channels[5], kernel_shape=[1, 1, 1], padding=0,
name=name+'/Branch_3/Conv3d_0b_1x1')
self.name = name
def forward(self, x):
b0 = self.b0(x)
b1 = self.b1b(self.b1a(x))
b2 = self.b2b(self.b2a(x))
b3 = self.b3b(self.b3a(x))
return torch.cat([b0,b1,b2,b3], dim=1)
class InceptionI3d(nn.Module):
"""Inception-v1 I3D architecture.
The model is introduced in:
Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
Joao Carreira, Andrew Zisserman
https://arxiv.org/pdf/1705.07750v1.pdf.
See also the Inception architecture, introduced in:
Going deeper with convolutions
Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
http://arxiv.org/pdf/1409.4842v1.pdf.
"""
# Endpoints of the model in order. During construction, all the endpoints up
# to a designated `final_endpoint` are returned in a dictionary as the
# second return value.
VALID_ENDPOINTS = (
'Conv3d_1a_7x7',
'MaxPool3d_2a_3x3',
'Conv3d_2b_1x1',
'Conv3d_2c_3x3',
'MaxPool3d_3a_3x3',
'Mixed_3b',
'Mixed_3c',
'MaxPool3d_4a_3x3',
'Mixed_4b',
'Mixed_4c',
'Mixed_4d',
'Mixed_4e',
'Mixed_4f',
'MaxPool3d_5a_2x2',
'Mixed_5b',
'Mixed_5c',
'Logits',
'Predictions',
)
def __init__(self, num_classes=400, spatial_squeeze=True,
final_endpoint='Logits',
name='inception_i3d', in_channels=3, dropout_keep_prob=0.5):
"""Initializes I3D model instance.
Args:
num_classes: The number of outputs in the logit layer (default 400, which
matches the Kinetics dataset).
spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
before returning (default True).
final_endpoint: The model contains many possible endpoints.
`final_endpoint` specifies the last endpoint for the model to be built
up to. In addition to the output at `final_endpoint`, all the outputs
at endpoints up to `final_endpoint` will also be returned, in a
dictionary. `final_endpoint` must be one of
InceptionI3d.VALID_ENDPOINTS (default 'Logits').
name: A string (optional). The name of this module.
Raises:
ValueError: if `final_endpoint` is not recognized.
"""
if final_endpoint not in self.VALID_ENDPOINTS:
raise ValueError('Unknown final endpoint %s' % final_endpoint)
super(InceptionI3d, self).__init__()
self._num_classes = num_classes
self._spatial_squeeze = spatial_squeeze
self._final_endpoint = final_endpoint
self.logits = None
self.hidden = None
if self._final_endpoint not in self.VALID_ENDPOINTS:
raise ValueError('Unknown final endpoint %s' % self._final_endpoint)
self.end_points = {}
end_point = 'Conv3d_1a_7x7'
self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7],
stride=(2, 2, 2), padding=(3,3,3), name=name+end_point)
if self._final_endpoint == end_point: return
end_point = 'MaxPool3d_2a_3x3'
self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
padding=0)
if self._final_endpoint == end_point: return
end_point = 'Conv3d_2b_1x1'
self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0,
name=name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Conv3d_2c_3x3'
self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1,
name=name+end_point)
if self._final_endpoint == end_point: return
end_point = 'MaxPool3d_3a_3x3'
self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[1, 3, 3], stride=(1, 2, 2),
padding=0)
if self._final_endpoint == end_point: return
end_point = 'Mixed_3b'
self.end_points[end_point] = InceptionModule(192, [64,96,128,16,32,32], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Mixed_3c'
self.end_points[end_point] = InceptionModule(256, [128,128,192,32,96,64], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'MaxPool3d_4a_3x3'
self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[3, 3, 3], stride=(2, 2, 2),
padding=0)
if self._final_endpoint == end_point: return
end_point = 'Mixed_4b'
self.end_points[end_point] = InceptionModule(128+192+96+64, [192,96,208,16,48,64], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Mixed_4c'
self.end_points[end_point] = InceptionModule(192+208+48+64, [160,112,224,24,64,64], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Mixed_4d'
self.end_points[end_point] = InceptionModule(160+224+64+64, [128,128,256,24,64,64], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Mixed_4e'
self.end_points[end_point] = InceptionModule(128+256+64+64, [112,144,288,32,64,64], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Mixed_4f'
self.end_points[end_point] = InceptionModule(112+288+64+64, [256,160,320,32,128,128], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'MaxPool3d_5a_2x2'
self.end_points[end_point] = MaxPool3dSamePadding(kernel_size=[2, 2, 2], stride=(2, 2, 2),
padding=0)
if self._final_endpoint == end_point: return
end_point = 'Mixed_5b'
self.end_points[end_point] = InceptionModule(256+320+128+128, [256,160,320,32,128,128], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Mixed_5c'
self.end_points[end_point] = InceptionModule(256+320+128+128, [384,192,384,48,128,128], name+end_point)
if self._final_endpoint == end_point: return
end_point = 'Logits'
self.avg_pool = nn.AvgPool3d(kernel_size=[2, 7, 7],
stride=(1, 1, 1))
self.dropout = nn.Dropout(dropout_keep_prob)
self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
kernel_shape=[1, 1, 1],
padding=0,
activation_fn=None,
use_batch_norm=False,
use_bias=True,
name='logits')
self.build()
def replace_logits(self, num_classes):
self._num_classes = num_classes
self.logits = Unit3D(in_channels=384+384+128+128, output_channels=self._num_classes,
kernel_shape=[1, 1, 1],
padding=0,
activation_fn=None,
use_batch_norm=False,
use_bias=True,
name='logits')
def build(self):
for k in self.end_points.keys():
self.add_module(k, self.end_points[k])
def forward(self, x):
for end_point in self.VALID_ENDPOINTS:
if end_point in self.end_points:
x = self._modules[end_point](x) # use _modules to work with dataparallel
x = self.logits(self.dropout(self.avg_pool(x)))
if self._spatial_squeeze:
logits = x.squeeze(3).squeeze(3)
# logits is batch X time X classes, which is what we want to work with
return logits
def extract_features(self, x):
for end_point in self.VALID_ENDPOINTS:
if end_point in self.end_points:
x = self._modules[end_point](x)
return self.avg_pool(x)
The training script :
import os
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torchvision
from torchvision import datasets, transforms
import numpy as np
from pytorch_architectures import InceptionI3d
from videos_dataset import Videos as Dataset
class TwoStream_I3D(nn.Module):
def __init__(self, modelA, modelB):
super(TwoStream_I3D, self).__init__()
self.modelA = modelA
self.modelB = modelB
self.classifier = nn.Linear(2, 11)
def forward(self, x1, x2):
x1 = self.modelA(x1)
x2 = self.modelB(x2)
x = torch.cat((x1, x2), dim=0)
x = self.classifier(F.relu(x))
return x
def run(init_lr=0.1, max_steps=5, mode='rgb', root='Data/Videos/', split_link='Splits/5_splits/Split0/Split0.json',
batch_size=1, save_model='Trained_models/', architecture_type='onestream'):
transforms_size = 224
# setup dataset
print(split_link, root, mode, architecture_type)
dataset = Dataset(split_link, 'train_simple', root, mode, architecture_type)
# dataset = Dataset(split_link, 'train', root, mode)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)
val_dataset = Dataset(split_link, 'val_simple', root, mode, architecture_type)
# val_dataset = Dataset(split_link, 'val', root, mode)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True)
dataloaders = {'train_simple': dataloader, 'val_simple': val_dataloader}
datasets = {'train_simple': dataset, 'val_simple': val_dataset}
# setup the model
if architecture_type == 'onestream':
if mode == 'flow':
i3d = InceptionI3d(400, in_channels=2)
i3d.load_state_dict(torch.load(os.path.join(save_model, 'flow_imagenet.pt')))
else:
i3d = InceptionI3d(400, in_channels=3)
i3d.load_state_dict(torch.load(os.path.join(save_model, 'rgb_imagenet.pt')))
i3d.replace_logits(11)
i3d.cuda()
i3d = nn.DataParallel(i3d)
lr = init_lr
optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001)
lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])
num_steps_per_update = 4 # accum gradient
steps = 0
# train it
while steps < max_steps:#for epoch in range(num_epochs):
print('Step {}/{}'.format(steps, max_steps))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train_simple', 'val_simple']:
if phase == 'train_simple':
i3d.train(True)
else:
i3d.train(False) # Set model to evaluate mode
tot_loss = 0.0
tot_loc_loss = 0.0
tot_cls_loss = 0.0
num_iter = 0
optimizer.zero_grad()
# Iterate over data.
for data in dataloaders[phase]:
num_iter += 1
# get the inputs
inputs, labels = data
# wrap them in Variable
inputs = Variable(inputs.cuda())
labels = Variable(labels.cuda())
t = inputs.size(2)
per_frame_logits = i3d(inputs)
# upsample to input size
per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
# compute localization loss
loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
tot_loc_loss += loc_loss.data
# compute classification loss (with max-pooling along time B x C x T)
cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
tot_cls_loss += cls_loss.data
loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update
tot_loss += loss.data
loss.backward()
if num_iter == num_steps_per_update and phase == 'train_simple':
steps += 1
num_iter = 0
optimizer.step()
optimizer.zero_grad()
lr_sched.step()
if steps % 10 == 0:
print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10))
# save model
torch.save(twostream_i3d.module.state_dict(), save_model+str(steps).zfill(6)+'.pt')
tot_loss = tot_loc_loss = tot_cls_loss = 0.
if phase == 'val_simple':
print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter))
else: #architecture_type == 'twostream'
i3d_flow = InceptionI3d(400, in_channels=2)
i3d_flow.load_state_dict(torch.load(os.path.join(save_model, 'flow_imagenet.pt')))
i3d_flow.replace_logits(11)
i3d_rgb = InceptionI3d(400, in_channels=3)
i3d_rgb.load_state_dict(torch.load(os.path.join(save_model, 'rgb_imagenet.pt')))
i3d_rgb.replace_logits(11)
twostream_i3d = TwoStream_I3D(i3d_rgb, i3d_flow)
twostream_i3d.cuda()
twostream_i3d = nn.DataParallel(twostream_i3d)
lr = init_lr
optimizer = optim.SGD(twostream_i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001)
lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])
num_steps_per_update = 4 # accum gradient
steps = 0
# train it
while steps < max_steps:#for epoch in range(num_epochs):
print('Step {}/{}'.format(steps, max_steps))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train_simple', 'val_simple']:
if phase == 'train_simple':
twostream_i3d.train(True)
else:
twostream_i3d.train(False) # Set model to evaluate mode
tot_loss = 0.0
tot_loc_loss = 0.0
tot_cls_loss = 0.0
num_iter = 0
optimizer.zero_grad()
# Iterate over data.
for data in dataloaders[phase]:
num_iter += 1
# get the inputs
rgb_inputs, flow_inputs, labels = data
# wrap them in Variable
rgb_inputs = Variable(rgb_inputs.cuda())
flow_inputs = Variable(flow_inputs.cuda())
labels = Variable(labels.cuda())
t = rgb_inputs.size(2)
per_frame_logits = twostream_i3d(rgb_inputs, flow_inputs)
# upsample to input size
per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
labels = torch.cat((labels,labels), dim=0)
# compute localization loss
loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
tot_loc_loss += loc_loss.data
# compute classification loss (with max-pooling along time B x C x T)
cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0])
tot_cls_loss += cls_loss.data
loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update
tot_loss += loss.data
loss.backward()
if num_iter == num_steps_per_update and phase == 'train_simple':
steps += 1
num_iter = 0
optimizer.step()
optimizer.zero_grad()
lr_sched.step()
if steps % 10 == 0:
print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10))
# save model
torch.save(twostream_i3d.module.state_dict(), save_model+str(steps).zfill(6)+'.pt')
tot_loss = tot_loc_loss = tot_cls_loss = 0.
if phase == 'val_simple':
print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter))
My question is : I had to fuse between the two Inception I3D at the following stage, however by merging at dimension 0. Could it represent a problem ? I couldn’t fuse at dimension 1, where was a size representing a number of classes.
x = torch.cat((x1, x2), dim=0)
Moreover, I needed to fuse the labels here, when training the TwoStream architecture :
labels = torch.cat((labels,labels), dim=0)