MSELoss backward returning runtime error: Found dtype Double but expected Float

yahoyoungho · August 30, 2020, 7:20am

I am getting this error and have no clue how to fix this.

I have MSELoss output and when I call backward() function, I get the error saying ‘Found dtype Double but expected Float’. I’ve seen some post saying I can change my parameters of my model to accept certain dtypes (which I did to float) and I have checked my input dtype (which is torch.cuda.FloatTensor).

What can I do to fix this error??

ptrblck · August 30, 2020, 9:08am

Could you post an executable code snippet, which reproduces this error, so that we could have a look?

yahoyoungho · September 2, 2020, 12:35am

This is my model

class SpeedChallengeModel1(nn.Module):
    
    # no normalization layer since already done in preprocessing
    def __init__(self):
        super(SpeedChallengeModel1,self).__init__()
        
        # input shape = (batch_size, 3, 66, 220)
        self.conv2d_0 = Conv2d(3,24,5, stride = 2, padding = 1 )
        self.conv2d_1 = Conv2d(24,36,5, stride = 2, padding = 1 )
        self.conv2d_2 = Conv2d(36,48,5, stride = 2, padding = 1 )
        self.conv2d_3 = Conv2d(48,64,3, stride = 1, padding = 1 )
        self.conv2d_4 = Conv2d(64,64,3, stride = 1, padding = 1 )
        
        # elu activation function
        self.elu = ELU()
        
        # dropout layer
        self.dropout_layer = nn.Dropout2d(p = 0.5 )
        
        # flatten layer
        self.flatten_layer = Flatten()
        
        # fully connected layers
        self.linear0 = Linear(11648,100)
        self.linear1 = Linear(100,50)
        self.linear2 = Linear(50,10)
        self.linear3 = Linear(10,1)
        
        # initialize weights
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m,nn.Linear):
                nn.init.kaiming_normal_(m.weight,a=0.1,mode="fan_in")
        
        
    
    def forward(self, x):
        #print(x[0])
        print(x.type())
        x = self.elu(self.conv2d_0(x))
        x = self.elu(self.conv2d_1(x))
        x = self.elu(self.conv2d_2(x))
        
        x = self.dropout_layer(x)
        x = self.elu(self.conv2d_3(x))
        x = self.conv2d_4(x)
        x = self.elu(self.flatten_layer(x))
        #print(x.shape)
        
        x = self.elu(self.linear0(x))
        x = self.elu(self.linear1(x))
        x = self.elu(self.linear2(x))
        
        x = self.linear3(x)
        
        return x

And this is my training/validation loop

print("*************Training Model****************")

# get train and validation dataset and dataset loader
tensor_train_df = SpeedChallengeDataSet(csv_file=train_df, transforms=transforms.Compose([ToTensor()]))
tensor_val_df = SpeedChallengeDataSet(csv_file=val_df, transforms=transforms.Compose([ToTensor()]))

train_data_loader = torch.utils.data.DataLoader(tensor_train_df, batch_size=8)# ,num_workers=2)
val_data_loader = torch.utils.data.DataLoader(tensor_val_df, batch_size=8)#, num_workers=2)


# enable gpu device
use_cuda = torch.cuda.is_available()
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
    
device = torch.device(dev)

learning_rate = 1e-6
batch_size = 8

n_epoch = 100

# model init
model = SpeedChallengeModel1()
model = model.float()
model.to(device)
# optimizer and loss criterion
optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)
mse_full = nn.MSELoss(reduction="none")
#mse_mean = nn.MSELoss()
train_losses = []
val_losses = []
epoch_pbar = tqdm(total = n_epoch, desc="Epochs")
for epoch_i in range(n_epoch):
    print( f"Epoch {epoch_i+1}/{n_epoch}" )
    print("-" * 15)
    
    
    # for each epoch do both training and validation
    for phase in ["train","val"]:
        if phase == "train":
            for batch in train_data_loader:
                
                batch_datas = batch["flow"]
                batch_labels = batch["avg_speed"]
                
                #print(type(batch_datas))
                #print(batch_datas.size())
                # transfer to gpu
                batch_datas = batch_datas.to(device) 
                batch_labels = batch_labels.to(device)
                
                #print(type(batch_datas))
                #print(batch_datas.size())
                #print(batch_datas)
                
                # predict speed
                pred = model(batch_datas)
                
                loss_each = mse_full( pred.flatten(), batch_labels )
                #mean_loss = mse_mean(pred, batch_labels)
                #train_losses.append(loss_each.cpu().detach().numpy())
                loss_all = torch.mean(loss_each)
                #loss_all = loss_all.float()
                #loss_all = loss_all.type(torch.cuda.FloatTensor)
                loss_all.backward()
                train_losses.append(loss_each.detach())
                
                
                #full_loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
                
        # Validation 
        elif phase == "val":
            with torch.set_grad_enabled(False):
                for val_batch in val_data_loader:
                    
                    batch_datas. batch_labels = batch_datas.to(device), batch_labels.to(device)
                    pred = model(batch_datas)
                    full_loss = mse_full( pred, batch_labels )
                    val_losses.append(full_loss)
    
    epoch_pbar.update(1)

When I pass through loss_all.backward() I am getting this error

ptrblck · September 2, 2020, 9:24am

Thanks for the code snippet.
I cannot reproduce the error using your model and this code:

model = SpeedChallengeModel1()
model = model.float()

mse_full = nn.MSELoss(reduction="none")
batch_datas = torch.randn(2, 3, 66, 220)
batch_labels = torch.randn(2)
                
# predict speed
pred = model(batch_datas)            
loss_each = mse_full( pred.flatten(), batch_labels )
loss_all = torch.mean(loss_each)
loss_all.backward()

yahoyoungho · September 3, 2020, 2:43am

Then does that mean there can be a problem with my preprocessing portion? I used OpenCV for some preprocessing

ptrblck · September 3, 2020, 9:05am

I’m not sure what might create the issue and we would need an executable code snippet to debug further.

Could you try to use random data, stick to your current pipeline as close as possible, and check if this would reproduce the error?

fco-dv · September 16, 2020, 10:36am

Maybe you should put the dtype of loss_all to dtype=torch.float32