Use Pearson Correlation Coefficient as cost function

Seems still have problem, my implementation is like the following:

def train(**kwargs):
    #torch.manual_seed(100) # 10, 100, 666, 
    opt.parse(kwargs)
    vis = Visualizer(opt.env)

    # step1: configure model
    model = getattr(models, opt.model)()
    if opt.load_model_path:
        model.load(opt.load_model_path)
    if opt.use_gpu:
        model.cuda()

    # step2: load data
    train_data = STSDataset(opt.train_data_path)
    val_data = STSDataset(opt.train_data_path)
    train_dataloader = DataLoader(train_data, opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers)
    val_dataloader = DataLoader(val_data, opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers)
    torch.save(train_data.X, opt.train_features_path)
    torch.save(train_data.y, opt.train_targets_path)

    # step3: set criterion and optimizer
    criterion = torch.nn.MSELoss()
    lr = opt.lr
    optimizer = torch.optim.Adam(model.parameters(), lr=lr,
                                 weight_decay=opt.weight_decay)
    #optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

    # step4: set meters
    loss_meter = meter.MSEMeter()
    previous_loss = 1e100

    # train
    for epoch in range(opt.max_epoch):
        loss_meter.reset()

        for ii, (data, label) in enumerate(train_dataloader):
            # train model on a batch data
            input = Variable(data)
            target = Variable(torch.FloatTensor(label.numpy()))
            if opt.use_gpu:
                input = input.cuda()
                target = target.cuda()
            optimizer.zero_grad()
            score = model(input)
            #loss = criterion(score, target)        # use MSE loss function
            vx = score - torch.mean(score)
            vy = target - torch.mean(target)
            loss = torch.sum(vx * vy) / (torch.sqrt(torch.sum(vx ** 2)) * torch.sqrt(torch.sum(vy ** 2)))  # use Pearson correlation

            loss.backward()
            optimizer.step()

            # update meters and visualize
            loss_meter.add(score.data, target.data)

            if ii % opt.print_freq == opt.print_freq - 1:
                vis.plot('loss', loss_meter.value())

                # enter debug mode
                if os.path.exists(opt.debug_file):
                    import ipdb
                    ipdb.set_trace()

        # save model for each epoch
        #model.save()

        # validate and visualize
        val_mse, val_pearsonr = val(model, val_dataloader)

        vis.plot('val_mse', val_mse)
        vis.plot('pearson', val_pearsonr)
        vis.log("epoch:{epoch},lr:{lr},\
            loss:{loss},val_mse:{val_mse},val_pearson:{val_pearson}".format(
            epoch=epoch,
            lr=lr,
            loss=loss_meter.value(),
            val_mse=str(val_mse),
            val_pearson=str(val_pearsonr)))

        # update learning rate
        if loss_meter.value() > previous_loss:
            lr = lr * opt.lr_decay
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        previous_loss = loss_meter.value()

and when I check the output, the MSE, pearson correlation are all NaN.