While training the model, I am having the following error. I am unable to solve this. Could you please help me solve this? I have provided the code and error. Thank you
Dataloader Part:
Radio_train = loaders_NTU.RadioUNet_c(phase=“train”)
#Radio_train_NTU = loaders.RadioUNet_c(phase=“abc”)
Radio_val = loaders_NTU.RadioUNet_c(phase=“val”)
Radio_test = loaders_NTU.RadioUNet_c(phase=“test”)
#device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
generator = torch.Generator(device=‘cuda’)
image_datasets = {
‘train’: Radio_train, ‘val’: Radio_val
}
batch_size = 15
dataloaders = {
‘train’: DataLoader(Radio_train, batch_size=batch_size, shuffle=True, num_workers=1, generator=generator), #generator = torch.Generator(device=‘cuda’),
‘val’: DataLoader(Radio_val, batch_size=batch_size, shuffle=True, num_workers=1, generator=generator), #generator = torch.Generator(device=‘cuda’))
}
Define the generator and add it to the dictionary
dataloaders[‘generator’] = generator
Training loop:
import torch
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
from collections import defaultdict
import torch.nn.functional as F
import torch.nn as nn
def calc_loss_dense(pred, target, metrics):
criterion = nn.MSELoss()
loss = criterion(pred, target)
metrics[‘loss’] += loss.data.cpu().numpy() * target.size(0)
return loss
def calc_loss_sparse(pred, target, samples, metrics, num_samples):
criterion = nn.MSELoss()
loss = criterion(samplespred, samplestarget)*(256**2)/num_samples
metrics[‘loss’] += loss.data.cpu().numpy() * target.size(0)
return loss
def print_metrics(metrics, epoch_samples, phase):
outputs1 =
outputs2 =
for k in metrics.keys():
outputs1.append(“{}: {:4f}”.format(k, metrics[k] / epoch_samples))
print("{}: {}".format(phase, ", ".join(outputs1)))
def train_model(model, optimizer, scheduler, num_epochs=50, WNetPhase=“firstU”, targetType=“dense”, num_samples=300, device=‘cuda:0’):
# WNetPhase: traine first U and freez second (“firstU”), or vice verse (“secondU”).
# targetType: train against dense images (“dense”) or sparse measurements (“sparse”)
best_model_wts = copy.deepcopy(model.state_dict())
best_loss = 1e10
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
since = time.time()
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
scheduler.step()
for param_group in optimizer.param_groups:
print("learning rate", param_group['lr'])
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
metrics = defaultdict(float)
epoch_samples = 0
if targetType=="dense":
for inputs, targets in dataloaders[phase]:
inputs = inputs.to(device)
targets = targets.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
[outputs1,outputs2] = model(inputs)
if WNetPhase=="firstU":
loss = calc_loss_dense(outputs1, targets, metrics)
else:
loss = calc_loss_dense(outputs2, targets, metrics)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
epoch_samples += inputs.size(0)
elif targetType=="sparse":
for inputs, targets, samples in dataloaders[phase]:
inputs = inputs.to(device)
targets = targets.to(device)
samples = samples.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
[outputs1,outputs2] = model(inputs)
if WNetPhase=="firstU":
loss = calc_loss_sparse(outputs1, targets, samples, metrics, num_samples)
else:
loss = calc_loss_sparse(outputs2, targets, samples, metrics, num_samples)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
epoch_samples += inputs.size(0)
print_metrics(metrics, epoch_samples, phase)
epoch_loss = metrics['loss'] / epoch_samples
# deep copy the model
if phase == 'val' and epoch_loss < best_loss:
print("saving best model")
best_loss = epoch_loss
best_model_wts = copy.deepcopy(model.state_dict())
time_elapsed = time.time() - since
print('{:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val loss: {:4f}'.format(best_loss))
# load best model weights
model.load_state_dict(best_model_wts)
return model
Training the Unet:
import torch
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import copy
Determine the device (CPU or CUDA)
device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
print(device)
model.to(device) # Move the model to the selected device
#dataloaders[‘generator’] = torch.Generator(device=device)
Move the generator to the selected device
#generator = torch.Generator(device=device)
Ensure the generator is on the same device as the model
#generator = torch.Generator(device=device)
Define your optimizer
optimizer_ft = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
Learning rate scheduler
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=30, gamma=0.1)
Train your model
model = train_model(model, optimizer_ft, exp_lr_scheduler, device=device)
The error message:
TypeError Traceback (most recent call last)
Cell In[36], line 29
26 exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=30, gamma=0.1)
28 # Train your model
—> 29 model = train_model(model, optimizer_ft, exp_lr_scheduler, device=device)
Cell In[35], line 62, in train_model(model, optimizer, scheduler, num_epochs, WNetPhase, targetType, num_samples, device)
58 epoch_samples = 0
61 if targetType==“dense”:
—> 62 for inputs, targets in dataloaders[phase]:
63 inputs = inputs.to(device)
64 targets = targets.to(device)
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\dataloader.py:438, in DataLoader.iter(self)
436 return self._iterator
437 else:
→ 438 return self._get_iterator()
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\dataloader.py:386, in DataLoader._get_iterator(self)
384 else:
385 self.check_worker_number_rationality()
→ 386 return _MultiProcessingDataLoaderIter(self)
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\dataloader.py:1084, in _MultiProcessingDataLoaderIter.init(self, loader)
1082 _utils.signal_handling._set_SIGCHLD_handler()
1083 self._worker_pids_set = True
→ 1084 self._reset(loader, first_iter=True)
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\dataloader.py:1117, in _MultiProcessingDataLoaderIter._reset(self, loader, first_iter)
1115 # prime the prefetch loop
1116 for _ in range(self._prefetch_factor * self._num_workers):
→ 1117 self._try_put_index()
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\dataloader.py:1351, in _MultiProcessingDataLoaderIter._try_put_index(self)
1348 assert self._tasks_outstanding < self._prefetch_factor * self._num_workers
1350 try:
→ 1351 index = self._next_index()
1352 except StopIteration:
1353 return
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\dataloader.py:620, in _BaseDataLoaderIter._next_index(self)
619 def _next_index(self):
→ 620 return next(self._sampler_iter)
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\sampler.py:283, in BatchSampler.iter(self)
281 batch = [0] * self.batch_size
282 idx_in_batch = 0
→ 283 for idx in self.sampler:
284 batch[idx_in_batch] = idx
285 idx_in_batch += 1
File ~\AppData\Local\anaconda3\envs\gpu_env\Lib\site-packages\torch\utils\data\sampler.py:165, in RandomSampler.iter(self)
163 else:
164 for _ in range(self.num_samples // n):
→ 165 yield from map(int, torch.randperm(n, generator=generator).numpy())
166 yield from map(int, torch.randperm(n, generator=generator)[:self.num_samples % n].numpy())
TypeError: can’t convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.