Hi,
I am new to Pytorch, and I follow this tutorial to implement the ray tune to a CNN model.
I got this error :
TypeError: '<' not supported between instances of 'Float' and 'float'
It’s raised by config[“lr”] in “optimizer = torch.optim.SGD(cnn.parameters(), lr=config[“lr”], momentum=0.9)”.
I have tried to cast the config[“lr”] to float but it does’t work, because the type of config[“lr”] is ray.tune.sample.Float. Any idea how to convert it to float?
Here is my code for reference:
# Download training data from open datasets. 60000
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
)
# Download test data from open datasets 10000
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
)
# get the validation dataset
indices = list(range(len(training_data)))
np.random.shuffle(indices)
train_sample = SubsetRandomSampler(indices[:50000]) # 50000
valid_sample = SubsetRandomSampler(indices[50000:]) # 10000
print(len(train_sample)
class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.model1=Sequential(
Conv2d(1,32,5, padding=2),
AvgPool2d(2),
Conv2d(32,32,5, padding=2),
AvgPool2d(2),
Conv2d(32,64,5, padding=2),
AvgPool2d(2),
Conv2d(64,64,5, padding=2),
AvgPool2d(2),
Flatten(),
Linear(64,64),
Linear(64,10),
)
def forward(self, x):
return self.model1(x)
def train_cifar(config, checkpoint_dir=None, data_dir=None):
cnn = CNN()
device = "cpu"
if torch.cuda.is_available():
device = "cuda:0"
if torch.cuda.device_count() > 1:
cnn = nn.DataParallel(cnn)
cnn.to(device)
criterion = nn.CrossEntropyLoss()
print(config['lr'])
print(type(config['lr']))
print(str(config["lr"]))
optimizer = torch.optim.SGD(cnn.parameters(), lr=config["lr"], momentum=0.9)
if checkpoint_dir:
model_state, optimizer_state = torch.load(
os.path.join(checkpoint_dir, "checkpoint"))
cnn.load_state_dict(model_state)
optimizer.load_state_dict(optimizer_state)
trainloader = torch.utils.data.DataLoader(
training_data,
sampler = train_sample,
batch_size=int(config["batch_size"]),
shuffle=True)
valloader = torch.utils.data.DataLoader(
training_data,
sampler = valid_sample,
batch_size=int(config["batch_size"]),
shuffle=True)
for epoch in range(50): # loop over the dataset multiple times
running_loss = 0.0
epoch_steps = 0
for i, data in enumerate(trainloader, 0):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = cnn(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
epoch_steps += 1
if i % 2000 == 1999: # print every 2000 mini-batches
print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
running_loss / epoch_steps))
running_loss = 0.0
# Validation loss
val_loss = 0.0
val_steps = 0
total = 0
correct = 0
for i, data in enumerate(valloader, 0):
with torch.no_grad():
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
outputs = cnn(inputs)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
loss = criterion(outputs, labels)
val_loss += loss.cpu().numpy()
val_steps += 1
with tune.checkpoint_dir(epoch) as checkpoint_dir:
path = os.path.join(checkpoint_dir, "checkpoint")
torch.save((cnn.state_dict(), optimizer.state_dict()), path)
tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
print("Finished Training")
config = {
"lr": tune.loguniform(1e-4, 1e-1),
"batch_size": tune.choice([16,32,64,128,256])
}
train_cifar(config)