Hi, I’m facing this weird issue where training slows down after exactly 5000 epochs and I found that after 5000 epochs calculations are shifted to the CPU from the GPU (GPU usage 0% and CPU at max), I have 3070ti 8 GB, and it doesn’t use more than 1 GB during the training but still moves to the CPU.
Here’s my model:
class Net(nn.Module):
def __init__(self, hidden_size=128):
super(Net, self).__init__()
self.fc1 = nn.Linear(30, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, 1)
self.dropout = nn.Dropout(0.2)
def forward(self, x):
x = self.dropout(F.relu(self.fc1(x)))
x = self.dropout(F.relu(self.fc2(x)))
return self.fc3(x)
This is my training loop:
epochs_list = [10000]
batch_size_list = [4000]
learning_rate_list = [0.001]
models = [Net(hidden_size=128).to(device)]
for net in models:
for epochs in epochs_list:
for batch_size in batch_size_list:
for learning_rate in learning_rate_list:
optimizer = T.optim.Adam(net.parameters(), lr=learning_rate)
batches = len(x_train) // batch_size
writer = SummaryWriter()
criterion = nn.MSELoss()
for epoch in range(epochs):
for batch in range(batches):
x_batch = T.tensor(
x_train[batch * batch_size : (batch + 1) * batch_size],
dtype=T.float32,
device=device,
)
y_batch = T.tensor(
y_train[batch * batch_size : (batch + 1) * batch_size],
dtype=T.float32,
device=device,
)
optimizer.zero_grad()
outputs = net(x_batch)
loss = nn.MSELoss()(outputs, y_batch)
loss.backward()
optimizer.step()
print(f"Epoch: {epoch+1}/{epochs}, Loss: {loss.item():.8f}")
writer.add_scalar("Loss/train", loss.item(), epoch)
net.eval()
with T.no_grad():
outputs = net(x_test)
loss = criterion(outputs, y_test)
print(f"Test loss: {loss.item():.8f}")
r2 = 1 - (
T.sum((y_test - outputs) ** 2)
/ T.sum((y_test - T.mean(y_test)) ** 2)
)
print("R^2 value:", r2.item())
plt.scatter(y_test.cpu().numpy(), outputs.cpu().numpy())
plt.xlabel("Actual Amps")
plt.ylabel("Predicted Amps")
plt.title(
f"e_{epochs}-bs_{batch_size}-lr_{learning_rate} R^2 value: {r2.item()}"
)
writer.add_figure(
"Scatter",
plt.gcf(),
global_step=epochs,
)
writer.flush()
writer.close()
net.train()
T.save(
net.state_dict(),
f"e_{epochs}-bs_{batch_size}-lr_{learning_rate}.pt",
)