HI folks. I’m just having an unreasonable amount of difficulty shaping outputs to use in graph. Is anyone available to just talk about data types etc?
I’m going to include a bunch of code below, and I’d like to know how best to save my predictions and the associated labels (given that I’m shuffling between epochs). In particular I’d like to make a confusion matrix, but can’t seem to get my outputs into a single structure (list/array/tensor). I’m also interested in if there is a good way to include the datum id in here somewhere, so I can compare accuracy within some of the other fields I’m not using in training my model. I’d also be happy to know what I’m making harder than it needs to be and whats just bad code.
If this sort of code dump and request for general help is inappropriate for the forum, please let me know. I’m just so frustrated with how messy this project has been… I’m about to finish a masters degree and this shouldn’t be this hard…
Many many thanks for your help.
params = {'batch_size': 200,
'shuffle': True}
batch_size=200
learning_rate = 0.001
num_epochs = 5
...
train_pd #pandasDF with
# index='id'= 5 char string,
# additional data not used in training
# label= int in range (0,8),
# size, indices, values = sparse vector information
# val_pd=same
# Prep for Data Loader
partition={'train': train_pd.index,'val': val_pd.index}
labels_train=train_pd['label'].to_dict()
labels_val=val_pd['label'].to_dict()
i_train=train_pd['indices'].to_dict()
i_val=val_pd['indices'].to_dict()
v_train=train_pd['values'].to_dict()
v_val=val_pd['values'].to_dict()
inputSize=train_pd['size'][0]
class Dataset(data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, list_IDs, labels, indices,values,vecSize):
'Initialization'
self.labels = labels
self.list_IDs = list_IDs
self.i= indices
self.v =values
self.vecSize=vecSize
def __len__(self):
'Denotes the total number of samples'
return len(self.list_IDs)
def __getitem__(self, index):
'Generates one sample of data'
# Select sample
ID = self.list_IDs[index]
try:
# Load data and get label
X = torch.sparse.FloatTensor(
torch.LongTensor([self.i[ID]]),
torch.FloatTensor(self.v[ID]),
torch.Size([self.vecSize])
).to_dense()
except RuntimeError:
print(ID)
i = self.labels[ID]
y=np.zeros(8)
y[i]=1
return X, torch.LongTensor([i])
training_set = Dataset(partition['train'], labels_train,i_train,v_train,inputSize)
training_generator = data.DataLoader(training_set, **params)
validation_set = Dataset(partition['val'], labels_val,i_val,v_val,inputSize)
validation_generator = data.DataLoader(validation_set, **params)
# Model
model = nn.Sequential(
nn.Linear(inputSize,16),
nn.ReLU(),
nn.Linear(16,8)#,
)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
losses=[]
accuracies=[]
comparisons=[]
total_step = len(training_generator)
for epoch in range(3):
i=0
total=0
correct=0
for local_batch, local_labels in training_generator:
i+=1
local_batch, local_labels = local_batch.to(device), local_labels.to(device)
optimizer.zero_grad()
# Forward pass
outputs = model(local_batch)
total += local_labels.size(0)
labs=local_labels.flatten()
pred=np.argmax(outputs.detach().numpy(), axis=1)
correct += (pred == labs).sum().item()
acc=correct/total
loss = criterion(outputs, local_labels.squeeze())
losses.append(loss.item())
accuracies.append(acc)
if epoch==num_epochs-1:
comparisons.append([pred,labs])
# Backward and optimize
loss.backward()
optimizer.step()
if i%100==0:
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy:{:,.4f}'
.format(epoch+1, num_epochs,i, total_step, loss.item(), acc))
# Validate the model
Vcomparisons=[]
total_step = len(validation_generator)
total=0
correct=0
Vaccuracies=[]
alllab=[]
allpred=[]
for local_batch, local_labels in validation_generator:
i+=1
local_batch, local_labels = local_batch.to(device), local_labels.to(device)
# Forward pass
outputs = model(local_batch)
total += local_labels.size(0)
labs=local_labels.flatten()
pred=np.argmax(outputs.detach().numpy(), axis=1)
correct += (pred == labs).sum().item()
alllab.append(labs.numpy())
allpred.append(pred)
acc=correct/total
Vaccuracies.append(acc)
Vcomparisons.append([pred,labs])
print ('Accuracy:{:,.4f}' .format( acc))
labs=alllab[0]
preds=allpred[0]
for i in range(1,len(alllab)-1):
print(labs, preds)
print(alllab[i],allpred[i])
labs=np.concatenate(labs,alllab[i], axis=1) #errors
preds=np.concatenate(preds,allpred[i],axis=1) #errors
#Eventually sklearn confusion matrix here