import os
from pathlib import Path
import dill
import torch
import torchtext.data as data
from torch.utils.data import Sampler
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn import preprocessing
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
class Dataset(torch.utils.data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, csv_file, dataset_name):
"""Initializes instance of class Dataset.
Args:
csv_file (str): Path to the csv file with the students data.
"""
#Iris Dataset
df = pd.read_csv(csv_file, sep=',')
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
df['Y'] = label_encoder.fit_transform(df['Y'])
df['Y'].unique()
# Save target and predictors
self.X = df.iloc[:, :-1]
self.y = df['Y']
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
# Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
if isinstance(idx, torch.Tensor):
idx = idx.tolist()
return [self.X.iloc[idx].values, self.y[idx]]
def get_dataloaders(dataset_name, valid_size, batch_size, detector_name, subtrain_ratio=1.0, dataroot='.data'):
csv_file = os.path.abspath(os.path.join(os.getcwd(),os.pardir,os.pardir,os.pardir,os.pardir, "datasets", dataset_name, detector_name, "filtered.csv"))
dataset = Dataset(csv_file, dataset_name)
train_size = int(0.8 * len(dataset))
print(train_size)
test_size = len(dataset) - train_size
print(test_size)
print(valid_size)
trainset, testset = random_split(dataset, [train_size, test_size])
if valid_size > 0:
trainset, validset = random_split(trainset, [train_size, valid_size])
else:
validset = None
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
if valid_size > 0:
validloader = DataLoader(validset, batch_size=batch_size, shuffle=True)
else:
validloader = None
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)
classes = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
print('### Dataset ###')
print(f'=>{dataset_name}')
print(f' |Train size:\t{len(trainloader)}')
if validloader is not None:
print(f' |Valid size:\t{len(validloader)}')
print(f' |Test size:\t{len(testloader)}')
return trainloader, validloader, testloader, classes
def run_model(dataloader)
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloader):
print(inputs)
print(labels)
inputs = inputs.to(self.device)
labels = labels.to(self.device)
features = inputs
outputs = encoder(inputs.float())
labels = labels.type(torch.LongTensor)
loss = self.criterion(outputs, labels)
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object
Any insights are helpful.!!