Hi,
I created a CNN that uses PyTorch to learn multi-class multi-label problems.
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.ConvLayer1 = nn.Sequential(
nn.Conv2d(3, 16, 5),
nn.MaxPool2d(2),
nn.ReLU(),
)
self.ConvLayer2 = nn.Sequential(
nn.Conv2d(16, 32, 5),
nn.MaxPool2d(2),
nn.ReLU(),
)
self.ConvLayer3 = nn.Sequential(
nn.Conv2d(32, 64, 5),
nn.MaxPool2d(2),
nn.ReLU(),
)
self.ConvLayer4 = nn.Sequential(
nn.Conv2d(64, 32, 5),
nn.MaxPool2d(2),
nn.ReLU(),
#nn.Dropout(0.2, inplace=True),
)
self.Linear1 = nn.Linear(32 * 10 * 10, 2048)
self.Linear2 = nn.Linear(2048, 1024)
self.Linear3 = nn.Linear(1024, 512)
self.Linear4 = nn.Linear(512, 5)
def forward(self, x):
x = self.ConvLayer1(x)
x = self.ConvLayer2(x)
x = self.ConvLayer3(x)
x = self.ConvLayer4(x)
#print(x.shape)
x = x.view(-1, 32 * 10 * 10)
#print(x.shape)
x = self.Linear1(x)
x = self.Linear2(x)
x = self.Linear3(x)
x = self.Linear4(x)
return nn.functional.sigmoid(x)
labels = ["desert","mountains","sea","sunset","trees"]
net = Net()
#criterion = torch.nn.BCEWithLogitsLoss()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.0001)
threshold = 0.7
n_epochs = 3
history = {"train_loss_mean":[], "train_acc_mean":[], "test_loss_mean":[], "test_acc_mean":[]}
for epoch in range(n_epochs):
""" train mode """
net.train()
train_loss = 0.0
for inputs, labels in train_dataloader:
optimizer.zero_grad()
outputs = net(inputs)
print (outputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_loss_mean = train_loss / len(train_dataloader)
print("Epoch: {}; Loss(mean): {}".format(epoch, train_loss_mean))
history["train_loss_mean"].append(train_loss_mean)
There are 5 labels (“desert”, “mountains”, “sea”, “sunset”, “trees”), which are landscape images. There are 2000 images. Image data was obtained from Kaggle.
Each image is resized to 224 * 224, converted to a tensor and normalized.
self.transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
The output result is the probability (0-1) for each label.
(ex: desert: 0.52, mountains: 0.83, sea: 0, sunset: 0.12, trees: 0.26)
Therefore, the output layer adapts the sigmoid function.
return nn.functional.sigmoid(x)
It processes each batch, gives data (image data, label) to the created network, and displays the result.
for inputs, labels in train_dataloader:
optimizer.zero_grad()
outputs = net(inputs)
print (outputs)
The content of outputs is the probability of each label.
I hope that this output will improve as the processing of 1 batch and 1 epoch progresses.
ex)
1 epoch
tensor ([0.52, 0.55, 0.23, 0.11, 0.32])
100 epoch
tensor ([0.98, 0.83, 0, 0, 0.12])
However, in reality, it is getting worse.
1 loop
tensor ([[0.4911, 0.5088, 0.4933, 0.4954, 0.5039],
[0.4914, 0.5085, 0.4938, 0.4958, 0.5038],
[0.4908, 0.5082, 0.4936, 0.4962, 0.5036],
30 loop
tensor ([[7.1020e-01, 1.0376e-01, 2.8710e-01, 7.2361e-02, 6.9248e-02],
[5.2053e-02, 1.5246e-01, 3.2112e-01, 5.2325e-01, 1.5300e-01],
[4.0268e-02, 4.2312e-01, 3.4056e-01, 3.7324e-02, 2.2200e-01],
Am I making a big mistake?
Thank you!