Why would batch size not match?

baichuanzhou · January 17, 2023, 5:31pm

I tried to implement a char level RNN and used cross entropy loss as my criterion.
Here’s my RNN module:

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden_state):
        x = self.encoder(x)
        out, hidden_state = self.gru(x.view(1, -1), hidden_state)
        print(out.shape)
        out = self.decoder(out)
        print(out.shape)
        return out, hidden_state

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, self.hidden_size)).to(device='cuda')

Here’s my training function and eval function and others:

def evaluate(model, starter_char, hidden_state, predict_len=100, temperature=0.8, device='cuda'):
    model.eval()

    input_tensor = char2tensor(starter_char).to(device=device)

    predict_chars = starter_char

    for i in range(predict_len):
        output, hidden_state = model(input_tensor, hidden_state)

        # sample an index from softmax probs
        probs = torch.exp(output.data.view(-1).div(temperature)) / torch.sum(output.data.view(-1).div(temperature))
        idx = torch.multinomial(probs, 1)[0]

        # concatenate the char to the output and render it the next prediction
        prediction = all_characters[idx]
        predict_chars += prediction

        input_tensor = char2tensor(prediction)

    return predict_chars


def train(model, optimizer, chunk_len, loss_his, device='cuda',
          epoch=1000,
          print_every=100,
          plot_every=100):
    hidden_state = model.init_hidden()
    model.to(device=device)

    criterion = F.cross_entropy

    loss = torch.tensor([0.0], requires_grad=True)

    for i in range(epoch):

        model.train()

        inp, target = training_set(chunk_len)

        inp = inp.to(device=device)
        target = target.to(device=device)

        for c in range(chunk_len):
            output, hidden_state = model(inp[c], hidden_state)
            loss += criterion(output, target[c])

        loss /= chunk_len
        # Clear out the gradient in case they add up
        optimizer.zero_grad()
        # backward pass for loss
        loss.backward()
        # take the gradient step
        optimizer.step()

        with torch.no_grad():
            if i % print_every == 0:
                starter_idx = np.random.choice(n_characters)
                starter_char = all_characters[starter_idx]
                print(f"Current Loss:{loss.item()}")
                print(f"Input letter:{starter_char}")
                print("--------------------------------------------")
                text = evaluate(model, starter_char, hidden_state)
                print(text)
                print("--------------------------------------------")

            if i % plot_every == 0:
                loss_his.append(loss.item() / plot_every)

all_characters = string.printable
n_characters = len(all_characters)
hidden_size = 128

file = unidecode.unidecode(open("data/data.txt").read())

file_len = len(file)


def random_chunk(chunk_len):
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]


def char2tensor(inputs):
    tensor = torch.zeros(len(inputs)).long()
    for c in range(len(inputs)):
        tensor[c] = all_characters.index(inputs[c])
    return Variable(tensor)

And when I tried to run these, this occurred:
ValueError: Expected input batch_size (1) to match target batch_size (0).
And when I tried to debug it, it seems that the output in the traing function is of size torch.Size([1, 100]) and target[c] is just a scalar. And whenever I tried to fix it with view() or squeeze(), it would suggest that it’s a leaf Variable and cannot use inplace operation.
Is there anything I can do to fix it?

vdw · January 18, 2023, 6:50am

What exactly is your task? A classification task?

If so, note that out contains the last hidden states (“last” w.r.t. the number of layers) for all time steps. For a classification task, you typically just use the hidden state of the last time step.

Additionally, you might want to have a look at this post to avoid any issues when using view().

baichuanzhou · January 18, 2023, 5:23pm

Thank you, I was trying to come up with a way to generate texts using character level RNN generator. This piece of code is very immature and I have found many bugs, as I’m still new to PyTorch. I found your help very helpful. Thank you very much.

Carlos_Segura · January 20, 2023, 8:07pm

I haven’t tried running your code but if you want to accumulate the loss in place, then do the following;


def train(...):
    ...
    # loss = torch.tensor([0.0], requires_grad=True) # replace this
    loss = torch.tensor([0.0])  # with this

    for i in range(epoch):
        ...
        for c in range(chunk_len):
            output, hidden_state = model(inp[c], hidden_state)
            loss += criterion(output, target[c])
        loss /= chunk_len
        ...
        # backward pass for loss
        loss.backward()  # execute all accumulated dependencies.

Carlos_Segura · January 20, 2023, 8:38pm

Here’s an example with similar semantics to your code that you can use to experiment;

import torch

data = torch.randn(3, 5)

a = data.clone().requires_grad_()
b = data.clone().requires_grad_()

loss_a = torch.relu(a).sum(0).mean()
loss_b = torch.tensor(0.)  # initialize the buffer

# Simulate accumulating the loss inside a loop with chunks.
chunks = b.unbind(0)
for i in range(len(chunks)):
    loss_b += chunks[i].relu().mean()

loss_a.backward()
loss_b.backward()

assert torch.allclose(loss_b, loss_a)
assert (b.grad == a.grad).all()

J_Johnson · January 21, 2023, 9:24am

Try changing this line:

loss += criterion(output, target[c:c+1])

There is a bug(or feature?) that drops the tensor down a dim when you use tensor[n], where n is an integer, but will keep the dim if you use tensor[n:n+1], instead.