Precision 16 run problem

karambit · June 4, 2024, 9:52pm

This is my code written by pytorch lightning and running on google colab gpu. I changed it to precision 16 and it was working ok previously, but suddenly it did not work and following error rose on line x1 = self.conv_1x1(x)

RuntimeError: dot : expected both vectors to have same dtype, but found Float and Half

this is my dataset

class TFDataset(torch.utils.data.Dataset):
    def __init__(self, split):
        super().__init__()
        self.reader = load_dataset(
            "openclimatefix/nimrod-uk-1km", "sample", split=split, streaming=True
        )
        self.iter_reader = self.reader

    def __len__(self):
        return 1000

    def __getitem__(self, item):
        try:
            row = next(self.iter_reader)
        except Exception:
            rng = default_rng()
            self.iter_reader = iter(
                self.reader.shuffle(seed=rng.integers(low=0, high=100000), buffer_size=10)
            )
            row = next(self.iter_reader)
        input_frames, target_frames = extract_input_and_target_frames(row["radar_frames"])

        d_flat = np.moveaxis(input_frames, [0, 1, 2, 3], [0, 3, 2, 1])
        d_flat = torch.from_numpy(d_flat)
        resized_d_flat = TF.resize(d_flat, (32, 32))
        resized_d_flat = resized_d_flat.detach().cpu().numpy()
        input_frames = np.moveaxis(resized_d_flat, [0, 1, 2, 3], [0, 3, 2, 1])

        d_flat = np.moveaxis(target_frames, [0, 1, 2, 3], [0, 3, 2, 1])
        d_flat = torch.from_numpy(d_flat)
        resized_d_flat = TF.resize(d_flat, (32, 32))
        resized_d_flat = resized_d_flat.detach().cpu().numpy()
        target_frames = np.moveaxis(resized_d_flat, [0, 1, 2, 3], [0, 3, 2, 1])

        return np.moveaxis(input_frames, [0, 1, 2, 3], [0, 2, 3, 1]), np.moveaxis(
            target_frames, [0, 1, 2, 3], [0, 2, 3, 1]
        )

this is the main code and classes

def get_conv_layer(conv_type: str = "standard") -> torch.nn.Module:
        if conv_type == "standard":
            conv_layer = torch.nn.Conv2d
        elif conv_type == "3d":
            conv_layer = torch.nn.Conv3d
        else:
            raise ValueError(f"{conv_type} is not a recognized Conv method")
        return conv_layer


class DBlock(torch.nn.Module):
    def __init__(
            self,
            input_channels: int = 12,
            output_channels: int = 12,
            conv_type: str = "standard",
            first_relu: bool = True,
            keep_same_output: bool = False,
    ):
        
        super().__init__()
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.first_relu = first_relu
        self.keep_same_output = keep_same_output
        self.conv_type = conv_type
        conv2d = get_conv_layer(conv_type)
        if conv_type == "3d":
            # 3D Average pooling
            self.pooling = torch.nn.AvgPool3d(kernel_size=2, stride=2)
        else:
            self.pooling = torch.nn.AvgPool2d(kernel_size=2, stride=2)
        self.conv_1x1 = spectral_norm(
            conv2d(
                in_channels=input_channels,
                out_channels=output_channels,
                kernel_size=1,
            )
        )
        self.first_conv_3x3 = spectral_norm(
            conv2d(
                in_channels=input_channels,
                out_channels=output_channels,
                kernel_size=3,
                padding=1,
            )
        )
        self.last_conv_3x3 = spectral_norm(
            conv2d(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
                padding=1,
                stride=1,
            )
        )
        # Downsample at end of 3x3
        self.relu = torch.nn.ReLU()
        # Concatenate to double final channels and keep reduced spatial extent

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.input_channels != self.output_channels:
            x1 = self.conv_1x1(x)
            if not self.keep_same_output:
                x1 = self.pooling(x1)
        else:
            x1 = x

        if self.first_relu:
            x = self.relu(x)

        x = self.first_conv_3x3(x)
        x = self.relu(x)
        x = self.last_conv_3x3(x)

        if not self.keep_same_output:
            x = self.pooling(x)
        x = x1 + x  # Sum the outputs should be half spatial and double channels
        return x



class CNNModel(LightningModule):

    def __init__(self):
        super(CNNModel, self).__init__()

        input_channels = 1

        output_channels = 384

        num_context_steps = 4

        self.d1 = DBlock(
            input_channels=4 * input_channels,
            output_channels=((output_channels // 4) * input_channels) // num_context_steps,
            conv_type="standard",
        )

    def forward(self, x):
        x = self.relu(self.conv_1x1(x))
        
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch

        self.space2depth = PixelUnshuffle(downscale_factor=2)

        x = self.space2depth(x)

        steps = x.size(1) 

        print(torch.cuda.is_available())

        for i in range(steps):

          s1 = self.d1(x[:, i, :, :, :])


if __name__ == "__main__":
    data_module = DGMRDataModule()
    model = CNNModel()
    trainer = pl.Trainer(
        max_epochs=1,
        accelerator="auto",
        precision=16,
        num_sanity_val_steps=0,
        min_epochs = 1)
    trainer.fit(model, data_module)

I tried various methods from changing input data type to float32 to using torch.cuda.amp.autocast but none of them solved the problem.

ptrblck · June 4, 2024, 10:02pm

The issue is raised if autocast isn’t properly used as seen in this example:

device ="cuda"
conv = nn.Conv2d(3, 3, 1, 1, 1).to(device)
x = torch.randn(1, 3, 224, 224, device=device)

# works using autocast
with torch.cuda.amp.autocast():
    out = conv(x)

print(out.dtype)
# torch.float16

# fails without autocast
out = conv(out)
# RuntimeError: Input type (c10::Half) and bias type (float) should be the same

I don’t know where you are using it or if the Lightning API adds it behind your back, but you might want to narrow down which op fails and if it’s outside of an autocast context.

This is also confusing and you might want to check what changed between the working and crashing run.

karambit · June 4, 2024, 11:50pm

thank you for your response

it can not be ran without outcast? because i was running it with and without outcast previously.

this is strange for me that the code did not run suddenly, i even tested a code section which was edited 4 months ago and was working then but does not work now. there were not any change or update in colab from last week?

moreover i must mention that i’m using pytorch lightning 1.6.5