Compiled model throws an error

iamexperimentingnow · November 23, 2023, 11:41am

Hi,

I’m trying to train image classification (ResNet152 model), I compiled and executed the below code but its throwing an error. any guidance will be apppreciated.

python version : 3.8.18
pytorch version : 2.1.1

here is the code,

import os, random, torch, torchvision, warnings
random.seed(32)
from tqdm.auto import tqdm
from typing import Dict, List, Tuple
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from typing import Tuple, Dict, List
from torchvision import models
from torchinfo import summary
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
warnings.filterwarnings("ignore")


class ImageFolderCustom(Dataset):
    def __init__(self, image_dir: str, class_file:str ,transform=None) -> None:
        with open(image_dir) as img_file:
            self.image_filename = ['/home/food-101/images/'+line.rstrip()+'.jpg' for line in img_file]
        self.transform = transform
        with open(class_file) as file:
            classes = [line.rstrip() for line in file]
        self.class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}

    def load_image(self, index: int) -> Image.Image:
        image_path = self.image_filename[index]
        return Image.open(image_path).convert('RGB')
    
    def __len__(self) -> int:
        return len(self.image_filename)
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
        img = self.load_image(index)
        class_name = self.image_filename[index].split('/')[-2]
        class_idx = self.class_to_idx[class_name]

        if self.transform:
            return self.transform(img), class_idx
        else:
            return transforms.ToTensor()(img), class_idx

# Augment train data
train_transforms = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])
test_transforms = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])

train_dir = '/home/food-101/meta/train.txt'
test_dir = '/home/food-101/meta/test.txt'
class_file = '/home/food-101/meta/classes.txt'
train_data_custom = ImageFolderCustom(image_dir=train_dir, transform=train_transforms,class_file=class_file)
test_data_custom = ImageFolderCustom(image_dir=test_dir, class_file=class_file,transform=test_transforms)

train_dataloader = DataLoader(train_data_custom, batch_size=256,pin_memory=True, num_workers=10, shuffle=True)
test_dataloader = DataLoader(test_data_custom, batch_size=256, num_workers=10,pin_memory=True, shuffle=False)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
res_model = models.resnet152(pretrained=True)
res_model.fc = nn.Sequential(nn.Linear(2048, 128),
               nn.ReLU(inplace=True),
               nn.Linear(128, 4))

res_model = res_model.to(device)

optimizer = optim.Adam(res_model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=1, gamma=0.7)


def train_step(model: torch.nn.Module, 
               dataloader: torch.utils.data.DataLoader, 
               loss_fn: torch.nn.Module, 
               optimizer: torch.optim.Optimizer,
               device: torch.device) -> Tuple[float, float]:
    model.train()
    train_loss, train_acc = 0, 0

    for batch, (X, y) in enumerate(dataloader):
        optimizer.zero_grad(set_to_none=True)
        X, y = X.to(device,non_blocking=True), y.to(device,non_blocking=True)
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        train_loss += loss.item() 
        loss.backward()
        optimizer.step()
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).sum().item()/len(y_pred)
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    return train_loss, train_acc

def test_step(model: torch.nn.Module, 
              dataloader: torch.utils.data.DataLoader, 
              loss_fn: torch.nn.Module,
              device: torch.device) -> Tuple[float, float]:
    model.eval() 
    test_loss, test_acc = 0, 0
    with torch.inference_mode():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device,non_blocking=True), y.to(device,non_blocking=True)
            test_pred_logits = model(X)
            loss = loss_fn(test_pred_logits, y)
            test_loss += loss.item()
            test_pred_labels = test_pred_logits.argmax(dim=1)
            test_acc += ((test_pred_labels == y).sum().item()/len(test_pred_labels))
    test_loss = test_loss / len(dataloader)
    test_acc = test_acc / len(dataloader)
    return test_loss, test_acc

def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List]:

    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }
    model.to(device)
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

    return results


train(model=res_model,train_dataloader= train_dataloader, test_dataloader= test_dataloader, optimizer= optimizer,loss_fn= loss_fn,epochs= 10, device=device)

error message:

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[9], line 1
----> 1 out_result = train(model=compiled_model,train_dataloader= train_dataloader, test_dataloader= test_dataloader, optimizer= optimizer,loss_fn= loss_fn,epochs= 10, device=device)

Cell In[8], line 102, in train(model, train_dataloader, test_dataloader, optimizer, loss_fn, epochs, device)
    100 # Loop through training and testing steps for a number of epochs
    101 for epoch in tqdm(range(epochs)):
--> 102     train_loss, train_acc = train_step(model=model,
    103                                       dataloader=train_dataloader,
    104                                       loss_fn=loss_fn,
    105                                       optimizer=optimizer,
    106                                       device=device)
    107     test_loss, test_acc = test_step(model=model,
    108       dataloader=test_dataloader,
    109       loss_fn=loss_fn,
    110       device=device)
    112     # Print out what's happening

Cell In[8], line 35, in train_step(model, dataloader, loss_fn, optimizer, device)
     28 train_loss += loss.item() 
     30 # 3. Optimizer zero grad
     31 #optimizer.zero_grad(set_to_none=True)
     32 #optimizer.zero_grad()
     33 
     34 # 4. Loss backward
---> 35 loss.backward()
     37 # 5. Optimizer step
     38 optimizer.step()

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_tensor.py:492, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    482 if has_torch_function_unary(self):
    483     return handle_torch_function(
    484         Tensor.backward,
    485         (self,),
   (...)
    490         inputs=inputs,
    491     )
--> 492 torch.autograd.backward(
    493     self, gradient, retain_graph, create_graph, inputs=inputs
    494 )

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/autograd/__init__.py:251, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    246     retain_graph = create_graph
    248 # The reason we repeat the same comment below is that
    249 # some Python versions print out the first line of a multi-line function
    250 # calls in the traceback and some print out the last line
--> 251 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    252     tensors,
    253     grad_tensors_,
    254     retain_graph,
    255     create_graph,
    256     inputs,
    257     allow_unreachable=True,
    258     accumulate_grad=True,
    259 )

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/autograd/function.py:288, in BackwardCFunction.apply(self, *args)
    282     raise RuntimeError(
    283         "Implementing both 'backward' and 'vjp' for a custom "
    284         "Function is not allowed. You should only implement one "
    285         "of them."
    286     )
    287 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 288 return user_fn(self, *args)

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_functorch/aot_autograd.py:3232, in aot_dispatch_autograd.<locals>.CompiledFunction.backward(ctx, *flat_args)
   3230     out = CompiledFunctionBackward.apply(*all_args)
   3231 else:
-> 3232     out = call_compiled_backward()
   3233 return out

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_functorch/aot_autograd.py:3204, in aot_dispatch_autograd.<locals>.CompiledFunction.backward.<locals>.call_compiled_backward()
   3199     with tracing(saved_context), context(), track_graph_compiling(aot_config, "backward"):
   3200         CompiledFunction.compiled_bw = aot_config.bw_compiler(
   3201             bw_module, placeholder_list
   3202         )
-> 3204 out = call_func_with_args(
   3205     CompiledFunction.compiled_bw,
   3206     all_args,
   3207     steal_args=True,
   3208     disable_amp=disable_amp,
   3209 )
   3211 out = functionalized_rng_runtime_epilogue(CompiledFunction.metadata, out)
   3212 return tuple(out)

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_functorch/aot_autograd.py:1506, in call_func_with_args(f, args, steal_args, disable_amp)
   1504 with context():
   1505     if hasattr(f, "_boxed_call"):
-> 1506         out = normalize_as_list(f(args))
   1507     else:
   1508         # TODO: Please remove soon
   1509         # https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670
   1510         warnings.warn(
   1511             "Your compiler for AOTAutograd is returning a function that doesn't take boxed arguments. "
   1512             "Please wrap it with functorch.compile.make_boxed_func or handle the boxed arguments yourself. "
   1513             "See https://github.com/pytorch/pytorch/pull/83137#issuecomment-1211320670 for rationale."
   1514         )

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_dynamo/eval_frame.py:328, in _TorchDynamoContext.__call__.<locals>._fn(*args, **kwargs)
    326 dynamic_ctx.__enter__()
    327 try:
--> 328     return fn(*args, **kwargs)
    329 finally:
    330     set_eval_frame(prior)

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_dynamo/external_utils.py:17, in wrap_inline.<locals>.inner(*args, **kwargs)
     15 @functools.wraps(fn)
     16 def inner(*args, **kwargs):
---> 17     return fn(*args, **kwargs)

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_inductor/codecache.py:374, in CompiledFxGraph.__call__(self, inputs)
    373 def __call__(self, inputs) -> Any:
--> 374     return self.get_current_callable()(inputs)

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_inductor/compile_fx.py:628, in align_inputs_from_check_idxs.<locals>.run(new_inputs)
    626 def run(new_inputs):
    627     copy_misaligned_inputs(new_inputs, inputs_to_check)
--> 628     return model(new_inputs)

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_inductor/codecache.py:401, in _run_from_cache(compiled_graph, inputs)
    391     from .codecache import PyCodeCache
    393     compiled_graph.compiled_artifact = PyCodeCache.load_by_key_path(
    394         compiled_graph.cache_key,
    395         compiled_graph.artifact_path,
   (...)
    398         else (),
    399     ).call
--> 401 return compiled_graph.compiled_artifact(inputs)

File /tmp/torchinductor_iamalien/l2/cl27m74cyvyw6agpvll46v7gax6igmymkivu3ogafbumw3mgdrm3.py:5643, in call(args)
   5641 del primals_4
   5642 buf472 = buf471[0]
-> 5643 assert_size_stride(buf472, (s0, 64, 16, 16), (16384, 1, 1024, 64))
   5644 buf473 = buf471[1]
   5645 assert_size_stride(buf473, (64, 64, 1, 1), (64, 1, 64, 64))

AssertionError: expected size 64==64, stride 256==1 at dim=1

@ptrblck can you please guide me here?

ptrblck · November 23, 2023, 1:41pm

I’m not an expert in the Inductor backend, but based on:

File ~/miniconda3/envs/pytorch2/lib/python3.8/site-packages/torch/_inductor/codecache.py:401, in _run_from_cache(compiled_graph, inputs)
    391     from .codecache import PyCodeCache
    393     compiled_graph.compiled_artifact = PyCodeCache.load_by_key_path(
    394         compiled_graph.cache_key,
    395         compiled_graph.artifact_path,
   (...)
    398         else (),
    399     ).call
--> 401 return compiled_graph.compiled_artifact(inputs)

it seems you are running the model from the cache. Could you try to delete it and let it recompile?

iamexperimentingnow · November 24, 2023, 10:57am

I restarted my machine and tried, it throws the same. Additionally, I tried this import torch torch._dynamo.reset() not sure whether it is approach or not.

could you please tell me how to do that?

ptrblck · November 26, 2023, 7:55pm

I don’t think torch._dynamo.reset() clears the cache and is used to switch to a different execution mode, if I’m not mistaken.
Check torch._inductor.codecache.CacheBase.get_local_cache_path() and .get_global_cache_path() and try to delete these manually.

iamexperimentingnow · December 7, 2023, 11:10pm

@ptrblck I get the below error when I use the following code torch._inductor.codecache.CacheBase.get_local_cache_path()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[2], line 1
----> 1 torch._inductor.codecache.CacheBase.get_local_cache_path()

AttributeError: module 'torch._inductor' has no attribute 'codecache'

ptrblck · December 7, 2023, 11:14pm

import it beforehand:

torch._inductor.codecache.CacheBase.get_local_cache_path()
Traceback (most recent call last):

  Cell In[43], line 1
    torch._inductor.codecache.CacheBase.get_local_cache_path()

AttributeError: module 'torch._inductor' has no attribute 'codecache'


import torch._inductor.codecache

torch._inductor.codecache.CacheBase.get_local_cache_path()
PosixPath('/tmp/torchinductor_pbialecki/cache/12aae8ab2d00bc83aafc3a7d39533da791c68dfda7b0ee1be4000cf8c16f2ef9')

iamexperimentingnow · December 7, 2023, 11:50pm

thanks @ptrblck , It worked and I deleted the cache and ran the code.
compile model logs

 10%|█         | 1/10 [05:48<52:19, 348.79s/it]Epoch: 1 | train_loss: 1.7183 | train_acc: 0.2524 | test_loss: 1.4440 | test_acc: 0.2725
 20%|██        | 2/10 [06:33<22:38, 169.86s/it]Epoch: 2 | train_loss: 1.3487 | train_acc: 0.3341 | test_loss: 1.3432 | test_acc: 0.3506
 30%|███       | 3/10 [07:21<13:20, 114.30s/it]Epoch: 3 | train_loss: 1.2715 | train_acc: 0.3823 | test_loss: 1.3098 | test_acc: 0.3486
 40%|████      | 4/10 [08:13<08:59, 89.86s/it] Epoch: 4 | train_loss: 1.1551 | train_acc: 0.4750 | test_loss: 1.1660 | test_acc: 0.4619
 50%|█████     | 5/10 [09:07<06:24, 76.92s/it]Epoch: 5 | train_loss: 1.0495 | train_acc: 0.5453 | test_loss: 1.3847 | test_acc: 0.4424
 60%|██████    | 6/10 [10:02<04:37, 69.26s/it]Epoch: 6 | train_loss: 1.0127 | train_acc: 0.5581 | test_loss: 0.9990 | test_acc: 0.5508
 70%|███████   | 7/10 [10:56<03:13, 64.42s/it]Epoch: 7 | train_loss: 0.9983 | train_acc: 0.5603 | test_loss: 1.0435 | test_acc: 0.5557
 80%|████████  | 8/10 [11:51<02:02, 61.28s/it]Epoch: 8 | train_loss: 0.9671 | train_acc: 0.5755 | test_loss: 1.0590 | test_acc: 0.5322
 90%|█████████ | 9/10 [12:45<00:59, 59.04s/it]Epoch: 9 | train_loss: 0.9284 | train_acc: 0.5831 | test_loss: 0.9651 | test_acc: 0.6143
100%|██████████| 10/10 [13:39<00:00, 82.00s/it]Epoch: 10 | train_loss: 0.9183 | train_acc: 0.5934 | test_loss: 0.8400 | test_acc: 0.6367

normal model(uncompiled) logs
 10%|█         | 1/10 [00:48<07:20, 48.89s/it]Epoch: 1 | train_loss: 1.5540 | train_acc: 0.2407 | test_loss: 1.3878 | test_acc: 0.2451
 20%|██        | 2/10 [01:35<06:22, 47.84s/it]Epoch: 2 | train_loss: 1.3866 | train_acc: 0.2592 | test_loss: 1.3847 | test_acc: 0.2451
 30%|███       | 3/10 [02:23<05:33, 47.65s/it]Epoch: 3 | train_loss: 1.3193 | train_acc: 0.3364 | test_loss: 8.3390 | test_acc: 0.3555
 40%|████      | 4/10 [03:10<04:45, 47.60s/it]Epoch: 4 | train_loss: 1.1661 | train_acc: 0.4084 | test_loss: 2.9549 | test_acc: 0.3994
 50%|█████     | 5/10 [03:58<03:57, 47.59s/it]Epoch: 5 | train_loss: 1.1333 | train_acc: 0.4169 | test_loss: 1.6769 | test_acc: 0.4629
 60%|██████    | 6/10 [04:45<03:10, 47.53s/it]Epoch: 6 | train_loss: 1.0923 | train_acc: 0.4269 | test_loss: 1.5188 | test_acc: 0.4023
 70%|███████   | 7/10 [05:33<02:22, 47.56s/it]Epoch: 7 | train_loss: 1.0276 | train_acc: 0.5352 | test_loss: 1.2394 | test_acc: 0.4580
 80%|████████  | 8/10 [06:20<01:34, 47.48s/it]Epoch: 8 | train_loss: 0.9831 | train_acc: 0.5525 | test_loss: 4.2785 | test_acc: 0.3496
 90%|█████████ | 9/10 [07:08<00:47, 47.43s/it]Epoch: 9 | train_loss: 0.9722 | train_acc: 0.5691 | test_loss: 1.8190 | test_acc: 0.4463
100%|██████████| 10/10 [07:55<00:00, 47.55s/it]Epoch: 10 | train_loss: 0.9652 | train_acc: 0.5622 | test_loss: 1.3043 | test_acc: 0.4785

here I see compiled model takes longer time to finish and additionally I see time taken for each epoch is gradually reducing.

on the other hand with normal model(uncompiled model) completed faster. it takes similar time to complete each epoch.

is this something I should expect? or is there a best practice to optimize it further?

RylanSchaeffer · December 21, 2023, 9:22pm

I’m hitting a similar error with a ResNet 18:

AssertionError: expected size 128==128, stride 128==1 at dim=0

When I try @ptrblck 's suggestion, I receive the below error:

>>> import torch._inductor.codecache
>>> torch._inductor.codecache.CacheBase.get_local_cache_path()
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
AttributeError: module 'torch._inductor.codecache' has no attribute 'CacheBase'

This is in Pytorch 2.0.1.

RylanSchaeffer · December 21, 2023, 9:25pm

It perhaps seems related to this issue? examples/imagenet still fails · Issue #1701 · pytorch/torchdynamo · GitHub