Hi.
I’m trying to train a model from
[facebookresearch/ViewDiff: ViewDiff generates high-quality, multi-view consistent images of a real-world 3D object in authentic surroundings. (CVPR2024). (github.com)](https://github.com/facebookresearch/ViewDiff)
I had a gpu OOM issue when I ran
./viewdiff/scripts/train_small.sh <path/to/co3d> "stabilityai/stable-diffusion-2-1-base" outputs/train <category=teddybear>
so I tried again with CUDA_LAUNCH_BLOCKING=1 and now I see RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered.
Full error output below:
Traceback (most recent call last):
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/jun/Documents/ViewDiff/viewdiff/train.py", line 924, in <module>
tyro.cli(train_and_test)
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/tyro/_cli.py", line 177, in cli
output = _cli_impl(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/tyro/_cli.py", line 431, in _cli_impl
out, consumed_keywords = _calling.call_from_args(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/tyro/_calling.py", line 217, in call_from_args
return unwrapped_f(*positional_args, **kwargs), consumed_keywords # type: ignore
File "/home/jun/Documents/ViewDiff/viewdiff/train.py", line 290, in train_and_test
avg_step_losses, acc_step, loss = train_step(
File "/home/jun/Documents/ViewDiff/viewdiff/train.py", line 683, in train_step
dreambooth_avg_losses, dreambooth_acc, dreambooth_loss = process_batch(dreambooth_batch)
File "/home/jun/Documents/ViewDiff/viewdiff/train.py", line 560, in process_batch
output = unet(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jun/Documents/ViewDiff/viewdiff/model/custom_unet_2d_condition.py", line 1553, in forward
sample = self.mid_block(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jun/Documents/ViewDiff/viewdiff/model/custom_unet_2d_blocks.py", line 745, in forward
hidden_states = attn(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jun/Documents/ViewDiff/viewdiff/model/custom_transformer_2d.py", line 292, in forward
hidden_states = block(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jun/Documents/ViewDiff/viewdiff/model/custom_attention.py", line 470, in forward
attn_output = self.unproj_reproj(unproj_reproj_kwargs, attn_output)
File "/home/jun/Documents/ViewDiff/viewdiff/model/custom_attention.py", line 368, in unproj_reproj
unproj_output, unproj_reproj_mask, unproj_reproj_depth = self.unproj_reproj_layer(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jun/Documents/ViewDiff/viewdiff/model/projection/layer.py", line 334, in forward
projected_latents, projected_mask, projected_depth = self.volume_renderer(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/jun/Documents/ViewDiff/viewdiff/model/projection/fastplane/fastplane_module.py", line 117, in forward
ray_length_render, negative_log_transmittance, feature_render = fastplane(
File "/home/jun/Documents/ViewDiff/viewdiff/model/projection/fastplane/fastplane_sig_function.py", line 534, in fastplane
return FastplaneFunction.apply(
File "/home/jun/miniconda3/envs/viewdiff/lib/python3.10/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/jun/Documents/ViewDiff/viewdiff/model/projection/fastplane/fastplane_sig_function.py", line 178, in forward
_fw_kernel[grid](
File "<string>", line 45, in _fw_kernel
RuntimeError: Triton Error [CUDA]: an illegal memory access was encountered
Any ideas how to fix triton cuda issue?
Many thanks!