Torch.compile too slow with SDXL model

Hello,

The following code runs too slow if I unable torch.compile.

from diffusers import DiffusionPipeline, LCMScheduler
from diffusers import EulerDiscreteScheduler

import torch
import numpy as np
import cv2

def pil_opencv_image( image ):
# Convert PIL image to a numpy array
npimg = np.array(image)

# Convert RGB to BGR for OpenCV
cvimg = npimg[:, :, ::-1]
return cvimg

model_id = “stabilityai/stable-diffusion-xl-base-1.0”
lcm_lora_id = “latent-consistency/lcm-lora-sdxl”

#pipe = DiffusionPipeline.from_pretrained(model_id, variant=“fp16”)
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)

pipe.load_lora_weights(lcm_lora_id)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)

pipe.unet = torch.compile(pipe.unet, mode=“reduce-overhead”, fullgraph=True)
pipe.to(device=“cuda”, dtype=torch.float16)

prompt = “close-up photography of old man standing in the rain at night, in a street lit by lamps, leica 35mm summilux”

t0 = cv2.getTickCount()
image = pipe(
prompt=prompt,
num_inference_steps=4,
guidance_scale=1,
).images[0]
t1 = cv2.getTickCount()
tp = (t1-t0)/cv2.getTickFrequency()
print("Time: ", tp )

cvimg = pil_opencv_image(image)
cv2.imshow( “Image”, cvimg)
cv2.waitKey(0)

Which torch version are you using? This has gotten better in nightlies, granted it’ll still be slow