Hello,
The following code runs too slow if I unable torch.compile.
from diffusers import DiffusionPipeline, LCMScheduler
from diffusers import EulerDiscreteScheduler
import torch
import numpy as np
import cv2
def pil_opencv_image( image ):
# Convert PIL image to a numpy array
npimg = np.array(image)
# Convert RGB to BGR for OpenCV
cvimg = npimg[:, :, ::-1]
return cvimg
model_id = “stabilityai/stable-diffusion-xl-base-1.0”
lcm_lora_id = “latent-consistency/lcm-lora-sdxl”
#pipe = DiffusionPipeline.from_pretrained(model_id, variant=“fp16”)
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.load_lora_weights(lcm_lora_id)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe.unet = torch.compile(pipe.unet, mode=“reduce-overhead”, fullgraph=True)
pipe.to(device=“cuda”, dtype=torch.float16)
prompt = “close-up photography of old man standing in the rain at night, in a street lit by lamps, leica 35mm summilux”
t0 = cv2.getTickCount()
image = pipe(
prompt=prompt,
num_inference_steps=4,
guidance_scale=1,
).images[0]
t1 = cv2.getTickCount()
tp = (t1-t0)/cv2.getTickFrequency()
print("Time: ", tp )
cvimg = pil_opencv_image(image)
cv2.imshow( “Image”, cvimg)
cv2.waitKey(0)