Webcam and image classifier integration

Hello, I am using a code to classify real time image, But I am confused about it. Could you please take a look on the code and help me to figure out any error:

import numpy as np  
import torch
import torch.nn as nn
import torchvision 
from torch.autograd import Variable
from torchvision import transforms
import PIL 
import cv2

#This is the Label
Labels = ['Perfect','Defected']

# Let's preprocess the inputted frame

data_transforms = torchvision.transforms.Compose([
    torchvision.transforms.Resize(size=(352, 288)),
    torchvision.transforms.RandomHorizontalFlip(),
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])



device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the model and set in eval
resnet18 = torchvision.models.resnet18(pretrained=True)
resnet18.fc = torch.nn.Sequential(nn.Linear(resnet18.fc.in_features,512),
                                  nn.ReLU(),
                                  nn.Dropout(),
                                  nn.Linear(512, 2))

resnet18.load_state_dict(torch.load('defect_classifier_demo.pt', map_location=torch.device('cpu')))
resnet18.eval()



#Set the Webcam 
def Webcam_720p():
    cap.set(3,1280)
    cap.set(4,720)







def preprocess(image):
    image = PIL.Image.fromarray(image) #Webcam frames are numpy array format
                                       #Therefore transform back to PIL image
    print(image)                             
    image = data_transforms(image)
    image = image.float()
    #image = Variable(image, requires_autograd=True)
    image = image.cpu()
    image = image.unsqueeze(0) #I don't know for sure but Resnet-50 model seems to only
                               #accpets 4-D Vector Tensor so we need to squeeze another
    return image                            #dimension out of our 3-D vector Tensor
    
    
#Let's start the real-time classification process!
                                  
cap = cv2.VideoCapture(0) #Set the webcam
Webcam_720p()

fps = 0
show_score = 0
show_res = 'Nothing'
sequence = 0

while True:
    ret, frame = cap.read() #Capture each frame
    
    
    if fps == 4:
        image        = frame[100:450,150:570]
        image_data   = preprocess(image)
        #print(image_data)
        prediction   = resnet18(image_data)[0]
        torch.nn.functional.softmax(prediction, dim=0)
        #result,score = argmax(prediction)
        prediction = prediction.cpu().detach().numpy()
        predicted_class_index = np.argmax(prediction)
        predicted_class_name = Labels[predicted_class_index]
        print(prediction)
        print(predicted_class_index)
        print(predicted_class_name)
        #print(result)
        #print(score)
        #prediction = np.vectorize(prediction)
        fps = 0
        if prediction.any():
            show_res  = predicted_class_name
            show_score= prediction
        else:
            show_res   = "Nothing"
            show_score = prediction
        
    fps += 1
    cv2.putText(frame, '%s' %(show_res),(950,250), cv2.FONT_HERSHEY_SIMPLEX, 2, (255,255,255), 3)
    #cv2.putText(frame, '(score = %.5f)' %(show_score), (950,300), cv2.FONT_HERSHEY_SIMPLEX, 1,(255,255,255),2)
    cv2.rectangle(frame,(400,150),(900,550), (250,0,0), 2)
    cv2.imshow("ASL SIGN DETECTER", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyWindow("ASL SIGN DETECTER") ```

What kind of error are you seeing? Could you post the error message with the complete stack trace, please?

The script is working wired.

When I am putting an object to predict, the prediction is changing oftenly.

Your model might be performing poorly on this data.
Have you checked the validation and final test dataset accuracy and were you using images captured by the same device?
If so, what was the overall performance of this model?

Validation Accuracy is 92% and Test Accuracy is 91%.
I am using same device and background also.

Training Statistics is like:

Epoch: 0, Training Loss: 0.67, Validation Loss: 0.38, accuracy = 0.86
Epoch: 1, Training Loss: 0.36, Validation Loss: 0.39, accuracy = 0.84
Epoch: 2, Training Loss: 0.28, Validation Loss: 0.22, accuracy = 0.91
Epoch: 3, Training Loss: 0.25, Validation Loss: 0.18, accuracy = 0.94
Epoch: 4, Training Loss: 0.21, Validation Loss: 0.25, accuracy = 0.89
Epoch: 5, Training Loss: 0.20, Validation Loss: 0.22, accuracy = 0.89
Epoch: 6, Training Loss: 0.20, Validation Loss: 0.19, accuracy = 0.93
Epoch: 7, Training Loss: 0.19, Validation Loss: 0.18, accuracy = 0.93
Epoch: 8, Training Loss: 0.16, Validation Loss: 0.24, accuracy = 0.91
Epoch: 9, Training Loss: 0.16, Validation Loss: 0.24, accuracy = 0.92

The test accuracy seems to be quite high. How are these “bad results” created? Are you using another dataset for it?

Hey guys i’m trying to deploy my fruits recognition model on webcam and this is my error that i’m facing with
Traceback (most recent call last):
File “c:/Users/Admin/Documents/AI-PYTHON/TEST/realtime_recognization.py”, line 60, in
prediction = model(image_data)
File “C:\Users\Admin\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “c:\Users\Admin\Documents\AI-PYTHON\TEST\training_model.py”, line 41, in forward
return self.network(xb)
File “C:\Users\Admin\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “C:\Users\Admin\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\container.py”, line 100, in forward
input = module(input)
File “C:\Users\Admin\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “C:\Users\Admin\anaconda3\envs\py36\lib\site-packages\torch\nn\modules\linear.py”, line 87, in forward
return F.linear(input, self.weight, self.bias)
File “C:\Users\Admin\anaconda3\envs\py36\lib\site-packages\torch\nn\functional.py”, line 1610, in linear
ret = torch.addmm(bias, input, weight.t())
RuntimeError: size mismatch, m1: [1 x 45000], m2: [36000 x 1000] at C:/cb/pytorch_1000000000000/work/aten/src\THC/generic/THCTensorMathBlas.cu:283

And this is my code

from torch import mode

from lib import *

from utils import *

from training_model import CnnModel

from PIL import Image

from config import resize, mean, std, save_path

data_transforms = transforms.Compose(

[

    transforms.Resize(resize),

    transforms.ToTensor(),

    transforms.Normalize(mean,std)

]

)

device = get_default_device()

model = to_device(CnnModel(), device)

model = load_model(model, save_path)

model.eval()

def Webcam_720p():

cap.set(3,1280)

cap.set(4,720)

def predict_image(prediction):

prediction = prediction.cuda()

prediction = prediction.detach().numpy()

_, preds  = torch.max(prediction, dim=1)

score = np.amax(prediction)

score = '{:6f}'.format(score)

result = datasett.classes[preds[0].item()]

return result,score

def preprocess(image):

image = Image.fromarray(image) #                          

image = data_transforms(image)

image = image.float()

image = image.cuda()

image = image.unsqueeze(0) 

return image 

cap = cv2.VideoCapture(0)

Webcam_720p()

fps = 0

show_score = 0

show_res = ‘Nothing’

sequence = 0

while True:

ret, frame = cap.read()

    

if fps == 4:

    image        = frame[100:450,150:570]

    image_data   = preprocess(image)

    prediction   = model(image_data)

    result,score = predict_image(prediction)

    fps = 0

    if result >= 0.5:

        show_res  = result

        show_score= score

    else:

        show_res   = "Nothing"

        show_score = score

    

fps += 1

cv2.putText(frame, '%s' %(show_res),(950,250), cv2.FONT_HERSHEY_SIMPLEX, 2, (255,255,255), 3)

cv2.putText(frame, '(score = %.5f)' %(show_score), (950,300), cv2.FONT_HERSHEY_SIMPLEX, 1,(255,255,255),2)

cv2.rectangle(frame,(400,150),(900,550), (0,250,0), 2)

cv2.imshow("ASL SIGN DETECTER", frame)

if cv2.waitKey(1) & 0xFF == ord('q'):

    break

cap.release()

cv2.destroyWindow(“ASL SIGN DETECTER”)

Based on the error message you are running into a shape mismatch in a linear layer, which is often caused by changing the input shape without using an adaptive pooling layer.
In case your model was working before without the webcam images, you should check the spatial shapes of the training and webcam images and make sure they are equal.

Yes sir you’re right , my model totally can make predictions from JPEG images but webcam images is a different thing . I wonder these mismatch errors come from this line “image = frame[100:450,150:570]” , doesn’t it ? And i dont know what exactly it is. Could you explain it for me ?

Yes, I think the slicing operation could create an unexpected shape.
The posted line of code sliced the frame in two dimensions using the specified start and end indices.
As described before, check which shape the training images had and make sure the new images have the same.

1 Like