Hi my manual implementation of the pytorch convolution2d will always have some precision difference compared with official pytorch implementation of conv2d.

Codes are attached for reproduce

```
def conv2d_matmul_fp(sample_input, weight, padding, stride, dilation):
N,C,X,Y = sample_input.size()
K,_,R,S = weight.size()
out_size = (math.floor((X+padding[0]*2-dilation[0]*(R-1)-1)/stride[0]) + 1, math.floor((Y+padding[1]*2-dilation[1]*(S-1)-1)/stride[1]) + 1)
simple_in_unfold = torch.nn.functional.unfold(sample_input, kernel_size=(R,S), dilation=dilation, padding=padding, stride=stride)
res = torch.matmul(weight.view(weight.size()[0], -1), simple_in_unfold[0])
return res.reshape(N, K, out_size[0], out_size[1])
def Conv2d_layer_matmul(sample_input, conv_layer):
weight = conv_layer.state_dict()["weight"]
padding = conv_layer.padding
stride = conv_layer.stride
dilation = conv_layer.dilation
return conv2d_matmul_fp(sample_input, weight, padding, stride, dilation)
# Define sample model
sample_fp_conv2d = nn.Conv2d(3, 64, kernel_size=7, stride=(2,2), padding=(3,3), bias=False)
sample_fp_conv2d.eval()
# Extract the weights
weight = sample_fp_conv2d.state_dict()['weight']
# Define sample input data
# sample_input = val_data[0]
sample_input = torch.randn(1,3,224,224)
# Define sample result
sample_res = sample_fp_conv2d(sample_input)
print(sample_res.size())
res = Conv2d_layer_matmul(sample_input, sample_fp_conv2d)
# Comare both results
torch.where(torch.isclose(res, sample_res, rtol=1e-3)==False)
```

Really appreciate it if anyone could help