I have the test set of MNIST dataset and I want to give the images to a pre-trained encoder and then cluster the embedded images using k-means clustering but I get an error when trying to fit_predict().
This is the code:
trans = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
test_set = dset.MNIST(root=root, train=False, transform=trans, download=True)
test_loader = torch.utils.data.DataLoader(
dataset = test_set,
batch_size = 10000,
shuffle = False)
km = KMeans(k, n_init=20, n_jobs=4)
sil=[]
for data in test_loader:
x, _= data
x = model(x.cuda())
x = x.data.cpu().numpy()
#x = x.astype(int)
y_pred = km.fit_predict(x) # seems we can only get a centre from batch
sil_score = sil(x, y_pred)
print('sil score', sil_score)
sil.append(sil_score)
And this is the error I get:
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\samin\Anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\samin\Anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\samin\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 567, in __call__
return self.func(*args, **kwargs)
File "C:\Users\samin\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\samin\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\samin\Anaconda3\lib\site-packages\sklearn\cluster\k_means_.py", line 437, in _kmeans_single_elkan
x_squared_norms=x_squared_norms)
File "C:\Users\samin\Anaconda3\lib\site-packages\sklearn\cluster\k_means_.py", line 749, in _init_centroids
x_squared_norms=x_squared_norms)
File "C:\Users\samin\Anaconda3\lib\site-packages\sklearn\cluster\k_means_.py", line 81, in _k_init
centers = np.empty((n_clusters, n_features), dtype=X.dtype)
TypeError: 'float' object cannot be interpreted as an integer
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-148-6ec8225ad810> in <module>
----> 1 k, sil_score = Test(test_loader, model, Controller)
<ipython-input-147-94d13c371d50> in Test(test_loader_0, model, Controller)
20 k = sum(k) / len(k)
21 km = KMeans(k, n_init=20, n_jobs=4)
---> 22 y_pred = km.fit_predict(obs) # seems we can only get a centre from batch
23 sil_score = sil(x, y_pred)
24
~\Anaconda3\lib\site-packages\sklearn\cluster\k_means_.py in fit_predict(self, X, y, sample_weight)
996 Index of the cluster each sample belongs to.
997 """
--> 998 return self.fit(X, sample_weight=sample_weight).labels_
999
1000 def fit_transform(self, X, y=None, sample_weight=None):
~\Anaconda3\lib\site-packages\sklearn\cluster\k_means_.py in fit(self, X, y, sample_weight)
970 tol=self.tol, random_state=random_state, copy_x=self.copy_x,
971 n_jobs=self.n_jobs, algorithm=self.algorithm,
--> 972 return_n_iter=True)
973 return self
974
~\Anaconda3\lib\site-packages\sklearn\cluster\k_means_.py in k_means(X, n_clusters, sample_weight, init, precompute_distances, n_init, max_iter, verbose, tol, random_state, copy_x, n_jobs, algorithm, return_n_iter)
397 # Change seed to ensure variety
398 random_state=seed)
--> 399 for seed in seeds)
400 # Get results with the lowest inertia
401 labels, inertia, centers, n_iters = zip(*results)
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
932
933 with self._backend.retrieval_context():
--> 934 self.retrieve()
935 # Make sure that we get a last message telling us we are done
936 elapsed_time = time.time() - self._start_time
~\Anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
~\Anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\Anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
TypeError: 'float' object cannot be interpreted as an integer
I tried adding x = x.astype(int)
:
km = KMeans(k, n_init=20, n_jobs=4)
sil=[]
for data in test_loader_0:
x, _= data
x = model(x.cuda())
x = x.data.cpu().numpy()
x = x.astype(int)
y_pred = km.fit_predict(x) # seems we can only get a centre from batch
sil_score = sil(x, y_pred)
print('sil score', sil_score)
sil.append(sil_score)
but got the same error. I find this error very strange as I have used the same dataset (training set) and the same network (model=encoder) to calculate the labels produced by k-means. I don’t think that k-means.fit_predict only accepts integer values.
I wonder if anyone has a clue about this or has encountered this issue? I appreciate a hint.