Estou usando o dataset dígitos do sklearn para clustering. No KMeans, defino o número de clusters com antecedência, mas não é verdade para o DBSCAN e o MeanShift. São 10 dígitos mas só consegui 1 cluster...
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotib inline
from sklearn.cluster import DBSCAN,MeanShift
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.decomposition import PCA
from time import time
digits = load_digits()
digits
digits.data.shape
X = digits.data
y = digits.target
fig, axes = plt.subplots(2,5, subplot_kw=dict(xticks=[], yticks=[]))
for ax, digit in zip(axes.flat, digits.data[:10]):
ax.imshow(digit.reshape(8,8), cmap="gray")
pca = PCA(n_components=2)
pca = pca.fit(digits.data)
digits_pca = pca.transform(digits.data)
t0=time()
# print len(digits.data) #1797
colors = ["#476A2A","#7851B8",'#BD3430','#4A2D4E','#875525',
'#A83683','#4E655E','#853541','#3A3120','#535D8E']
plt.figure(figsize=(10,10))
plt.xlim(digits_pca[:,0].min(), digits_pca[:,0].max())
plt.ylim(digits_pca[:,1].min(), digits_pca[:,1].max())
for i in range(len(digits.data)):
plt.text(digits_pca[i,0], digits_pca[i,1], str(digits.target[i]),
color = colors[digits.target[i]],
fontdict={'weight':'bold', 'size':9})
plt.title('PCA')
plt.xlabel("first PC")
plt.ylabel("second PC")
print("PCA time: ", time()-t0)
plt.show()
#feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(digits_pca)
scaled_p = scaler.transform(digits_pca)
# # DBSCAN
X, labels_true = load_digits(return_X_y=True)
X = StandardScaler().fit_transform(X)
#Next, we can extract our cluster labels and outliers to plot our results.
labels = db.labels_
core_samples_mask = np.zeros_like(labels, dtype = bool)
core_samples_mask[db.core_sample_indices_] = True
# # Compute DBSCAN
t2 = time()
db = DBSCAN(eps=0.8, min_samples=50).fit(scaled_p) ## default parameter values You will construct a DBSCAN object that requires a minimum of 15 data points in a neighborhood of radius 0.5 to be considered a core point.
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print( "number of clusters in pca-DBSCAN: ", n_clusters_)
plt.scatter(scaled_p[:,0], scaled_p[:,1], c=labels, s=60, edgecolors='black')
plt.title('PCA -> DBSCAN')
plt.xlabel("first PC")
plt.ylabel("second PC")
print( "DBSCAN time: ", time()-t2)
plt.show()
# # MeanShift
db = MeanShift().fit(scaled_p) ## default parameter values You will construct a DBSCAN object that requires a minimum of 15 data points in a neighborhood of radius 0.5 to be considered a core point.
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print( "number of clusters in pca-MeanShift: ", n_clusters_)
plt.scatter(scaled_p[:,0], scaled_p[:,1], c=labels, s=60, edgecolors='black')
plt.title('PCA -> MeanShift')
plt.xlabel("first PC")
plt.ylabel("second PC")
plt.show()