- Preparando os dados:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
# Carregar o dataset Iris
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names
species_names = iris.target_names
# Criar um DataFrame para visualização
df_iris_clustering = pd.DataFrame(X, columns=feature_names)
df_iris_clustering['species'] = y
df_iris_clustering['species_name'] = df_iris_clustering['species'].map({i: name for i, name in enumerate(species_names)})
print("Primeiras 5 linhas do DataFrame Iris (para clustering):")
display(df_iris_clustering.head())
# Padronizar os dados (importante para algoritmos baseados em distância)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nDados Iris padronizados (primeiras 5 linhas):")
display(pd.DataFrame(X_scaled, columns=feature_names).head())
- Clusterização com K-Means
# Aplicar K-Means
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) # n_init para evitar warnings
kmeans_labels = kmeans.fit_predict(X_scaled)
df_iris_clustering['kmeans_cluster'] = kmeans_labels
print("\nDistribuição dos clusters K-Means e comparação com as espécies reais:")
display(pd.crosstab(df_iris_clustering['species_name'], df_iris_clustering['kmeans_cluster']))
# Visualização dos clusters K-Means (usando os dois primeiros componentes principais para 2D)
pca_kmeans = PCA(n_components=2)
X_pca_kmeans = pca_kmeans.fit_transform(X_scaled)
df_pca_kmeans = pd.DataFrame(data=X_pca_kmeans, columns=['principal_component_1', 'principal_component_2'])
df_pca_kmeans['kmeans_cluster'] = kmeans_labels
df_pca_kmeans['species_name'] = df_iris_clustering['species_name']
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x='principal_component_1', y='principal_component_2', hue='kmeans_cluster', palette='viridis', data=df_pca_kmeans, s=70)
plt.title('K-Means Clusters (PCA-reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.subplot(1, 2, 2)
sns.scatterplot(x='principal_component_1', y='principal_component_2', hue='species_name', palette='viridis', data=df_pca_kmeans, s=70)
plt.title('True Species (PCA-reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Species')
plt.grid(True)
plt.tight_layout()
plt.show()
- Agrupamento Hierárquico
from scipy.cluster.hierarchy import dendrogram, linkage
# Aplicar Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
hc_labels = hc.fit_predict(X_scaled)
df_iris_clustering['hc_cluster'] = hc_labels
print("\nDistribuição dos clusters Hierárquicos e comparação com as espécies reais:")
display(pd.crosstab(df_iris_clustering['species_name'], df_iris_clustering['hc_cluster']))
# Visualização dos clusters Hierárquicos (usando os dois primeiros componentes principais para 2D)
pca_hc = PCA(n_components=2)
X_pca_hc = pca_hc.fit_transform(X_scaled)
df_pca_hc = pd.DataFrame(data=X_pca_hc, columns=['principal_component_1', 'principal_component_2'])
df_pca_hc['hc_cluster'] = hc_labels
df_pca_hc['species_name'] = df_iris_clustering['species_name']
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.scatterplot(x='principal_component_1', y='principal_component_2', hue='hc_cluster', palette='viridis', data=df_pca_hc, s=70)
plt.title('Hierarchical Clusters (PCA-reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.subplot(1, 2, 2)
sns.scatterplot(x='principal_component_1', y='principal_component_2', hue='species_name', palette='viridis', data=df_pca_hc, s=70)
plt.title('True Species (PCA-reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Species')
plt.grid(True)
plt.tight_layout()
plt.show()
# Para visualizar o Dendrograma para Hierarchical Clustering
# linkage_matrix = linkage(X_scaled, method='ward')
# plt.figure(figsize=(15, 7))
# dendrogram(linkage_matrix, labels=df_iris_clustering['species_name'].tolist(), leaf_rotation=90., leaf_font_size=8.)
# plt.title('Dendrograma para Hierarchical Clustering (Iris Dataset)')
# plt.xlabel('Amostras')
# plt.ylabel('Distância')
# plt.show()
Continua nos coment