1.
import pandas as pd
url = 'https://raw.githubusercontent.com/alura-cursos/classificacao_multiclasse/main/Dados/df_vinho.csv'
dados = pd.read_csv(url)
dados.head()
2.
dados.info()
3.
X = dados.drop(columns='qualidade')
y = dados['qualidade']
4.
from sklearn.model_selection import train_test_split
X, X_teste, y, y_teste = train_test_split(X, y, test_size=0.30, random_state=40, shuffle=True, stratify=y)
X_treino, X_val, y_treino, y_val = train_test_split(X, y, test_size=0.30, random_state=40, shuffle=True, stratify=y)
5.
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=50, random_state=40)
rf.fit(X_treino, y_treino)
y_pred = rf.predict(X_val)
print(f'Acurácia de Treino: {rf.score(X_treino, y_treino)}')
print(f'Acurácia de Validação: {rf.score(X_val, y_val)}')
6.
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.metrics import classification_report
report = classification_report(y_true=y_val, y_pred=y_pred)
print(f'Relatório de Classificação: \n {report}')
matriz_confusao = confusion_matrix(y_true=y_val, y_pred=y_pred)
ConfusionMatrixDisplay.from_predictions(y_val, y_pred, normalize='true', display_labels=rf.classes_);
7.
# y_treino.value_counts()
# qualidade
# mediano 544
# bom 90
# ruim 31
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=40)
X_balanceado, y_balanceado = oversample.fit_resample(X_treino, y_treino)
y_balanceado.value_counts()
rf = RandomForestClassifier(max_depth=50, random_state=40)
rf.fit(X_balanceado, y_balanceado)
y_pred = rf.predict(X_val)
print(classification_report(y_val, y_pred))
ConfusionMatrixDisplay.from_predictions(y_val, y_pred, normalize='true', display_labels=rf.classes_);
8.
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.model_selection import cross_validate, StratifiedKFold
rf = RandomForestClassifier(max_depth=50, random_state=40)
pipeline = imbpipeline([('oversample', SMOTE()), ('florestas', rf)])
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_resultados = cross_validate(pipeline, X, y, cv=skf, scoring='recall_weighted')
cv_resultados['test_score']
9.
oversample = SMOTE(random_state=0)
X_balanceado, y_balanceado = oversample.fit_resample(X, y)
rf = RandomForestClassifier(random_state=0, max_depth=10)
rf.fit(X_balanceado, y_balanceado)
y_pred = rf.predict(X_teste)
print(classification_report(y_teste, y_pred))
ConfusionMatrixDisplay.from_predictions(y_teste, y_pred, normalize='true', display_labels=rf.classes_);