Primeiramente, o dataset California Housing será utilizado para ajustar os hiperparâmetros de um modelo de Árvore de Decisão.
# Importação de bibliotecas:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import numpy as np
# Carregando dataset:
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = housing.target
# Transformando em classificação, estilo faixas de preço:
y_class = pd.qcut(y, q=3, labels=["baixo","medio","alto"])
# Divisão entre treinoe teste:
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.3, random_state=42)
# Modelo:
dt_clf = DecisionTreeClassifier(random_state=42)
# Grid de hiperparâmetros:
param_grid = {
"max_depth": [3, 5, 7, None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
# Aplicação de GridSearchCV:
grid_search = GridSearchCV(dt_clf, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)
print("Melhores parâmetros:", grid_search.best_params_)
# Avaliação no conjunto de teste:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Acurácia no teste:", accuracy_score(y_test, y_pred))
Melhores parâmetros: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Acurácia no teste: 0.752422480620155
Posteriormente, utizamos o DecisionTreeRegressor para identificar as características mais importantes:
from sklearn.tree import DecisionTreeRegressor
# Carregando dataset Pima Indians Diabetes (UCI):
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age","Outcome"]
data = pd.read_csv(url, names=columns)
X = data.drop("Outcome", axis=1)
y = data["Outcome"]
# Divisão entre treino e teste:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Modelo de regressão:
dt_reg = DecisionTreeRegressor(random_state=42)
# Grid de hiperparâmetros:
param_grid_reg = {
"max_depth": [3, 5, 7, None],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4]
}
grid_search_reg = GridSearchCV(dt_reg, param_grid_reg, cv=5, scoring="r2")
grid_search_reg.fit(X_train, y_train)
print("Melhores parâmetros:", grid_search_reg.best_params_)
# Importância das variáveis:
best_reg = grid_search_reg.best_estimator_
importances = best_reg.feature_importances_
importance_df = pd.DataFrame({
"Feature": X.columns,
"Importance": importances
}).sort_values(by="Importance", ascending=False)
print(importance_df)
Melhores parâmetros: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Feature Importance
1 Glucose 0.628444
7 Age 0.187383
5 BMI 0.173613
6 DiabetesPedigreeFunction 0.010560
3 SkinThickness 0.000000
2 BloodPressure 0.000000
0 Pregnancies 0.000000
4 Insulin 0.000000