California Housing
# =========================================
# IMPORTAÇÃO DAS BIBLIOTECAS
# =========================================
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
# =========================================
# CARREGAR O DATASET
# =========================================
housing = fetch_california_housing()
X = housing.data
y = housing.target
# =========================================
# DIVIDIR OS DADOS (ANTES DO GRID SEARCH)
# =========================================
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42
)
# =========================================
# DEFINIR O MODELO
# =========================================
modelo = DecisionTreeRegressor(random_state=42)
# =========================================
# DEFINIR OS HIPERPARÂMETROS
# =========================================
param_grid = {
'max_depth': [None, 5, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# =========================================
# GRID SEARCH (APENAS NO TREINO)
# =========================================
grid_search = GridSearchCV(
estimator=modelo,
param_grid=param_grid,
cv=5,
scoring='neg_mean_squared_error',
n_jobs=-1
)
grid_search.fit(X_train, y_train)
# =========================================
# EXIBIR APENAS OS MELHORES PARÂMETROS
# =========================================
print(grid_search.best_params_)
Indians Diabetes
# =========================================
# IMPORTAÇÃO DAS BIBLIOTECAS
# =========================================
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
# =========================================
# CARREGAR O DATASET (SKLEARN)
# =========================================
diabetes = load_diabetes()
# Converter para DataFrame
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
# Criar variável alvo binária (acima da média = 1, abaixo = 0)
df['Outcome'] = (diabetes.target > diabetes.target.mean()).astype(int)
# =========================================
# SEPARAR DADOS
# =========================================
X = df.drop("Outcome", axis=1)
y = df["Outcome"]
# Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2,
random_state=42,
stratify=y
)
# =========================================
# TREINAR MODELO RANDOM FOREST
# =========================================
modelo = RandomForestClassifier(
n_estimators=100,
random_state=42
)
modelo.fit(X_train, y_train)
# =========================================
# IMPORTÂNCIA DAS FEATURES
# =========================================
importancias = modelo.feature_importances_
df_importancias = pd.DataFrame({
"Feature": X.columns,
"Importancia": importancias
}).sort_values(by="Importancia", ascending=False)
# =========================================
# GRÁFICO DE BARRAS
# =========================================
plt.figure()
plt.bar(df_importancias["Feature"], df_importancias["Importancia"])
plt.xticks(rotation=45)
plt.xlabel("Características")
plt.ylabel("Importância")
plt.title("Importância das Variáveis - Random Forest")
plt.tight_layout()
plt.show()