Estou realizando um treinamento do modelo com base no código abaixo:
# definindo parâmetros dos modelos
models = [
(
"PER", #Nome do modelo (abreviado)
Perceptron(max_iter=1000), # Chamada do método do modelo
{"penalty": ['l2','l1','elasticnet']} # Diferentes parametros que serão testados
),
(
"NB",
BernoulliNB(),
{"alpha": [1e-3, 0.5, 1]}
),
(
"DT",
DecisionTreeClassifier(),
{
'criterion': ['gini','entropy'],
'max_depth': [2,3,5,10,20],
'min_samples_leaf': [5,10,20,50,100,200]
}
),
(
"RF",
RandomForestClassifier(),
{
'criterion': ['gini','entropy'],
'max_depth': [2,3,5,10,20],
'min_samples_leaf': [5,10,20,50,100,200],
'n_estimators': [10,25,30,50,100,200]
}
),
(
"LR", #Nome do modelo (abreviado)
LogisticRegression(solver='saga', max_iter=1000), # Chamada do método do modelo
{"penalty": ['none', 'l1', 'l2']} # Diferentes parametros que serão testados
),
(
"KNN",
KNeighborsClassifier(metric='euclidean'),
{"n_neighbors": np.arange(1, 31, 2), 'weights': ["uniform", "distance"]}
),
(
"XGB",
XGBClassifier(),
{
'min_child_weight': [1, 5, 10],
'gamma': [0.5, 1, 1.5, 2, 5],
'subsample': [0.6, 0.8, 1.0],
'colsample_bytree': [0.6, 0.8, 1.0],
'max_depth': [3, 4, 5],
'n_iter': [3,4,5],
'learning_rate':[0.01, 0.05,0.1,0.5,1]
}
),
(
"SVM",
SVC(max_iter=10000),
{'C':[1, 10, 100, 1000],'gamma':[1, 0.1, 0.001, 0.0001], 'kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']}
),
(
"MLP",
MLPClassifier(),
{
'hidden_layer_sizes': [(5,10,15,20,25,30),(5,10,15,20,25,30)],
'activation': ['identity', 'logistic', 'tanh', 'relu'],
'solver': ['lbfgs', 'sgd', 'adam'],
'alpha': [0.0001, 0.01, 0.05,0.1,0.5,1],
'learning_rate': ['constant','adaptive'],
}
),
]
metrics = {
'accuracy': make_scorer(accuracy_score, average='weighted'),
'precision': make_scorer(precision_score, average='weighted', zero_division=1),
'recall': make_scorer(recall_score, average='weighted', zero_division=1),
'f1': make_scorer(f1_score, average='weighted', zero_division=1)
}
X = df.drop(columns=[target_column], axis=1)
y = (df[[target_column]].to_numpy().ravel())
cv = ShuffleSplit(n_splits=30, train_size=0.8, random_state=42) #Separa em conjuntos de teste e treino
# Realizando treinamento dos modelos selecionados
results = {}
for model_name, model, model_params in models:
print(f'{model_name} run...')
model_gs = GridSearchCV(model, model_params, scoring='accuracy')
approach = Pipeline([
("preprocessing", preprocessing),
("model", model_gs)
])
model_results = cross_validate(
approach,
X=X,
y=y,
scoring=metrics,
cv=cv,
n_jobs=-1,
return_train_score=False,
error_score=0
)
model_results['name'] = [model_name] * len(model_results['score_time'])
if results:
for key, value in model_results.items():
results[key] = np.append(results[key], value)
else:
results = model_results
e para cada modelo treinado surge duas mensagens de warnings:
a primeira é:
UserWarning: Scoring failed. The score on this train-test partition for these parameters will be set to 0. KeyError: 'predict'
A segunda é:
ValueError: Found unknown categories
O que posso fazer para resolver esses erros?