#================================ Conexão ================================
import pandas as pd
import pyodbc as bd
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
server = "PCWT"
database = "DW_TCC"
username = "sa"
password = "spfc2010"
connection_str = 'Driver={SQL Server Native Client 11.0};Server='+server+';Database='+database+';UID='+username+';PWD='+ password
query = 'SELECT F.*, DD.COD_CURSO FROM FATO F INNER JOIN DIM_DISCIPLINA DD ON F.COD_DISCIPLINA = DD.COD_DISCIPLINA'
connection = bd.connect(connection_str)
dados = pd.read_sql_query(query,connection)
pd.set_option('display.max_columns', None) # nao limitar as colunas que aparecem no output
#=========================================================================
renomear = {'COD_ACADEMICO':'evasao', 'COD_ALUNO':'aluno', 'COD_DISCIPLINA':'disciplina', 'COD_FINANCEIRO':'financeiro',
'COD_SIT_DISCIPLINA':'sit_disciplina', 'COD_TEMPO':'tempo', 'QTD_FALTAS':'faltas', 'NOTA_MEDIA':'nota', 'SEMESTRE_ATUAL':'semestre', 'IDADE':'age', 'COD_CURSO':'curso'}
dados = dados.rename(columns = renomear)
dados.head()
trocar = {1:0,2:0,3:0,4:0,5:0,6:1,7:1,8:1,9:1,10:1,11:1,12:1,13:1,14:1}
dados['EVADIDO'] = dados.evasao.map(trocar)
x = dados[['aluno', 'disciplina', 'financeiro', 'sit_disciplina', 'tempo', 'faltas', 'nota', 'semestre', 'age', 'curso']]
y = dados['EVADIDO']
SEED = 20
treino_x, teste_x, treino_y, teste_y = train_test_split(x, y, random_state = SEED, test_size = 0.25, stratify = y)
print("Treinaremos com %d elementos e testaremos com %d elementos " % (len(treino_x), len(teste_x)))
modelo = LinearSVC()
modelo.fit(treino_x, treino_y)
previsoes = modelo.predict(teste_x)
acuracia = accuracy_score(teste_y, previsoes) * 100
print("A acurácia foi %.2f" % acuracia)
mas me devolve o erro:
Traceback (most recent call last):
File "C:/Users/Wtorres/Documents/MachineLearningTCC/Main.py", line 42, in <module>
modelo.fit(treino_x, treino_y)
File "C:\Users\Wtorres\anaconda3\envs\MachineLearningTCC\lib\site-packages\sklearn\svm\_classes.py", line 227, in fit
X, y = self._validate_data(X, y, accept_sparse='csr',
File "C:\Users\Wtorres\anaconda3\envs\MachineLearningTCC\lib\site-packages\sklearn\base.py", line 432, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "C:\Users\Wtorres\anaconda3\envs\MachineLearningTCC\lib\site-packages\sklearn\utils\validation.py", line 73, in inner_f
return f(**kwargs)
File "C:\Users\Wtorres\anaconda3\envs\MachineLearningTCC\lib\site-packages\sklearn\utils\validation.py", line 796, in check_X_y
X = check_array(X, accept_sparse=accept_sparse,
File "C:\Users\Wtorres\anaconda3\envs\MachineLearningTCC\lib\site-packages\sklearn\utils\validation.py", line 73, in inner_f
return f(**kwargs)
File "C:\Users\Wtorres\anaconda3\envs\MachineLearningTCC\lib\site-packages\sklearn\utils\validation.py", line 599, in check_array
array = np.asarray(array, order=order, dtype=dtype)
File "C:\Users\Wtorres\anaconda3\envs\MachineLearningTCC\lib\site-packages\numpy\core\_asarray.py", line 83, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float: 'A'
Process finished with exit code 1
sabe me dizer como posso resolver?, as colunas do banco de dados são exatamente essas que estao na variavel dados