import pandas as pd
campos = vetorizar.get_feature_names()
matriz_esparsa = pd.DataFrame.sparse.from_spmatrix(bag_of_words,columns= campos)
def classificar_texto(texto, coluna_texto, coluna_classificacao):
vetorizar = CountVectorizer(lowercase=False, max_features=50)
bag_of_words = vetorizar.fit_transform([coluna_texto])
treino, teste, classe_treino, classe_teste = train_test_split(bag_of_words,texto[coluna_classificacao],random_state=42)
regressao_logistica = LogisticRegression()
regressao_logistica.fit(treino, classe_treino)
return regressao_logistica.score(teste, classe_teste)
print(classificar_texto(resenha, "text_pt","classificacao"))
O código acima está com erro abaixo. Não consigo identificar o problema.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-45-2c6398606a43> in <module>
----> 1 print(classificar_texto(resenha, "text_pt","classificacao"))
<ipython-input-44-6beb460e9c85> in classificar_texto(texto, coluna_texto, coluna_classificacao)
7 vetorizar = CountVectorizer(lowercase=False, max_features=50)
8 bag_of_words = vetorizar.fit_transform([coluna_texto])
----> 9 treino, teste, classe_treino, classe_teste = train_test_split(bag_of_words,texto[coluna_classificacao],random_state=42)
10 regressao_logistica = LogisticRegression()
11 regressao_logistica.fit(treino, classe_treino)
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_split.py in train_test_split(*arrays, **options)
2125 raise TypeError("Invalid parameters passed: %s" % str(options))
2126
-> 2127 arrays = indexable(*arrays)
2128
2129 n_samples = _num_samples(arrays[0])
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in indexable(*iterables)
290 """
291 result = [_make_indexable(X) for X in iterables]
--> 292 check_consistent_length(*result)
293 return result
294
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
254 if len(uniques) > 1:
255 raise ValueError("Found input variables with inconsistent numbers of"
--> 256 " samples: %r" % [int(l) for l in lengths])
257
258
ValueError: Found input variables with inconsistent numbers of samples: [1, 49459]
Estou usando o KAGGLE NOTEBOOK