#!_*_ coding: utf8 _*_
import pandas as pd
from collections import Counter
import numpy as np
from sklearn.cross_validation import cross_val_score
text1 = "Se eu comprar cinco anos antecipados, eu ganho algum desconto?"
text2 = "O exercicio 15 do curso de Java 1 está com a resposta errada. Pode conferir pf?"
text3 = "Existe algum curso para cuidar do marketing da minha empresa?"
classification = pd.read_csv('emails.csv')
textPure = classification['email']
textBroke = textPure.str.lower().str.split(' ')
dictionary = set()
for lista in textBroke:
dictionary.update(lista)
totalOfWords = len(dictionary)
tuples = zip(dictionary, xrange(totalOfWords))
tradutor = {palavra:indice for palavra, indice in tuples}
print totalOfWords
def vector_text(text, tradutor):
vector = [0] * len(tradutor)
for palavra in text:
if palavra in tradutor:
position = tradutor[palavra]
vector[position] += 1
return vector
valuesOfTExt = [vector_text(text, tradutor) for text in textBroke]
marcas = classification['classificacao']
X = np.array(vector_text)
Y = np.array(marcas.tolist())
percent_of_train = 0.8
size_of_train = int(percent_of_train * len(Y))
size_of_validation = len(Y) - size_of_train
print size_of_train
train_data = X[0:size_of_train]
train_marcation = Y[0:size_of_train]
validation_data = X[size_of_train:]
validation_marcation = Y[size_of_train:]
def fit_and_predict(name, model, train_data, train_marcation):
k = 10
scores = cros_val_score(model, train_data, train_marcation, cv = k)
hit_rate = np.mean(scores)
msg = "Taxa de acerto do {0}: {1}".format(name, hit_rate)
print msg
return hit_rate