import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
Dados
df = pd.read_csv("user_data.csv")
df["last_played"] = pd.to_datetime(df["last_played"])
df = df.drop_duplicates()
Matriz usuário × música
matriz = df.pivot_table(index="user_id", columns="song_id", values="play_count", fill_value=0)
Item-based: música × usuário
item_user = matriz.T
item_user_norm = normalize(item_user, norm="l2")
Modelo
modelo = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=5)
modelo.fit(item_user_norm)
Recomendação
def recomendar_para_usuario(user_id, k=3, n_neighbors=5):
if user_id not in matriz.index:
return []
ouvidas = matriz.loc[user_id]
ouvidas = ouvidas[ouvidas > 0].index.tolist()
scores = {}
for song_id in ouvidas:
idx = item_user.index.get_loc(song_id)
dist, ind = modelo.kneighbors(item_user_norm[idx].reshape(1, -1), n_neighbors=n_neighbors)
for d, i in zip(dist.flatten(), ind.flatten()):
musica = item_user.index[i]
if musica not in ouvidas:
scores[musica] = scores.get(musica, 0) + (1 - d)
return [m for m, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]]
Validação simples: oculta a música mais tocada
def hit_rate_at_k(matriz, k=3, n_neighbors=5):
hits, total = 0, 0
for user_id in matriz.index:
plays = matriz.loc[user_id]
ouvidas = plays[plays > 0]
if len(ouvidas) < 2:
continue
oculta = ouvidas.idxmax()
matriz_tmp = matriz.copy()
matriz_tmp.loc[user_id, oculta] = 0
item_tmp = matriz_tmp.T
item_tmp_norm = normalize(item_tmp, norm="l2")
modelo_tmp = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=n_neighbors)
modelo_tmp.fit(item_tmp_norm)
ouvidas_tmp = matriz_tmp.loc[user_id]
ouvidas_tmp = ouvidas_tmp[ouvidas_tmp > 0].index.tolist()
scores = {}
for song_id in ouvidas_tmp:
idx = item_tmp.index.get_loc(song_id)
dist, ind = modelo_tmp.kneighbors(item_tmp_norm[idx].reshape(1, -1), n_neighbors=n_neighbors)
for d, i in zip(dist.flatten(), ind.flatten()):
musica = item_tmp.index[i]
if musica not in ouvidas_tmp:
scores[musica] = scores.get(musica, 0) + (1 - d)
recs = [m for m, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]]
hits += int(oculta in recs)
total += 1
return hits / total if total else 0
Testes
print("Recomendações usuário 1:", recomendar_para_usuario(1, k=3))
hr = hit_rate_at_k(matriz, k=3, n_neighbors=5)
print("Hit Rate@3:", hr)
print("Precision@3:", hr/3)