import json
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
=========================
Configuração do MLflow
=========================
mlflow.set_tracking_uri("sqlite:///mlruns.db")
mlflow.set_experiment("playcatch-recsys")
=========================
Dados
=========================
df = pd.read_csv("user_data.csv")
df["last_played"] = pd.to_datetime(df["last_played"])
df = df.drop_duplicates()
matriz = df.pivot_table(
index="user_id",
columns="song_id",
values="play_count",
fill_value=0
)
=========================
Função de treino/validação
=========================
def treinar_avaliar(n_neighbors=5, k=3, norm="l2"):
item_user = matriz.T
item_user_norm = normalize(item_user, norm=norm)
knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=n_neighbors)
knn.fit(item_user_norm)
def recomendar_para_usuario(user_id, top_k=k):
if user_id not in matriz.index:
return []
ouvidas = matriz.loc[user_id]
ouvidas = ouvidas[ouvidas > 0].index.tolist()
scores = {}
for song_id in ouvidas:
idx = item_user.index.get_loc(song_id)
dist, ind = knn.kneighbors(item_user_norm[idx].reshape(1, -1), n_neighbors=n_neighbors)
for d, i in zip(dist.flatten(), ind.flatten()):
musica = item_user.index[i]
if musica not in ouvidas:
scores[musica] = scores.get(musica, 0) + (1 - d)
return [m for m, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]]
# Hit Rate@k simples: oculta a música mais tocada
hits, total = 0, 0
for user_id in matriz.index:
plays = matriz.loc[user_id]
ouvidas = plays[plays > 0]
if len(ouvidas) < 2:
continue
oculta = ouvidas.idxmax()
recs = recomendar_para_usuario(user_id, top_k=k)
hits += int(oculta in recs)
total += 1
hit_rate = hits / total if total else 0
# Artefatos
top_songs = df.groupby("song_id")["play_count"].sum().sort_values(ascending=False).reset_index()
top_songs.to_csv("top_songs.csv", index=False)
sample_recs = {str(u): recomendar_para_usuario(u, top_k=k) for u in list(matriz.index)[:3]}
with open("sample_recs.json", "w") as f:
json.dump(sample_recs, f)
song_to_idx = {str(song): int(i) for i, song in enumerate(item_user.index)}
with open("song_to_idx.json", "w") as f:
json.dump(song_to_idx, f)
with open("matrix_shape.json", "w") as f:
json.dump({"shape": list(matriz.shape)}, f)
return knn, hit_rate
=========================
Múltiplas execuções
=========================
for n_neighbors in [3, 5, 7]:
for k in [1, 3, 5]:
with mlflow.start_run():
knn, hr = treinar_avaliar(n_neighbors=n_neighbors, k=k, norm="l2")
mlflow.log_param("n_neighbors", n_neighbors)
mlflow.log_param("k", k)
mlflow.log_param("norm", "l2")
mlflow.log_metric("hit_rate_at_k", hr)
mlflow.log_artifact("top_songs.csv")
mlflow.log_artifact("sample_recs.json")
mlflow.log_artifact("song_to_idx.json")
mlflow.log_artifact("matrix_shape.json")
mlflow.sklearn.log_model(knn, "model")
print(f"Run: n_neighbors={n_neighbors}, k={k}, hit_rate={hr:.4f}")