In [15]:
# !pip install surprise
In [21]:
from surprise import SVD
from surprise import Dataset
from surprise import dump
from surprise import accuracy
from surprise import Reader
import pandas as pd
from collections import defaultdict
In [6]:
# data = Dataset.load_builtin('ml-100k')
학습할 데이터 읽기¶
영화 평점 데이터를 사용해 보자¶
무비렌즈 데이터 다운: https://files.grouplens.org/datasets/movielens/ml-latest-small.zip¶
In [10]:
rateingsDf = pd.read_csv('~/SVD/ratings.csv')
In [16]:
rateingsDf.head()
Out[16]:
학습하기¶
In [12]:
### 학습데이터 포멧팅 from DataFrame
DfOrgData = rateingsDf[['userId','movieId','rating']]
r_min = DfOrgData['rating'].min()
r_max = DfOrgData['rating'].max()
reader = Reader(rating_scale=(r_min, r_max))
data = Dataset.load_from_df(DfOrgData[['userId', 'movieId', 'rating']],reader)
trainset = data.build_full_trainset()
testset = trainset.build_testset()
algo = SVD()
### trainset으로 SVD 학습
algo.fit(trainset)
Out[12]:
결과 평가하기¶
In [17]:
### testset으로 RMSE 측정
predictions = algo.test(testset)
accuracy.rmse(predictions)
Out[17]:
그리드 탐색을 통한 최적의 파라메터 출력¶
In [44]:
from surprise.model_selection import GridSearchCV, cross_validate
param_grid = {'n_factors': [50, 75], 'lr_all': [0.5, 0.05], 'reg_all': [0.06, 0.04]}
gs = GridSearchCV(algo_class=SVD, measures=['RMSE'], param_grid=param_grid)
gs.fit(data)
print('\n###################')
print('Best Score :', gs.best_score['rmse'])
print('Best Parameters :', gs.best_params['rmse'])
print('#####################')
최적의 파라메터로 최종 모델 생성¶
In [47]:
best_params = gs.best_params['rmse']
In [66]:
final_algo = SVD(n_factors=best_params['n_factors'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])
### SVD 학습
final_algo.fit(trainset)
### 최종으로 RMSE 측정
predictions = final_algo.test(testset)
accuracy.rmse(predictions)
Out[66]:
모든 결과값 확인¶
In [67]:
pd.DataFrame(predictions)
Out[67]:
user1, item2 에 대해서 예측값 확인 하기¶
In [33]:
userid = 1
movieid = 2
sample_test = [(userid, movieid, 0)]
algo.test(sample_test)
Out[33]:
상위 N개만 출력하기¶
In [19]:
def get_top_n(predictions, n=10):
# 각 사용자의 예측데이터를 defaultdict에 저장
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
# 정렬 후 Top N 개만 저장
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:n]
return top_n
In [37]:
topNresults = get_top_n(predictions, 5)
In [42]:
topNresults[1]
Out[42]:
In [43]:
topNresults[2]
Out[43]: