一.筹备
1. 评分预测公式
预测分数= (用户类似度* 对电影的评分)之和/类似度之和
2. 数据集
movielens数据集 中 ml-latest-small.zip
地址:
https://grouplens.org/dataset...
二.算法实现
1. 加载数据集
def load_data(data_path): '''加载数据,data_path 为数据集门路 file_path = r'E:\RecommendData\ml-latest-small\ratings.csv' ''' cache_path = os.path.join(cache_dir, 'ratings_matrix.cache') print('开始加载数据集...') if os.path.exists(cache_path): print('加载缓存中') ratings_matrix = pd.read_pickle(cache_path) print("从缓存加载数据结束") else: # 从数据集中加载数据 print("加载新数据中...") dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32} # 读取csv文件内容 ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3)) # 对读取到的数据进行透视,组成用户为 index movieId 为列的数据结构 ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values="rating") # 将数据存入缓存文件 ratings_matrix.to_pickle(cache_path) print("加载数据结束") return ratings_matrix
2. 应用皮尔逊算法计算用户类似度
def compute_pearson_similarity(ratings_matrix, based='user'): ''' 计算皮尔逊相关系数 ''' user_similarity_cache_path = os.path.join(cache_dir, 'user_similarity.cache') item_similarity_cache_path = os.path.join(cache_dir, 'item_similarity.cache') if based == 'user': # 计算用户类似度 if os.path.exists(user_similarity_cache_path): similarity = pd.read_pickle(user_similarity_cache_path) else: # 计算用户类似度 similarity = ratings_matrix.T.corr() # 将用户类似度写入缓存中 similarity.to_pickle(user_similarity_cache_path) elif based == 'item': # 计算物品类似度 if os.path.exists(item_similarity_cache_path): # item similar 已存在,读取缓存 similarity = pd.read_pickle(item_similarity_cache_path) else: # item similarity 不存在,从新计算类似度,保留进缓存 similarity = ratings_matrix.corr() # 将item类似度写入缓存中 similarity.to_pickle(item_similarity_cache_path) else: print("传入based 值谬误") return similarity
3.预测算法
def predict(uid, iid, ratings_matrix, user_similar): # 获取与uid 类似的用户 similar_users = user_similar[uid].drop([uid]).dropna() # 筛选正相干的用户 similar_users = similar_users.where(similar_users > 0).dropna() # 提醒没有类似用户 if similar_users.empty is True: raise Exception("用户<%d>没有类似的用户" % uid) # uid 近邻类似用户中筛选 对iid物品有评分记录的用户 ids = set(ratings_matrix[iid].dropna().index) & set(similar_users.index) # 依据用户ids 获取对应的类似的用户及类似度 finally_similar_users = similar_users.loc[list(ids)] sum_up = 0 sum_down = 0 # 对每个类似的用户进行循环 for sim_uid, similarity in finally_similar_users.iteritems(): # 类似用户评过分的说有电影 sim_user_rated_movies = ratings_matrix.loc[sim_uid].dropna() # 类似用户对指定电影的评分 sim_user_rating_for_item = sim_user_rated_movies[iid] # 类似用户 类似度* 对电影的评分 sum_up += similarity * sim_user_rating_for_item # 各个类似用户类似度之后 sum_down += similarity # 预测分数为 (类似用户类似度* 对电影的评分)之和/类似度之和 predict_rating = sum_up / sum_down print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating)) return round(predict_rating, 2)
4. 对用户所有电影进行评分预测
def _predict_all(uid, item_ids, ratings_matrix, user_similar): # 预测全副评分 # 对指定用户做所有电影举荐 for iid in item_ids: try: # 对指定用户指定电影做评分预测 rating = predict(uid, iid, ratings_matrix, user_similar) except Exception as e: print(e) else: yield uid, iid, ratingdef predict_all(uid, rating_matrix, user_similar, filter_rule=None): # 预测全副评分,并依据条件进行前置过滤 if not filter_rule: # 不进行过滤 item_ids = rating_matrix.columns elif isinstance(filter_rule, str) and filter_rule == 'unhot': '''过滤非热门电影''' # 统计每部电影的评分次数 count = rating_matrix.count() # 过滤评分次数高于10词的电影,作为热门电影 item_ids = count.where(count > 10).dropna().index elif isinstance(filter_rule, str) and filter_rule == 'rated': '''过滤用户评分过的电影''' # 获取用户对所有电影的评分记录 user_ratings = rating_matrix.loc[uid] # 评分范畴是1-5,小于6的都是评分过的,除此以外的都是没有评分的 _ = user_ratings < 6 item_ids = _.where(_ == False).dropna().index elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]): count = rating_matrix.count() ids1 = count.where(count > 10).dropna().index user_ratings = rating_matrix.loc[uid] _ = user_ratings < 6 ids2 = _.where(_ == False).dropna().index item_ids = set(ids1) & set(ids2) else: raise Exception("有效的过滤参数") yield from _predict_all(uid, item_ids, rating_matrix, user_similar)
5. 返回K个举荐后果
def top_k_rs_result(K): file_path = r'E:\RecommendData\ml-latest-small\ratings.csv' ratings_matrix = load_data(file_path) user_similarity = compute_pearson_similarity(ratings_matrix, based='user') results = predict_all(1, ratings_matrix, user_similarity, filter_rule=["unhot", "rated"]) return sorted(results, key=lambda x: x[2], reverse=True)[:K]
三.预测后果
[(1, 1041, 4.76), (1, 714, 4.72), (1, 80906, 4.7), (1, 1235, 4.63), (1, 3030, 4.63), (1, 65261, 4.63), (1, 1178, 4.57), (1, 1217, 4.56), (1, 318, 4.55), (1, 1104, 4.55), (1, 3451, 4.55), (1, 280, 4.54), (1, 168252, 4.52), (1, 3246, 4.5), (1, 58, 4.49), (1, 290, 4.49), (1, 115569, 4.49), (1, 1243, 4.48), (1, 142488, 4.47), (1, 800, 4.45)]