一.筹备

1. 评分预测公式


预测分数= (用户类似度* 对电影的评分)之和/类似度之和

2. 数据集

movielens数据集 中 ml-latest-small.zip
地址:
https://grouplens.org/dataset...

二.算法实现

1. 加载数据集

def load_data(data_path):    '''加载数据,data_path 为数据集门路      file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'    '''    cache_path = os.path.join(cache_dir, 'ratings_matrix.cache')    print('开始加载数据集...')    if os.path.exists(cache_path):        print('加载缓存中')        ratings_matrix = pd.read_pickle(cache_path)        print("从缓存加载数据结束")    else:        # 从数据集中加载数据        print("加载新数据中...")        dtype = {'userId': np.int32, 'movieId': np.int32, 'rating': np.float32}        # 读取csv文件内容        ratings = pd.read_csv(data_path, dtype=dtype, usecols=range(3))        # 对读取到的数据进行透视,组成用户为 index movieId 为列的数据结构        ratings_matrix = ratings.pivot_table(index=['userId'], columns=['movieId'], values="rating")        # 将数据存入缓存文件        ratings_matrix.to_pickle(cache_path)        print("加载数据结束")    return ratings_matrix

2. 应用皮尔逊算法计算用户类似度

def compute_pearson_similarity(ratings_matrix, based='user'):    '''    计算皮尔逊相关系数    '''    user_similarity_cache_path = os.path.join(cache_dir, 'user_similarity.cache')    item_similarity_cache_path = os.path.join(cache_dir, 'item_similarity.cache')    if based == 'user':        # 计算用户类似度        if os.path.exists(user_similarity_cache_path):            similarity = pd.read_pickle(user_similarity_cache_path)        else:            # 计算用户类似度            similarity = ratings_matrix.T.corr()            # 将用户类似度写入缓存中            similarity.to_pickle(user_similarity_cache_path)    elif based == 'item':        # 计算物品类似度        if os.path.exists(item_similarity_cache_path):            # item similar 已存在,读取缓存            similarity = pd.read_pickle(item_similarity_cache_path)        else:            # item similarity 不存在,从新计算类似度,保留进缓存            similarity = ratings_matrix.corr()            # 将item类似度写入缓存中            similarity.to_pickle(item_similarity_cache_path)    else:        print("传入based 值谬误")    return similarity

3.预测算法

def predict(uid, iid, ratings_matrix, user_similar):    # 获取与uid 类似的用户    similar_users = user_similar[uid].drop([uid]).dropna()    # 筛选正相干的用户    similar_users = similar_users.where(similar_users > 0).dropna()    # 提醒没有类似用户    if similar_users.empty is True:        raise Exception("用户<%d>没有类似的用户" % uid)    # uid 近邻类似用户中筛选 对iid物品有评分记录的用户    ids = set(ratings_matrix[iid].dropna().index) & set(similar_users.index)    # 依据用户ids 获取对应的类似的用户及类似度    finally_similar_users = similar_users.loc[list(ids)]    sum_up = 0    sum_down = 0    # 对每个类似的用户进行循环    for sim_uid, similarity in finally_similar_users.iteritems():        # 类似用户评过分的说有电影        sim_user_rated_movies = ratings_matrix.loc[sim_uid].dropna()        # 类似用户对指定电影的评分        sim_user_rating_for_item = sim_user_rated_movies[iid]        # 类似用户 类似度* 对电影的评分        sum_up += similarity * sim_user_rating_for_item        # 各个类似用户类似度之后        sum_down += similarity    # 预测分数为 (类似用户类似度* 对电影的评分)之和/类似度之和    predict_rating = sum_up / sum_down    print("预测出用户<%d>对电影<%d>的评分:%0.2f" % (uid, iid, predict_rating))    return round(predict_rating, 2)

4. 对用户所有电影进行评分预测

def _predict_all(uid, item_ids, ratings_matrix, user_similar):    # 预测全副评分    # 对指定用户做所有电影举荐    for iid in item_ids:        try:            # 对指定用户指定电影做评分预测            rating = predict(uid, iid, ratings_matrix, user_similar)        except Exception as e:            print(e)        else:            yield uid, iid, ratingdef predict_all(uid, rating_matrix, user_similar, filter_rule=None):    # 预测全副评分,并依据条件进行前置过滤    if not filter_rule:        # 不进行过滤        item_ids = rating_matrix.columns    elif isinstance(filter_rule, str) and filter_rule == 'unhot':        '''过滤非热门电影'''        # 统计每部电影的评分次数        count = rating_matrix.count()        # 过滤评分次数高于10词的电影,作为热门电影        item_ids = count.where(count > 10).dropna().index    elif isinstance(filter_rule, str) and filter_rule == 'rated':        '''过滤用户评分过的电影'''        # 获取用户对所有电影的评分记录        user_ratings = rating_matrix.loc[uid]        # 评分范畴是1-5,小于6的都是评分过的,除此以外的都是没有评分的        _ = user_ratings < 6        item_ids = _.where(_ == False).dropna().index    elif isinstance(filter_rule, list) and set(filter_rule) == set(["unhot", "rated"]):        count = rating_matrix.count()        ids1 = count.where(count > 10).dropna().index        user_ratings = rating_matrix.loc[uid]        _ = user_ratings < 6        ids2 = _.where(_ == False).dropna().index        item_ids = set(ids1) & set(ids2)    else:        raise Exception("有效的过滤参数")    yield from _predict_all(uid, item_ids, rating_matrix, user_similar)

5. 返回K个举荐后果

def top_k_rs_result(K):    file_path = r'E:\RecommendData\ml-latest-small\ratings.csv'    ratings_matrix = load_data(file_path)    user_similarity = compute_pearson_similarity(ratings_matrix, based='user')    results = predict_all(1, ratings_matrix, user_similarity, filter_rule=["unhot", "rated"])    return sorted(results, key=lambda x: x[2], reverse=True)[:K]

三.预测后果

[(1, 1041,  4.76), (1, 714,   4.72), (1, 80906, 4.7), (1, 1235,  4.63), (1, 3030,  4.63), (1, 65261, 4.63), (1, 1178,  4.57), (1, 1217,  4.56), (1, 318,   4.55), (1, 1104,  4.55), (1, 3451,  4.55), (1, 280,   4.54), (1, 168252, 4.52), (1, 3246,   4.5), (1, 58,     4.49), (1, 290,    4.49), (1, 115569, 4.49), (1, 1243,   4.48), (1, 142488, 4.47), (1, 800,    4.45)]