共计 2185 个字符,预计需要花费 6 分钟才能阅读完成。
一. 实现思路
- 筹备测试数据
- 计算物品类似度, 应用 jaccard 计算类似度
- 获取每个物品对应的类似物品
- 获取最初的举荐数据
二. 代码实现
1. 筹备测试数据
users = ['User1', 'User2', 'User3', 'User4', 'User5',] | |
items = ['ItemA', 'ItemB', 'ItemC', 'ItemD', 'ItemE'] | |
datasets = [[1, 0, 1, 1, 0], | |
[1, 0, 0, 1, 1], | |
[1, 0, 1, 0, 0], | |
[0, 1, 0, 1, 1], | |
[1, 1, 1, 0, 1], | |
] | |
df = pd.DataFrame(datasets, columns=items, index=users) |
2. 计算物品类似度
items_similar = 1 - pairwise_distances(df.values.T, metric='jaccard') | |
items_similar = pd.DataFrame(items_similar, columns=items, index=items) |
3. 获取每个物品对应的类似物品
topN_items = {} | |
for i in items_similar.index: | |
_df = items_similar.loc[i].drop([i]) | |
_df_sorted = _df.sort_values(ascending=False) | |
topN_items[i] = list(_df_sorted[:2].index) | |
print(topN_items) |
运行后果
{'ItemA': ['ItemC', 'ItemE'], | |
'ItemB': ['ItemE', 'ItemD'], | |
'ItemC': ['ItemA', 'ItemB'], | |
'ItemD': ['ItemE', 'ItemA'], | |
'ItemE': ['ItemB', 'ItemD']} |
4. 获取最初的举荐数据
rs_results = {} | |
for user in df.index: | |
user_item = df.loc[user].replace(0, np.nan).dropna().index | |
rs_result = set() | |
for item in user_item: | |
rs_result=rs_result.union(topN_items[item]) | |
rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index) | |
rs_results[user] = rs_result | |
print(rs_results) |
举荐后果
{'User1': {'ItemB', 'ItemE'}, | |
'User2': {'ItemB', 'ItemC'}, | |
'User3': {'ItemB', 'ItemE'}, | |
'User4': {'ItemA'}, | |
'User5': {'ItemD'}} |
残缺代码
import pandas as pd | |
from sklearn.metrics import jaccard_score | |
from sklearn.metrics.pairwise import pairwise_distances | |
import numpy as np | |
users = ['User1', 'User2', 'User3', 'User4', 'User5',] | |
items = ['ItemA', 'ItemB', 'ItemC', 'ItemD', 'ItemE'] | |
datasets = [[1, 0, 1, 1, 0], | |
[1, 0, 0, 1, 1], | |
[1, 0, 1, 0, 0], | |
[0, 1, 0, 1, 1], | |
[1, 1, 1, 0, 1], | |
] | |
df = pd.DataFrame(datasets, columns=items, index=users) | |
# print(df) | |
items_similar = 1 - pairwise_distances(df.values.T, metric='jaccard') | |
items_similar = pd.DataFrame(items_similar, columns=items, index=items) | |
topN_items = {} | |
for i in items_similar.index: | |
_df = items_similar.loc[i].drop([i]) | |
_df_sorted = _df.sort_values(ascending=False) | |
topN_items[i] = list(_df_sorted[:2].index) | |
print(topN_items) | |
rs_results = {} | |
for user in df.index: | |
user_item = df.loc[user].replace(0, np.nan).dropna().index | |
rs_result = set() | |
for item in user_item: | |
rs_result=rs_result.union(topN_items[item]) | |
rs_result -= set(df.loc[user].replace(0, np.nan).dropna().index) | |
rs_results[user] = rs_result | |
print(rs_results) | |
{'User1': {'ItemB', 'ItemE'}, | |
'User2': {'ItemB', 'ItemC'}, | |
'User3': {'ItemB', 'ItemE'}, | |
'User4': {'ItemA'}, | |
'User5': {'ItemD'}} |
正文完