排序模型通过召回的操作, 咱们曾经进行了问题规模的缩减, 对于每个用户, 抉择出了N篇文章作为了候选集,并基于召回的候选集构建了与用户历史相干的特色,以及用户自身的属性特色,文章本省的属性特色,以及用户与文章之间的特色,上面就是应用机器学习模型来对结构好的特色进行学习,而后对测试集进行预测,失去测试集中的每个候选集用户点击的概率,返回点击概率最大的topk个文章,作为最终的后果。
排序阶段抉择了三个比拟有代表性的排序模型,它们别离是:
LGB的排序模型LGB的分类模型深度学习的分类模型DIN失去了最终的排序模型输入的后果之后,还抉择了两种比拟经典的模型集成的办法:
输入后果加权交融Staking(将模型的输入后果再应用一个简略模型进行预测)导入库import numpy as npimport pandas as pdimport picklefrom tqdm import tqdmimport gc, osimport timefrom datetime import datetimeimport lightgbm as lgbfrom sklearn.preprocessing import MinMaxScalerimport warningswarnings.filterwarnings('ignore')读取排序特色data_path = './data_raw'save_path = './temp_results'offline = False# 从新读取数据的时候,发现click_article_id是一个浮点数,所以将其转换成int类型trn_user_item_feats_df = pd.read_csv(save_path + 'trn_user_item_feats_df.csv')trn_user_item_feats_df['click_article_id'] = trn_user_item_feats_df['click_article_id'].astype(int)if offline: val_user_item_feats_df = pd.read_csv(save_path + 'val_user_item_feats_df.csv') val_user_item_feats_df['click_article_id'] = val_user_item_feats_df['click_article_id'].astype(int)else: val_user_item_feats_df = None tst_user_item_feats_df = pd.read_csv(save_path + 'tst_user_item_feats_df.csv')tst_user_item_feats_df['click_article_id'] = tst_user_item_feats_df['click_article_id'].astype(int)# 做特色的时候为了不便,给测试集也打上了一个有效的标签,这里间接删掉就行del tst_user_item_feats_df['label']返回排序后的后果def submit(recall_df, topk=5, model_name=None): recall_df = recall_df.sort_values(by=['user_id', 'pred_score']) recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 判断是不是每个用户都有5篇文章及以上 tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max()) assert tmp.min() >= topk del recall_df['pred_score'] submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index() submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)] # 依照提交格局定义列名 submit = submit.rename(columns={'': 'user_id', 1: 'article_1', 2: 'article_2', 3: 'article_3', 4: 'article_4', 5: 'article_5'}) save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv' submit.to_csv(save_name, index=False, header=True)# 排序后果归一化def norm_sim(sim_df, weight=0.0): # print(sim_df.head()) min_sim = sim_df.min() max_sim = sim_df.max() if max_sim == min_sim: sim_df = sim_df.apply(lambda sim: 1.0) else: sim_df = sim_df.apply(lambda sim: 1.0 * (sim - min_sim) / (max_sim - min_sim)) sim_df = sim_df.apply(lambda sim: sim + weight) # plus one return sim_dfLGB排序模型# copy 一份trn_user_item_feats_df_rank_model = trn_user_item_feats_df.copy()if offline: val_user_item_feats_df_rank_model = val_user_item_feats_df.copy() tst_user_item_feats_df_rank_model = tst_user_item_feats_df.copy()# 定义特色列lgb_cols = ['sim0', 'time_diff0', 'word_diff0','sim_max', 'sim_min', 'sim_sum', 'sim_mean', 'score','click_size', 'time_diff_mean', 'active_level', 'click_environment','click_deviceGroup', 'click_os', 'click_country', 'click_region','click_referrer_type', 'user_time_hob1', 'user_time_hob2', 'words_hbo', 'category_id', 'created_at_ts','words_count'] # 排序模型分组trn_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True)g_train = trn_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].valuesif offline: val_user_item_feats_df_rank_model.sort_values(by=['user_id'], inplace=True) g_val = val_user_item_feats_df_rank_model.groupby(['user_id'], as_index=False).count()["label"].values# 排序模型定义lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) # 排序模型训练if offline: lgb_ranker.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], group=g_train, eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, )else: lgb_ranker.fit(trn_user_item_feats_df[lgb_cols], trn_user_item_feats_df['label'], group=g_train) # 模型预测tst_user_item_feats_df['pred_score'] = lgb_ranker.predict(tst_user_item_feats_df[lgb_cols], num_iteration=lgb_ranker.best_iteration_)# 将这里的排序后果保留一份,用户前面的模型交融tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_ranker_score.csv', index=False)# 预测后果从新排序, 及生成提交后果rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)submit(rank_results, topk=5, model_name='lgb_ranker') # 五折穿插验证,这里的五折穿插是以用户为指标进行五折划分# 这一部分与后面的独自训练和验证是离开的def get_kfold_users(trn_df, n=5): user_ids = trn_df['user_id'].unique() user_set = [user_ids[i::n] for i in range(n)] return user_setk_fold = 5trn_df = trn_user_item_feats_df_rank_modeluser_set = get_kfold_users(trn_df, n=k_fold)score_list = []score_df = trn_df[['user_id', 'click_article_id','label']]sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])# 五折穿插验证,并将两头后果保留用于stakingfor n_fold, valid_user in enumerate(user_set): train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user valid_idx = trn_df[trn_df['user_id'].isin(valid_user)] # 训练集与验证集的用户分组 train_idx.sort_values(by=['user_id'], inplace=True) g_train = train_idx.groupby(['user_id'], as_index=False).count()["label"].values valid_idx.sort_values(by=['user_id'], inplace=True) g_val = valid_idx.groupby(['user_id'], as_index=False).count()["label"].values # 定义模型 lgb_ranker = lgb.LGBMRanker(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16) # 训练模型 lgb_ranker.fit(train_idx[lgb_cols], train_idx['label'], group=g_train, eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_group= [g_val], eval_at=[1, 2, 3, 4, 5], eval_metric=['ndcg', ], early_stopping_rounds=50, ) # 预测验证集后果 valid_idx['pred_score'] = lgb_ranker.predict(valid_idx[lgb_cols], num_iteration=lgb_ranker.best_iteration_) # 对输入后果进行归一化 valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x)) valid_idx.sort_values(by=['user_id', 'pred_score']) valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 将验证集的预测后果放到一个列表中,前面进行拼接 score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']]) # 如果是线上测试,须要计算每次穿插验证的后果相加,最初求均匀 if not offline: sub_preds += lgb_ranker.predict(tst_user_item_feats_df_rank_model[lgb_cols], lgb_ranker.best_iteration_) score_df_ = pd.concat(score_list, axis=0)score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])# 保留训练集穿插验证产生的新特色score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_ranker_feats.csv', index=False) # 测试集的预测后果,屡次穿插验证求均匀,将预测的score和对应的rank特色保留,能够用于前面的staking,这里还能够结构其余更多的特色tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_foldtst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')# 保留测试集穿插验证的新特色tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_ranker_feats.csv', index=False)# 预测后果从新排序, 及生成提交后果# 单模型生成提交后果rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)submit(rank_results, topk=5, model_name='lgb_ranker')LGB分类模型# 模型及参数的定义lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) # 模型及参数的定义lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=500, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) # 模型训练if offline: lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label'], eval_set=[(val_user_item_feats_df_rank_model[lgb_cols], val_user_item_feats_df_rank_model['label'])], eval_metric=['auc', ],early_stopping_rounds=50, )else: lgb_Classfication.fit(trn_user_item_feats_df_rank_model[lgb_cols], trn_user_item_feats_df_rank_model['label']) # 模型预测tst_user_item_feats_df['pred_score'] = lgb_Classfication.predict_proba(tst_user_item_feats_df[lgb_cols])[:,1]# 将这里的排序后果保留一份,用户前面的模型交融tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']].to_csv(save_path + 'lgb_cls_score.csv', index=False)# 预测后果从新排序, 及生成提交后果rank_results = tst_user_item_feats_df[['user_id', 'click_article_id', 'pred_score']]rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)submit(rank_results, topk=5, model_name='lgb_cls')# 五折穿插验证,这里的五折穿插是以用户为指标进行五折划分# 这一部分与后面的独自训练和验证是离开的def get_kfold_users(trn_df, n=5): user_ids = trn_df['user_id'].unique() user_set = [user_ids[i::n] for i in range(n)] return user_setk_fold = 5trn_df = trn_user_item_feats_df_rank_modeluser_set = get_kfold_users(trn_df, n=k_fold)score_list = []score_df = trn_df[['user_id', 'click_article_id', 'label']]sub_preds = np.zeros(tst_user_item_feats_df_rank_model.shape[0])# 五折穿插验证,并将两头后果保留用于stakingfor n_fold, valid_user in enumerate(user_set): train_idx = trn_df[~trn_df['user_id'].isin(valid_user)] # add slide user valid_idx = trn_df[trn_df['user_id'].isin(valid_user)] # 模型及参数的定义 lgb_Classfication = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=100, subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.01, min_child_weight=50, random_state=2018, n_jobs= 16, verbose=10) # 训练模型 lgb_Classfication.fit(train_idx[lgb_cols], train_idx['label'],eval_set=[(valid_idx[lgb_cols], valid_idx['label'])], eval_metric=['auc', ],early_stopping_rounds=50, ) # 预测验证集后果 valid_idx['pred_score'] = lgb_Classfication.predict_proba(valid_idx[lgb_cols], num_iteration=lgb_Classfication.best_iteration_)[:,1] # 对输入后果进行归一化 分类模型输入的值自身就是一个概率值不须要进行归一化 # valid_idx['pred_score'] = valid_idx[['pred_score']].transform(lambda x: norm_sim(x)) valid_idx.sort_values(by=['user_id', 'pred_score']) valid_idx['pred_rank'] = valid_idx.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first') # 将验证集的预测后果放到一个列表中,前面进行拼接 score_list.append(valid_idx[['user_id', 'click_article_id', 'pred_score', 'pred_rank']]) # 如果是线上测试,须要计算每次穿插验证的后果相加,最初求均匀 if not offline: sub_preds += lgb_Classfication.predict_proba(tst_user_item_feats_df_rank_model[lgb_cols], num_iteration=lgb_Classfication.best_iteration_)[:,1] score_df_ = pd.concat(score_list, axis=0)score_df = score_df.merge(score_df_, how='left', on=['user_id', 'click_article_id'])# 保留训练集穿插验证产生的新特色score_df[['user_id', 'click_article_id', 'pred_score', 'pred_rank', 'label']].to_csv(save_path + 'trn_lgb_cls_feats.csv', index=False) # 测试集的预测后果,屡次穿插验证求均匀,将预测的score和对应的rank特色保留,能够用于前面的staking,这里还能够结构其余更多的特色tst_user_item_feats_df_rank_model['pred_score'] = sub_preds / k_foldtst_user_item_feats_df_rank_model['pred_score'] = tst_user_item_feats_df_rank_model['pred_score'].transform(lambda x: norm_sim(x))tst_user_item_feats_df_rank_model.sort_values(by=['user_id', 'pred_score'])tst_user_item_feats_df_rank_model['pred_rank'] = tst_user_item_feats_df_rank_model.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')# 保留测试集穿插验证的新特色tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score', 'pred_rank']].to_csv(save_path + 'tst_lgb_cls_feats.csv', index=False) # 预测后果从新排序, 及生成提交后果rank_results = tst_user_item_feats_df_rank_model[['user_id', 'click_article_id', 'pred_score']]rank_results['click_article_id'] = rank_results['click_article_id'].astype(int)submit(rank_results, topk=5, model_name='lgb_cls')DIN模型用户的历史点击行为列表这个是为前面的DIN模型服务的
...