应用基于BERT的构建基于协同过滤的举荐模型
基于编码器的自注意力Transformer十分善于预测自然语言生成工作的下一个字符,因为它们能够留神到给定字符四周的标记/字符的重要性。为什么咱们不能利用这个概念来预测任何用户喜爱的给定物品序列中的下一个我的项目呢?这种举荐问题能够归类为基于物品的协同过滤。
在基于物品的协同过滤中,咱们试图找到给定的物品集和不同用户的偏好之间的关系或模式。让咱们举个例子,假如咱们有两个用户Alice和Bob,每次Alice来咱们的网站买她每月的食品杂货,她买牛奶,面包,奶酪,意大利面和番茄酱。
当初咱们有一个齐全未知的用户,假如叫他Guest来到并向他的购物车中增加面包,当初察看到Guest用户增加了面包,而后咱们能够倡议用户增加牛奶或奶酪,因为咱们从其余用户的历史记录中晓得。
咱们并不关怀用户的类型,比方他们的背景是什么,他们在哪里下单,或者他们的性别是什么。咱们只关注每个用户购买或喜爱的物品集。
咱们将通过预测给定的物品序列的下一个物品来从新表述举荐问题。这个问题将变得更加相似或齐全相似于下一个字符预测或语言建模。咱们还能够通过随机屏蔽给定序列中的任何项,并训练基于编码器的Transformer模型来预测被屏蔽的项,从而减少更多的变动。该模型能够从左右两个方向预测物品。
为什么咱们要在两个方向上预测?让咱们以下面探讨的问题为例。
假如Guest用户间接将奶酪增加到他们的购物车中,那么如果咱们只从一个方向进行预测,咱们能够向用户举荐番茄酱或面条,但对于这个用户来说,购买这些货色是没有意义的。
然而如果咱们的模型以一个给定的程序预测被遮蔽的物品,咱们就能够预测两边,对于Guest用户咱们能够倡议他增加牛奶、面包或鸡蛋。
让咱们尝试应用这个概念来构建和训练一个咱们的模型,预测给定序列中的被屏蔽项。咱们将通过上面的一些形象来探讨代码。这里应用的是MovieLens-25m数据集。
数据预处理
与咱们的例子中字符的标识id相似,咱们将把每个惟一的电影转换为一个id,从2到电影数量开始。咱们将应用id 0作为[PAD], id 1作为[MASK]。
import numpy as np import pandas as pd import torch import torch.nn as nn import torch.nn.functional as F import random import sys sys.path.append("../") from constants import * movies_df = pd.read_csv("../data/ml-25m/ml-25m/movies.csv") ratings_df = pd.read_csv("../data/ml-25m/ml-25m/ratings.csv") ratings_df.sort_values(by=["timestamp"], inplace=True) grouped_ratings = ratings_df.groupby(by="userId").agg(list) movieIdMapping = {k:i+2 for i, k in enumerate(sorted(list(ratings_df.movieId.unique())))} ratings_df["movieId_mapped"] = ratings_df.movieId.map(movieIdMapping) movies_df["movieId_mapped"] = movies_df.movieId.map(movieIdMapping)
模型
import os from requests import head import torch as T import torch.nn as nn import torch.nn.functional as F from modules import Encoder, Decoder class RecommendationTransformer(nn.Module): """Sequential recommendation model architecture """ def __init__(self, vocab_size, heads=4, layers=6, emb_dim=256, pad_id=0, num_pos=128): super().__init__() """Recommendation model initializer Args: vocab_size (int): Number of unique tokens/items heads (int, optional): Number of heads in the Multi-Head Self Attention Transformers (). Defaults to 4. layers (int, optional): Number of Layers. Defaults to 6. emb_dim (int, optional): Embedding Dimension. Defaults to 256. pad_id (int, optional): Token used to pad tensors. Defaults to 0. num_pos (int, optional): Positional Embedding, fixed sequence. Defaults to 128 """ self.emb_dim = emb_dim self.pad_id = pad_id self.num_pos = num_pos self.vocab_size = vocab_size self.encoder = Encoder(source_vocab_size=vocab_size, emb_dim=emb_dim, layers=layers, heads=heads, dim_model=emb_dim, dim_inner=4 * emb_dim, dim_value=emb_dim, dim_key=emb_dim, pad_id=self.pad_id, num_pos=num_pos) self.rec = nn.Linear(emb_dim, vocab_size) def forward(self, source, source_mask): enc_op = self.encoder(source, source_mask) op = self.rec(enc_op) return op.permute(0, 2, 1)
训练的流程
import os import re import pandas as pd from tqdm import trange, tnrange import torch as T import torch.nn as nn import torch.nn.functional as F from torch.utils.data import DataLoader from bert4rec_dataset import Bert4RecDataset from bert4rec_model import RecommendationModel, RecommendationTransformer from rich.table import Column, Table from rich import box from rich.console import Console from torch import cuda from train_validate import train_step, validate_step from sklearn.model_selection import train_test_split from AttentionTransformer.ScheduledOptimizer import ScheduledOptimizer from IPython.display import clear_output from AttentionTransformer.utilities import count_model_parameters import random import numpy as np device = T.device('cuda') if cuda.is_available() else T.device('cpu') def trainer(data_params, model_params, loggers, optimizer_params=None, warmup_steps=False, output_dir="./models/", modify_last_fc=False, validation=5): # console instance console = loggers.get("CONSOLE") # tables train_logger = loggers.get("TRAIN_LOGGER") valid_logger = loggers.get("VALID_LOGGER") # check if output_dir/model_files available; if not create if not os.path.exists(output_dir): console.log(f"OUTPUT DIRECTORY DOES NOT EXIST. CREATING...") os.mkdir(output_dir) os.mkdir(os.path.join(output_dir, "model_files")) os.mkdir(os.path.join(output_dir, "model_files_initial")) else: console.log(f"OUTPUT DIRECTORY EXISTS. CHECKING CHILD DIRECTORY...") if not os.path.exists(os.path.join(output_dir, "model_files")): os.mkdir(os.path.join(output_dir, "model_files")) os.mkdir(os.path.join(output_dir, "model_files_initial")) # seed console.log("SEED WITH: ", model_params.get("SEED")) T.manual_seed(model_params["SEED"]) T.cuda.manual_seed(model_params["SEED"]) np.random.seed(model_params.get("SEED")) random.seed(model_params.get("SEED")) T.backends.cudnn.deterministic = True # intialize model console.log("MODEL PARAMS: ", model_params) console.log("INITIALIZING MODEL: ", model_params) model = RecommendationTransformer( vocab_size=model_params.get("VOCAB_SIZE"), heads=model_params.get("heads", 4), layers=model_params.get("layers", 6), emb_dim=model_params.get("emb_dim", 512), pad_id=model_params.get("pad_id", 0), num_pos=model_params.get("history", 120)) # model.encoder.sou if model_params.get("trained"): # load the already trained model console.log("TRAINED MODEL AVAILABLE. LOADING...") model.load_state_dict( T.load(model_params.get("trained"))["state_dict"]) console.log("MODEL LOADED") console.log(f'MOVING MODEL TO DEVICE: {device}') if modify_last_fc: new_word_embedding = nn.Embedding(model_params.get("NEW_VOCAB_SIZE"), model_params.get("emb_dim"), 0) new_word_embedding.weight.requires_grad = False console.log( f"REQUIRES GRAD for `NEW WORD EMBEDDING` set to {new_word_embedding.weight.requires_grad}" ) new_word_embedding.weight[:model.encoder.word_embedding.weight.size( 0)] = model.encoder.word_embedding.weight.clone().detach() model.encoder.word_embedding = new_word_embedding # model.encoder.word_embedding.weight.retain_grad() console.log( f"WORD EMBEDDING MODIFIED TO `{model.encoder.word_embedding}`") model.encoder.word_embedding.weight.requires_grad = True new_lin_layer = nn.Linear(model_params.get("emb_dim"), model_params.get("NEW_VOCAB_SIZE")) new_lin_layer.weight.requires_grad = False new_lin_layer.weight[:model.lin_op.weight. size(0)] = model.lin_op.weight.clone().detach() model.lin_op = new_lin_layer # model.lin_op.weight.retain_grad() model.lin_op.weight.requires_grad = True console.log("MODEL LIN OP: ", model.lin_op.out_features) model = model.to(device) console.log( f"TOTAL NUMBER OF MODEL PARAMETERS: {round(count_model_parameters(model)/1e6, 2)} Million" ) optim_name = optimizer_params.get("OPTIM_NAME") if optim_name == "SGD": optimizer = T.optim.SGD(params=model.parameters(), **optimizer_params.get("PARAMS")) elif optim_name == "ADAM": optimizer = T.optim.Adam(params=model.parameters(), **optimizer_params.get("PARAMS")) else: optimizer = T.optim.SGD(params=model.parameters(), lr=model_params.get("LEARNING_RATE"), momentum=0.8, nesterov=True) if warmup_steps: optimizer = ScheduledOptimizer(optimizer, 1e-6, model_params.get("emb_dim")) console.log("OPTIMIZER AND MODEL DONE") console.log("CONFIGURING DATASET AND DATALOADER") console.log("DATA PARAMETERS: ", data_params) data = pd.read_csv(data_params.get("path")) train_data, valid_data = train_test_split( data, test_size=0.25, random_state=model_params.get("SEED")) console.log("LEN OF TRAIN DATASET: ", len(train_data)) console.log("LEN OF VALID DATASET: ", len(valid_data)) train_dataset = Bert4RecDataset(train_data, data_params.get("group_by_col"), data_params.get("data_col"), data_params.get("train_history", 120), data_params.get("valid_history", 5), data_params.get("padding_mode", "right"), "train", data_params.get("threshold_column"), data_params.get("threshold"), data_params.get("timestamp_col")) train_dl = DataLoader(train_dataset, **data_params.get("LOADERS").get("TRAIN")) console.save_text(os.path.join(output_dir, "logs_model_initialization.txt"), clear=True) losses = [] for epoch in tnrange(1, model_params.get("EPOCHS") + 1): if epoch % 3 == 0: clear_output(wait=True) train_loss, train_acc = train_step(model, device, train_dl, optimizer, warmup_steps, data_params.get("MASK"), model_params.get("CLIP"), data_params.get("chunkify")) train_logger.add_row(str(epoch), str(train_loss), str(train_acc)) console.log(train_logger) if epoch == 1: console.log(f"Saving Initial Model") T.save( model, os.path.join(output_dir, "model_files_initial", model_params.get("SAVE_NAME"))) T.save( dict(state_dict=model.state_dict(), epoch=epoch, train_loss=train_loss, train_acc=train_acc, optimizer_dict=optimizer._optimizer.state_dict() if warmup_steps else optimizer.state_dict()), os.path.join(output_dir, "model_files_initial", model_params.get("SAVE_STATE_DICT_NAME"))) if epoch > 1 and min(losses) > train_loss: console.log("SAVING BEST MODEL AT EPOCH -> ", epoch) console.log("LOSS OF BEST MODEL: ", train_loss) console.log("ACCURACY OF BEST MODEL: ", train_acc) T.save( model, os.path.join(output_dir, "model_files", model_params.get("SAVE_NAME"))) T.save( dict(state_dict=model.state_dict(), epoch=epoch, train_acc=train_acc, train_loss=train_loss, optimizer_dict=optimizer._optimizer.state_dict() if warmup_steps else optimizer.state_dict()), os.path.join(output_dir, "model_files", model_params.get("SAVE_STATE_DICT_NAME"))) losses.append(train_loss) if validation and epoch > 1 and epoch % validation == 0: valid_dataset = Bert4RecDataset( valid_data, data_params.get("group_by_col"), data_params.get("data_col"), data_params.get("train_history", 120), data_params.get("valid_history", 5), data_params.get("padding_mode", "right"), "valid") valid_dl = DataLoader(valid_dataset, **data_params.get("LOADERS").get("VALID")) valid_loss, valid_acc = validate_step(model, valid_dl, device, data_params.get("MASK")) valid_logger.add_row(str(epoch), str(valid_loss), str(valid_acc)) console.log(valid_logger) del valid_dataset, valid_dl console.log("VALIDATION DONE AT EPOCH ", epoch) console.save_text(os.path.join(output_dir, "logs_training.txt"), clear=True) console.save_text(os.path.join(output_dir, "logs_training.txt"), clear=True)
训练
from train_pipeline import trainer from constants import TRAIN_CONSTANTS from rich.table import Column, Table from rich import box from rich.console import Console console = Console(record=True) training_logger = Table( Column("Epoch", justify="center"), Column("Loss", justify="center"), Column("Accuracy", justify="center"), title="Training Status", pad_edge=False, box=box.ASCII, ) valid_loggger = Table( Column("Epoch", justify="center"), Column("Loss", justify="center"), Column("Accuracy", justify="center"), title="Validation Status", pad_edge=False, box=box.ASCII, ) loggers = dict(CONSOLE=console, TRAIN_LOGGER=training_logger, VALID_LOGGER=valid_loggger) model_params = dict( SEED=3007, VOCAB_SIZE=59049, heads=4, layers=6, emb_dim=256, pad_id=TRAIN_CONSTANTS.PAD, history=TRAIN_CONSTANTS.HISTORY, #trained= #"/content/drive/MyDrive/bert4rec/models/rec-transformer-model-9/model_files/bert4rec-state-dict.pth", trained=None, LEARNING_RATE=0.1, EPOCHS=5000, SAVE_NAME="bert4rec.pt", SAVE_STATE_DICT_NAME="bert4rec-state-dict.pth", CLIP=2 # NEW_VOCAB_SIZE=59049 ) data_params = dict( # path="/content/bert4rec/data/ratings_mapped.csv", # path="drive/MyDrive/bert4rec/data/ml-25m/ratings_mapped.csv", path="/content/drive/MyDrive/bert4rec/data/ml-25m/ratings_mapped.csv", group_by_col="userId", data_col="movieId_mapped", train_history=TRAIN_CONSTANTS.HISTORY, valid_history=5, padding_mode="right", MASK=TRAIN_CONSTANTS.MASK, chunkify=False, threshold_column="rating", threshold=3.5, timestamp_col="timestamp", LOADERS=dict(TRAIN=dict(batch_size=512, shuffle=True, num_workers=0), VALID=dict(batch_size=32, shuffle=False, num_workers=0))) optimizer_params = { "OPTIM_NAME": "SGD", "PARAMS": { "lr": 0.142, "momentum": 0.85, } } output_dir = "/content/drive/MyDrive/bert4rec/models/rec-transformer-model-10/" trainer(data_params=data_params, model_params=model_params, loggers=loggers, warmup_steps=False, output_dir=output_dir, modify_last_fc=False, validation=False, optimizer_params=optimizer_params)
预测
import torch as T import torch.nn.functional as F import torch.nn as nn import numpy as np import os import re from bert4rec_model import RecommendationTransformer from constants import TRAIN_CONSTANTS from typing import List, Dict, Tuple import random T.manual_seed(3007) T.cuda.manual_seed(3007) class Recommender: """Recommender Object """ def __init__(self, model_path: str): """Recommender object to predict sequential recommendation Args: model_path (str): Path to the model """ self.model = RecommendationTransformer( vocab_size=TRAIN_CONSTANTS.VOCAB_SIZE, heads=TRAIN_CONSTANTS.HEADS, layers=TRAIN_CONSTANTS.LAYERS, emb_dim=TRAIN_CONSTANTS.EMB_DIM, pad_id=TRAIN_CONSTANTS.PAD, num_pos=TRAIN_CONSTANTS.HISTORY) state_dict = T.load(model_path, map_location="cpu") self.model.load_state_dict(state_dict["state_dict"]) self.model.eval() self.max_length = 25 def predict(self, inp_tnsr: T.LongTensor, mode="post"): """Predict and return next or prev item in the sequence based on the mode Args: inp_tnsr (T.LongTensor): Input Tensor of items in the sequence mode (str, optional): Predict the start or end item based on the mode. Defaults to "post". Returns: int: Item ID """ with T.no_grad(): op = self.model(inp_tnsr.unsqueeze(0), None) _, pred = op.max(1) if mode == "post": pred = pred.flatten().tolist()[-1] elif mode == "pre": pred = pred.flatten().tolist()[0] else: pred = pred.flatten().tolist()[-1] return pred def recommendPre(self, sequence: List[int], num_recs: int = 5): """Predict item at start Args: sequence (List[int]): Input list of items num_recs (int, optional): Total number of items to predict. Defaults to 5. Returns: Tuple: Returns the sequence and history if more predictions than max length """ history = [] predict_hist = 0 while predict_hist < num_recs: if len(sequence) > TRAIN_CONSTANTS.HISTORY - 1: history.extend(sequence) sequence = sequence[:TRAIN_CONSTANTS.HISTORY - 1] inp_seq = T.LongTensor(sequence) inp_tnsr = T.ones((inp_seq.size(0) + 1), dtype=T.long) inp_tnsr[1:] = inp_seq pred = self.predict(inp_tnsr, mode="pre") sequence = [pred] + sequence predict_hist += 1 return sequence, history def recommendPost(self, sequence: List[int], num_recs: int = 5): """Predict item at end Args: sequence (List[int]): Input list of items num_recs (int, optional): Total number of item to predict. Defaults to 5. Returns: Tuple: Returns the sequence and history if more predictions than max length """ history = [] predict_hist = 0 while predict_hist < num_recs: if len(sequence) > TRAIN_CONSTANTS.HISTORY - 1: history.extend(sequence) sequence = sequence[::-1][:TRAIN_CONSTANTS.HISTORY - 1][::-1] inp_seq = T.LongTensor(sequence) inp_tnsr = T.ones((inp_seq.size(0) + 1), dtype=T.long) inp_tnsr[:inp_seq.size(0)] = inp_seq pred = self.predict(inp_tnsr) sequence.append(pred) predict_hist += 1 return sequence, history def recommendSequential(self, sequence: List[int], num_recs: int = 5): """Predicts both start and end items randomly Args: sequence (List[int]): Input list of items num_recs (int, optional): Total number of items to predict. Defaults to 5. Returns: Tuple: Returns the sequence and history (empty always) """ assert num_recs < ( self.max_length / 2 ) - 1, f"Can only recommend: {num_recs < (self.max_length / 2) - 1} with sequential recommendation" history = [] predict_hist = 0 while predict_hist < num_recs: if bool(random.choice([0, 1])): # print(f"RECOMMEND POST") sequence, hist = self.recommendPost(sequence, 1) # print(f"SEQUENCE: {sequence}") if len(hist) > 0: history.extend(hist) else: # print(f"RECOMMEND PRE") sequence, hist = self.recommendPre(sequence, 1) # print(f"SEQUENCE: {sequence}") if len(hist) > 0: history.extend(hist) predict_hist += 1 return sequence, [] def cleanHistory(self, history: List[int]): """History might have multiple repetitions, we clean the history and maintain the sequence Args: history (List[int]): Predicted item ids Returns: List[int]: Returns cleaned item id """ history = history[::-1] history = [ h for ix, h in enumerate(history) if h not in history[ix + 1:] ] return history[::-1] def recommend(self, sequence: List[int], num_recs: int = 5, mode: str = "post"): """Recommend Items Args: sequence (List[int]): Input list of items num_recs (int, optional): Total number of items to predict. Defaults to 5. mode (str, optional): Predict start or end items or creates a random sequence around the input sequence. Defaults to "post". Returns: List[int]: Recommended items """ if mode == "post": seq, hist = self.recommendPost(sequence, num_recs) elif mode == "pre": seq, hist = self.recommendPre(sequence, num_recs) else: seq, hist = self.recommendSequential(sequence, num_recs) hist = self.cleanHistory(hist) if len(hist) > 0 and len(hist) > len(seq): return hist return seq with __name__ == "__main__": rec_obj = Recommender(TRAIN_CONSTANTS.MODEL_PATH) rec = rec_obj.recommend(sequence=[2, 3], num_recs=10)
后果
下面代码咱们看到了如何应用Transformer模型(NLP畛域的风行模型)来构建基于物品的协同过滤模型。并且通过代码从头开始训练。
https://avoid.overfit.cn/post/bd856efdfdb24f5e9d8f2b095a6f8b11
作者:Vatsal Saglani